👓 feat: Vision Support for Assistants (#2195)

* refactor(assistants/chat): use promises to speed up initialization, initialize shared variables, include `attachedFileIds` to streamRunManager * chore: additional typedefs * fix(OpenAIClient): handle edge case where attachments promise is resolved * feat: createVisionPrompt * feat: Vision Support for Assistants
2026-02-23 02:44:08 +01:00 · 2024-03-24 23:43:00 -04:00 · 2024-03-24 23:43:00 -04:00 · 798e8763d0
commit 798e8763d0
parent 1f0fb497f8
16 changed files with 376 additions and 100 deletions
--- a/api/server/services/Runs/StreamRunManager.js
+++ b/api/server/services/Runs/StreamRunManager.js
@ -59,6 +59,10 @@ class StreamRunManager {
    this.messages = [];
    /** @type {string} */
    this.text = '';
+    /** @type {Set<string>} */
+    this.attachedFileIds = fields.attachedFileIds;
+    /** @type {undefined | Promise<ChatCompletion>} */
+    this.visionPromise = fields.visionPromise;

    /**
     * @type {Object.<AssistantStreamEvents, (event: AssistantStreamEvent) => Promise<void>>}
--- a/api/server/services/Threads/manage.js
+++ b/api/server/services/Threads/manage.js
@ -468,21 +468,28 @@ async function checkMessageGaps({ openai, latestMessageId, thread_id, run_id, co

 /**
 * Records token usage for a given completion request.
- *
 * @param {Object} params - The parameters for initializing a thread.
 * @param {number} params.prompt_tokens - The number of prompt tokens used.
 * @param {number} params.completion_tokens - The number of completion tokens used.
 * @param {string} params.model - The model used by the assistant run.
 * @param {string} params.user - The user's ID.
 * @param {string} params.conversationId - LibreChat conversation ID.
+ * @param {string} [params.context='message'] - The context of the usage. Defaults to 'message'.
 * @return {Promise<TMessage[]>} A promise that resolves to the updated messages
 */
-const recordUsage = async ({ prompt_tokens, completion_tokens, model, user, conversationId }) => {
+const recordUsage = async ({
+  prompt_tokens,
+  completion_tokens,
+  model,
+  user,
+  conversationId,
+  context = 'message',
+}) => {
  await spendTokens(
    {
      user,
      model,
-      context: 'message',
+      context,
      conversationId,
    },
    { promptTokens: prompt_tokens, completionTokens: completion_tokens },
--- a/api/server/services/ToolService.js
+++ b/api/server/services/ToolService.js
@ -4,14 +4,17 @@ const { StructuredTool } = require('langchain/tools');
 const { zodToJsonSchema } = require('zod-to-json-schema');
 const { Calculator } = require('langchain/tools/calculator');
 const {
+  Tools,
  ContentTypes,
  imageGenTools,
+  actionDelimiter,
+  ImageVisionTool,
  openapiToFunction,
  validateAndParseOpenAPISpec,
-  actionDelimiter,
 } = require('librechat-data-provider');
 const { loadActionSets, createActionTool, domainParser } = require('./ActionService');
 const { processFileURL } = require('~/server/services/Files/process');
+const { recordUsage } = require('~/server/services/Threads');
 const { loadTools } = require('~/app/clients/tools/util');
 const { redactMessage } = require('~/config/parsers');
 const { sleep } = require('~/server/utils');
@ -83,6 +86,8 @@ function loadAndFormatTools({ directory, filter = new Set() }) {
    tools.push(formattedTool);
  }

+  tools.push(ImageVisionTool);
+
  return tools.reduce((map, tool) => {
    map[tool.function.name] = tool;
    return map;
@ -100,8 +105,8 @@ function loadAndFormatTools({ directory, filter = new Set() }) {
 */
 function formatToOpenAIAssistantTool(tool) {
  return {
-    type: 'function',
-    function: {
+    type: Tools.function,
+    [Tools.function]: {
      name: tool.name,
      description: tool.description,
      parameters: zodToJsonSchema(tool.schema),
@ -109,13 +114,42 @@ function formatToOpenAIAssistantTool(tool) {
  };
 }

+/**
+ * Processes the required actions by calling the appropriate tools and returning the outputs.
+ * @param {OpenAIClient} client - OpenAI or StreamRunManager Client.
+ * @param {RequiredAction} requiredActions - The current required action.
+ * @returns {Promise<ToolOutput>} The outputs of the tools.
+ */
+const processVisionRequest = async (client, currentAction) => {
+  if (!client.visionPromise) {
+    return {
+      tool_call_id: currentAction.toolCallId,
+      output: 'No image details found.',
+    };
+  }
+
+  /** @type {ChatCompletion | undefined} */
+  const completion = await client.visionPromise;
+  if (completion.usage) {
+    recordUsage({
+      user: client.req.user.id,
+      model: client.req.body.model,
+      conversationId: (client.responseMessage ?? client.finalMessage).conversationId,
+      ...completion.usage,
+    });
+  }
+  const output = completion?.choices?.[0]?.message?.content ?? 'No image details found.';
+  return {
+    tool_call_id: currentAction.toolCallId,
+    output,
+  };
+};
+
 /**
 * Processes return required actions from run.
- *
 * @param {OpenAIClient} client - OpenAI or StreamRunManager Client.
 * @param {RequiredAction[]} requiredActions - The required actions to submit outputs for.
 * @returns {Promise<ToolOutputs>} The outputs of the tools.
- *
 */
 async function processRequiredActions(client, requiredActions) {
  logger.debug(
@ -152,6 +186,10 @@ async function processRequiredActions(client, requiredActions) {

  for (let i = 0; i < requiredActions.length; i++) {
    const currentAction = requiredActions[i];
+    if (currentAction.tool === ImageVisionTool.function.name) {
+      promises.push(processVisionRequest(client, currentAction));
+      continue;
+    }
    let tool = ToolMap[currentAction.tool] ?? ActionToolMap[currentAction.tool];

    const handleToolOutput = async (output) => {