👓 feat: Vision Support for Assistants (#2195)

* refactor(assistants/chat): use promises to speed up initialization, initialize shared variables, include `attachedFileIds` to streamRunManager * chore: additional typedefs * fix(OpenAIClient): handle edge case where attachments promise is resolved * feat: createVisionPrompt * feat: Vision Support for Assistants
2026-02-03 00:01:49 +01:00 · 2024-03-24 23:43:00 -04:00 · 2024-03-24 23:43:00 -04:00 · 798e8763d0
commit 798e8763d0
parent 1f0fb497f8
16 changed files with 376 additions and 100 deletions
--- a/api/app/clients/OpenAIClient.js
+++ b/api/app/clients/OpenAIClient.js
@ -92,7 +92,11 @@ class OpenAIClient extends BaseClient {
    }
    this.defaultVisionModel = this.options.visionModel ?? 'gpt-4-vision-preview';
-    this.options.attachments?.then((attachments) => this.checkVisionRequest(attachments));
+    if (typeof this.options.attachments?.then === 'function') {
      this.options.attachments.then((attachments) => this.checkVisionRequest(attachments));
    } else {
      this.checkVisionRequest(this.options.attachments);
    }
    const { OPENROUTER_API_KEY, OPENAI_FORCE_PROMPT } = process.env ?? {};
    if (OPENROUTER_API_KEY && !this.azure) {
--- a/api/app/clients/prompts/createVisionPrompt.js
+++ b/api/app/clients/prompts/createVisionPrompt.js
@ -0,0 +1,34 @@
 /**
 * Generates a prompt instructing the user to describe an image in detail, tailored to different types of visual content.
 * @param {boolean} pluralized - Whether to pluralize the prompt for multiple images.
 * @returns {string} - The generated vision prompt.
 */
 const createVisionPrompt = (pluralized = false) => {
  return `Please describe the image${
    pluralized ? 's' : ''
  } in detail, covering relevant aspects such as:
  For photographs, illustrations, or artwork:
  - The main subject(s) and their appearance, positioning, and actions
  - The setting, background, and any notable objects or elements
  - Colors, lighting, and overall mood or atmosphere
  - Any interesting details, textures, or patterns
  - The style, technique, or medium used (if discernible)
  For screenshots or images containing text:
  - The content and purpose of the text
  - The layout, formatting, and organization of the information
  - Any notable visual elements, such as logos, icons, or graphics
  - The overall context or message conveyed by the screenshot
  For graphs, charts, or data visualizations:
  - The type of graph or chart (e.g., bar graph, line chart, pie chart)
  - The variables being compared or analyzed
  - Any trends, patterns, or outliers in the data
  - The axis labels, scales, and units of measurement
  - The title, legend, and any additional context provided
  Be as specific and descriptive as possible while maintaining clarity and concision.`;
 };
 module.exports = createVisionPrompt;
--- a/api/app/clients/prompts/index.js
+++ b/api/app/clients/prompts/index.js
@ -4,6 +4,7 @@ const handleInputs = require('./handleInputs');
 const instructions = require('./instructions');
 const titlePrompts = require('./titlePrompts');
 const truncateText = require('./truncateText');
 const createVisionPrompt = require('./createVisionPrompt');
 const createContextHandlers = require('./createContextHandlers');
 module.exports = {
@ -13,5 +14,6 @@ module.exports = {
  ...instructions,
  ...titlePrompts,
  truncateText,
  createVisionPrompt,
  createContextHandlers,
 };
--- a/api/server/routes/assistants/chat.js
+++ b/api/server/routes/assistants/chat.js
@ -4,9 +4,11 @@ const {
  Constants,
  RunStatus,
  CacheKeys,
  FileSources,
  ContentTypes,
  EModelEndpoint,
  ViolationTypes,
  ImageVisionTool,
  AssistantStreamEvents,
 } = require('librechat-data-provider');
 const {
@ -17,9 +19,10 @@ const {
  addThreadMetadata,
  saveAssistantMessage,
 } = require('~/server/services/Threads');
 const { sendResponse, sendMessage, sleep, isEnabled, countTokens } = require('~/server/utils');
 const { runAssistant, createOnTextProgress } = require('~/server/services/AssistantService');
 const { addTitle, initializeClient } = require('~/server/services/Endpoints/assistants');
-const { sendResponse, sendMessage, sleep, isEnabled, countTokens } = require('~/server/utils');
+const { formatMessage, createVisionPrompt } = require('~/app/clients/prompts');
 const { createRun, StreamRunManager } = require('~/server/services/Runs');
 const { getTransactions } = require('~/models/Transaction');
 const checkBalance = require('~/models/checkBalance');
@ -100,6 +103,16 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
  let parentMessageId = _parentId;
  /** @type {TMessage[]} */
  let previousMessages = [];
  /** @type {import('librechat-data-provider').TConversation | null} */
  let conversation = null;
  /** @type {string[]} */
  let file_ids = [];
  /** @type {Set<string>} */
  let attachedFileIds = new Set();
  /** @type {TMessage | null} */
  let requestMessage = null;
  /** @type {undefined | Promise<ChatCompletion>} */
  let visionPromise;
  const userMessageId = v4();
  const responseMessageId = v4();
@ -258,7 +271,10 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
      throw new Error('Missing assistant_id');
    }
-    if (isEnabled(process.env.CHECK_BALANCE)) {
+    const checkBalanceBeforeRun = async () => {
      if (!isEnabled(process.env.CHECK_BALANCE)) {
        return;
      }
      const transactions =
        (await getTransactions({
          user: req.user.id,
@ -288,7 +304,7 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
          amount: promptTokens,
        },
      });
-    }
+    };
    /** @type {{ openai: OpenAIClient }} */
    const { openai: _openai, client } = await initializeClient({
@ -300,15 +316,11 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
    openai = _openai;
    // if (thread_id) {
    //   previousMessages = await checkMessageGaps({ openai, thread_id, conversationId });
    // }
    if (previousMessages.length) {
      parentMessageId = previousMessages[previousMessages.length - 1].messageId;
    }
-    const userMessage = {
+    let userMessage = {
      role: 'user',
      content: text,
      metadata: {
@ -316,75 +328,7 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
      },
    };
-    let thread_file_ids = [];
+    /** @type {CreateRunBody | undefined} */
    if (convoId) {
      const convo = await getConvo(req.user.id, convoId);
      if (convo && convo.file_ids) {
        thread_file_ids = convo.file_ids;
      }
    }
    const file_ids = files.map(({ file_id }) => file_id);
    if (file_ids.length || thread_file_ids.length) {
      userMessage.file_ids = file_ids;
      openai.attachedFileIds = new Set([...file_ids, ...thread_file_ids]);
    }
    // TODO: may allow multiple messages to be created beforehand in a future update
    const initThreadBody = {
      messages: [userMessage],
      metadata: {
        user: req.user.id,
        conversationId,
      },
    };
    const result = await initThread({ openai, body: initThreadBody, thread_id });
    thread_id = result.thread_id;
    createOnTextProgress({
      openai,
      conversationId,
      userMessageId,
      messageId: responseMessageId,
      thread_id,
    });
    const requestMessage = {
      user: req.user.id,
      text,
      messageId: userMessageId,
      parentMessageId,
      // TODO: make sure client sends correct format for `files`, use zod
      files,
      file_ids,
      conversationId,
      isCreatedByUser: true,
      assistant_id,
      thread_id,
      model: assistant_id,
    };
    previousMessages.push(requestMessage);
    await saveUserMessage({ ...requestMessage, model });
    const conversation = {
      conversationId,
      // TODO: title feature
      title: 'New Chat',
      endpoint: EModelEndpoint.assistants,
      promptPrefix: promptPrefix,
      instructions: instructions,
      assistant_id,
      // model,
    };
    if (file_ids.length) {
      conversation.file_ids = file_ids;
    }
    /** @type {CreateRunBody} */
    const body = {
      assistant_id,
      model,
@ -398,6 +342,143 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
      body.instructions = instructions;
    }
    const getRequestFileIds = async () => {
      let thread_file_ids = [];
      if (convoId) {
        const convo = await getConvo(req.user.id, convoId);
        if (convo && convo.file_ids) {
          thread_file_ids = convo.file_ids;
        }
      }
      file_ids = files.map(({ file_id }) => file_id);
      if (file_ids.length || thread_file_ids.length) {
        userMessage.file_ids = file_ids;
        attachedFileIds = new Set([...file_ids, ...thread_file_ids]);
      }
    };
    const addVisionPrompt = async () => {
      if (!req.body.endpointOption.attachments) {
        return;
      }
      const assistant = await openai.beta.assistants.retrieve(assistant_id);
      const visionToolIndex = assistant.tools.findIndex(
        (tool) => tool.function.name === ImageVisionTool.function.name,
      );
      if (visionToolIndex === -1) {
        return;
      }
      const attachments = await req.body.endpointOption.attachments;
      let visionMessage = {
        role: 'user',
        content: '',
      };
      const files = await client.addImageURLs(visionMessage, attachments);
      if (!visionMessage.image_urls?.length) {
        return;
      }
      const imageCount = visionMessage.image_urls.length;
      const plural = imageCount > 1;
      visionMessage.content = createVisionPrompt(plural);
      visionMessage = formatMessage({ message: visionMessage, endpoint: EModelEndpoint.openAI });
      visionPromise = openai.chat.completions.create({
        model: 'gpt-4-vision-preview',
        messages: [visionMessage],
        max_tokens: 4000,
      });
      const pluralized = plural ? 's' : '';
      body.additional_instructions = `${
        body.additional_instructions ? `${body.additional_instructions}\n` : ''
      }The user has uploaded ${imageCount} image${pluralized}.
      Use the \`${ImageVisionTool.function.name}\` tool to retrieve ${
  plural ? '' : 'a '
 }detailed text description${pluralized} for ${plural ? 'each' : 'the'} image${pluralized}.`;
      return files;
    };
    const initializeThread = async () => {
      /** @type {[ undefined | MongoFile[]]}*/
      const [processedFiles] = await Promise.all([addVisionPrompt(), getRequestFileIds()]);
      // TODO: may allow multiple messages to be created beforehand in a future update
      const initThreadBody = {
        messages: [userMessage],
        metadata: {
          user: req.user.id,
          conversationId,
        },
      };
      if (processedFiles) {
        for (const file of processedFiles) {
          if (file.source !== FileSources.openai) {
            attachedFileIds.delete(file.file_id);
            const index = file_ids.indexOf(file.file_id);
            if (index > -1) {
              file_ids.splice(index, 1);
            }
          }
        }
        userMessage.file_ids = file_ids;
      }
      const result = await initThread({ openai, body: initThreadBody, thread_id });
      thread_id = result.thread_id;
      createOnTextProgress({
        openai,
        conversationId,
        userMessageId,
        messageId: responseMessageId,
        thread_id,
      });
      requestMessage = {
        user: req.user.id,
        text,
        messageId: userMessageId,
        parentMessageId,
        // TODO: make sure client sends correct format for `files`, use zod
        files,
        file_ids,
        conversationId,
        isCreatedByUser: true,
        assistant_id,
        thread_id,
        model: assistant_id,
      };
      previousMessages.push(requestMessage);
      /* asynchronous */
      saveUserMessage({ ...requestMessage, model });
      conversation = {
        conversationId,
        title: 'New Chat',
        endpoint: EModelEndpoint.assistants,
        promptPrefix: promptPrefix,
        instructions: instructions,
        assistant_id,
        // model,
      };
      if (file_ids.length) {
        conversation.file_ids = file_ids;
      }
    };
    const promises = [initializeThread(), checkBalanceBeforeRun()];
    await Promise.all(promises);
    const sendInitialResponse = () => {
      sendMessage(res, {
        sync: true,
@ -421,6 +502,8 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
    const processRun = async (retry = false) => {
      if (req.app.locals[EModelEndpoint.azureOpenAI]?.assistants) {
        openai.attachedFileIds = attachedFileIds;
        openai.visionPromise = visionPromise;
        if (retry) {
          response = await runAssistant({
            openai,
@ -463,9 +546,11 @@ router.post('/', validateModel, buildEndpointOption, setHeaders, async (req, res
        req,
        res,
        openai,
        thread_id,
        responseMessage: openai.responseMessage,
        handlers,
        thread_id,
        visionPromise,
        attachedFileIds,
        responseMessage: openai.responseMessage,
        // streamOptions: {
        // },
--- a/api/server/services/Runs/StreamRunManager.js
+++ b/api/server/services/Runs/StreamRunManager.js
@ -59,6 +59,10 @@ class StreamRunManager {
    this.messages = [];
    /** @type {string} */
    this.text = '';
    /** @type {Set<string>} */
    this.attachedFileIds = fields.attachedFileIds;
    /** @type {undefined | Promise<ChatCompletion>} */
    this.visionPromise = fields.visionPromise;
    /**
     * @type {Object.<AssistantStreamEvents, (event: AssistantStreamEvent) => Promise<void>>}
--- a/api/server/services/Threads/manage.js
+++ b/api/server/services/Threads/manage.js
@ -468,21 +468,28 @@ async function checkMessageGaps({ openai, latestMessageId, thread_id, run_id, co
 /**
 * Records token usage for a given completion request.
 *
 * @param {Object} params - The parameters for initializing a thread.
 * @param {number} params.prompt_tokens - The number of prompt tokens used.
 * @param {number} params.completion_tokens - The number of completion tokens used.
 * @param {string} params.model - The model used by the assistant run.
 * @param {string} params.user - The user's ID.
 * @param {string} params.conversationId - LibreChat conversation ID.
 * @param {string} [params.context='message'] - The context of the usage. Defaults to 'message'.
 * @return {Promise<TMessage[]>} A promise that resolves to the updated messages
 */
-const recordUsage = async ({ prompt_tokens, completion_tokens, model, user, conversationId }) => {
+const recordUsage = async ({
  prompt_tokens,
  completion_tokens,
  model,
  user,
  conversationId,
  context = 'message',
 }) => {
  await spendTokens(
    {
      user,
      model,
-      context: 'message',
+      context,
      conversationId,
    },
    { promptTokens: prompt_tokens, completionTokens: completion_tokens },
--- a/api/server/services/ToolService.js
+++ b/api/server/services/ToolService.js
@ -4,14 +4,17 @@ const { StructuredTool } = require('langchain/tools');
 const { zodToJsonSchema } = require('zod-to-json-schema');
 const { Calculator } = require('langchain/tools/calculator');
 const {
  Tools,
  ContentTypes,
  imageGenTools,
  actionDelimiter,
  ImageVisionTool,
  openapiToFunction,
  validateAndParseOpenAPISpec,
  actionDelimiter,
 } = require('librechat-data-provider');
 const { loadActionSets, createActionTool, domainParser } = require('./ActionService');
 const { processFileURL } = require('~/server/services/Files/process');
 const { recordUsage } = require('~/server/services/Threads');
 const { loadTools } = require('~/app/clients/tools/util');
 const { redactMessage } = require('~/config/parsers');
 const { sleep } = require('~/server/utils');
@ -83,6 +86,8 @@ function loadAndFormatTools({ directory, filter = new Set() }) {
    tools.push(formattedTool);
  }
  tools.push(ImageVisionTool);
  return tools.reduce((map, tool) => {
    map[tool.function.name] = tool;
    return map;
@ -100,8 +105,8 @@ function loadAndFormatTools({ directory, filter = new Set() }) {
 */
 function formatToOpenAIAssistantTool(tool) {
  return {
-    type: 'function',
+    type: Tools.function,
-    function: {
+    [Tools.function]: {
      name: tool.name,
      description: tool.description,
      parameters: zodToJsonSchema(tool.schema),
@ -109,13 +114,42 @@ function formatToOpenAIAssistantTool(tool) {
  };
 }
 /**
 * Processes the required actions by calling the appropriate tools and returning the outputs.
 * @param {OpenAIClient} client - OpenAI or StreamRunManager Client.
 * @param {RequiredAction} requiredActions - The current required action.
 * @returns {Promise<ToolOutput>} The outputs of the tools.
 */
 const processVisionRequest = async (client, currentAction) => {
  if (!client.visionPromise) {
    return {
      tool_call_id: currentAction.toolCallId,
      output: 'No image details found.',
    };
  }
  /** @type {ChatCompletion | undefined} */
  const completion = await client.visionPromise;
  if (completion.usage) {
    recordUsage({
      user: client.req.user.id,
      model: client.req.body.model,
      conversationId: (client.responseMessage ?? client.finalMessage).conversationId,
      ...completion.usage,
    });
  }
  const output = completion?.choices?.[0]?.message?.content ?? 'No image details found.';
  return {
    tool_call_id: currentAction.toolCallId,
    output,
  };
 };
 /**
 * Processes return required actions from run.
 *
 * @param {OpenAIClient} client - OpenAI or StreamRunManager Client.
 * @param {RequiredAction[]} requiredActions - The required actions to submit outputs for.
 * @returns {Promise<ToolOutputs>} The outputs of the tools.
 *
 */
 async function processRequiredActions(client, requiredActions) {
  logger.debug(
@ -152,6 +186,10 @@ async function processRequiredActions(client, requiredActions) {
  for (let i = 0; i < requiredActions.length; i++) {
    const currentAction = requiredActions[i];
    if (currentAction.tool === ImageVisionTool.function.name) {
      promises.push(processVisionRequest(client, currentAction));
      continue;
    }
    let tool = ToolMap[currentAction.tool] ?? ActionToolMap[currentAction.tool];
    const handleToolOutput = async (output) => {
--- a/api/server/utils/handleText.js
+++ b/api/server/utils/handleText.js
@ -172,6 +172,7 @@ function generateConfig(key, baseURL, assistants = false) {
    config.retrievalModels = defaultRetrievalModels;
    config.capabilities = [
      Capabilities.code_interpreter,
      Capabilities.image_vision,
      Capabilities.retrieval,
      Capabilities.actions,
      Capabilities.tools,
--- a/api/typedefs.js
+++ b/api/typedefs.js
@ -32,6 +32,18 @@
 * @memberof typedefs
 */
 /**
 * @exports ChatCompletionContentPartImage
 * @typedef {import('openai').OpenAI.ChatCompletionContentPartImage} ChatCompletionContentPartImage
 * @memberof typedefs
 */
 /**
 * @exports ChatCompletion
 * @typedef {import('openai').OpenAI.ChatCompletion} ChatCompletion
 * @memberof typedefs
 */
 /**
 * @exports OpenAIRequestOptions
 * @typedef {import('openai').OpenAI.RequestOptions} OpenAIRequestOptions
--- a/client/src/common/assistants-types.ts
+++ b/client/src/common/assistants-types.ts
@ -1,3 +1,4 @@
 import { Capabilities } from 'librechat-data-provider';
 import type { Assistant } from 'librechat-data-provider';
 import type { Option, ExtendedFile } from './types';
@ -6,8 +7,9 @@ export type TAssistantOption =
  | (Option & Assistant & { files?: Array<[string, ExtendedFile]> });
 export type Actions = {
-  code_interpreter: boolean;
+  [Capabilities.code_interpreter]: boolean;
-  retrieval: boolean;
+  [Capabilities.image_vision]: boolean;
  [Capabilities.retrieval]: boolean;
 };
 export type AssistantForm = {
--- a/client/src/components/Chat/Messages/Content/Part.tsx
+++ b/client/src/components/Chat/Messages/Content/Part.tsx
@ -1,4 +1,9 @@
-import { ToolCallTypes, ContentTypes, imageGenTools } from 'librechat-data-provider';
+import {
  ToolCallTypes,
  ContentTypes,
  imageGenTools,
  isImageVisionTool,
 } from 'librechat-data-provider';
 import type { TMessageContentParts, TMessage } from 'librechat-data-provider';
 import type { TDisplayProps } from '~/common';
 import { ErrorMessage } from './MessageContent';
@ -96,6 +101,25 @@ export default function Part({
    part[ContentTypes.TOOL_CALL].type === ToolCallTypes.FUNCTION
  ) {
    const toolCall = part[ContentTypes.TOOL_CALL];
    if (isImageVisionTool(toolCall)) {
      if (isSubmitting && showCursor) {
        return (
          <Container>
            <div className="markdown prose dark:prose-invert light dark:text-gray-70 my-1 w-full break-words">
              <DisplayMessage
                text={''}
                isCreatedByUser={message.isCreatedByUser}
                message={message}
                showCursor={showCursor}
              />
            </div>
          </Container>
        );
      }
      return null;
    }
    return (
      <ToolCall
        initialProgress={toolCall.progress ?? 0.1}
--- a/client/src/components/SidePanel/Builder/AssistantPanel.tsx
+++ b/client/src/components/SidePanel/Builder/AssistantPanel.tsx
@ -8,6 +8,7 @@ import {
  Capabilities,
  EModelEndpoint,
  actionDelimiter,
  ImageVisionTool,
  defaultAssistantFormValues,
 } from 'librechat-data-provider';
 import type { AssistantForm, AssistantPanelProps } from '~/common';
@ -82,6 +83,10 @@ export default function AssistantPanel({
    () => assistants?.capabilities?.includes(Capabilities.code_interpreter),
    [assistants],
  );
  const imageVisionEnabled = useMemo(
    () => assistants?.capabilities?.includes(Capabilities.image_vision),
    [assistants],
  );
  useEffect(() => {
    if (model && !retrievalModels.has(model)) {
@ -157,6 +162,9 @@ export default function AssistantPanel({
    if (data.retrieval) {
      tools.push({ type: Tools.retrieval });
    }
    if (data.image_vision) {
      tools.push(ImageVisionTool);
    }
    const {
      name,
@ -374,6 +382,37 @@ export default function AssistantPanel({
                  </label>
                </div>
              )}
              {imageVisionEnabled && (
                <div className="flex items-center">
                  <Controller
                    name={Capabilities.image_vision}
                    control={control}
                    render={({ field }) => (
                      <Checkbox
                        {...field}
                        checked={field.value}
                        onCheckedChange={field.onChange}
                        className="relative float-left  mr-2 inline-flex h-4 w-4 cursor-pointer"
                        value={field?.value?.toString()}
                      />
                    )}
                  />
                  <label
                    className="form-check-label text-token-text-primary w-full cursor-pointer"
                    htmlFor={Capabilities.image_vision}
                    onClick={() =>
                      setValue(Capabilities.image_vision, !getValues(Capabilities.image_vision), {
                        shouldDirty: true,
                      })
                    }
                  >
                    <div className="flex items-center">
                      {localize('com_assistants_image_vision')}
                      <QuestionMark />
                    </div>
                  </label>
                </div>
              )}
              {retrievalEnabled && (
                <div className="flex items-center">
                  <Controller
@ -417,9 +456,9 @@ export default function AssistantPanel({
              ${actionsEnabled ? localize('com_assistants_actions') : ''}`}
            </label>
            <div className="space-y-1">
-              {functions.map((func) => (
+              {functions.map((func, i) => (
                <AssistantTool
-                  key={func}
+                  key={`${func}-${i}-${assistant_id}`}
                  tool={func}
                  allTools={allTools}
                  assistant_id={assistant_id}
--- a/client/src/components/SidePanel/Builder/AssistantSelect.tsx
+++ b/client/src/components/SidePanel/Builder/AssistantSelect.tsx
@ -3,6 +3,8 @@ import { useCallback, useEffect, useRef } from 'react';
 import {
  defaultAssistantFormValues,
  defaultOrderQuery,
  isImageVisionTool,
  Capabilities,
  FileSources,
 } from 'librechat-data-provider';
 import type { UseFormReset } from 'react-hook-form';
@ -13,7 +15,7 @@ import SelectDropDown from '~/components/ui/SelectDropDown';
 import { useListAssistantsQuery } from '~/data-provider';
 import { useFileMapContext } from '~/Providers';
 import { useLocalize } from '~/hooks';
-import { cn } from '~/utils/';
+import { cn } from '~/utils';
 const keys = new Set(['name', 'id', 'description', 'instructions', 'model']);
@ -87,20 +89,21 @@ export default function AssistantSelect({
      };
      const actions: Actions = {
-        code_interpreter: false,
+        [Capabilities.code_interpreter]: false,
-        retrieval: false,
+        [Capabilities.image_vision]: false,
        [Capabilities.retrieval]: false,
      };
      assistant?.tools
-        ?.filter((tool) => tool.type !== 'function')
+        ?.filter((tool) => tool.type !== 'function' || isImageVisionTool(tool))
-        ?.map((tool) => tool.type)
+        ?.map((tool) => tool?.function?.name || tool.type)
        .forEach((tool) => {
          actions[tool] = true;
        });
      const functions =
        assistant?.tools
-          ?.filter((tool) => tool.type === 'function')
+          ?.filter((tool) => tool.type === 'function' && !isImageVisionTool(tool))
          ?.map((tool) => tool.function?.name ?? '') ?? [];
      const formValues: Partial<AssistantForm & Actions> = {
--- a/client/src/localization/languages/Eng.tsx
+++ b/client/src/localization/languages/Eng.tsx
@ -16,6 +16,7 @@ export default {
    'If you upload files under Knowledge, conversations with your Assistant may include file contents.',
  com_assistants_knowledge_disabled:
    'Assistant must be created, and Code Interpreter or Retrieval must be enabled and saved before uploading files as Knowledge.',
  com_assistants_image_vision: 'Image Vision',
  com_assistants_code_interpreter: 'Code Interpreter',
  com_assistants_code_interpreter_files:
    'The following files are only available for Code Interpreter:',
--- a/packages/data-provider/src/config.ts
+++ b/packages/data-provider/src/config.ts
@ -82,6 +82,7 @@ export type TValidatedAzureConfig = {
 export enum Capabilities {
  code_interpreter = 'code_interpreter',
  image_vision = 'image_vision',
  retrieval = 'retrieval',
  actions = 'actions',
  tools = 'tools',
@ -100,6 +101,7 @@ export const assistantEndpointSchema = z.object({
    .optional()
    .default([
      Capabilities.code_interpreter,
      Capabilities.image_vision,
      Capabilities.retrieval,
      Capabilities.actions,
      Capabilities.tools,
--- a/packages/data-provider/src/schemas.ts
+++ b/packages/data-provider/src/schemas.ts
@ -1,5 +1,6 @@
 import { z } from 'zod';
-import type { TMessageContentParts } from './types/assistants';
+import { Tools } from './types/assistants';
 import type { TMessageContentParts, FunctionTool, FunctionToolCall } from './types/assistants';
 import type { TFile } from './types/files';
 export const isUUID = z.string().uuid();
@ -25,9 +26,26 @@ export const defaultAssistantFormValues = {
  model: '',
  functions: [],
  code_interpreter: false,
  image_vision: false,
  retrieval: false,
 };
 export const ImageVisionTool: FunctionTool = {
  type: Tools.function,
  [Tools.function]: {
    name: 'image_vision',
    description: 'Get detailed text descriptions for all current image attachments.',
    parameters: {
      type: 'object',
      properties: {},
      required: [],
    },
  },
 };
 export const isImageVisionTool = (tool: FunctionTool | FunctionToolCall) =>
  tool.type === 'function' && tool.function?.name === ImageVisionTool?.function?.name;
 export const endpointSettings = {
  [EModelEndpoint.google]: {
    model: {