🔍 feat: Mistral OCR API / Upload Files as Text (#6274)

* refactor: move `loadAuthValues` to `~/services/Tools/credentials` * feat: add createAxiosInstance function to configure axios with proxy support * WIP: First pass mistral ocr * refactor: replace getConvoFiles with getToolFiles for improved file retrieval logic * refactor: improve document formatting in encodeAndFormat function * refactor: remove unused resendFiles parameter from buildOptions function (this option comes from the agent config) * fix: update getFiles call to include files with `text` property as well * refactor: move file handling to `initializeAgentOptions` * refactor: enhance addImageURLs method to handle OCR text and improve message formatting * refactor: update message formatting to handle OCR text in various content types * refactor: remove unused resendFiles property from compactAgentsSchema * fix: add error handling for Mistral OCR document upload and logging * refactor: integrate OCR capability into file upload options and configuration * refactor: skip processing for text source files in delete request, as they are directly tied to database * feat: add metadata field to ExtendedFile type and update PanelColumns and PanelTable components for localization and metadata handling * fix: source icon styling * wip: first pass, frontend file context agent resources * refactor: add hover card with contextual information for File Context (OCR) in FileContext component * feat: enhance file processing by integrating file retrieval for OCR resources in agent initialization * feat: implement OCR config; fix: agent resource deletion for ocr files * feat: enhance agent initialization by adding OCR capability check in resource priming * ci: fix `~/config` module mock * ci: add OCR property expectation in AppService tests * refactor: simplify OCR config loading by removing environment variable extraction, to be done when OCR is actually performed * ci: add unit test to ensure environment variable references are not parsed in OCR config * refactor: disable base64 image inclusion in OCR request * refactor: enhance OCR configuration handling by validating environment variables and providing defaults * refactor: use file stream from disk for mistral ocr api
2026-03-19 06:06:34 +01:00 · 2025-03-10 17:23:46 -04:00 · 2025-03-10 17:23:46 -04:00 · ded3cd8876
commit ded3cd8876
parent 9db00edfc4
48 changed files with 1621 additions and 131 deletions
--- a/api/server/controllers/agents/callbacks.js
+++ b/api/server/controllers/agents/callbacks.js
@ -10,8 +10,8 @@ const {
  ChatModelStreamHandler,
 } = require('@librechat/agents');
 const { processCodeOutput } = require('~/server/services/Files/Code/process');
+const { loadAuthValues } = require('~/server/services/Tools/credentials');
 const { saveBase64Image } = require('~/server/services/Files/process');
-const { loadAuthValues } = require('~/app/clients/tools/util');
 const { logger, sendEvent } = require('~/config');

 /** @typedef {import('@librechat/agents').Graph} Graph */
--- a/api/server/controllers/agents/client.js
+++ b/api/server/controllers/agents/client.js
@ -223,14 +223,23 @@ class AgentClient extends BaseClient {
    };
  }

+  /**
+   *
+   * @param {TMessage} message
+   * @param {Array<MongoFile>} attachments
+   * @returns {Promise<Array<Partial<MongoFile>>>}
+   */
  async addImageURLs(message, attachments) {
-    const { files, image_urls } = await encodeAndFormat(
+    const { files, text, image_urls } = await encodeAndFormat(
      this.options.req,
      attachments,
      this.options.agent.provider,
      VisionModes.agents,
    );
    message.image_urls = image_urls.length ? image_urls : undefined;
+    if (text && text.length) {
+      message.ocr = text;
+    }
    return files;
  }

@ -308,7 +317,21 @@ class AgentClient extends BaseClient {
        assistantName: this.options?.modelLabel,
      });

-      const needsTokenCount = this.contextStrategy && !orderedMessages[i].tokenCount;
+      if (message.ocr && i !== orderedMessages.length - 1) {
+        if (typeof formattedMessage.content === 'string') {
+          formattedMessage.content = message.ocr + '\n' + formattedMessage.content;
+        } else {
+          const textPart = formattedMessage.content.find((part) => part.type === 'text');
+          textPart
+            ? (textPart.text = message.ocr + '\n' + textPart.text)
+            : formattedMessage.content.unshift({ type: 'text', text: message.ocr });
+        }
+      } else if (message.ocr && i === orderedMessages.length - 1) {
+        systemContent = [systemContent, message.ocr].join('\n');
+      }
+
+      const needsTokenCount =
+        (this.contextStrategy && !orderedMessages[i].tokenCount) || message.ocr;

      /* If tokens were never counted, or, is a Vision request and the message has files, count again */
      if (needsTokenCount || (this.isVisionModel && (message.image_urls || message.files))) {