🖼️ fix: Extract File Context & Persist Attachments (#10069)

- problem: `addImageUrls` had a side effect that was being leveraged before to populate both the `ocr` message field, now `fileContext`, and `client.options.attachments`, which would record the user's uploaded message attachments to the user message when saved to the database and returned at the end of the request lifecycle - solution: created dedicated handling for file context, and made sure to populate `allFiles` with non-provider attachments
2025-12-17 00:40:14 +01:00 · 2025-10-10 12:35:37 +03:00 · 2025-10-10 12:35:37 +03:00 · 07d0abc9fd
commit 07d0abc9fd
parent fbe341a171
5 changed files with 128 additions and 50 deletions
--- a/api/app/clients/BaseClient.js
+++ b/api/app/clients/BaseClient.js
@ -3,6 +3,7 @@ const fetch = require('node-fetch');
 const { logger } = require('@librechat/data-schemas');
 const {
  getBalanceConfig,
+  extractFileContext,
  encodeAndFormatAudios,
  encodeAndFormatVideos,
  encodeAndFormatDocuments,
@ -10,6 +11,7 @@ const {
 const {
  Constants,
  ErrorTypes,
+  FileSources,
  ContentTypes,
  excludedKeys,
  EModelEndpoint,
@ -21,6 +23,7 @@ const { getMessages, saveMessage, updateMessage, saveConvo, getConvo } = require
 const { getStrategyFunctions } = require('~/server/services/Files/strategies');
 const { checkBalance } = require('~/models/balanceMethods');
 const { truncateToolCallOutputs } = require('./prompts');
+const countTokens = require('~/server/utils/countTokens');
 const { getFiles } = require('~/models/File');
 const TextStream = require('./TextStream');

@ -1245,27 +1248,62 @@ class BaseClient {
    return audioResult.files;
  }

+  /**
+   * Extracts text context from attachments and sets it on the message.
+   * This handles text that was already extracted from files (OCR, transcriptions, document text, etc.)
+   * @param {TMessage} message - The message to add context to
+   * @param {MongoFile[]} attachments - Array of file attachments
+   * @returns {Promise<void>}
+   */
+  async addFileContextToMessage(message, attachments) {
+    const fileContext = await extractFileContext({
+      attachments,
+      req: this.options?.req,
+      tokenCountFn: (text) => countTokens(text),
+    });
+
+    if (fileContext) {
+      message.fileContext = fileContext;
+    }
+  }
+
  async processAttachments(message, attachments) {
    const categorizedAttachments = {
      images: [],
-      documents: [],
      videos: [],
      audios: [],
+      documents: [],
    };

+    const allFiles = [];
+
    for (const file of attachments) {
+      /** @type {FileSources} */
+      const source = file.source ?? FileSources.local;
+      if (source === FileSources.text) {
+        allFiles.push(file);
+        continue;
+      }
+      if (file.embedded === true || file.metadata?.fileIdentifier != null) {
+        allFiles.push(file);
+        continue;
+      }
+
      if (file.type.startsWith('image/')) {
        categorizedAttachments.images.push(file);
      } else if (file.type === 'application/pdf') {
        categorizedAttachments.documents.push(file);
+        allFiles.push(file);
      } else if (file.type.startsWith('video/')) {
        categorizedAttachments.videos.push(file);
+        allFiles.push(file);
      } else if (file.type.startsWith('audio/')) {
        categorizedAttachments.audios.push(file);
+        allFiles.push(file);
      }
    }

-    const [imageFiles, documentFiles, videoFiles, audioFiles] = await Promise.all([
+    const [imageFiles] = await Promise.all([
      categorizedAttachments.images.length > 0
        ? this.addImageURLs(message, categorizedAttachments.images)
        : Promise.resolve([]),
@ -1280,7 +1318,8 @@ class BaseClient {
        : Promise.resolve([]),
    ]);

-    const allFiles = [...imageFiles, ...documentFiles, ...videoFiles, ...audioFiles];
+    allFiles.push(...imageFiles);
+
    const seenFileIds = new Set();
    const uniqueFiles = [];

@ -1345,6 +1384,7 @@ class BaseClient {
        {},
      );

+      await this.addFileContextToMessage(message, files);
      await this.processAttachments(message, files);

      this.message_file_map[message.messageId] = files;
--- a/api/server/controllers/agents/client.js
+++ b/api/server/controllers/agents/client.js
@ -211,16 +211,13 @@ class AgentClient extends BaseClient {
   * @returns {Promise<Array<Partial<MongoFile>>>}
   */
  async addImageURLs(message, attachments) {
-    const { files, text, image_urls } = await encodeAndFormat(
+    const { files, image_urls } = await encodeAndFormat(
      this.options.req,
      attachments,
      this.options.agent.provider,
      VisionModes.agents,
    );
    message.image_urls = image_urls.length ? image_urls : undefined;
-    if (text && text.length) {
-      message.ocr = text;
-    }
    return files;
  }

@ -248,19 +245,18 @@ class AgentClient extends BaseClient {

    if (this.options.attachments) {
      const attachments = await this.options.attachments;
+      const latestMessage = orderedMessages[orderedMessages.length - 1];

      if (this.message_file_map) {
-        this.message_file_map[orderedMessages[orderedMessages.length - 1].messageId] = attachments;
+        this.message_file_map[latestMessage.messageId] = attachments;
      } else {
        this.message_file_map = {
-          [orderedMessages[orderedMessages.length - 1].messageId]: attachments,
+          [latestMessage.messageId]: attachments,
        };
      }

-      const files = await this.processAttachments(
-        orderedMessages[orderedMessages.length - 1],
-        attachments,
-      );
+      await this.addFileContextToMessage(latestMessage, attachments);
+      const files = await this.processAttachments(latestMessage, attachments);

      this.options.attachments = files;
    }
@ -280,21 +276,21 @@ class AgentClient extends BaseClient {
        assistantName: this.options?.modelLabel,
      });

-      if (message.ocr && i !== orderedMessages.length - 1) {
+      if (message.fileContext && i !== orderedMessages.length - 1) {
        if (typeof formattedMessage.content === 'string') {
-          formattedMessage.content = message.ocr + '\n' + formattedMessage.content;
+          formattedMessage.content = message.fileContext + '\n' + formattedMessage.content;
        } else {
          const textPart = formattedMessage.content.find((part) => part.type === 'text');
          textPart
-            ? (textPart.text = message.ocr + '\n' + textPart.text)
-            : formattedMessage.content.unshift({ type: 'text', text: message.ocr });
+            ? (textPart.text = message.fileContext + '\n' + textPart.text)
+            : formattedMessage.content.unshift({ type: 'text', text: message.fileContext });
        }
-      } else if (message.ocr && i === orderedMessages.length - 1) {
-        systemContent = [systemContent, message.ocr].join('\n');
+      } else if (message.fileContext && i === orderedMessages.length - 1) {
+        systemContent = [systemContent, message.fileContext].join('\n');
      }

      const needsTokenCount =
-        (this.contextStrategy && !orderedMessages[i].tokenCount) || message.ocr;
+        (this.contextStrategy && !orderedMessages[i].tokenCount) || message.fileContext;

      /* If tokens were never counted, or, is a Vision request and the message has files, count again */
      if (needsTokenCount || (this.isVisionModel && (message.image_urls || message.files))) {
--- a/api/server/services/Files/images/encode.js
+++ b/api/server/services/Files/images/encode.js
@ -1,16 +1,14 @@
 const axios = require('axios');
+const { logAxiosError } = require('@librechat/api');
 const { logger } = require('@librechat/data-schemas');
-const { logAxiosError, processTextWithTokenLimit } = require('@librechat/api');
 const {
  FileSources,
  VisionModes,
  ImageDetail,
  ContentTypes,
  EModelEndpoint,
-  mergeFileConfig,
 } = require('librechat-data-provider');
 const { getStrategyFunctions } = require('~/server/services/Files/strategies');
-const countTokens = require('~/server/utils/countTokens');

 /**
 * Converts a readable stream to a base64 encoded string.
@ -88,15 +86,14 @@ const blobStorageSources = new Set([FileSources.azure_blob, FileSources.s3]);
 * @param {Array<MongoFile>} files - The array of files to encode and format.
 * @param {EModelEndpoint} [endpoint] - Optional: The endpoint for the image.
 * @param {string} [mode] - Optional: The endpoint mode for the image.
- * @returns {Promise<{ text: string; files: MongoFile[]; image_urls: MessageContentImageUrl[] }>} - A promise that resolves to the result object containing the encoded images and file details.
+ * @returns {Promise<{ files: MongoFile[]; image_urls: MessageContentImageUrl[] }>} - A promise that resolves to the result object containing the encoded images and file details.
 */
 async function encodeAndFormat(req, files, endpoint, mode) {
  const promises = [];
  /** @type {Record<FileSources, Pick<ReturnType<typeof getStrategyFunctions>, 'prepareImagePayload' | 'getDownloadStream'>>} */
  const encodingMethods = {};
-  /** @type {{ text: string; files: MongoFile[]; image_urls: MessageContentImageUrl[] }} */
+  /** @type {{ files: MongoFile[]; image_urls: MessageContentImageUrl[] }} */
  const result = {
-    text: '',
    files: [],
    image_urls: [],
  };
@ -105,29 +102,9 @@ async function encodeAndFormat(req, files, endpoint, mode) {
    return result;
  }

-  const fileTokenLimit =
-    req.body?.fileTokenLimit ?? mergeFileConfig(req.config?.fileConfig).fileTokenLimit;
-
  for (let file of files) {
    /** @type {FileSources} */
    const source = file.source ?? FileSources.local;
-    if (source === FileSources.text && file.text) {
-      let fileText = file.text;
-
-      const { text: limitedText, wasTruncated } = await processTextWithTokenLimit({
-        text: fileText,
-        tokenLimit: fileTokenLimit,
-        tokenCountFn: (text) => countTokens(text),
-      });
-
-      if (wasTruncated) {
-        logger.debug(
-          `[encodeAndFormat] Text content truncated for file: ${file.filename} due to token limits`,
-        );
-      }
-
-      result.text += `${!result.text ? 'Attached document(s):\n```md' : '\n\n---\n\n'}# "${file.filename}"\n${limitedText}\n`;
-    }

    if (!file.height) {
      promises.push([file, null]);
@ -165,10 +142,6 @@ async function encodeAndFormat(req, files, endpoint, mode) {
    promises.push(preparePayload(req, file));
  }

-  if (result.text) {
-    result.text += '\n```';
-  }
-
  const detail = req.body.imageDetail ?? ImageDetail.auto;

  /** @type {Array<[MongoFile, string]>} */
--- a/packages/api/src/files/context.ts
+++ b/packages/api/src/files/context.ts
@ -0,0 +1,68 @@
+import { logger } from '@librechat/data-schemas';
+import { FileSources, mergeFileConfig } from 'librechat-data-provider';
+import type { fileConfigSchema } from 'librechat-data-provider';
+import type { IMongoFile } from '@librechat/data-schemas';
+import type { z } from 'zod';
+import { processTextWithTokenLimit } from '~/utils/text';
+
+/**
+ * Extracts text context from attachments and returns formatted text.
+ * This handles text that was already extracted from files (OCR, transcriptions, document text, etc.)
+ * @param params - The parameters object
+ * @param params.attachments - Array of file attachments
+ * @param params.req - Express request object for config access
+ * @param params.tokenCountFn - Function to count tokens in text
+ * @returns The formatted file context text, or undefined if no text found
+ */
+export async function extractFileContext({
+  attachments,
+  req,
+  tokenCountFn,
+}: {
+  attachments: IMongoFile[];
+  req?: {
+    body?: { fileTokenLimit?: number };
+    config?: { fileConfig?: z.infer<typeof fileConfigSchema> };
+  };
+  tokenCountFn: (text: string) => number;
+}): Promise<string | undefined> {
+  if (!attachments || attachments.length === 0) {
+    return undefined;
+  }
+
+  const fileConfig = mergeFileConfig(req?.config?.fileConfig);
+  const fileTokenLimit = req?.body?.fileTokenLimit ?? fileConfig.fileTokenLimit;
+
+  if (!fileTokenLimit) {
+    // If no token limit, return undefined (no processing)
+    return undefined;
+  }
+
+  let resultText = '';
+
+  for (const file of attachments) {
+    const source = file.source ?? FileSources.local;
+    if (source === FileSources.text && file.text) {
+      const { text: limitedText, wasTruncated } = await processTextWithTokenLimit({
+        text: file.text,
+        tokenLimit: fileTokenLimit,
+        tokenCountFn,
+      });
+
+      if (wasTruncated) {
+        logger.debug(
+          `[extractFileContext] Text content truncated for file: ${file.filename} due to token limits`,
+        );
+      }
+
+      resultText += `${!resultText ? 'Attached document(s):\n```md' : '\n\n---\n\n'}# "${file.filename}"\n${limitedText}\n`;
+    }
+  }
+
+  if (resultText) {
+    resultText += '\n```';
+    return resultText;
+  }
+
+  return undefined;
+}
--- a/packages/api/src/files/index.ts
+++ b/packages/api/src/files/index.ts
@ -1,4 +1,5 @@
 export * from './audio';
+export * from './context';
 export * from './encode';
 export * from './mistral/crud';
 export * from './ocr';