diff --git a/api/app/clients/BaseClient.js b/api/app/clients/BaseClient.js index 32c76523f7..5c6561396e 100644 --- a/api/app/clients/BaseClient.js +++ b/api/app/clients/BaseClient.js @@ -3,6 +3,7 @@ const fetch = require('node-fetch'); const { logger } = require('@librechat/data-schemas'); const { getBalanceConfig, + extractFileContext, encodeAndFormatAudios, encodeAndFormatVideos, encodeAndFormatDocuments, @@ -10,6 +11,7 @@ const { const { Constants, ErrorTypes, + FileSources, ContentTypes, excludedKeys, EModelEndpoint, @@ -21,6 +23,7 @@ const { getMessages, saveMessage, updateMessage, saveConvo, getConvo } = require const { getStrategyFunctions } = require('~/server/services/Files/strategies'); const { checkBalance } = require('~/models/balanceMethods'); const { truncateToolCallOutputs } = require('./prompts'); +const countTokens = require('~/server/utils/countTokens'); const { getFiles } = require('~/models/File'); const TextStream = require('./TextStream'); @@ -1245,27 +1248,62 @@ class BaseClient { return audioResult.files; } + /** + * Extracts text context from attachments and sets it on the message. + * This handles text that was already extracted from files (OCR, transcriptions, document text, etc.) + * @param {TMessage} message - The message to add context to + * @param {MongoFile[]} attachments - Array of file attachments + * @returns {Promise} + */ + async addFileContextToMessage(message, attachments) { + const fileContext = await extractFileContext({ + attachments, + req: this.options?.req, + tokenCountFn: (text) => countTokens(text), + }); + + if (fileContext) { + message.fileContext = fileContext; + } + } + async processAttachments(message, attachments) { const categorizedAttachments = { images: [], - documents: [], videos: [], audios: [], + documents: [], }; + const allFiles = []; + for (const file of attachments) { + /** @type {FileSources} */ + const source = file.source ?? FileSources.local; + if (source === FileSources.text) { + allFiles.push(file); + continue; + } + if (file.embedded === true || file.metadata?.fileIdentifier != null) { + allFiles.push(file); + continue; + } + if (file.type.startsWith('image/')) { categorizedAttachments.images.push(file); } else if (file.type === 'application/pdf') { categorizedAttachments.documents.push(file); + allFiles.push(file); } else if (file.type.startsWith('video/')) { categorizedAttachments.videos.push(file); + allFiles.push(file); } else if (file.type.startsWith('audio/')) { categorizedAttachments.audios.push(file); + allFiles.push(file); } } - const [imageFiles, documentFiles, videoFiles, audioFiles] = await Promise.all([ + const [imageFiles] = await Promise.all([ categorizedAttachments.images.length > 0 ? this.addImageURLs(message, categorizedAttachments.images) : Promise.resolve([]), @@ -1280,7 +1318,8 @@ class BaseClient { : Promise.resolve([]), ]); - const allFiles = [...imageFiles, ...documentFiles, ...videoFiles, ...audioFiles]; + allFiles.push(...imageFiles); + const seenFileIds = new Set(); const uniqueFiles = []; @@ -1345,6 +1384,7 @@ class BaseClient { {}, ); + await this.addFileContextToMessage(message, files); await this.processAttachments(message, files); this.message_file_map[message.messageId] = files; diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js index a9f5543a61..a648488d14 100644 --- a/api/server/controllers/agents/client.js +++ b/api/server/controllers/agents/client.js @@ -211,16 +211,13 @@ class AgentClient extends BaseClient { * @returns {Promise>>} */ async addImageURLs(message, attachments) { - const { files, text, image_urls } = await encodeAndFormat( + const { files, image_urls } = await encodeAndFormat( this.options.req, attachments, this.options.agent.provider, VisionModes.agents, ); message.image_urls = image_urls.length ? image_urls : undefined; - if (text && text.length) { - message.ocr = text; - } return files; } @@ -248,19 +245,18 @@ class AgentClient extends BaseClient { if (this.options.attachments) { const attachments = await this.options.attachments; + const latestMessage = orderedMessages[orderedMessages.length - 1]; if (this.message_file_map) { - this.message_file_map[orderedMessages[orderedMessages.length - 1].messageId] = attachments; + this.message_file_map[latestMessage.messageId] = attachments; } else { this.message_file_map = { - [orderedMessages[orderedMessages.length - 1].messageId]: attachments, + [latestMessage.messageId]: attachments, }; } - const files = await this.processAttachments( - orderedMessages[orderedMessages.length - 1], - attachments, - ); + await this.addFileContextToMessage(latestMessage, attachments); + const files = await this.processAttachments(latestMessage, attachments); this.options.attachments = files; } @@ -280,21 +276,21 @@ class AgentClient extends BaseClient { assistantName: this.options?.modelLabel, }); - if (message.ocr && i !== orderedMessages.length - 1) { + if (message.fileContext && i !== orderedMessages.length - 1) { if (typeof formattedMessage.content === 'string') { - formattedMessage.content = message.ocr + '\n' + formattedMessage.content; + formattedMessage.content = message.fileContext + '\n' + formattedMessage.content; } else { const textPart = formattedMessage.content.find((part) => part.type === 'text'); textPart - ? (textPart.text = message.ocr + '\n' + textPart.text) - : formattedMessage.content.unshift({ type: 'text', text: message.ocr }); + ? (textPart.text = message.fileContext + '\n' + textPart.text) + : formattedMessage.content.unshift({ type: 'text', text: message.fileContext }); } - } else if (message.ocr && i === orderedMessages.length - 1) { - systemContent = [systemContent, message.ocr].join('\n'); + } else if (message.fileContext && i === orderedMessages.length - 1) { + systemContent = [systemContent, message.fileContext].join('\n'); } const needsTokenCount = - (this.contextStrategy && !orderedMessages[i].tokenCount) || message.ocr; + (this.contextStrategy && !orderedMessages[i].tokenCount) || message.fileContext; /* If tokens were never counted, or, is a Vision request and the message has files, count again */ if (needsTokenCount || (this.isVisionModel && (message.image_urls || message.files))) { diff --git a/api/server/services/Files/images/encode.js b/api/server/services/Files/images/encode.js index 34128e3152..7609ed388a 100644 --- a/api/server/services/Files/images/encode.js +++ b/api/server/services/Files/images/encode.js @@ -1,16 +1,14 @@ const axios = require('axios'); +const { logAxiosError } = require('@librechat/api'); const { logger } = require('@librechat/data-schemas'); -const { logAxiosError, processTextWithTokenLimit } = require('@librechat/api'); const { FileSources, VisionModes, ImageDetail, ContentTypes, EModelEndpoint, - mergeFileConfig, } = require('librechat-data-provider'); const { getStrategyFunctions } = require('~/server/services/Files/strategies'); -const countTokens = require('~/server/utils/countTokens'); /** * Converts a readable stream to a base64 encoded string. @@ -88,15 +86,14 @@ const blobStorageSources = new Set([FileSources.azure_blob, FileSources.s3]); * @param {Array} files - The array of files to encode and format. * @param {EModelEndpoint} [endpoint] - Optional: The endpoint for the image. * @param {string} [mode] - Optional: The endpoint mode for the image. - * @returns {Promise<{ text: string; files: MongoFile[]; image_urls: MessageContentImageUrl[] }>} - A promise that resolves to the result object containing the encoded images and file details. + * @returns {Promise<{ files: MongoFile[]; image_urls: MessageContentImageUrl[] }>} - A promise that resolves to the result object containing the encoded images and file details. */ async function encodeAndFormat(req, files, endpoint, mode) { const promises = []; /** @type {Record, 'prepareImagePayload' | 'getDownloadStream'>>} */ const encodingMethods = {}; - /** @type {{ text: string; files: MongoFile[]; image_urls: MessageContentImageUrl[] }} */ + /** @type {{ files: MongoFile[]; image_urls: MessageContentImageUrl[] }} */ const result = { - text: '', files: [], image_urls: [], }; @@ -105,29 +102,9 @@ async function encodeAndFormat(req, files, endpoint, mode) { return result; } - const fileTokenLimit = - req.body?.fileTokenLimit ?? mergeFileConfig(req.config?.fileConfig).fileTokenLimit; - for (let file of files) { /** @type {FileSources} */ const source = file.source ?? FileSources.local; - if (source === FileSources.text && file.text) { - let fileText = file.text; - - const { text: limitedText, wasTruncated } = await processTextWithTokenLimit({ - text: fileText, - tokenLimit: fileTokenLimit, - tokenCountFn: (text) => countTokens(text), - }); - - if (wasTruncated) { - logger.debug( - `[encodeAndFormat] Text content truncated for file: ${file.filename} due to token limits`, - ); - } - - result.text += `${!result.text ? 'Attached document(s):\n```md' : '\n\n---\n\n'}# "${file.filename}"\n${limitedText}\n`; - } if (!file.height) { promises.push([file, null]); @@ -165,10 +142,6 @@ async function encodeAndFormat(req, files, endpoint, mode) { promises.push(preparePayload(req, file)); } - if (result.text) { - result.text += '\n```'; - } - const detail = req.body.imageDetail ?? ImageDetail.auto; /** @type {Array<[MongoFile, string]>} */ diff --git a/packages/api/src/files/context.ts b/packages/api/src/files/context.ts new file mode 100644 index 0000000000..24418ce49d --- /dev/null +++ b/packages/api/src/files/context.ts @@ -0,0 +1,68 @@ +import { logger } from '@librechat/data-schemas'; +import { FileSources, mergeFileConfig } from 'librechat-data-provider'; +import type { fileConfigSchema } from 'librechat-data-provider'; +import type { IMongoFile } from '@librechat/data-schemas'; +import type { z } from 'zod'; +import { processTextWithTokenLimit } from '~/utils/text'; + +/** + * Extracts text context from attachments and returns formatted text. + * This handles text that was already extracted from files (OCR, transcriptions, document text, etc.) + * @param params - The parameters object + * @param params.attachments - Array of file attachments + * @param params.req - Express request object for config access + * @param params.tokenCountFn - Function to count tokens in text + * @returns The formatted file context text, or undefined if no text found + */ +export async function extractFileContext({ + attachments, + req, + tokenCountFn, +}: { + attachments: IMongoFile[]; + req?: { + body?: { fileTokenLimit?: number }; + config?: { fileConfig?: z.infer }; + }; + tokenCountFn: (text: string) => number; +}): Promise { + if (!attachments || attachments.length === 0) { + return undefined; + } + + const fileConfig = mergeFileConfig(req?.config?.fileConfig); + const fileTokenLimit = req?.body?.fileTokenLimit ?? fileConfig.fileTokenLimit; + + if (!fileTokenLimit) { + // If no token limit, return undefined (no processing) + return undefined; + } + + let resultText = ''; + + for (const file of attachments) { + const source = file.source ?? FileSources.local; + if (source === FileSources.text && file.text) { + const { text: limitedText, wasTruncated } = await processTextWithTokenLimit({ + text: file.text, + tokenLimit: fileTokenLimit, + tokenCountFn, + }); + + if (wasTruncated) { + logger.debug( + `[extractFileContext] Text content truncated for file: ${file.filename} due to token limits`, + ); + } + + resultText += `${!resultText ? 'Attached document(s):\n```md' : '\n\n---\n\n'}# "${file.filename}"\n${limitedText}\n`; + } + } + + if (resultText) { + resultText += '\n```'; + return resultText; + } + + return undefined; +} diff --git a/packages/api/src/files/index.ts b/packages/api/src/files/index.ts index 3d1a3118e3..9111b8d5e3 100644 --- a/packages/api/src/files/index.ts +++ b/packages/api/src/files/index.ts @@ -1,4 +1,5 @@ export * from './audio'; +export * from './context'; export * from './encode'; export * from './mistral/crud'; export * from './ocr';