🔍 feat: Mistral OCR API / Upload Files as Text (#6274)

* refactor: move `loadAuthValues` to `~/services/Tools/credentials`

* feat: add createAxiosInstance function to configure axios with proxy support

* WIP: First pass mistral ocr

* refactor: replace getConvoFiles with getToolFiles for improved file retrieval logic

* refactor: improve document formatting in encodeAndFormat function

* refactor: remove unused resendFiles parameter from buildOptions function (this option comes from the agent config)

* fix: update getFiles call to include files with `text` property as well

* refactor: move file handling to `initializeAgentOptions`

* refactor: enhance addImageURLs method to handle OCR text and improve message formatting

* refactor: update message formatting to handle OCR text in various content types

* refactor: remove unused resendFiles property from compactAgentsSchema

* fix: add error handling for Mistral OCR document upload and logging

* refactor: integrate OCR capability into file upload options and configuration

* refactor: skip processing for text source files in delete request, as they are directly tied to database

* feat: add metadata field to ExtendedFile type and update PanelColumns and PanelTable components for localization and metadata handling

* fix: source icon styling

* wip: first pass, frontend file context agent resources

* refactor: add hover card with contextual information for File Context (OCR) in FileContext component

* feat: enhance file processing by integrating file retrieval for OCR resources in agent initialization

* feat: implement OCR config; fix: agent resource deletion for ocr files

* feat: enhance agent initialization by adding OCR capability check in resource priming

* ci: fix `~/config` module mock

* ci: add OCR property expectation in AppService tests

* refactor: simplify OCR config loading by removing environment variable extraction, to be done when OCR is actually performed

* ci: add unit test to ensure environment variable references are not parsed in OCR config

* refactor: disable base64 image inclusion in OCR request

* refactor: enhance OCR configuration handling by validating environment variables and providing defaults

* refactor: use file stream from disk for mistral ocr api
This commit is contained in:
Danny Avila 2025-03-10 17:23:46 -04:00 committed by GitHub
parent 9db00edfc4
commit ded3cd8876
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 1621 additions and 131 deletions

View file

@ -28,8 +28,8 @@ const { addResourceFileId, deleteResourceFileId } = require('~/server/controller
const { addAgentResourceFile, removeAgentResourceFiles } = require('~/models/Agent');
const { getOpenAIClient } = require('~/server/controllers/assistants/helpers');
const { createFile, updateFileUsage, deleteFiles } = require('~/models/File');
const { loadAuthValues } = require('~/server/services/Tools/credentials');
const { getEndpointsConfig } = require('~/server/services/Config');
const { loadAuthValues } = require('~/app/clients/tools/util');
const { LB_QueueAsyncCall } = require('~/server/utils/queue');
const { getStrategyFunctions } = require('./strategies');
const { determineFileType } = require('~/server/utils');
@ -162,7 +162,6 @@ const processDeleteRequest = async ({ req, files }) => {
for (const file of files) {
const source = file.source ?? FileSources.local;
if (req.body.agent_id && req.body.tool_resource) {
agentFiles.push({
tool_resource: req.body.tool_resource,
@ -170,6 +169,11 @@ const processDeleteRequest = async ({ req, files }) => {
});
}
if (source === FileSources.text) {
resolvedFileIds.push(file.file_id);
continue;
}
if (checkOpenAIStorage(source) && !client[source]) {
await initializeClients();
}
@ -521,6 +525,52 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
if (!isFileSearchEnabled) {
throw new Error('File search is not enabled for Agents');
}
} else if (tool_resource === EToolResources.ocr) {
const isOCREnabled = await checkCapability(req, AgentCapabilities.ocr);
if (!isOCREnabled) {
throw new Error('OCR capability is not enabled for Agents');
}
const { handleFileUpload } = getStrategyFunctions(
req.app.locals?.ocr?.strategy ?? FileSources.mistral_ocr,
);
const { file_id, temp_file_id } = metadata;
const {
text,
bytes,
// TODO: OCR images support?
images,
filename,
filepath: ocrFileURL,
} = await handleFileUpload({ req, file, file_id, entity_id: agent_id });
const fileInfo = removeNullishValues({
text,
bytes,
file_id,
temp_file_id,
user: req.user.id,
type: file.mimetype,
filepath: ocrFileURL,
source: FileSources.text,
filename: filename ?? file.originalname,
model: messageAttachment ? undefined : req.body.model,
context: messageAttachment ? FileContext.message_attachment : FileContext.agents,
});
if (!messageAttachment && tool_resource) {
await addAgentResourceFile({
req,
file_id,
agent_id,
tool_resource,
});
}
const result = await createFile(fileInfo, true);
return res
.status(200)
.json({ message: 'Agent file uploaded and processed successfully', ...result });
}
const source =