👁️ feat: Azure Mistral OCR Strategy (#7888)

* 👁️ feat: Add Azure Mistral OCR strategy and endpoint integration

This commit introduces a new OCR strategy named 'azure_mistral_ocr', allowing the use of a Mistral OCR endpoint deployed on Azure. The configuration, schemas, and file upload strategies have been updated to support this integration, enabling seamless OCR processing via Azure-hosted Mistral services.

* 🗑️ chore: Clean up .gitignore by removing commented-out uncommon directory name

* chore: remove unused vars

* refactor: Move createAxiosInstance to packages/api/utils and update imports

- Removed the createAxiosInstance function from the config module and relocated it to a new utils module for better organization.
- Updated import paths in relevant files to reflect the new location of createAxiosInstance.
- Added tests for createAxiosInstance to ensure proper functionality and proxy configuration handling.

* chore: move axios helpers to packages/api

- Added logAxiosError function to @librechat/api for centralized error logging.
- Updated imports across various files to use the new logAxiosError function.
- Removed the old axios.js utility file as it is no longer needed.

* chore: Update Jest moduleNameMapper for improved path resolution

- Added a new mapping for '~/' to resolve module paths in Jest configuration, enhancing import handling for the project.

* feat: Implement Mistral OCR API integration in TS

* chore: Update MistralOCR tests based on new imports

* fix: Enhance MistralOCR configuration handling and tests

- Introduced helper functions for resolving configuration values from environment variables or hardcoded settings.
- Updated the uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration resolution logic.
- Improved test cases to ensure correct behavior when mixing environment variables and hardcoded values.
- Mocked file upload and signed URL responses in tests to validate functionality without external dependencies.

* feat: Enhance MistralOCR functionality with improved configuration and error handling

- Introduced helper functions for loading authentication configuration and resolving values from environment variables.
- Updated uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration logic.
- Added utility functions for processing OCR results and creating error messages.
- Improved document type determination and result aggregation for better OCR processing.

* refactor: Reorganize OCR type imports in Mistral CRUD file

- Moved OCRResult, OCRResultPage, and OCRImage imports to a more logical grouping for better readability and maintainability.

* feat: Add file exports to API and create files index

* chore: Update OCR types for enhanced structure and clarity

- Redesigned OCRImage interface to include mandatory fields and improved naming conventions.
- Added PageDimensions interface for better representation of page metrics.
- Updated OCRResultPage to include dimensions and mandatory images array.
- Refined OCRResult to include document annotation and usage information.

* refactor: use TS counterpart of uploadOCR methods

* ci: Update MistralOCR tests to reflect new OCR result structure

* chore: Bump version of @librechat/api to 1.2.3 in package.json and package-lock.json

* chore: Update CONFIG_VERSION to 1.2.8

* chore: remove unused sendEvent function from config module (now imported from '@librechat/api')

* chore: remove MistralOCR service files and tests (now in '@librechat/api')

* ci: update logger import in ModelService tests to use @librechat/data-schemas

---------

Co-authored-by: arthurolivierfortin <arthurolivier.fortin@gmail.com>
This commit is contained in:
Danny Avila 2025-06-13 15:14:57 -04:00 committed by GitHub
parent 46ff008b07
commit 5f2d1c5dc9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 2245 additions and 1235 deletions

View file

@ -0,0 +1 @@
export * from './mistral/crud';

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,424 @@
import * as fs from 'fs';
import * as path from 'path';
import FormData from 'form-data';
import { logger } from '@librechat/data-schemas';
import {
FileSources,
envVarRegex,
extractEnvVariable,
extractVariableName,
} from 'librechat-data-provider';
import type { TCustomConfig } from 'librechat-data-provider';
import type { Request as ServerRequest } from 'express';
import type { AxiosError } from 'axios';
import type {
MistralFileUploadResponse,
MistralSignedUrlResponse,
MistralOCRUploadResult,
MistralOCRError,
OCRResultPage,
OCRResult,
OCRImage,
} from '~/types';
import { logAxiosError, createAxiosInstance } from '~/utils/axios';
const axios = createAxiosInstance();
const DEFAULT_MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
const DEFAULT_MISTRAL_MODEL = 'mistral-ocr-latest';
/** Helper type for auth configuration */
interface AuthConfig {
apiKey: string;
baseURL: string;
}
/** Helper type for OCR request context */
interface OCRContext {
req: Pick<ServerRequest, 'user' | 'app'> & {
user?: { id: string };
app: {
locals?: {
ocr?: TCustomConfig['ocr'];
};
};
};
file: Express.Multer.File;
loadAuthValues: (params: {
userId: string;
authFields: string[];
optional?: Set<string>;
}) => Promise<Record<string, string | undefined>>;
}
/**
* Uploads a document to Mistral API using file streaming to avoid loading the entire file into memory
* @param params Upload parameters
* @param params.filePath The path to the file on disk
* @param params.fileName Optional filename to use (defaults to the name from filePath)
* @param params.apiKey Mistral API key
* @param params.baseURL Mistral API base URL
* @returns The response from Mistral API
*/
export async function uploadDocumentToMistral({
apiKey,
filePath,
baseURL = DEFAULT_MISTRAL_BASE_URL,
fileName = '',
}: {
apiKey: string;
filePath: string;
baseURL?: string;
fileName?: string;
}): Promise<MistralFileUploadResponse> {
const form = new FormData();
form.append('purpose', 'ocr');
const actualFileName = fileName || path.basename(filePath);
const fileStream = fs.createReadStream(filePath);
form.append('file', fileStream, { filename: actualFileName });
return axios
.post(`${baseURL}/files`, form, {
headers: {
Authorization: `Bearer ${apiKey}`,
...form.getHeaders(),
},
maxBodyLength: Infinity,
maxContentLength: Infinity,
})
.then((res) => res.data)
.catch((error) => {
throw error;
});
}
export async function getSignedUrl({
apiKey,
fileId,
expiry = 24,
baseURL = DEFAULT_MISTRAL_BASE_URL,
}: {
apiKey: string;
fileId: string;
expiry?: number;
baseURL?: string;
}): Promise<MistralSignedUrlResponse> {
return axios
.get(`${baseURL}/files/${fileId}/url?expiry=${expiry}`, {
headers: {
Authorization: `Bearer ${apiKey}`,
},
})
.then((res) => res.data)
.catch((error) => {
logger.error('Error fetching signed URL:', error.message);
throw error;
});
}
/**
* @param {Object} params
* @param {string} params.apiKey
* @param {string} params.url - The document or image URL
* @param {string} [params.documentType='document_url'] - 'document_url' or 'image_url'
* @param {string} [params.model]
* @param {string} [params.baseURL]
* @returns {Promise<OCRResult>}
*/
export async function performOCR({
url,
apiKey,
model = DEFAULT_MISTRAL_MODEL,
baseURL = DEFAULT_MISTRAL_BASE_URL,
documentType = 'document_url',
}: {
url: string;
apiKey: string;
model?: string;
baseURL?: string;
documentType?: 'document_url' | 'image_url';
}): Promise<OCRResult> {
const documentKey = documentType === 'image_url' ? 'image_url' : 'document_url';
return axios
.post(
`${baseURL}/ocr`,
{
model,
image_limit: 0,
include_image_base64: false,
document: {
type: documentType,
[documentKey]: url,
},
},
{
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
},
},
)
.then((res) => res.data)
.catch((error) => {
logger.error('Error performing OCR:', error.message);
throw error;
});
}
/**
* Determines if a value needs to be loaded from environment
*/
function needsEnvLoad(value: string): boolean {
return envVarRegex.test(value) || !value.trim();
}
/**
* Gets the environment variable name for a config value
*/
function getEnvVarName(configValue: string, defaultName: string): string {
if (!envVarRegex.test(configValue)) {
return defaultName;
}
return extractVariableName(configValue) || defaultName;
}
/**
* Resolves a configuration value from either hardcoded or environment
*/
async function resolveConfigValue(
configValue: string,
defaultEnvName: string,
authValues: Record<string, string | undefined>,
defaultValue?: string,
): Promise<string> {
// If it's a hardcoded value (not env var and not empty), use it directly
if (!needsEnvLoad(configValue)) {
return configValue;
}
// Otherwise, get from auth values
const envVarName = getEnvVarName(configValue, defaultEnvName);
return authValues[envVarName] || defaultValue || '';
}
/**
* Loads authentication configuration from OCR config
*/
async function loadAuthConfig(context: OCRContext): Promise<AuthConfig> {
const ocrConfig = context.req.app.locals?.ocr;
const apiKeyConfig = ocrConfig?.apiKey || '';
const baseURLConfig = ocrConfig?.baseURL || '';
// If both are hardcoded, return them directly
if (!needsEnvLoad(apiKeyConfig) && !needsEnvLoad(baseURLConfig)) {
return {
apiKey: apiKeyConfig,
baseURL: baseURLConfig,
};
}
// Build auth fields array
const authFields: string[] = [];
if (needsEnvLoad(baseURLConfig)) {
authFields.push(getEnvVarName(baseURLConfig, 'OCR_BASEURL'));
}
if (needsEnvLoad(apiKeyConfig)) {
authFields.push(getEnvVarName(apiKeyConfig, 'OCR_API_KEY'));
}
// Load auth values
const authValues = await context.loadAuthValues({
userId: context.req.user?.id || '',
authFields,
optional: new Set(['OCR_BASEURL']),
});
// Resolve each value
const apiKey = await resolveConfigValue(apiKeyConfig, 'OCR_API_KEY', authValues);
const baseURL = await resolveConfigValue(
baseURLConfig,
'OCR_BASEURL',
authValues,
DEFAULT_MISTRAL_BASE_URL,
);
return { apiKey, baseURL };
}
/**
* Gets the model configuration
*/
function getModelConfig(ocrConfig: TCustomConfig['ocr']): string {
const modelConfig = ocrConfig?.mistralModel || '';
if (!modelConfig.trim()) {
return DEFAULT_MISTRAL_MODEL;
}
if (envVarRegex.test(modelConfig)) {
return extractEnvVariable(modelConfig) || DEFAULT_MISTRAL_MODEL;
}
return modelConfig.trim();
}
/**
* Determines document type based on file
*/
function getDocumentType(file: Express.Multer.File): 'image_url' | 'document_url' {
const mimetype = (file.mimetype || '').toLowerCase();
const originalname = file.originalname || '';
const isImage =
mimetype.startsWith('image') || /\.(png|jpe?g|gif|bmp|webp|tiff?)$/i.test(originalname);
return isImage ? 'image_url' : 'document_url';
}
/**
* Processes OCR result pages into aggregated text and images
*/
function processOCRResult(ocrResult: OCRResult): { text: string; images: string[] } {
let aggregatedText = '';
const images: string[] = [];
ocrResult.pages.forEach((page: OCRResultPage, index: number) => {
if (ocrResult.pages.length > 1) {
aggregatedText += `# PAGE ${index + 1}\n`;
}
aggregatedText += page.markdown + '\n\n';
if (!page.images || page.images.length === 0) {
return;
}
page.images.forEach((image: OCRImage) => {
if (image.image_base64) {
images.push(image.image_base64);
}
});
});
return { text: aggregatedText, images };
}
/**
* Creates an error message for OCR operations
*/
function createOCRError(error: unknown, baseMessage: string): Error {
const axiosError = error as AxiosError<MistralOCRError>;
const detail = axiosError?.response?.data?.detail;
const message = detail || baseMessage;
const responseMessage = axiosError?.response?.data?.message;
const errorLog = logAxiosError({ error: axiosError, message });
const fullMessage = responseMessage ? `${errorLog} - ${responseMessage}` : errorLog;
return new Error(fullMessage);
}
/**
* Uploads a file to the Mistral OCR API and processes the OCR result.
*
* @param params - The params object.
* @param params.req - The request object from Express. It should have a `user` property with an `id`
* representing the user
* @param params.file - The file object, which is part of the request. The file object should
* have a `mimetype` property that tells us the file type
* @param params.loadAuthValues - Function to load authentication values
* @returns - The result object containing the processed `text` and `images` (not currently used),
* along with the `filename` and `bytes` properties.
*/
export const uploadMistralOCR = async (context: OCRContext): Promise<MistralOCRUploadResult> => {
try {
const { apiKey, baseURL } = await loadAuthConfig(context);
const model = getModelConfig(context.req.app.locals?.ocr);
// Upload file
const mistralFile = await uploadDocumentToMistral({
filePath: context.file.path,
fileName: context.file.originalname,
apiKey,
baseURL,
});
// Get signed URL
const signedUrlResponse = await getSignedUrl({
apiKey,
baseURL,
fileId: mistralFile.id,
});
// Perform OCR
const documentType = getDocumentType(context.file);
const ocrResult = await performOCR({
apiKey,
baseURL,
model,
url: signedUrlResponse.url,
documentType,
});
// Process result
const { text, images } = processOCRResult(ocrResult);
return {
filename: context.file.originalname,
bytes: text.length * 4,
filepath: FileSources.mistral_ocr,
text,
images,
};
} catch (error) {
throw createOCRError(error, 'Error uploading document to Mistral OCR API');
}
};
/**
* Use Azure Mistral OCR API to processe the OCR result.
*
* @param params - The params object.
* @param params.req - The request object from Express. It should have a `user` property with an `id`
* representing the user
* @param params.file - The file object, which is part of the request. The file object should
* have a `mimetype` property that tells us the file type
* @param params.loadAuthValues - Function to load authentication values
* @returns - The result object containing the processed `text` and `images` (not currently used),
* along with the `filename` and `bytes` properties.
*/
export const uploadAzureMistralOCR = async (
context: OCRContext,
): Promise<MistralOCRUploadResult> => {
try {
const { apiKey, baseURL } = await loadAuthConfig(context);
const model = getModelConfig(context.req.app.locals?.ocr);
// Read file as base64
const buffer = fs.readFileSync(context.file.path);
const base64 = buffer.toString('base64');
// Perform OCR directly with base64
const documentType = getDocumentType(context.file);
const ocrResult = await performOCR({
apiKey,
baseURL,
model,
url: `data:image/jpeg;base64,${base64}`,
documentType,
});
// Process result
const { text, images } = processOCRResult(ocrResult);
return {
filename: context.file.originalname,
bytes: text.length * 4,
filepath: FileSources.azure_mistral_ocr,
text,
images,
};
} catch (error) {
throw createOCRError(error, 'Error uploading document to Azure Mistral OCR API');
}
};