👁️ feat: Azure Mistral OCR Strategy (#7888)

* 👁️ feat: Add Azure Mistral OCR strategy and endpoint integration

This commit introduces a new OCR strategy named 'azure_mistral_ocr', allowing the use of a Mistral OCR endpoint deployed on Azure. The configuration, schemas, and file upload strategies have been updated to support this integration, enabling seamless OCR processing via Azure-hosted Mistral services.

* 🗑️ chore: Clean up .gitignore by removing commented-out uncommon directory name

* chore: remove unused vars

* refactor: Move createAxiosInstance to packages/api/utils and update imports

- Removed the createAxiosInstance function from the config module and relocated it to a new utils module for better organization.
- Updated import paths in relevant files to reflect the new location of createAxiosInstance.
- Added tests for createAxiosInstance to ensure proper functionality and proxy configuration handling.

* chore: move axios helpers to packages/api

- Added logAxiosError function to @librechat/api for centralized error logging.
- Updated imports across various files to use the new logAxiosError function.
- Removed the old axios.js utility file as it is no longer needed.

* chore: Update Jest moduleNameMapper for improved path resolution

- Added a new mapping for '~/' to resolve module paths in Jest configuration, enhancing import handling for the project.

* feat: Implement Mistral OCR API integration in TS

* chore: Update MistralOCR tests based on new imports

* fix: Enhance MistralOCR configuration handling and tests

- Introduced helper functions for resolving configuration values from environment variables or hardcoded settings.
- Updated the uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration resolution logic.
- Improved test cases to ensure correct behavior when mixing environment variables and hardcoded values.
- Mocked file upload and signed URL responses in tests to validate functionality without external dependencies.

* feat: Enhance MistralOCR functionality with improved configuration and error handling

- Introduced helper functions for loading authentication configuration and resolving values from environment variables.
- Updated uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration logic.
- Added utility functions for processing OCR results and creating error messages.
- Improved document type determination and result aggregation for better OCR processing.

* refactor: Reorganize OCR type imports in Mistral CRUD file

- Moved OCRResult, OCRResultPage, and OCRImage imports to a more logical grouping for better readability and maintainability.

* feat: Add file exports to API and create files index

* chore: Update OCR types for enhanced structure and clarity

- Redesigned OCRImage interface to include mandatory fields and improved naming conventions.
- Added PageDimensions interface for better representation of page metrics.
- Updated OCRResultPage to include dimensions and mandatory images array.
- Refined OCRResult to include document annotation and usage information.

* refactor: use TS counterpart of uploadOCR methods

* ci: Update MistralOCR tests to reflect new OCR result structure

* chore: Bump version of @librechat/api to 1.2.3 in package.json and package-lock.json

* chore: Update CONFIG_VERSION to 1.2.8

* chore: remove unused sendEvent function from config module (now imported from '@librechat/api')

* chore: remove MistralOCR service files and tests (now in '@librechat/api')

* ci: update logger import in ModelService tests to use @librechat/data-schemas

---------

Co-authored-by: arthurolivierfortin <arthurolivier.fortin@gmail.com>
This commit is contained in:
Danny Avila 2025-06-13 15:14:57 -04:00 committed by GitHub
parent 46ff008b07
commit 5f2d1c5dc9
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
37 changed files with 2245 additions and 1235 deletions

View file

@ -5,6 +5,7 @@ export default {
testResultsProcessor: 'jest-junit',
moduleNameMapper: {
'^@src/(.*)$': '<rootDir>/src/$1',
'~/(.*)': '<rootDir>/src/$1',
},
// coverageThreshold: {
// global: {

View file

@ -1,6 +1,6 @@
{
"name": "@librechat/api",
"version": "1.2.2",
"version": "1.2.3",
"type": "commonjs",
"description": "MCP services for LibreChat",
"main": "dist/index.js",
@ -51,6 +51,7 @@
"@types/diff": "^6.0.0",
"@types/express": "^5.0.0",
"@types/jest": "^29.5.2",
"@types/multer": "^1.4.13",
"@types/node": "^20.3.0",
"@types/react": "^18.2.18",
"@types/winston": "^2.4.4",
@ -70,14 +71,15 @@
"peerDependencies": {
"@librechat/agents": "^2.4.37",
"@librechat/data-schemas": "*",
"librechat-data-provider": "*",
"@modelcontextprotocol/sdk": "^1.11.2",
"axios": "^1.8.2",
"diff": "^7.0.0",
"eventsource": "^3.0.2",
"express": "^4.21.2",
"node-fetch": "2.7.0",
"keyv": "^5.3.2",
"zod": "^3.22.4",
"tiktoken": "^1.0.15"
"librechat-data-provider": "*",
"node-fetch": "2.7.0",
"tiktoken": "^1.0.15",
"zod": "^3.22.4"
}
}

View file

@ -1,5 +1,6 @@
// rollup.config.js
import { readFileSync } from 'fs';
import json from '@rollup/plugin-json';
import terser from '@rollup/plugin-terser';
import replace from '@rollup/plugin-replace';
import commonjs from '@rollup/plugin-commonjs';
@ -29,6 +30,7 @@ const plugins = [
inlineSourceMap: true,
}),
terser(),
json(),
];
const cjsBuild = {

View file

@ -0,0 +1 @@
export * from './mistral/crud';

File diff suppressed because it is too large Load diff

View file

@ -0,0 +1,424 @@
import * as fs from 'fs';
import * as path from 'path';
import FormData from 'form-data';
import { logger } from '@librechat/data-schemas';
import {
FileSources,
envVarRegex,
extractEnvVariable,
extractVariableName,
} from 'librechat-data-provider';
import type { TCustomConfig } from 'librechat-data-provider';
import type { Request as ServerRequest } from 'express';
import type { AxiosError } from 'axios';
import type {
MistralFileUploadResponse,
MistralSignedUrlResponse,
MistralOCRUploadResult,
MistralOCRError,
OCRResultPage,
OCRResult,
OCRImage,
} from '~/types';
import { logAxiosError, createAxiosInstance } from '~/utils/axios';
const axios = createAxiosInstance();
const DEFAULT_MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
const DEFAULT_MISTRAL_MODEL = 'mistral-ocr-latest';
/** Helper type for auth configuration */
interface AuthConfig {
apiKey: string;
baseURL: string;
}
/** Helper type for OCR request context */
interface OCRContext {
req: Pick<ServerRequest, 'user' | 'app'> & {
user?: { id: string };
app: {
locals?: {
ocr?: TCustomConfig['ocr'];
};
};
};
file: Express.Multer.File;
loadAuthValues: (params: {
userId: string;
authFields: string[];
optional?: Set<string>;
}) => Promise<Record<string, string | undefined>>;
}
/**
* Uploads a document to Mistral API using file streaming to avoid loading the entire file into memory
* @param params Upload parameters
* @param params.filePath The path to the file on disk
* @param params.fileName Optional filename to use (defaults to the name from filePath)
* @param params.apiKey Mistral API key
* @param params.baseURL Mistral API base URL
* @returns The response from Mistral API
*/
export async function uploadDocumentToMistral({
apiKey,
filePath,
baseURL = DEFAULT_MISTRAL_BASE_URL,
fileName = '',
}: {
apiKey: string;
filePath: string;
baseURL?: string;
fileName?: string;
}): Promise<MistralFileUploadResponse> {
const form = new FormData();
form.append('purpose', 'ocr');
const actualFileName = fileName || path.basename(filePath);
const fileStream = fs.createReadStream(filePath);
form.append('file', fileStream, { filename: actualFileName });
return axios
.post(`${baseURL}/files`, form, {
headers: {
Authorization: `Bearer ${apiKey}`,
...form.getHeaders(),
},
maxBodyLength: Infinity,
maxContentLength: Infinity,
})
.then((res) => res.data)
.catch((error) => {
throw error;
});
}
export async function getSignedUrl({
apiKey,
fileId,
expiry = 24,
baseURL = DEFAULT_MISTRAL_BASE_URL,
}: {
apiKey: string;
fileId: string;
expiry?: number;
baseURL?: string;
}): Promise<MistralSignedUrlResponse> {
return axios
.get(`${baseURL}/files/${fileId}/url?expiry=${expiry}`, {
headers: {
Authorization: `Bearer ${apiKey}`,
},
})
.then((res) => res.data)
.catch((error) => {
logger.error('Error fetching signed URL:', error.message);
throw error;
});
}
/**
* @param {Object} params
* @param {string} params.apiKey
* @param {string} params.url - The document or image URL
* @param {string} [params.documentType='document_url'] - 'document_url' or 'image_url'
* @param {string} [params.model]
* @param {string} [params.baseURL]
* @returns {Promise<OCRResult>}
*/
export async function performOCR({
url,
apiKey,
model = DEFAULT_MISTRAL_MODEL,
baseURL = DEFAULT_MISTRAL_BASE_URL,
documentType = 'document_url',
}: {
url: string;
apiKey: string;
model?: string;
baseURL?: string;
documentType?: 'document_url' | 'image_url';
}): Promise<OCRResult> {
const documentKey = documentType === 'image_url' ? 'image_url' : 'document_url';
return axios
.post(
`${baseURL}/ocr`,
{
model,
image_limit: 0,
include_image_base64: false,
document: {
type: documentType,
[documentKey]: url,
},
},
{
headers: {
'Content-Type': 'application/json',
Authorization: `Bearer ${apiKey}`,
},
},
)
.then((res) => res.data)
.catch((error) => {
logger.error('Error performing OCR:', error.message);
throw error;
});
}
/**
* Determines if a value needs to be loaded from environment
*/
function needsEnvLoad(value: string): boolean {
return envVarRegex.test(value) || !value.trim();
}
/**
* Gets the environment variable name for a config value
*/
function getEnvVarName(configValue: string, defaultName: string): string {
if (!envVarRegex.test(configValue)) {
return defaultName;
}
return extractVariableName(configValue) || defaultName;
}
/**
* Resolves a configuration value from either hardcoded or environment
*/
async function resolveConfigValue(
configValue: string,
defaultEnvName: string,
authValues: Record<string, string | undefined>,
defaultValue?: string,
): Promise<string> {
// If it's a hardcoded value (not env var and not empty), use it directly
if (!needsEnvLoad(configValue)) {
return configValue;
}
// Otherwise, get from auth values
const envVarName = getEnvVarName(configValue, defaultEnvName);
return authValues[envVarName] || defaultValue || '';
}
/**
* Loads authentication configuration from OCR config
*/
async function loadAuthConfig(context: OCRContext): Promise<AuthConfig> {
const ocrConfig = context.req.app.locals?.ocr;
const apiKeyConfig = ocrConfig?.apiKey || '';
const baseURLConfig = ocrConfig?.baseURL || '';
// If both are hardcoded, return them directly
if (!needsEnvLoad(apiKeyConfig) && !needsEnvLoad(baseURLConfig)) {
return {
apiKey: apiKeyConfig,
baseURL: baseURLConfig,
};
}
// Build auth fields array
const authFields: string[] = [];
if (needsEnvLoad(baseURLConfig)) {
authFields.push(getEnvVarName(baseURLConfig, 'OCR_BASEURL'));
}
if (needsEnvLoad(apiKeyConfig)) {
authFields.push(getEnvVarName(apiKeyConfig, 'OCR_API_KEY'));
}
// Load auth values
const authValues = await context.loadAuthValues({
userId: context.req.user?.id || '',
authFields,
optional: new Set(['OCR_BASEURL']),
});
// Resolve each value
const apiKey = await resolveConfigValue(apiKeyConfig, 'OCR_API_KEY', authValues);
const baseURL = await resolveConfigValue(
baseURLConfig,
'OCR_BASEURL',
authValues,
DEFAULT_MISTRAL_BASE_URL,
);
return { apiKey, baseURL };
}
/**
* Gets the model configuration
*/
function getModelConfig(ocrConfig: TCustomConfig['ocr']): string {
const modelConfig = ocrConfig?.mistralModel || '';
if (!modelConfig.trim()) {
return DEFAULT_MISTRAL_MODEL;
}
if (envVarRegex.test(modelConfig)) {
return extractEnvVariable(modelConfig) || DEFAULT_MISTRAL_MODEL;
}
return modelConfig.trim();
}
/**
* Determines document type based on file
*/
function getDocumentType(file: Express.Multer.File): 'image_url' | 'document_url' {
const mimetype = (file.mimetype || '').toLowerCase();
const originalname = file.originalname || '';
const isImage =
mimetype.startsWith('image') || /\.(png|jpe?g|gif|bmp|webp|tiff?)$/i.test(originalname);
return isImage ? 'image_url' : 'document_url';
}
/**
* Processes OCR result pages into aggregated text and images
*/
function processOCRResult(ocrResult: OCRResult): { text: string; images: string[] } {
let aggregatedText = '';
const images: string[] = [];
ocrResult.pages.forEach((page: OCRResultPage, index: number) => {
if (ocrResult.pages.length > 1) {
aggregatedText += `# PAGE ${index + 1}\n`;
}
aggregatedText += page.markdown + '\n\n';
if (!page.images || page.images.length === 0) {
return;
}
page.images.forEach((image: OCRImage) => {
if (image.image_base64) {
images.push(image.image_base64);
}
});
});
return { text: aggregatedText, images };
}
/**
* Creates an error message for OCR operations
*/
function createOCRError(error: unknown, baseMessage: string): Error {
const axiosError = error as AxiosError<MistralOCRError>;
const detail = axiosError?.response?.data?.detail;
const message = detail || baseMessage;
const responseMessage = axiosError?.response?.data?.message;
const errorLog = logAxiosError({ error: axiosError, message });
const fullMessage = responseMessage ? `${errorLog} - ${responseMessage}` : errorLog;
return new Error(fullMessage);
}
/**
* Uploads a file to the Mistral OCR API and processes the OCR result.
*
* @param params - The params object.
* @param params.req - The request object from Express. It should have a `user` property with an `id`
* representing the user
* @param params.file - The file object, which is part of the request. The file object should
* have a `mimetype` property that tells us the file type
* @param params.loadAuthValues - Function to load authentication values
* @returns - The result object containing the processed `text` and `images` (not currently used),
* along with the `filename` and `bytes` properties.
*/
export const uploadMistralOCR = async (context: OCRContext): Promise<MistralOCRUploadResult> => {
try {
const { apiKey, baseURL } = await loadAuthConfig(context);
const model = getModelConfig(context.req.app.locals?.ocr);
// Upload file
const mistralFile = await uploadDocumentToMistral({
filePath: context.file.path,
fileName: context.file.originalname,
apiKey,
baseURL,
});
// Get signed URL
const signedUrlResponse = await getSignedUrl({
apiKey,
baseURL,
fileId: mistralFile.id,
});
// Perform OCR
const documentType = getDocumentType(context.file);
const ocrResult = await performOCR({
apiKey,
baseURL,
model,
url: signedUrlResponse.url,
documentType,
});
// Process result
const { text, images } = processOCRResult(ocrResult);
return {
filename: context.file.originalname,
bytes: text.length * 4,
filepath: FileSources.mistral_ocr,
text,
images,
};
} catch (error) {
throw createOCRError(error, 'Error uploading document to Mistral OCR API');
}
};
/**
* Use Azure Mistral OCR API to processe the OCR result.
*
* @param params - The params object.
* @param params.req - The request object from Express. It should have a `user` property with an `id`
* representing the user
* @param params.file - The file object, which is part of the request. The file object should
* have a `mimetype` property that tells us the file type
* @param params.loadAuthValues - Function to load authentication values
* @returns - The result object containing the processed `text` and `images` (not currently used),
* along with the `filename` and `bytes` properties.
*/
export const uploadAzureMistralOCR = async (
context: OCRContext,
): Promise<MistralOCRUploadResult> => {
try {
const { apiKey, baseURL } = await loadAuthConfig(context);
const model = getModelConfig(context.req.app.locals?.ocr);
// Read file as base64
const buffer = fs.readFileSync(context.file.path);
const base64 = buffer.toString('base64');
// Perform OCR directly with base64
const documentType = getDocumentType(context.file);
const ocrResult = await performOCR({
apiKey,
baseURL,
model,
url: `data:image/jpeg;base64,${base64}`,
documentType,
});
// Process result
const { text, images } = processOCRResult(ocrResult);
return {
filename: context.file.originalname,
bytes: text.length * 4,
filepath: FileSources.azure_mistral_ocr,
text,
images,
};
} catch (error) {
throw createOCRError(error, 'Error uploading document to Azure Mistral OCR API');
}
};

View file

@ -9,6 +9,8 @@ export * from './flow/manager';
export * from './agents';
/* Endpoints */
export * from './endpoints';
/* Files */
export * from './files';
/* types */
export type * from './mcp/types';
export type * from './flow/types';

View file

@ -88,7 +88,7 @@ export class MCPConnection extends EventEmitter {
this.client = new Client(
{
name: '@librechat/api-client',
version: '1.2.2',
version: '1.2.3',
},
{
capabilities: {},

View file

@ -1,4 +1,5 @@
export * from './azure';
export * from './events';
export * from './mistral';
export * from './openai';
export * from './run';

View file

@ -0,0 +1,82 @@
/**
* Mistral OCR API Types
* Based on https://docs.mistral.ai/api/#tag/ocr/operation/ocr_v1_ocr_post
*/
export interface MistralFileUploadResponse {
id: string;
object: string;
bytes: number;
created_at: number;
filename: string;
purpose: string;
}
export interface MistralSignedUrlResponse {
url: string;
expires_at: number;
}
export interface OCRImage {
id: string;
top_left_x: number;
top_left_y: number;
bottom_right_x: number;
bottom_right_y: number;
image_base64: string;
image_annotation?: string;
}
export interface PageDimensions {
dpi: number;
height: number;
width: number;
}
export interface OCRResultPage {
index: number;
markdown: string;
images: OCRImage[];
dimensions: PageDimensions;
}
export interface OCRUsageInfo {
pages_processed: number;
doc_size_bytes: number;
}
export interface OCRResult {
pages: OCRResultPage[];
model: string;
document_annotation?: string | null;
usage_info: OCRUsageInfo;
}
export interface MistralOCRRequest {
model: string;
image_limit?: number;
include_image_base64?: boolean;
document: {
type: 'document_url' | 'image_url';
document_url?: string;
image_url?: string;
};
}
export interface MistralOCRError {
detail?: string;
message?: string;
error?: {
message?: string;
type?: string;
code?: string;
};
}
export interface MistralOCRUploadResult {
filename: string;
bytes: number;
filepath: string;
text: string;
images: string[];
}

View file

@ -0,0 +1,131 @@
import axios from 'axios';
import { createAxiosInstance } from './axios';
jest.mock('axios', () => ({
interceptors: {
request: { use: jest.fn(), eject: jest.fn() },
response: { use: jest.fn(), eject: jest.fn() },
},
create: jest.fn().mockReturnValue({
defaults: {
proxy: null,
},
get: jest.fn().mockResolvedValue({ data: {} }),
post: jest.fn().mockResolvedValue({ data: {} }),
put: jest.fn().mockResolvedValue({ data: {} }),
delete: jest.fn().mockResolvedValue({ data: {} }),
}),
get: jest.fn().mockResolvedValue({ data: {} }),
post: jest.fn().mockResolvedValue({ data: {} }),
put: jest.fn().mockResolvedValue({ data: {} }),
delete: jest.fn().mockResolvedValue({ data: {} }),
reset: jest.fn().mockImplementation(function (this: {
get: jest.Mock;
post: jest.Mock;
put: jest.Mock;
delete: jest.Mock;
create: jest.Mock;
}) {
this.get.mockClear();
this.post.mockClear();
this.put.mockClear();
this.delete.mockClear();
this.create.mockClear();
}),
}));
describe('createAxiosInstance', () => {
const originalEnv = process.env;
beforeEach(() => {
// Reset mocks
jest.clearAllMocks();
// Create a clean copy of process.env
process.env = { ...originalEnv };
// Default: no proxy
delete process.env.proxy;
});
afterAll(() => {
// Restore original process.env
process.env = originalEnv;
});
test('creates an axios instance without proxy when no proxy env is set', () => {
const instance = createAxiosInstance();
expect(axios.create).toHaveBeenCalledTimes(1);
expect(instance.defaults.proxy).toBeNull();
});
test('configures proxy correctly with hostname and protocol', () => {
process.env.proxy = 'http://example.com';
const instance = createAxiosInstance();
expect(axios.create).toHaveBeenCalledTimes(1);
expect(instance.defaults.proxy).toEqual({
host: 'example.com',
protocol: 'http',
});
});
test('configures proxy correctly with hostname, protocol and port', () => {
process.env.proxy = 'https://proxy.example.com:8080';
const instance = createAxiosInstance();
expect(axios.create).toHaveBeenCalledTimes(1);
expect(instance.defaults.proxy).toEqual({
host: 'proxy.example.com',
protocol: 'https',
port: 8080,
});
});
test('handles proxy URLs with authentication', () => {
process.env.proxy = 'http://user:pass@proxy.example.com:3128';
const instance = createAxiosInstance();
expect(axios.create).toHaveBeenCalledTimes(1);
expect(instance.defaults.proxy).toEqual({
host: 'proxy.example.com',
protocol: 'http',
port: 3128,
// Note: The current implementation doesn't handle auth - if needed, add this functionality
});
});
test('throws error when proxy URL is invalid', () => {
process.env.proxy = 'invalid-url';
expect(() => createAxiosInstance()).toThrow('Invalid proxy URL');
expect(axios.create).toHaveBeenCalledTimes(1);
});
// If you want to test the actual URL parsing more thoroughly
test('handles edge case proxy URLs correctly', () => {
// IPv6 address
process.env.proxy = 'http://[::1]:8080';
let instance = createAxiosInstance();
expect(instance.defaults.proxy).toEqual({
host: '::1',
protocol: 'http',
port: 8080,
});
// URL with path (which should be ignored for proxy config)
process.env.proxy = 'http://proxy.example.com:8080/some/path';
instance = createAxiosInstance();
expect(instance.defaults.proxy).toEqual({
host: 'proxy.example.com',
protocol: 'http',
port: 8080,
});
});
});

View file

@ -0,0 +1,77 @@
import axios from 'axios';
import { logger } from '@librechat/data-schemas';
import type { AxiosInstance, AxiosProxyConfig, AxiosError } from 'axios';
/**
* Logs Axios errors based on the error object and a custom message.
* @param options - The options object.
* @param options.message - The custom message to be logged.
* @param options.error - The Axios error object.
* @returns The log message.
*/
export const logAxiosError = ({ message, error }: { message: string; error: AxiosError }) => {
let logMessage = message;
try {
const stack = error.stack || 'No stack trace available';
if (error.response?.status) {
const { status, headers, data } = error.response;
logMessage = `${message} The server responded with status ${status}: ${error.message}`;
logger.error(logMessage, {
status,
headers,
data,
stack,
});
} else if (error.request) {
const { method, url } = error.config || {};
logMessage = `${message} No response received for ${method ? method.toUpperCase() : ''} ${url || ''}: ${error.message}`;
logger.error(logMessage, {
requestInfo: { method, url },
stack,
});
} else if (error?.message?.includes("Cannot read properties of undefined (reading 'status')")) {
logMessage = `${message} It appears the request timed out or was unsuccessful: ${error.message}`;
logger.error(logMessage, { stack });
} else {
logMessage = `${message} An error occurred while setting up the request: ${error.message}`;
logger.error(logMessage, { stack });
}
} catch (err: unknown) {
logMessage = `Error in logAxiosError: ${(err as Error).message}`;
logger.error(logMessage, { stack: (err as Error).stack || 'No stack trace available' });
}
return logMessage;
};
/**
* Creates and configures an Axios instance with optional proxy settings.
* @returns A configured Axios instance
* @throws If there's an issue creating the Axios instance or parsing the proxy URL
*/
export function createAxiosInstance(): AxiosInstance {
const instance = axios.create();
if (process.env.proxy) {
try {
const url = new URL(process.env.proxy);
const proxyConfig: Partial<AxiosProxyConfig> = {
host: url.hostname.replace(/^\[|\]$/g, ''),
protocol: url.protocol.replace(':', ''),
};
if (url.port) {
proxyConfig.port = parseInt(url.port, 10);
}
instance.defaults.proxy = proxyConfig as AxiosProxyConfig;
} catch (error) {
console.error('Error parsing proxy URL:', error);
throw new Error(`Invalid proxy URL: ${process.env.proxy}`);
}
}
return instance;
}

View file

@ -1,3 +1,4 @@
export * from './axios';
export * from './azure';
export * from './common';
export * from './events';