👁️ feat: Azure Mistral OCR Strategy (#7888)

* 👁️ feat: Add Azure Mistral OCR strategy and endpoint integration This commit introduces a new OCR strategy named 'azure_mistral_ocr', allowing the use of a Mistral OCR endpoint deployed on Azure. The configuration, schemas, and file upload strategies have been updated to support this integration, enabling seamless OCR processing via Azure-hosted Mistral services. * 🗑️ chore: Clean up .gitignore by removing commented-out uncommon directory name * chore: remove unused vars * refactor: Move createAxiosInstance to packages/api/utils and update imports - Removed the createAxiosInstance function from the config module and relocated it to a new utils module for better organization. - Updated import paths in relevant files to reflect the new location of createAxiosInstance. - Added tests for createAxiosInstance to ensure proper functionality and proxy configuration handling. * chore: move axios helpers to packages/api - Added logAxiosError function to @librechat/api for centralized error logging. - Updated imports across various files to use the new logAxiosError function. - Removed the old axios.js utility file as it is no longer needed. * chore: Update Jest moduleNameMapper for improved path resolution - Added a new mapping for '~/' to resolve module paths in Jest configuration, enhancing import handling for the project. * feat: Implement Mistral OCR API integration in TS * chore: Update MistralOCR tests based on new imports * fix: Enhance MistralOCR configuration handling and tests - Introduced helper functions for resolving configuration values from environment variables or hardcoded settings. - Updated the uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration resolution logic. - Improved test cases to ensure correct behavior when mixing environment variables and hardcoded values. - Mocked file upload and signed URL responses in tests to validate functionality without external dependencies. * feat: Enhance MistralOCR functionality with improved configuration and error handling - Introduced helper functions for loading authentication configuration and resolving values from environment variables. - Updated uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration logic. - Added utility functions for processing OCR results and creating error messages. - Improved document type determination and result aggregation for better OCR processing. * refactor: Reorganize OCR type imports in Mistral CRUD file - Moved OCRResult, OCRResultPage, and OCRImage imports to a more logical grouping for better readability and maintainability. * feat: Add file exports to API and create files index * chore: Update OCR types for enhanced structure and clarity - Redesigned OCRImage interface to include mandatory fields and improved naming conventions. - Added PageDimensions interface for better representation of page metrics. - Updated OCRResultPage to include dimensions and mandatory images array. - Refined OCRResult to include document annotation and usage information. * refactor: use TS counterpart of uploadOCR methods * ci: Update MistralOCR tests to reflect new OCR result structure * chore: Bump version of @librechat/api to 1.2.3 in package.json and package-lock.json * chore: Update CONFIG_VERSION to 1.2.8 * chore: remove unused sendEvent function from config module (now imported from '@librechat/api') * chore: remove MistralOCR service files and tests (now in '@librechat/api') * ci: update logger import in ModelService tests to use @librechat/data-schemas --------- Co-authored-by: arthurolivierfortin <arthurolivier.fortin@gmail.com>
2026-02-13 21:14:24 +01:00 · 2025-06-13 15:14:57 -04:00 · 2025-06-13 15:14:57 -04:00 · 5f2d1c5dc9
commit 5f2d1c5dc9
parent 46ff008b07
37 changed files with 2245 additions and 1235 deletions
--- a/packages/api/src/files/index.ts
+++ b/packages/api/src/files/index.ts
@ -0,0 +1 @@
+export * from './mistral/crud';
--- a/packages/api/src/files/mistral/crud.spec.ts
+++ b/packages/api/src/files/mistral/crud.spec.ts
--- a/packages/api/src/files/mistral/crud.ts
+++ b/packages/api/src/files/mistral/crud.ts
@ -0,0 +1,424 @@
+import * as fs from 'fs';
+import * as path from 'path';
+import FormData from 'form-data';
+import { logger } from '@librechat/data-schemas';
+import {
+  FileSources,
+  envVarRegex,
+  extractEnvVariable,
+  extractVariableName,
+} from 'librechat-data-provider';
+import type { TCustomConfig } from 'librechat-data-provider';
+import type { Request as ServerRequest } from 'express';
+import type { AxiosError } from 'axios';
+import type {
+  MistralFileUploadResponse,
+  MistralSignedUrlResponse,
+  MistralOCRUploadResult,
+  MistralOCRError,
+  OCRResultPage,
+  OCRResult,
+  OCRImage,
+} from '~/types';
+import { logAxiosError, createAxiosInstance } from '~/utils/axios';
+
+const axios = createAxiosInstance();
+const DEFAULT_MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
+const DEFAULT_MISTRAL_MODEL = 'mistral-ocr-latest';
+
+/** Helper type for auth configuration */
+interface AuthConfig {
+  apiKey: string;
+  baseURL: string;
+}
+
+/** Helper type for OCR request context */
+interface OCRContext {
+  req: Pick<ServerRequest, 'user' | 'app'> & {
+    user?: { id: string };
+    app: {
+      locals?: {
+        ocr?: TCustomConfig['ocr'];
+      };
+    };
+  };
+  file: Express.Multer.File;
+  loadAuthValues: (params: {
+    userId: string;
+    authFields: string[];
+    optional?: Set<string>;
+  }) => Promise<Record<string, string | undefined>>;
+}
+
+/**
+ * Uploads a document to Mistral API using file streaming to avoid loading the entire file into memory
+ * @param params Upload parameters
+ * @param params.filePath The path to the file on disk
+ * @param params.fileName Optional filename to use (defaults to the name from filePath)
+ * @param params.apiKey Mistral API key
+ * @param params.baseURL Mistral API base URL
+ * @returns The response from Mistral API
+ */
+export async function uploadDocumentToMistral({
+  apiKey,
+  filePath,
+  baseURL = DEFAULT_MISTRAL_BASE_URL,
+  fileName = '',
+}: {
+  apiKey: string;
+  filePath: string;
+  baseURL?: string;
+  fileName?: string;
+}): Promise<MistralFileUploadResponse> {
+  const form = new FormData();
+  form.append('purpose', 'ocr');
+  const actualFileName = fileName || path.basename(filePath);
+  const fileStream = fs.createReadStream(filePath);
+  form.append('file', fileStream, { filename: actualFileName });
+
+  return axios
+    .post(`${baseURL}/files`, form, {
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+        ...form.getHeaders(),
+      },
+      maxBodyLength: Infinity,
+      maxContentLength: Infinity,
+    })
+    .then((res) => res.data)
+    .catch((error) => {
+      throw error;
+    });
+}
+
+export async function getSignedUrl({
+  apiKey,
+  fileId,
+  expiry = 24,
+  baseURL = DEFAULT_MISTRAL_BASE_URL,
+}: {
+  apiKey: string;
+  fileId: string;
+  expiry?: number;
+  baseURL?: string;
+}): Promise<MistralSignedUrlResponse> {
+  return axios
+    .get(`${baseURL}/files/${fileId}/url?expiry=${expiry}`, {
+      headers: {
+        Authorization: `Bearer ${apiKey}`,
+      },
+    })
+    .then((res) => res.data)
+    .catch((error) => {
+      logger.error('Error fetching signed URL:', error.message);
+      throw error;
+    });
+}
+
+/**
+ * @param {Object} params
+ * @param {string} params.apiKey
+ * @param {string} params.url - The document or image URL
+ * @param {string} [params.documentType='document_url'] - 'document_url' or 'image_url'
+ * @param {string} [params.model]
+ * @param {string} [params.baseURL]
+ * @returns {Promise<OCRResult>}
+ */
+export async function performOCR({
+  url,
+  apiKey,
+  model = DEFAULT_MISTRAL_MODEL,
+  baseURL = DEFAULT_MISTRAL_BASE_URL,
+  documentType = 'document_url',
+}: {
+  url: string;
+  apiKey: string;
+  model?: string;
+  baseURL?: string;
+  documentType?: 'document_url' | 'image_url';
+}): Promise<OCRResult> {
+  const documentKey = documentType === 'image_url' ? 'image_url' : 'document_url';
+  return axios
+    .post(
+      `${baseURL}/ocr`,
+      {
+        model,
+        image_limit: 0,
+        include_image_base64: false,
+        document: {
+          type: documentType,
+          [documentKey]: url,
+        },
+      },
+      {
+        headers: {
+          'Content-Type': 'application/json',
+          Authorization: `Bearer ${apiKey}`,
+        },
+      },
+    )
+    .then((res) => res.data)
+    .catch((error) => {
+      logger.error('Error performing OCR:', error.message);
+      throw error;
+    });
+}
+
+/**
+ * Determines if a value needs to be loaded from environment
+ */
+function needsEnvLoad(value: string): boolean {
+  return envVarRegex.test(value) || !value.trim();
+}
+
+/**
+ * Gets the environment variable name for a config value
+ */
+function getEnvVarName(configValue: string, defaultName: string): string {
+  if (!envVarRegex.test(configValue)) {
+    return defaultName;
+  }
+  return extractVariableName(configValue) || defaultName;
+}
+
+/**
+ * Resolves a configuration value from either hardcoded or environment
+ */
+async function resolveConfigValue(
+  configValue: string,
+  defaultEnvName: string,
+  authValues: Record<string, string | undefined>,
+  defaultValue?: string,
+): Promise<string> {
+  // If it's a hardcoded value (not env var and not empty), use it directly
+  if (!needsEnvLoad(configValue)) {
+    return configValue;
+  }
+
+  // Otherwise, get from auth values
+  const envVarName = getEnvVarName(configValue, defaultEnvName);
+  return authValues[envVarName] || defaultValue || '';
+}
+
+/**
+ * Loads authentication configuration from OCR config
+ */
+async function loadAuthConfig(context: OCRContext): Promise<AuthConfig> {
+  const ocrConfig = context.req.app.locals?.ocr;
+  const apiKeyConfig = ocrConfig?.apiKey || '';
+  const baseURLConfig = ocrConfig?.baseURL || '';
+
+  // If both are hardcoded, return them directly
+  if (!needsEnvLoad(apiKeyConfig) && !needsEnvLoad(baseURLConfig)) {
+    return {
+      apiKey: apiKeyConfig,
+      baseURL: baseURLConfig,
+    };
+  }
+
+  // Build auth fields array
+  const authFields: string[] = [];
+
+  if (needsEnvLoad(baseURLConfig)) {
+    authFields.push(getEnvVarName(baseURLConfig, 'OCR_BASEURL'));
+  }
+
+  if (needsEnvLoad(apiKeyConfig)) {
+    authFields.push(getEnvVarName(apiKeyConfig, 'OCR_API_KEY'));
+  }
+
+  // Load auth values
+  const authValues = await context.loadAuthValues({
+    userId: context.req.user?.id || '',
+    authFields,
+    optional: new Set(['OCR_BASEURL']),
+  });
+
+  // Resolve each value
+  const apiKey = await resolveConfigValue(apiKeyConfig, 'OCR_API_KEY', authValues);
+  const baseURL = await resolveConfigValue(
+    baseURLConfig,
+    'OCR_BASEURL',
+    authValues,
+    DEFAULT_MISTRAL_BASE_URL,
+  );
+
+  return { apiKey, baseURL };
+}
+
+/**
+ * Gets the model configuration
+ */
+function getModelConfig(ocrConfig: TCustomConfig['ocr']): string {
+  const modelConfig = ocrConfig?.mistralModel || '';
+
+  if (!modelConfig.trim()) {
+    return DEFAULT_MISTRAL_MODEL;
+  }
+
+  if (envVarRegex.test(modelConfig)) {
+    return extractEnvVariable(modelConfig) || DEFAULT_MISTRAL_MODEL;
+  }
+
+  return modelConfig.trim();
+}
+
+/**
+ * Determines document type based on file
+ */
+function getDocumentType(file: Express.Multer.File): 'image_url' | 'document_url' {
+  const mimetype = (file.mimetype || '').toLowerCase();
+  const originalname = file.originalname || '';
+  const isImage =
+    mimetype.startsWith('image') || /\.(png|jpe?g|gif|bmp|webp|tiff?)$/i.test(originalname);
+
+  return isImage ? 'image_url' : 'document_url';
+}
+
+/**
+ * Processes OCR result pages into aggregated text and images
+ */
+function processOCRResult(ocrResult: OCRResult): { text: string; images: string[] } {
+  let aggregatedText = '';
+  const images: string[] = [];
+
+  ocrResult.pages.forEach((page: OCRResultPage, index: number) => {
+    if (ocrResult.pages.length > 1) {
+      aggregatedText += `# PAGE ${index + 1}\n`;
+    }
+
+    aggregatedText += page.markdown + '\n\n';
+
+    if (!page.images || page.images.length === 0) {
+      return;
+    }
+
+    page.images.forEach((image: OCRImage) => {
+      if (image.image_base64) {
+        images.push(image.image_base64);
+      }
+    });
+  });
+
+  return { text: aggregatedText, images };
+}
+
+/**
+ * Creates an error message for OCR operations
+ */
+function createOCRError(error: unknown, baseMessage: string): Error {
+  const axiosError = error as AxiosError<MistralOCRError>;
+  const detail = axiosError?.response?.data?.detail;
+  const message = detail || baseMessage;
+
+  const responseMessage = axiosError?.response?.data?.message;
+  const errorLog = logAxiosError({ error: axiosError, message });
+  const fullMessage = responseMessage ? `${errorLog} - ${responseMessage}` : errorLog;
+
+  return new Error(fullMessage);
+}
+
+/**
+ * Uploads a file to the Mistral OCR API and processes the OCR result.
+ *
+ * @param params - The params object.
+ * @param params.req - The request object from Express. It should have a `user` property with an `id`
+ *                       representing the user
+ * @param params.file - The file object, which is part of the request. The file object should
+ *                                     have a `mimetype` property that tells us the file type
+ * @param params.loadAuthValues - Function to load authentication values
+ * @returns - The result object containing the processed `text` and `images` (not currently used),
+ *                       along with the `filename` and `bytes` properties.
+ */
+export const uploadMistralOCR = async (context: OCRContext): Promise<MistralOCRUploadResult> => {
+  try {
+    const { apiKey, baseURL } = await loadAuthConfig(context);
+    const model = getModelConfig(context.req.app.locals?.ocr);
+
+    // Upload file
+    const mistralFile = await uploadDocumentToMistral({
+      filePath: context.file.path,
+      fileName: context.file.originalname,
+      apiKey,
+      baseURL,
+    });
+
+    // Get signed URL
+    const signedUrlResponse = await getSignedUrl({
+      apiKey,
+      baseURL,
+      fileId: mistralFile.id,
+    });
+
+    // Perform OCR
+    const documentType = getDocumentType(context.file);
+    const ocrResult = await performOCR({
+      apiKey,
+      baseURL,
+      model,
+      url: signedUrlResponse.url,
+      documentType,
+    });
+
+    // Process result
+    const { text, images } = processOCRResult(ocrResult);
+
+    return {
+      filename: context.file.originalname,
+      bytes: text.length * 4,
+      filepath: FileSources.mistral_ocr,
+      text,
+      images,
+    };
+  } catch (error) {
+    throw createOCRError(error, 'Error uploading document to Mistral OCR API');
+  }
+};
+
+/**
+ * Use Azure Mistral OCR API to processe the OCR result.
+ *
+ * @param params - The params object.
+ * @param params.req - The request object from Express. It should have a `user` property with an `id`
+ *                       representing the user
+ * @param params.file - The file object, which is part of the request. The file object should
+ *                                     have a `mimetype` property that tells us the file type
+ * @param params.loadAuthValues - Function to load authentication values
+ * @returns - The result object containing the processed `text` and `images` (not currently used),
+ *                       along with the `filename` and `bytes` properties.
+ */
+export const uploadAzureMistralOCR = async (
+  context: OCRContext,
+): Promise<MistralOCRUploadResult> => {
+  try {
+    const { apiKey, baseURL } = await loadAuthConfig(context);
+    const model = getModelConfig(context.req.app.locals?.ocr);
+
+    // Read file as base64
+    const buffer = fs.readFileSync(context.file.path);
+    const base64 = buffer.toString('base64');
+
+    // Perform OCR directly with base64
+    const documentType = getDocumentType(context.file);
+    const ocrResult = await performOCR({
+      apiKey,
+      baseURL,
+      model,
+      url: `data:image/jpeg;base64,${base64}`,
+      documentType,
+    });
+
+    // Process result
+    const { text, images } = processOCRResult(ocrResult);
+
+    return {
+      filename: context.file.originalname,
+      bytes: text.length * 4,
+      filepath: FileSources.azure_mistral_ocr,
+      text,
+      images,
+    };
+  } catch (error) {
+    throw createOCRError(error, 'Error uploading document to Azure Mistral OCR API');
+  }
+};