mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-19 09:50:15 +01:00
👁️ feat: Azure Mistral OCR Strategy (#7888)
* 👁️ feat: Add Azure Mistral OCR strategy and endpoint integration This commit introduces a new OCR strategy named 'azure_mistral_ocr', allowing the use of a Mistral OCR endpoint deployed on Azure. The configuration, schemas, and file upload strategies have been updated to support this integration, enabling seamless OCR processing via Azure-hosted Mistral services. * 🗑️ chore: Clean up .gitignore by removing commented-out uncommon directory name * chore: remove unused vars * refactor: Move createAxiosInstance to packages/api/utils and update imports - Removed the createAxiosInstance function from the config module and relocated it to a new utils module for better organization. - Updated import paths in relevant files to reflect the new location of createAxiosInstance. - Added tests for createAxiosInstance to ensure proper functionality and proxy configuration handling. * chore: move axios helpers to packages/api - Added logAxiosError function to @librechat/api for centralized error logging. - Updated imports across various files to use the new logAxiosError function. - Removed the old axios.js utility file as it is no longer needed. * chore: Update Jest moduleNameMapper for improved path resolution - Added a new mapping for '~/' to resolve module paths in Jest configuration, enhancing import handling for the project. * feat: Implement Mistral OCR API integration in TS * chore: Update MistralOCR tests based on new imports * fix: Enhance MistralOCR configuration handling and tests - Introduced helper functions for resolving configuration values from environment variables or hardcoded settings. - Updated the uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration resolution logic. - Improved test cases to ensure correct behavior when mixing environment variables and hardcoded values. - Mocked file upload and signed URL responses in tests to validate functionality without external dependencies. * feat: Enhance MistralOCR functionality with improved configuration and error handling - Introduced helper functions for loading authentication configuration and resolving values from environment variables. - Updated uploadMistralOCR and uploadAzureMistralOCR functions to utilize the new configuration logic. - Added utility functions for processing OCR results and creating error messages. - Improved document type determination and result aggregation for better OCR processing. * refactor: Reorganize OCR type imports in Mistral CRUD file - Moved OCRResult, OCRResultPage, and OCRImage imports to a more logical grouping for better readability and maintainability. * feat: Add file exports to API and create files index * chore: Update OCR types for enhanced structure and clarity - Redesigned OCRImage interface to include mandatory fields and improved naming conventions. - Added PageDimensions interface for better representation of page metrics. - Updated OCRResultPage to include dimensions and mandatory images array. - Refined OCRResult to include document annotation and usage information. * refactor: use TS counterpart of uploadOCR methods * ci: Update MistralOCR tests to reflect new OCR result structure * chore: Bump version of @librechat/api to 1.2.3 in package.json and package-lock.json * chore: Update CONFIG_VERSION to 1.2.8 * chore: remove unused sendEvent function from config module (now imported from '@librechat/api') * chore: remove MistralOCR service files and tests (now in '@librechat/api') * ci: update logger import in ModelService tests to use @librechat/data-schemas --------- Co-authored-by: arthurolivierfortin <arthurolivier.fortin@gmail.com>
This commit is contained in:
parent
46ff008b07
commit
5f2d1c5dc9
37 changed files with 2245 additions and 1235 deletions
1
packages/api/src/files/index.ts
Normal file
1
packages/api/src/files/index.ts
Normal file
|
|
@ -0,0 +1 @@
|
|||
export * from './mistral/crud';
|
||||
1570
packages/api/src/files/mistral/crud.spec.ts
Normal file
1570
packages/api/src/files/mistral/crud.spec.ts
Normal file
File diff suppressed because it is too large
Load diff
424
packages/api/src/files/mistral/crud.ts
Normal file
424
packages/api/src/files/mistral/crud.ts
Normal file
|
|
@ -0,0 +1,424 @@
|
|||
import * as fs from 'fs';
|
||||
import * as path from 'path';
|
||||
import FormData from 'form-data';
|
||||
import { logger } from '@librechat/data-schemas';
|
||||
import {
|
||||
FileSources,
|
||||
envVarRegex,
|
||||
extractEnvVariable,
|
||||
extractVariableName,
|
||||
} from 'librechat-data-provider';
|
||||
import type { TCustomConfig } from 'librechat-data-provider';
|
||||
import type { Request as ServerRequest } from 'express';
|
||||
import type { AxiosError } from 'axios';
|
||||
import type {
|
||||
MistralFileUploadResponse,
|
||||
MistralSignedUrlResponse,
|
||||
MistralOCRUploadResult,
|
||||
MistralOCRError,
|
||||
OCRResultPage,
|
||||
OCRResult,
|
||||
OCRImage,
|
||||
} from '~/types';
|
||||
import { logAxiosError, createAxiosInstance } from '~/utils/axios';
|
||||
|
||||
const axios = createAxiosInstance();
|
||||
const DEFAULT_MISTRAL_BASE_URL = 'https://api.mistral.ai/v1';
|
||||
const DEFAULT_MISTRAL_MODEL = 'mistral-ocr-latest';
|
||||
|
||||
/** Helper type for auth configuration */
|
||||
interface AuthConfig {
|
||||
apiKey: string;
|
||||
baseURL: string;
|
||||
}
|
||||
|
||||
/** Helper type for OCR request context */
|
||||
interface OCRContext {
|
||||
req: Pick<ServerRequest, 'user' | 'app'> & {
|
||||
user?: { id: string };
|
||||
app: {
|
||||
locals?: {
|
||||
ocr?: TCustomConfig['ocr'];
|
||||
};
|
||||
};
|
||||
};
|
||||
file: Express.Multer.File;
|
||||
loadAuthValues: (params: {
|
||||
userId: string;
|
||||
authFields: string[];
|
||||
optional?: Set<string>;
|
||||
}) => Promise<Record<string, string | undefined>>;
|
||||
}
|
||||
|
||||
/**
|
||||
* Uploads a document to Mistral API using file streaming to avoid loading the entire file into memory
|
||||
* @param params Upload parameters
|
||||
* @param params.filePath The path to the file on disk
|
||||
* @param params.fileName Optional filename to use (defaults to the name from filePath)
|
||||
* @param params.apiKey Mistral API key
|
||||
* @param params.baseURL Mistral API base URL
|
||||
* @returns The response from Mistral API
|
||||
*/
|
||||
export async function uploadDocumentToMistral({
|
||||
apiKey,
|
||||
filePath,
|
||||
baseURL = DEFAULT_MISTRAL_BASE_URL,
|
||||
fileName = '',
|
||||
}: {
|
||||
apiKey: string;
|
||||
filePath: string;
|
||||
baseURL?: string;
|
||||
fileName?: string;
|
||||
}): Promise<MistralFileUploadResponse> {
|
||||
const form = new FormData();
|
||||
form.append('purpose', 'ocr');
|
||||
const actualFileName = fileName || path.basename(filePath);
|
||||
const fileStream = fs.createReadStream(filePath);
|
||||
form.append('file', fileStream, { filename: actualFileName });
|
||||
|
||||
return axios
|
||||
.post(`${baseURL}/files`, form, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
...form.getHeaders(),
|
||||
},
|
||||
maxBodyLength: Infinity,
|
||||
maxContentLength: Infinity,
|
||||
})
|
||||
.then((res) => res.data)
|
||||
.catch((error) => {
|
||||
throw error;
|
||||
});
|
||||
}
|
||||
|
||||
export async function getSignedUrl({
|
||||
apiKey,
|
||||
fileId,
|
||||
expiry = 24,
|
||||
baseURL = DEFAULT_MISTRAL_BASE_URL,
|
||||
}: {
|
||||
apiKey: string;
|
||||
fileId: string;
|
||||
expiry?: number;
|
||||
baseURL?: string;
|
||||
}): Promise<MistralSignedUrlResponse> {
|
||||
return axios
|
||||
.get(`${baseURL}/files/${fileId}/url?expiry=${expiry}`, {
|
||||
headers: {
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
},
|
||||
})
|
||||
.then((res) => res.data)
|
||||
.catch((error) => {
|
||||
logger.error('Error fetching signed URL:', error.message);
|
||||
throw error;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* @param {Object} params
|
||||
* @param {string} params.apiKey
|
||||
* @param {string} params.url - The document or image URL
|
||||
* @param {string} [params.documentType='document_url'] - 'document_url' or 'image_url'
|
||||
* @param {string} [params.model]
|
||||
* @param {string} [params.baseURL]
|
||||
* @returns {Promise<OCRResult>}
|
||||
*/
|
||||
export async function performOCR({
|
||||
url,
|
||||
apiKey,
|
||||
model = DEFAULT_MISTRAL_MODEL,
|
||||
baseURL = DEFAULT_MISTRAL_BASE_URL,
|
||||
documentType = 'document_url',
|
||||
}: {
|
||||
url: string;
|
||||
apiKey: string;
|
||||
model?: string;
|
||||
baseURL?: string;
|
||||
documentType?: 'document_url' | 'image_url';
|
||||
}): Promise<OCRResult> {
|
||||
const documentKey = documentType === 'image_url' ? 'image_url' : 'document_url';
|
||||
return axios
|
||||
.post(
|
||||
`${baseURL}/ocr`,
|
||||
{
|
||||
model,
|
||||
image_limit: 0,
|
||||
include_image_base64: false,
|
||||
document: {
|
||||
type: documentType,
|
||||
[documentKey]: url,
|
||||
},
|
||||
},
|
||||
{
|
||||
headers: {
|
||||
'Content-Type': 'application/json',
|
||||
Authorization: `Bearer ${apiKey}`,
|
||||
},
|
||||
},
|
||||
)
|
||||
.then((res) => res.data)
|
||||
.catch((error) => {
|
||||
logger.error('Error performing OCR:', error.message);
|
||||
throw error;
|
||||
});
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines if a value needs to be loaded from environment
|
||||
*/
|
||||
function needsEnvLoad(value: string): boolean {
|
||||
return envVarRegex.test(value) || !value.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the environment variable name for a config value
|
||||
*/
|
||||
function getEnvVarName(configValue: string, defaultName: string): string {
|
||||
if (!envVarRegex.test(configValue)) {
|
||||
return defaultName;
|
||||
}
|
||||
return extractVariableName(configValue) || defaultName;
|
||||
}
|
||||
|
||||
/**
|
||||
* Resolves a configuration value from either hardcoded or environment
|
||||
*/
|
||||
async function resolveConfigValue(
|
||||
configValue: string,
|
||||
defaultEnvName: string,
|
||||
authValues: Record<string, string | undefined>,
|
||||
defaultValue?: string,
|
||||
): Promise<string> {
|
||||
// If it's a hardcoded value (not env var and not empty), use it directly
|
||||
if (!needsEnvLoad(configValue)) {
|
||||
return configValue;
|
||||
}
|
||||
|
||||
// Otherwise, get from auth values
|
||||
const envVarName = getEnvVarName(configValue, defaultEnvName);
|
||||
return authValues[envVarName] || defaultValue || '';
|
||||
}
|
||||
|
||||
/**
|
||||
* Loads authentication configuration from OCR config
|
||||
*/
|
||||
async function loadAuthConfig(context: OCRContext): Promise<AuthConfig> {
|
||||
const ocrConfig = context.req.app.locals?.ocr;
|
||||
const apiKeyConfig = ocrConfig?.apiKey || '';
|
||||
const baseURLConfig = ocrConfig?.baseURL || '';
|
||||
|
||||
// If both are hardcoded, return them directly
|
||||
if (!needsEnvLoad(apiKeyConfig) && !needsEnvLoad(baseURLConfig)) {
|
||||
return {
|
||||
apiKey: apiKeyConfig,
|
||||
baseURL: baseURLConfig,
|
||||
};
|
||||
}
|
||||
|
||||
// Build auth fields array
|
||||
const authFields: string[] = [];
|
||||
|
||||
if (needsEnvLoad(baseURLConfig)) {
|
||||
authFields.push(getEnvVarName(baseURLConfig, 'OCR_BASEURL'));
|
||||
}
|
||||
|
||||
if (needsEnvLoad(apiKeyConfig)) {
|
||||
authFields.push(getEnvVarName(apiKeyConfig, 'OCR_API_KEY'));
|
||||
}
|
||||
|
||||
// Load auth values
|
||||
const authValues = await context.loadAuthValues({
|
||||
userId: context.req.user?.id || '',
|
||||
authFields,
|
||||
optional: new Set(['OCR_BASEURL']),
|
||||
});
|
||||
|
||||
// Resolve each value
|
||||
const apiKey = await resolveConfigValue(apiKeyConfig, 'OCR_API_KEY', authValues);
|
||||
const baseURL = await resolveConfigValue(
|
||||
baseURLConfig,
|
||||
'OCR_BASEURL',
|
||||
authValues,
|
||||
DEFAULT_MISTRAL_BASE_URL,
|
||||
);
|
||||
|
||||
return { apiKey, baseURL };
|
||||
}
|
||||
|
||||
/**
|
||||
* Gets the model configuration
|
||||
*/
|
||||
function getModelConfig(ocrConfig: TCustomConfig['ocr']): string {
|
||||
const modelConfig = ocrConfig?.mistralModel || '';
|
||||
|
||||
if (!modelConfig.trim()) {
|
||||
return DEFAULT_MISTRAL_MODEL;
|
||||
}
|
||||
|
||||
if (envVarRegex.test(modelConfig)) {
|
||||
return extractEnvVariable(modelConfig) || DEFAULT_MISTRAL_MODEL;
|
||||
}
|
||||
|
||||
return modelConfig.trim();
|
||||
}
|
||||
|
||||
/**
|
||||
* Determines document type based on file
|
||||
*/
|
||||
function getDocumentType(file: Express.Multer.File): 'image_url' | 'document_url' {
|
||||
const mimetype = (file.mimetype || '').toLowerCase();
|
||||
const originalname = file.originalname || '';
|
||||
const isImage =
|
||||
mimetype.startsWith('image') || /\.(png|jpe?g|gif|bmp|webp|tiff?)$/i.test(originalname);
|
||||
|
||||
return isImage ? 'image_url' : 'document_url';
|
||||
}
|
||||
|
||||
/**
|
||||
* Processes OCR result pages into aggregated text and images
|
||||
*/
|
||||
function processOCRResult(ocrResult: OCRResult): { text: string; images: string[] } {
|
||||
let aggregatedText = '';
|
||||
const images: string[] = [];
|
||||
|
||||
ocrResult.pages.forEach((page: OCRResultPage, index: number) => {
|
||||
if (ocrResult.pages.length > 1) {
|
||||
aggregatedText += `# PAGE ${index + 1}\n`;
|
||||
}
|
||||
|
||||
aggregatedText += page.markdown + '\n\n';
|
||||
|
||||
if (!page.images || page.images.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
page.images.forEach((image: OCRImage) => {
|
||||
if (image.image_base64) {
|
||||
images.push(image.image_base64);
|
||||
}
|
||||
});
|
||||
});
|
||||
|
||||
return { text: aggregatedText, images };
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates an error message for OCR operations
|
||||
*/
|
||||
function createOCRError(error: unknown, baseMessage: string): Error {
|
||||
const axiosError = error as AxiosError<MistralOCRError>;
|
||||
const detail = axiosError?.response?.data?.detail;
|
||||
const message = detail || baseMessage;
|
||||
|
||||
const responseMessage = axiosError?.response?.data?.message;
|
||||
const errorLog = logAxiosError({ error: axiosError, message });
|
||||
const fullMessage = responseMessage ? `${errorLog} - ${responseMessage}` : errorLog;
|
||||
|
||||
return new Error(fullMessage);
|
||||
}
|
||||
|
||||
/**
|
||||
* Uploads a file to the Mistral OCR API and processes the OCR result.
|
||||
*
|
||||
* @param params - The params object.
|
||||
* @param params.req - The request object from Express. It should have a `user` property with an `id`
|
||||
* representing the user
|
||||
* @param params.file - The file object, which is part of the request. The file object should
|
||||
* have a `mimetype` property that tells us the file type
|
||||
* @param params.loadAuthValues - Function to load authentication values
|
||||
* @returns - The result object containing the processed `text` and `images` (not currently used),
|
||||
* along with the `filename` and `bytes` properties.
|
||||
*/
|
||||
export const uploadMistralOCR = async (context: OCRContext): Promise<MistralOCRUploadResult> => {
|
||||
try {
|
||||
const { apiKey, baseURL } = await loadAuthConfig(context);
|
||||
const model = getModelConfig(context.req.app.locals?.ocr);
|
||||
|
||||
// Upload file
|
||||
const mistralFile = await uploadDocumentToMistral({
|
||||
filePath: context.file.path,
|
||||
fileName: context.file.originalname,
|
||||
apiKey,
|
||||
baseURL,
|
||||
});
|
||||
|
||||
// Get signed URL
|
||||
const signedUrlResponse = await getSignedUrl({
|
||||
apiKey,
|
||||
baseURL,
|
||||
fileId: mistralFile.id,
|
||||
});
|
||||
|
||||
// Perform OCR
|
||||
const documentType = getDocumentType(context.file);
|
||||
const ocrResult = await performOCR({
|
||||
apiKey,
|
||||
baseURL,
|
||||
model,
|
||||
url: signedUrlResponse.url,
|
||||
documentType,
|
||||
});
|
||||
|
||||
// Process result
|
||||
const { text, images } = processOCRResult(ocrResult);
|
||||
|
||||
return {
|
||||
filename: context.file.originalname,
|
||||
bytes: text.length * 4,
|
||||
filepath: FileSources.mistral_ocr,
|
||||
text,
|
||||
images,
|
||||
};
|
||||
} catch (error) {
|
||||
throw createOCRError(error, 'Error uploading document to Mistral OCR API');
|
||||
}
|
||||
};
|
||||
|
||||
/**
|
||||
* Use Azure Mistral OCR API to processe the OCR result.
|
||||
*
|
||||
* @param params - The params object.
|
||||
* @param params.req - The request object from Express. It should have a `user` property with an `id`
|
||||
* representing the user
|
||||
* @param params.file - The file object, which is part of the request. The file object should
|
||||
* have a `mimetype` property that tells us the file type
|
||||
* @param params.loadAuthValues - Function to load authentication values
|
||||
* @returns - The result object containing the processed `text` and `images` (not currently used),
|
||||
* along with the `filename` and `bytes` properties.
|
||||
*/
|
||||
export const uploadAzureMistralOCR = async (
|
||||
context: OCRContext,
|
||||
): Promise<MistralOCRUploadResult> => {
|
||||
try {
|
||||
const { apiKey, baseURL } = await loadAuthConfig(context);
|
||||
const model = getModelConfig(context.req.app.locals?.ocr);
|
||||
|
||||
// Read file as base64
|
||||
const buffer = fs.readFileSync(context.file.path);
|
||||
const base64 = buffer.toString('base64');
|
||||
|
||||
// Perform OCR directly with base64
|
||||
const documentType = getDocumentType(context.file);
|
||||
const ocrResult = await performOCR({
|
||||
apiKey,
|
||||
baseURL,
|
||||
model,
|
||||
url: `data:image/jpeg;base64,${base64}`,
|
||||
documentType,
|
||||
});
|
||||
|
||||
// Process result
|
||||
const { text, images } = processOCRResult(ocrResult);
|
||||
|
||||
return {
|
||||
filename: context.file.originalname,
|
||||
bytes: text.length * 4,
|
||||
filepath: FileSources.azure_mistral_ocr,
|
||||
text,
|
||||
images,
|
||||
};
|
||||
} catch (error) {
|
||||
throw createOCRError(error, 'Error uploading document to Azure Mistral OCR API');
|
||||
}
|
||||
};
|
||||
Loading…
Add table
Add a link
Reference in a new issue