Merge 187adb0491 into 386900fb4f

2025-09-22 06:00:56 +02:00 · 2025-09-21 18:33:06 +05:30 · 2025-09-21 18:33:06 +05:30 · cdfcb8af3a
commit cdfcb8af3a
parent 386900fb4f 187adb0491
5 changed files with 214 additions and 1 deletions
--- a/api/server/services/Files/OpenAIVisionOCR/crud.js
+++ b/api/server/services/Files/OpenAIVisionOCR/crud.js
@ -0,0 +1,184 @@
+const fs = require('fs');
+const path = require('path');
+const { FileSources, envVarRegex, extractEnvVariable } = require('librechat-data-provider');
+
+const { createAxiosInstance, logAxiosError } = require('@librechat/api');
+const { logger } = require('~/config');
+
+const axios = createAxiosInstance();
+
+/**
+ * Uses GPT-4o Vision to extract text from an image or document
+ *
+ * @param {Object} params Upload parameters
+ * @param {string} params.filePath The path to the file on disk
+ * @param {string} params.apiKey OpenAI API key
+ * @param {string} [params.baseURL=https://api.openai.com/v1] OpenAI API base URL
+ * @param {string} [params.model=gpt-4o] Vision model to use for OCR
+ * @returns {Promise<Object>} The response from OpenAI API
+ */
+async function performVisionOCR({
+  filePath,
+  apiKey,
+  baseURL = 'https://api.openai.com/v1',
+  model = 'gpt-4o',
+}) {
+  try {
+    // Read file to base64
+    const fileBuffer = fs.readFileSync(filePath);
+    const base64Image = fileBuffer.toString('base64');
+
+    // Determine the correct MIME type based on file extension
+    const ext = path.extname(filePath).toLowerCase();
+    let mimeType;
+
+    // Only allow supported image types for vision API
+    // PDF files need special handling that we're not implementing yet
+    switch (ext) {
+      case '.jpg':
+      case '.jpeg':
+        mimeType = 'image/jpeg';
+        break;
+      case '.png':
+        mimeType = 'image/png';
+        break;
+      case '.gif':
+        mimeType = 'image/gif';
+        break;
+      case '.webp':
+        mimeType = 'image/webp';
+        break;
+      default:
+        // Default to PNG for unknown types, but log a warning
+
+        mimeType = 'image/png';
+    }
+
+    const response = await axios.post(
+      `${baseURL}/chat/completions`,
+      {
+        model,
+        messages: [
+          {
+            role: 'system',
+            content:
+              'You are a precise OCR service. Extract text from images exactly as it appears, preserving layout when important. For tables, use markdown table format. For structured content with headings, use markdown headings. Focus on accuracy of text content rather than perfect formatting. Keep all original text including numbers, special characters, and punctuation.',
+          },
+          {
+            role: 'user',
+            content: [
+              {
+                type: 'text',
+                text: 'Extract all text from this image/document:',
+              },
+              {
+                type: 'image_url',
+                image_url: {
+                  url: `data:${mimeType};base64,${base64Image}`,
+                  detail: 'high',
+                },
+              },
+            ],
+          },
+        ],
+        max_tokens: 2048,
+      },
+      {
+        headers: {
+          'Content-Type': 'application/json',
+          Authorization: `Bearer ${apiKey}`,
+        },
+      },
+    );
+
+    return response.data;
+  } catch (error) {
+    logger.error(`[OpenAI Vision OCR] Error in performVisionOCR: ${error.message}`, error);
+    // Provide more specific error messages based on error type
+    if (error.response) {
+      const status = error.response.status;
+      if (status === 401) {
+        throw new Error('Authentication error: Invalid API key');
+      } else if (status === 429) {
+        throw new Error('Rate limit exceeded: Too many requests to OpenAI API');
+      } else if (status === 500) {
+        throw new Error('OpenAI server error: Try again later');
+      }
+    }
+    throw error;
+  }
+}
+
+/**
+ * Uploads a file for OCR processing using OpenAI Vision
+ *
+ * @param {Object} params - The params object.
+ * @param {ServerRequest} params.req - The request object
+ * @param {Express.Multer.File} params.file - The file object
+ * @returns {Promise<{ filepath: string, bytes: number, text: string }>} - The OCR result
+ */
+const uploadOpenAIVisionOCR = async ({ req, file }) => {
+  try {
+    /** @type {TCustomConfig['ocr']} */
+    const ocrConfig = req.config?.ocr;
+    const modelConfig = ocrConfig.visionModel || '';
+    const isModelEnvVar = envVarRegex.test(modelConfig);
+    let apiKey, baseURL, model;
+
+    // Always use server environment variables for OCR
+    apiKey = process.env.OPENAI_API_KEY;
+    baseURL = process.env.OPENAI_API_HOST || 'https://api.openai.com/v1';
+
+    if (!apiKey) {
+      logger.error('[OpenAI Vision OCR Debug] Missing OPENAI_API_KEY in server environment');
+      throw new Error('OpenAI API key not configured on server');
+    }
+
+    logger.info('[OpenAI Vision OCR Debug] Using server configuration', {
+      hasApiKey: !!apiKey,
+      baseURL,
+    });
+
+    model = isModelEnvVar ? extractEnvVariable(modelConfig) : modelConfig.trim() || 'gpt-4o';
+
+    logger.info(`[OpenAI Vision OCR] Using model: ${model}`);
+
+    // Check if file is a supported image type for vision API
+    const ext = path.extname(file.path).toLowerCase();
+    const supportedImageExts = new Set(['.jpg', '.jpeg', '.png', '.gif', '.webp']);
+
+    if (!supportedImageExts.has(ext)) {
+      const error = `Unsupported file type for Vision OCR: ${ext}. Only image types ${Array.from(supportedImageExts).join(', ')} are supported.`;
+      logger.error(`[OpenAI Vision OCR] ${error}`);
+      throw new Error(error);
+    }
+
+    const ocrResult = await performVisionOCR({
+      filePath: file.path,
+      apiKey,
+      baseURL,
+      model,
+    });
+
+    // Extract text from OpenAI response
+    const extractedText = ocrResult.choices[0].message.content;
+
+    return {
+      filename: file.originalname,
+      bytes: extractedText.length * 4,
+      filepath: FileSources.openai_vision_ocr,
+      text: extractedText,
+      // Note: we don't extract images since this is a text extraction service
+      images: [],
+    };
+  } catch (error) {
+    logger.error(`[OpenAI Vision OCR] Error in uploadOpenAIVisionOCR: ${error.message}`, error);
+    const message = 'Error performing OCR with OpenAI Vision API';
+    throw new Error(logAxiosError({ error, message }));
+  }
+};
+
+module.exports = {
+  performVisionOCR,
+  uploadOpenAIVisionOCR,
+};
--- a/api/server/services/Files/strategies.js
+++ b/api/server/services/Files/strategies.js
@ -48,7 +48,9 @@ const {
  prepareAzureImageURL,
  processAzureAvatar,
 } = require('./Azure');
-const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI');
+
+const { uploadOpenAIVisionOCR } = require('./OpenAIVisionOCR/crud');
+const { uploadOpenAIFile, deleteOpenAIFile, getOpenAIFileStream } = require('./OpenAI/index');
 const { getCodeOutputDownloadStream, uploadCodeEnvFile } = require('./Code');
 const { uploadVectors, deleteVectors } = require('./VectorDB');

@ -226,6 +228,26 @@ const azureMistralOCRStrategy = () => ({
  handleFileUpload: uploadAzureMistralOCR,
 });

+const openAIVisionOCRStrategy = () => ({
+  /** @type {typeof saveFileFromURL | null} */
+  saveURL: null,
+  /** @type {typeof getLocalFileURL | null} */
+  getFileURL: null,
+  /** @type {typeof saveLocalBuffer | null} */
+  saveBuffer: null,
+  /** @type {typeof processLocalAvatar | null} */
+  processAvatar: null,
+  /** @type {typeof uploadLocalImage | null} */
+  handleImageUpload: null,
+  /** @type {typeof prepareImagesLocal | null} */
+  prepareImagePayload: null,
+  /** @type {typeof deleteLocalFile | null} */
+  deleteFile: null,
+  /** @type {typeof getLocalFileStream | null} */
+  getDownloadStream: null,
+  handleFileUpload: uploadOpenAIVisionOCR,
+});
+
 const vertexMistralOCRStrategy = () => ({
  /** @type {typeof saveFileFromURL | null} */
  saveURL: null,
@ -270,6 +292,8 @@ const getStrategyFunctions = (fileSource) => {
    return azureMistralOCRStrategy();
  } else if (fileSource === FileSources.vertexai_mistral_ocr) {
    return vertexMistralOCRStrategy();
+  } else if (fileSource === FileSources.openai_vision_ocr) {
+    return openAIVisionOCRStrategy();
  } else if (fileSource === FileSources.text) {
    return localStrategy(); // Text files use local strategy
  } else {
--- a/packages/data-provider/src/config.ts
+++ b/packages/data-provider/src/config.ts
@ -671,6 +671,7 @@ export type TStartupConfig = {

 export enum OCRStrategy {
  MISTRAL_OCR = 'mistral_ocr',
+  OPENAI_VISION = 'openai_vision_ocr',
  CUSTOM_OCR = 'custom_ocr',
  AZURE_MISTRAL_OCR = 'azure_mistral_ocr',
  VERTEXAI_MISTRAL_OCR = 'vertexai_mistral_ocr',
@ -756,6 +757,7 @@ export type TWebSearchConfig = z.infer<typeof webSearchSchema>;

 export const ocrSchema = z.object({
  mistralModel: z.string().optional(),
+  visionModel: z.string().optional(),
  apiKey: z.string().optional().default('${OCR_API_KEY}'),
  baseURL: z.string().optional().default('${OCR_BASEURL}'),
  strategy: z.nativeEnum(OCRStrategy).default(OCRStrategy.MISTRAL_OCR),
--- a/packages/data-provider/src/ocr.ts
+++ b/packages/data-provider/src/ocr.ts
@ -5,10 +5,12 @@ export function loadOCRConfig(config: TCustomConfig['ocr']): TCustomConfig['ocr'
  const baseURL = config?.baseURL ?? '';
  const apiKey = config?.apiKey ?? '';
  const mistralModel = config?.mistralModel ?? '';
+  const visionModel = config?.visionModel ?? '';
  return {
    apiKey,
    baseURL,
    mistralModel,
+    visionModel,
    strategy: config?.strategy ?? OCRStrategy.MISTRAL_OCR,
  };
 }
--- a/packages/data-provider/src/types/files.ts
+++ b/packages/data-provider/src/types/files.ts
@ -10,6 +10,7 @@ export enum FileSources {
  vectordb = 'vectordb',
  execute_code = 'execute_code',
  mistral_ocr = 'mistral_ocr',
+  openai_vision_ocr = 'openai_vision_ocr',
  azure_mistral_ocr = 'azure_mistral_ocr',
  vertexai_mistral_ocr = 'vertexai_mistral_ocr',
  text = 'text',