📝 fix: Mistral OCR Image Support and Azure Agent Titles (#6901)

* fix: azure title model * refactor: typing for uploadMistralOCR * fix: update conversation ID handling in useSSE for better state management, only use PENDING_CONVO for new conversations * fix: streamline conversation ID handling in useSSE for simplicity, only needs state update to prevent draft from applying * fix: update performOCR and tests to support document and image URLs with appropriate types
2025-12-17 17:00:15 +01:00 · 2025-04-15 18:03:56 -04:00 · 2025-04-15 18:03:56 -04:00 · d32f34e5d7
commit d32f34e5d7
parent 650e9b4f6c
5 changed files with 185 additions and 15 deletions
--- a/api/server/services/Files/MistralOCR/crud.js
+++ b/api/server/services/Files/MistralOCR/crud.js
@ -69,16 +69,20 @@ async function getSignedUrl({
 /**
 * @param {Object} params
 * @param {string} params.apiKey
- * @param {string} params.documentUrl
+ * @param {string} params.url - The document or image URL
+ * @param {string} [params.documentType='document_url'] - 'document_url' or 'image_url'
+ * @param {string} [params.model]
 * @param {string} [params.baseURL]
 * @returns {Promise<OCRResult>}
 */
 async function performOCR({
  apiKey,
-  documentUrl,
+  url,
+  documentType = 'document_url',
  model = 'mistral-ocr-latest',
  baseURL = 'https://api.mistral.ai/v1',
 }) {
+  const documentKey = documentType === 'image_url' ? 'image_url' : 'document_url';
  return axios
    .post(
      `${baseURL}/ocr`,
@ -86,8 +90,8 @@ async function performOCR({
        model,
        include_image_base64: false,
        document: {
-          type: 'document_url',
-          document_url: documentUrl,
+          type: documentType,
+          [documentKey]: url,
        },
      },
      {
@ -109,6 +113,19 @@ function extractVariableName(str) {
  return match ? match[1] : null;
 }

+/**
+ * Uploads a file to the Mistral OCR API and processes the OCR result.
+ *
+ * @param {Object} params - The params object.
+ * @param {ServerRequest} params.req - The request object from Express. It should have a `user` property with an `id`
+ *                       representing the user
+ * @param {Express.Multer.File} params.file - The file object, which is part of the request. The file object should
+ *                                     have a `mimetype` property that tells us the file type
+ * @param {string} params.file_id - The file ID.
+ * @param {string} [params.entity_id] - The entity ID, not used here but passed for consistency.
+ * @returns {Promise<{ filepath: string, bytes: number }>} - The result object containing the processed `text` and `images` (not currently used),
+ *                       along with the `filename` and `bytes` properties.
+ */
 const uploadMistralOCR = async ({ req, file, file_id, entity_id }) => {
  try {
    /** @type {TCustomConfig['ocr']} */
@ -160,11 +177,18 @@ const uploadMistralOCR = async ({ req, file, file_id, entity_id }) => {
      fileId: mistralFile.id,
    });

+    const mimetype = (file.mimetype || '').toLowerCase();
+    const originalname = file.originalname || '';
+    const isImage =
+      mimetype.startsWith('image') || /\.(png|jpe?g|gif|bmp|webp|tiff?)$/i.test(originalname);
+    const documentType = isImage ? 'image_url' : 'document_url';
+
    const ocrResult = await performOCR({
      apiKey,
      baseURL,
      model,
-      documentUrl: signedUrlResponse.url,
+      url: signedUrlResponse.url,
+      documentType,
    });

    let aggregatedText = '';