📄 feat: Local Text Extraction for PDF, DOCX, and XLS/XLSX (#11900)

* feat: Added "document parser" OCR strategy The document parser uses libraries to parse the text out of known document types. This lets LibreChat handle some complex document types without having to use a secondary service (like Mistral or standing up a RAG API server). To enable the document parser, set the ocr strategy to "document_parser" in librechat.yaml. We now support: - PDFs using pdfjs - DOCX using mammoth - XLS/XLSX using SheetJS (The associated packages were also added to the project.) * fix: applied Copilot code review suggestions - Properly calculate length of text based on UTF8. - Avoid issues with loading / blocking PDF parsing. * fix: improved docs on parseDocument() * chore: move to packages/api for TS support * refactor: make document processing the default ocr strategy - Introduced support for additional document types in the OCR strategy, including PDF, DOCX, and XLS/XLSX. - Updated the file upload handling to dynamically select the appropriate parsing strategy based on the file type. - Refactored the document parsing functions to use asynchronous imports for improved performance and maintainability. * test: add unit tests for processAgentFileUpload functionality - Introduced a new test suite for the processAgentFileUpload function in process.spec.js. - Implemented various test cases to validate OCR strategy selection based on file types, including PDF, DOCX, XLSX, and XLS. - Mocked dependencies to ensure isolated testing of file upload handling and strategy selection logic. - Enhanced coverage for scenarios involving OCR capability checks and default strategy fallbacks. * chore: update pdfjs-dist version and enhance document parsing tests - Bumped pdfjs-dist dependency to version 5.4.624 in both api and packages/api. - Refactored document parsing tests to use 'originalname' instead of 'filename' for file objects. - Added a new test case for parsing XLS files to improve coverage of document types supported by the parser. - Introduced a sample XLS file for testing purposes. * feat: enforce text size limit and improve OCR fallback handling in processAgentFileUpload - Added a check to ensure extracted text does not exceed the 15MB storage limit, throwing an error if it does. - Refactored the OCR handling logic to improve fallback behavior when the configured OCR fails, ensuring a more robust document processing flow. - Enhanced unit tests to cover scenarios for oversized text and fallback mechanisms, ensuring proper error handling and functionality. * fix: correct OCR URL construction in performOCR function - Updated the OCR URL construction to ensure it correctly appends '/ocr' to the base URL if not already present, improving the reliability of the OCR request. --------- Co-authored-by: Dan Lew <daniel@mightyacorn.com>
2026-02-25 11:54:08 +01:00 · 2026-02-22 14:22:45 -05:00 · 2026-02-22 14:22:45 -05:00 · 7ce898d6a0
commit 7ce898d6a0
parent 7692fa837e
16 changed files with 1012 additions and 25 deletions
--- a/packages/api/package.json
+++ b/packages/api/package.json
@ -67,12 +67,15 @@
    "jest": "^30.2.0",
    "jest-junit": "^16.0.0",
    "librechat-data-provider": "*",
+    "mammoth": "^1.11.0",
    "mongodb": "^6.14.2",
+    "pdfjs-dist": "^5.4.624",
    "rimraf": "^6.1.2",
    "rollup": "^4.22.4",
    "rollup-plugin-peer-deps-external": "^2.2.4",
    "ts-node": "^10.9.2",
-    "typescript": "^5.0.4"
+    "typescript": "^5.0.4",
+    "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz"
  },
  "publishConfig": {
    "registry": "https://registry.npmjs.org/"
@ -106,10 +109,12 @@
    "keyv": "^5.3.2",
    "keyv-file": "^5.1.2",
    "librechat-data-provider": "*",
+    "mammoth": "^1.11.0",
    "mathjs": "^15.1.0",
    "memorystore": "^1.6.7",
    "mongoose": "^8.12.1",
    "node-fetch": "2.7.0",
+    "pdfjs-dist": "^5.4.624",
    "rate-limit-redis": "^4.2.0",
    "tiktoken": "^1.0.15",
    "undici": "^7.18.2",
--- a/packages/api/src/files/documents/crud.spec.ts
+++ b/packages/api/src/files/documents/crud.spec.ts
@ -0,0 +1,80 @@
+import path from 'path';
+import { parseDocument } from './crud';
+
+describe('Document Parser', () => {
+  test('parseDocument() parses text from docx', async () => {
+    const file = {
+      originalname: 'sample.docx',
+      path: path.join(__dirname, 'sample.docx'),
+      mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    } as Express.Multer.File;
+
+    const document = await parseDocument({ file });
+
+    expect(document).toEqual({
+      bytes: 29,
+      filename: 'sample.docx',
+      filepath: 'document_parser',
+      images: [],
+      text: 'This is a sample DOCX file.\n\n',
+    });
+  });
+
+  test('parseDocument() parses text from xlsx', async () => {
+    const file = {
+      originalname: 'sample.xlsx',
+      path: path.join(__dirname, 'sample.xlsx'),
+      mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
+    } as Express.Multer.File;
+
+    const document = await parseDocument({ file });
+
+    expect(document).toEqual({
+      bytes: 66,
+      filename: 'sample.xlsx',
+      filepath: 'document_parser',
+      images: [],
+      text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n',
+    });
+  });
+
+  test('parseDocument() parses text from xls', async () => {
+    const file = {
+      originalname: 'sample.xls',
+      path: path.join(__dirname, 'sample.xls'),
+      mimetype: 'application/vnd.ms-excel',
+    } as Express.Multer.File;
+
+    const document = await parseDocument({ file });
+
+    expect(document).toEqual({
+      bytes: 31,
+      filename: 'sample.xls',
+      filepath: 'document_parser',
+      images: [],
+      text: 'Sheet One:\nData,on,first,sheet\n',
+    });
+  });
+
+  test('parseDocument() throws error for unhandled document type', async () => {
+    const file = {
+      originalname: 'nonexistent.file',
+      path: path.join(__dirname, 'nonexistent.file'),
+      mimetype: 'application/invalid',
+    } as Express.Multer.File;
+
+    await expect(parseDocument({ file })).rejects.toThrow(
+      'Unsupported file type in document parser: application/invalid',
+    );
+  });
+
+  test('parseDocument() throws error for empty document', async () => {
+    const file = {
+      originalname: 'empty.docx',
+      path: path.join(__dirname, 'empty.docx'),
+      mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+    } as Express.Multer.File;
+
+    await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
+  });
+});
--- a/packages/api/src/files/documents/crud.ts
+++ b/packages/api/src/files/documents/crud.ts
@ -0,0 +1,87 @@
+import * as fs from 'fs';
+import { FileSources } from 'librechat-data-provider';
+import type { TextItem } from 'pdfjs-dist/types/src/display/api';
+import type { MistralOCRUploadResult } from '~/types';
+
+/**
+ * Parses an uploaded document and extracts its text content and metadata.
+ *
+ * Throws an Error if it fails to parse or no text is found.
+ */
+export async function parseDocument({
+  file,
+}: {
+  file: Express.Multer.File;
+}): Promise<MistralOCRUploadResult> {
+  let text: string;
+  switch (file.mimetype) {
+    case 'application/pdf':
+      text = await pdfToText(file);
+      break;
+    case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
+      text = await wordDocToText(file);
+      break;
+    case 'application/vnd.ms-excel':
+    case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
+      text = await excelSheetToText(file);
+      break;
+    default:
+      throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
+  }
+
+  if (!text?.trim()) {
+    throw new Error('No text found in document');
+  }
+
+  return {
+    filename: file.originalname,
+    bytes: Buffer.byteLength(text, 'utf8'),
+    filepath: FileSources.document_parser,
+    text,
+    images: [],
+  };
+}
+
+/** Parses PDF, returns text inside. */
+async function pdfToText(file: Express.Multer.File): Promise<string> {
+  // Imported inline so that Jest can test other routes without failing due to loading ESM
+  const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs');
+
+  const data = new Uint8Array(await fs.promises.readFile(file.path));
+  const pdf = await getDocument({ data }).promise;
+
+  let fullText = '';
+  for (let i = 1; i <= pdf.numPages; i++) {
+    const page = await pdf.getPage(i);
+    const textContent = await page.getTextContent();
+    const pageText = textContent.items
+      .filter((item): item is TextItem => !('type' in item))
+      .map((item) => item.str)
+      .join(' ');
+    fullText += pageText + '\n';
+  }
+
+  return fullText;
+}
+
+/** Parses Word document, returns text inside. */
+async function wordDocToText(file: Express.Multer.File): Promise<string> {
+  const { extractRawText } = await import('mammoth');
+  const rawText = await extractRawText({ path: file.path });
+  return rawText.value;
+}
+
+/** Parses Excel sheet, returns text inside. */
+async function excelSheetToText(file: Express.Multer.File): Promise<string> {
+  const { readFile, utils } = await import('xlsx');
+  const workbook = readFile(file.path);
+
+  let text = '';
+  for (const sheetName of workbook.SheetNames) {
+    const worksheet = workbook.Sheets[sheetName];
+    const worksheetAsCsvString = utils.sheet_to_csv(worksheet);
+    text += `${sheetName}:\n${worksheetAsCsvString}\n`;
+  }
+
+  return text;
+}
--- a/packages/api/src/files/documents/empty.docx
+++ b/packages/api/src/files/documents/empty.docx
--- a/packages/api/src/files/documents/sample.docx
+++ b/packages/api/src/files/documents/sample.docx
--- a/packages/api/src/files/documents/sample.xls
+++ b/packages/api/src/files/documents/sample.xls
--- a/packages/api/src/files/documents/sample.xlsx
+++ b/packages/api/src/files/documents/sample.xlsx
--- a/packages/api/src/files/index.ts
+++ b/packages/api/src/files/index.ts
@ -1,5 +1,6 @@
 export * from './audio';
 export * from './context';
+export * from './documents/crud';
 export * from './encode';
 export * from './filter';
 export * from './mistral/crud';
--- a/packages/api/src/files/mistral/crud.ts
+++ b/packages/api/src/files/mistral/crud.ts
@ -165,9 +165,11 @@ export async function performOCR({
    config.httpsAgent = new HttpsProxyAgent(process.env.PROXY);
  }

+  const ocrURL = baseURL.endsWith('/ocr') ? baseURL : `${baseURL}/ocr`;
+
  return axios
    .post(
-      `${baseURL}/ocr`,
+      ocrURL,
      {
        model,
        image_limit: 0,