🧯 fix: Add Pre-Parse File Size Guard to Document Parser (#12275)

Prevent memory exhaustion DoS by rejecting documents exceeding 15MB before reading them into memory, closing the gap between the 512MB upload limit and unbounded in-memory parsing.
2026-03-17 13:16:34 +01:00 · 2026-03-17 02:36:18 -04:00 · 2026-03-17 02:36:18 -04:00 · 68435cdcd0
commit 68435cdcd0
parent 0c378811f1
2 changed files with 60 additions and 15 deletions
--- a/packages/api/src/files/documents/crud.spec.ts
+++ b/packages/api/src/files/documents/crud.spec.ts
@ -122,6 +122,30 @@ describe('Document Parser', () => {
    await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
  });

+  test('parseDocument() rejects files exceeding the pre-parse size limit', async () => {
+    const file = {
+      originalname: 'oversized.docx',
+      path: path.join(__dirname, 'sample.docx'),
+      mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+      size: 16 * 1024 * 1024,
+    } as Express.Multer.File;
+
+    await expect(parseDocument({ file })).rejects.toThrow(
+      /exceeds the 15MB document parser limit \(16MB\)/,
+    );
+  });
+
+  test('parseDocument() allows files exactly at the size limit boundary', async () => {
+    const file = {
+      originalname: 'sample.docx',
+      path: path.join(__dirname, 'sample.docx'),
+      mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
+      size: 15 * 1024 * 1024,
+    } as Express.Multer.File;
+
+    await expect(parseDocument({ file })).resolves.toBeDefined();
+  });
+
  test('parseDocument() parses empty xlsx with only sheet name', async () => {
    const file = {
      originalname: 'empty.xlsx',
--- a/packages/api/src/files/documents/crud.ts
+++ b/packages/api/src/files/documents/crud.ts
@ -1,35 +1,39 @@
 import * as fs from 'fs';
-import { excelMimeTypes, FileSources } from 'librechat-data-provider';
+import { megabyte, excelMimeTypes, FileSources } from 'librechat-data-provider';
 import type { TextItem } from 'pdfjs-dist/types/src/display/api';
 import type { MistralOCRUploadResult } from '~/types';

+type FileParseFn = (file: Express.Multer.File) => Promise<string>;
+
+const DOCUMENT_PARSER_MAX_FILE_SIZE = 15 * megabyte;
+
 /**
 * Parses an uploaded document and extracts its text content and metadata.
 * Handled types must stay in sync with `documentParserMimeTypes` from data-provider.
 *
- * @throws {Error} if `file.mimetype` is not handled or no text is found.
+ * @throws {Error} if `file.mimetype` is not handled, file exceeds size limit, or no text is found.
 */
 export async function parseDocument({
  file,
 }: {
  file: Express.Multer.File;
 }): Promise<MistralOCRUploadResult> {
-  let text: string;
-  if (file.mimetype === 'application/pdf') {
-    text = await pdfToText(file);
-  } else if (
-    file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
-  ) {
-    text = await wordDocToText(file);
-  } else if (
-    excelMimeTypes.test(file.mimetype) ||
-    file.mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
-  ) {
-    text = await excelSheetToText(file);
-  } else {
+  const parseFn = getParserForMimeType(file.mimetype);
+  if (!parseFn) {
    throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
  }

+  const fileSize = file.size ?? (file.path != null ? (await fs.promises.stat(file.path)).size : 0);
+  if (fileSize > DOCUMENT_PARSER_MAX_FILE_SIZE) {
+    const limitMB = DOCUMENT_PARSER_MAX_FILE_SIZE / megabyte;
+    const sizeMB = Math.ceil(fileSize / megabyte);
+    throw new Error(
+      `File "${file.originalname}" exceeds the ${limitMB}MB document parser limit (${sizeMB}MB).`,
+    );
+  }
+
+  const text = await parseFn(file);
+
  if (!text?.trim()) {
    throw new Error('No text found in document');
  }
@ -43,6 +47,23 @@ export async function parseDocument({
  };
 }

+/** Maps a MIME type to its document parser function, or `undefined` if unsupported. */
+function getParserForMimeType(mimetype: string): FileParseFn | undefined {
+  if (mimetype === 'application/pdf') {
+    return pdfToText;
+  }
+  if (mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
+    return wordDocToText;
+  }
+  if (
+    excelMimeTypes.test(mimetype) ||
+    mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
+  ) {
+    return excelSheetToText;
+  }
+  return undefined;
+}
+
 /** Parses PDF, returns text inside. */
 async function pdfToText(file: Express.Multer.File): Promise<string> {
  // Imported inline so that Jest can test other routes without failing due to loading ESM