From 68435cdcd06da5b694a01ccf719b022e7ae8cd2e Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Tue, 17 Mar 2026 02:36:18 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=AF=20fix:=20Add=20Pre-Parse=20File=20?= =?UTF-8?q?Size=20Guard=20to=20Document=20Parser=20(#12275)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit Prevent memory exhaustion DoS by rejecting documents exceeding 15MB before reading them into memory, closing the gap between the 512MB upload limit and unbounded in-memory parsing. --- packages/api/src/files/documents/crud.spec.ts | 24 +++++++++ packages/api/src/files/documents/crud.ts | 51 +++++++++++++------ 2 files changed, 60 insertions(+), 15 deletions(-) diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts index f22693718a..f8b255dd5e 100644 --- a/packages/api/src/files/documents/crud.spec.ts +++ b/packages/api/src/files/documents/crud.spec.ts @@ -122,6 +122,30 @@ describe('Document Parser', () => { await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); }); + test('parseDocument() rejects files exceeding the pre-parse size limit', async () => { + const file = { + originalname: 'oversized.docx', + path: path.join(__dirname, 'sample.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + size: 16 * 1024 * 1024, + } as Express.Multer.File; + + await expect(parseDocument({ file })).rejects.toThrow( + /exceeds the 15MB document parser limit \(16MB\)/, + ); + }); + + test('parseDocument() allows files exactly at the size limit boundary', async () => { + const file = { + originalname: 'sample.docx', + path: path.join(__dirname, 'sample.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + size: 15 * 1024 * 1024, + } as Express.Multer.File; + + await expect(parseDocument({ file })).resolves.toBeDefined(); + }); + test('parseDocument() parses empty xlsx with only sheet name', async () => { const file = { originalname: 'empty.xlsx', diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index ab16534b45..61c1956542 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -1,35 +1,39 @@ import * as fs from 'fs'; -import { excelMimeTypes, FileSources } from 'librechat-data-provider'; +import { megabyte, excelMimeTypes, FileSources } from 'librechat-data-provider'; import type { TextItem } from 'pdfjs-dist/types/src/display/api'; import type { MistralOCRUploadResult } from '~/types'; +type FileParseFn = (file: Express.Multer.File) => Promise; + +const DOCUMENT_PARSER_MAX_FILE_SIZE = 15 * megabyte; + /** * Parses an uploaded document and extracts its text content and metadata. * Handled types must stay in sync with `documentParserMimeTypes` from data-provider. * - * @throws {Error} if `file.mimetype` is not handled or no text is found. + * @throws {Error} if `file.mimetype` is not handled, file exceeds size limit, or no text is found. */ export async function parseDocument({ file, }: { file: Express.Multer.File; }): Promise { - let text: string; - if (file.mimetype === 'application/pdf') { - text = await pdfToText(file); - } else if ( - file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' - ) { - text = await wordDocToText(file); - } else if ( - excelMimeTypes.test(file.mimetype) || - file.mimetype === 'application/vnd.oasis.opendocument.spreadsheet' - ) { - text = await excelSheetToText(file); - } else { + const parseFn = getParserForMimeType(file.mimetype); + if (!parseFn) { throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); } + const fileSize = file.size ?? (file.path != null ? (await fs.promises.stat(file.path)).size : 0); + if (fileSize > DOCUMENT_PARSER_MAX_FILE_SIZE) { + const limitMB = DOCUMENT_PARSER_MAX_FILE_SIZE / megabyte; + const sizeMB = Math.ceil(fileSize / megabyte); + throw new Error( + `File "${file.originalname}" exceeds the ${limitMB}MB document parser limit (${sizeMB}MB).`, + ); + } + + const text = await parseFn(file); + if (!text?.trim()) { throw new Error('No text found in document'); } @@ -43,6 +47,23 @@ export async function parseDocument({ }; } +/** Maps a MIME type to its document parser function, or `undefined` if unsupported. */ +function getParserForMimeType(mimetype: string): FileParseFn | undefined { + if (mimetype === 'application/pdf') { + return pdfToText; + } + if (mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') { + return wordDocToText; + } + if ( + excelMimeTypes.test(mimetype) || + mimetype === 'application/vnd.oasis.opendocument.spreadsheet' + ) { + return excelSheetToText; + } + return undefined; +} + /** Parses PDF, returns text inside. */ async function pdfToText(file: Express.Multer.File): Promise { // Imported inline so that Jest can test other routes without failing due to loading ESM