🧯 fix: Add Pre-Parse File Size Guard to Document Parser (#12275)

Prevent memory exhaustion DoS by rejecting documents exceeding 15MB
before reading them into memory, closing the gap between the 512MB
upload limit and unbounded in-memory parsing.
This commit is contained in:
Danny Avila 2026-03-17 02:36:18 -04:00 committed by GitHub
parent 0c378811f1
commit 68435cdcd0
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
2 changed files with 60 additions and 15 deletions

View file

@ -122,6 +122,30 @@ describe('Document Parser', () => {
await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
});
test('parseDocument() rejects files exceeding the pre-parse size limit', async () => {
const file = {
originalname: 'oversized.docx',
path: path.join(__dirname, 'sample.docx'),
mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
size: 16 * 1024 * 1024,
} as Express.Multer.File;
await expect(parseDocument({ file })).rejects.toThrow(
/exceeds the 15MB document parser limit \(16MB\)/,
);
});
test('parseDocument() allows files exactly at the size limit boundary', async () => {
const file = {
originalname: 'sample.docx',
path: path.join(__dirname, 'sample.docx'),
mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
size: 15 * 1024 * 1024,
} as Express.Multer.File;
await expect(parseDocument({ file })).resolves.toBeDefined();
});
test('parseDocument() parses empty xlsx with only sheet name', async () => {
const file = {
originalname: 'empty.xlsx',

View file

@ -1,35 +1,39 @@
import * as fs from 'fs';
import { excelMimeTypes, FileSources } from 'librechat-data-provider';
import { megabyte, excelMimeTypes, FileSources } from 'librechat-data-provider';
import type { TextItem } from 'pdfjs-dist/types/src/display/api';
import type { MistralOCRUploadResult } from '~/types';
type FileParseFn = (file: Express.Multer.File) => Promise<string>;
const DOCUMENT_PARSER_MAX_FILE_SIZE = 15 * megabyte;
/**
* Parses an uploaded document and extracts its text content and metadata.
* Handled types must stay in sync with `documentParserMimeTypes` from data-provider.
*
* @throws {Error} if `file.mimetype` is not handled or no text is found.
* @throws {Error} if `file.mimetype` is not handled, file exceeds size limit, or no text is found.
*/
export async function parseDocument({
file,
}: {
file: Express.Multer.File;
}): Promise<MistralOCRUploadResult> {
let text: string;
if (file.mimetype === 'application/pdf') {
text = await pdfToText(file);
} else if (
file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
) {
text = await wordDocToText(file);
} else if (
excelMimeTypes.test(file.mimetype) ||
file.mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
) {
text = await excelSheetToText(file);
} else {
const parseFn = getParserForMimeType(file.mimetype);
if (!parseFn) {
throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
}
const fileSize = file.size ?? (file.path != null ? (await fs.promises.stat(file.path)).size : 0);
if (fileSize > DOCUMENT_PARSER_MAX_FILE_SIZE) {
const limitMB = DOCUMENT_PARSER_MAX_FILE_SIZE / megabyte;
const sizeMB = Math.ceil(fileSize / megabyte);
throw new Error(
`File "${file.originalname}" exceeds the ${limitMB}MB document parser limit (${sizeMB}MB).`,
);
}
const text = await parseFn(file);
if (!text?.trim()) {
throw new Error('No text found in document');
}
@ -43,6 +47,23 @@ export async function parseDocument({
};
}
/** Maps a MIME type to its document parser function, or `undefined` if unsupported. */
function getParserForMimeType(mimetype: string): FileParseFn | undefined {
if (mimetype === 'application/pdf') {
return pdfToText;
}
if (mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
return wordDocToText;
}
if (
excelMimeTypes.test(mimetype) ||
mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
) {
return excelSheetToText;
}
return undefined;
}
/** Parses PDF, returns text inside. */
async function pdfToText(file: Express.Multer.File): Promise<string> {
// Imported inline so that Jest can test other routes without failing due to loading ESM