mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-03-17 13:16:34 +01:00
🧯 fix: Add Pre-Parse File Size Guard to Document Parser (#12275)
Prevent memory exhaustion DoS by rejecting documents exceeding 15MB before reading them into memory, closing the gap between the 512MB upload limit and unbounded in-memory parsing.
This commit is contained in:
parent
0c378811f1
commit
68435cdcd0
2 changed files with 60 additions and 15 deletions
|
|
@ -122,6 +122,30 @@ describe('Document Parser', () => {
|
|||
await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
|
||||
});
|
||||
|
||||
test('parseDocument() rejects files exceeding the pre-parse size limit', async () => {
|
||||
const file = {
|
||||
originalname: 'oversized.docx',
|
||||
path: path.join(__dirname, 'sample.docx'),
|
||||
mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
size: 16 * 1024 * 1024,
|
||||
} as Express.Multer.File;
|
||||
|
||||
await expect(parseDocument({ file })).rejects.toThrow(
|
||||
/exceeds the 15MB document parser limit \(16MB\)/,
|
||||
);
|
||||
});
|
||||
|
||||
test('parseDocument() allows files exactly at the size limit boundary', async () => {
|
||||
const file = {
|
||||
originalname: 'sample.docx',
|
||||
path: path.join(__dirname, 'sample.docx'),
|
||||
mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
|
||||
size: 15 * 1024 * 1024,
|
||||
} as Express.Multer.File;
|
||||
|
||||
await expect(parseDocument({ file })).resolves.toBeDefined();
|
||||
});
|
||||
|
||||
test('parseDocument() parses empty xlsx with only sheet name', async () => {
|
||||
const file = {
|
||||
originalname: 'empty.xlsx',
|
||||
|
|
|
|||
|
|
@ -1,35 +1,39 @@
|
|||
import * as fs from 'fs';
|
||||
import { excelMimeTypes, FileSources } from 'librechat-data-provider';
|
||||
import { megabyte, excelMimeTypes, FileSources } from 'librechat-data-provider';
|
||||
import type { TextItem } from 'pdfjs-dist/types/src/display/api';
|
||||
import type { MistralOCRUploadResult } from '~/types';
|
||||
|
||||
type FileParseFn = (file: Express.Multer.File) => Promise<string>;
|
||||
|
||||
const DOCUMENT_PARSER_MAX_FILE_SIZE = 15 * megabyte;
|
||||
|
||||
/**
|
||||
* Parses an uploaded document and extracts its text content and metadata.
|
||||
* Handled types must stay in sync with `documentParserMimeTypes` from data-provider.
|
||||
*
|
||||
* @throws {Error} if `file.mimetype` is not handled or no text is found.
|
||||
* @throws {Error} if `file.mimetype` is not handled, file exceeds size limit, or no text is found.
|
||||
*/
|
||||
export async function parseDocument({
|
||||
file,
|
||||
}: {
|
||||
file: Express.Multer.File;
|
||||
}): Promise<MistralOCRUploadResult> {
|
||||
let text: string;
|
||||
if (file.mimetype === 'application/pdf') {
|
||||
text = await pdfToText(file);
|
||||
} else if (
|
||||
file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
|
||||
) {
|
||||
text = await wordDocToText(file);
|
||||
} else if (
|
||||
excelMimeTypes.test(file.mimetype) ||
|
||||
file.mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
|
||||
) {
|
||||
text = await excelSheetToText(file);
|
||||
} else {
|
||||
const parseFn = getParserForMimeType(file.mimetype);
|
||||
if (!parseFn) {
|
||||
throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
|
||||
}
|
||||
|
||||
const fileSize = file.size ?? (file.path != null ? (await fs.promises.stat(file.path)).size : 0);
|
||||
if (fileSize > DOCUMENT_PARSER_MAX_FILE_SIZE) {
|
||||
const limitMB = DOCUMENT_PARSER_MAX_FILE_SIZE / megabyte;
|
||||
const sizeMB = Math.ceil(fileSize / megabyte);
|
||||
throw new Error(
|
||||
`File "${file.originalname}" exceeds the ${limitMB}MB document parser limit (${sizeMB}MB).`,
|
||||
);
|
||||
}
|
||||
|
||||
const text = await parseFn(file);
|
||||
|
||||
if (!text?.trim()) {
|
||||
throw new Error('No text found in document');
|
||||
}
|
||||
|
|
@ -43,6 +47,23 @@ export async function parseDocument({
|
|||
};
|
||||
}
|
||||
|
||||
/** Maps a MIME type to its document parser function, or `undefined` if unsupported. */
|
||||
function getParserForMimeType(mimetype: string): FileParseFn | undefined {
|
||||
if (mimetype === 'application/pdf') {
|
||||
return pdfToText;
|
||||
}
|
||||
if (mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document') {
|
||||
return wordDocToText;
|
||||
}
|
||||
if (
|
||||
excelMimeTypes.test(mimetype) ||
|
||||
mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
|
||||
) {
|
||||
return excelSheetToText;
|
||||
}
|
||||
return undefined;
|
||||
}
|
||||
|
||||
/** Parses PDF, returns text inside. */
|
||||
async function pdfToText(file: Express.Multer.File): Promise<string> {
|
||||
// Imported inline so that Jest can test other routes without failing due to loading ESM
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue