diff --git a/api/package.json b/api/package.json index 2255679dae..4416acd1d8 100644 --- a/api/package.json +++ b/api/package.json @@ -113,6 +113,7 @@ "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "yauzl": "^3.2.1", "zod": "^3.22.4" }, "devDependencies": { diff --git a/package-lock.json b/package-lock.json index 35aacac9c2..5d8264f602 100644 --- a/package-lock.json +++ b/package-lock.json @@ -128,6 +128,7 @@ "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "yauzl": "^3.2.1", "zod": "^3.22.4" }, "devDependencies": { @@ -21269,6 +21270,16 @@ "integrity": "sha512-I4q9QU9MQv4oEOz4tAHJtNz1cwuLxn2F3xcc2iV5WdqLPpUnj30aUuxt1mAxYTG+oe8CZMV/+6rU4S4gRDzqtQ==", "dev": true }, + "node_modules/@types/yauzl": { + "version": "2.10.3", + "resolved": "https://registry.npmjs.org/@types/yauzl/-/yauzl-2.10.3.tgz", + "integrity": "sha512-oJoftv0LSuaDZE3Le4DbKX+KS9G36NzOeSap90UIK0yMA/NhKJhqlSGtNDORNRaIbQfzjXDrQa0ytJ6mNRGz/Q==", + "dev": true, + "license": "MIT", + "dependencies": { + "@types/node": "*" + } + }, "node_modules/@typescript-eslint/eslint-plugin": { "version": "8.24.0", "resolved": "https://registry.npmjs.org/@typescript-eslint/eslint-plugin/-/eslint-plugin-8.24.0.tgz", @@ -23080,7 +23091,6 @@ "version": "0.2.13", "resolved": "https://registry.npmjs.org/buffer-crc32/-/buffer-crc32-0.2.13.tgz", "integrity": "sha512-VO9Ht/+p3SN7SKWqcrgEzjGbRSJYTx+Q1pTQC0wrWqHx0vpJraQ6GtHx8tvcg1rlK1byhU5gccxgOgj7B0TDkQ==", - "dev": true, "license": "MIT", "engines": { "node": "*" @@ -35393,7 +35403,6 @@ "version": "1.2.0", "resolved": "https://registry.npmjs.org/pend/-/pend-1.2.0.tgz", "integrity": "sha512-F3asv42UuXchdzt+xXqfW1OGlVBe+mxa2mqI0pg5yAHZPvFmY3Y6drSf/GQ1A86WgWEN9Kzh/WrgKa6iGcHXLg==", - "dev": true, "license": "MIT" }, "node_modules/picocolors": { @@ -43740,7 +43749,6 @@ "version": "3.2.1", "resolved": "https://registry.npmjs.org/yauzl/-/yauzl-3.2.1.tgz", "integrity": "sha512-k1isifdbpNSFEHFJ1ZY4YDewv0IH9FR61lDetaRMD3j2ae3bIXGV+7c+LHCqtQGofSd8PIyV4X6+dHMAnSr60A==", - "dev": true, "license": "MIT", "dependencies": { "buffer-crc32": "~0.2.3", @@ -43802,9 +43810,6 @@ "name": "@librechat/api", "version": "1.7.26", "license": "ISC", - "dependencies": { - "jszip": "^3.10.1" - }, "devDependencies": { "@babel/preset-env": "^7.21.5", "@babel/preset-react": "^7.18.6", @@ -43825,8 +43830,10 @@ "@types/node-fetch": "^2.6.13", "@types/react": "^18.2.18", "@types/winston": "^2.4.4", + "@types/yauzl": "^2.10.3", "jest": "^30.2.0", "jest-junit": "^16.0.0", + "jszip": "^3.10.1", "librechat-data-provider": "*", "mammoth": "^1.11.0", "mongodb": "^6.14.2", @@ -43836,7 +43843,8 @@ "rollup-plugin-peer-deps-external": "^2.2.4", "ts-node": "^10.9.2", "typescript": "^5.0.4", - "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "yauzl": "^3.2.1" }, "peerDependencies": { "@anthropic-ai/vertex-sdk": "^0.14.3", @@ -43876,6 +43884,7 @@ "pdfjs-dist": "^5.4.624", "rate-limit-redis": "^4.2.0", "undici": "^7.24.1", + "yauzl": "^3.2.1", "zod": "^3.22.4" } }, diff --git a/packages/api/package.json b/packages/api/package.json index 57675ee371..3a3b3caef6 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -64,7 +64,9 @@ "@types/node-fetch": "^2.6.13", "@types/react": "^18.2.18", "@types/winston": "^2.4.4", + "@types/yauzl": "^2.10.3", "jest": "^30.2.0", + "jszip": "^3.10.1", "jest-junit": "^16.0.0", "librechat-data-provider": "*", "mammoth": "^1.11.0", @@ -75,7 +77,8 @@ "rollup-plugin-peer-deps-external": "^2.2.4", "ts-node": "^10.9.2", "typescript": "^5.0.4", - "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "yauzl": "^3.2.1" }, "publishConfig": { "registry": "https://registry.npmjs.org/" @@ -118,9 +121,7 @@ "pdfjs-dist": "^5.4.624", "rate-limit-redis": "^4.2.0", "undici": "^7.24.1", + "yauzl": "^3.2.1", "zod": "^3.22.4" - }, - "dependencies": { - "jszip": "^3.10.1" } } diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts index a1c317279c..2a5086869f 100644 --- a/packages/api/src/files/documents/crud.spec.ts +++ b/packages/api/src/files/documents/crud.spec.ts @@ -104,7 +104,7 @@ describe('Document Parser', () => { await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); }); - test('parseDocument() throws for odt whose decompressed content exceeds the size limit', async () => { + test('parseDocument() aborts decompression when content.xml exceeds the size limit', async () => { const zip = new JSZip(); zip.file('mimetype', 'application/vnd.oasis.opendocument.text', { compression: 'STORE' }); zip.file('content.xml', 'x'.repeat(51 * 1024 * 1024), { compression: 'DEFLATE' }); @@ -118,7 +118,7 @@ describe('Document Parser', () => { path: tmpPath, mimetype: 'application/vnd.oasis.opendocument.text', } as Express.Multer.File; - await expect(parseDocument({ file })).rejects.toThrow(/exceeds the 50MB limit/); + await expect(parseDocument({ file })).rejects.toThrow(/exceeds the 50MB decompressed limit/); } finally { await fs.promises.unlink(tmpPath); } diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index e255323f77..20d547cf26 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -1,5 +1,5 @@ import * as fs from 'fs'; -import JSZip from 'jszip'; +import yauzl from 'yauzl'; import { megabyte, excelMimeTypes, FileSources } from 'librechat-data-provider'; import type { TextItem } from 'pdfjs-dist/types/src/display/api'; import type { MistralOCRUploadResult } from '~/types'; @@ -124,28 +124,7 @@ async function excelSheetToText(file: Express.Multer.File): Promise { * text boxes, and annotations are stripped without replacement. */ async function odtToText(file: Express.Multer.File): Promise { - const data = await fs.promises.readFile(file.path); - const zip = await JSZip.loadAsync(data); - - let totalUncompressed = 0; - zip.forEach((_, entry) => { - const raw = entry as JSZip.JSZipObject & { _data?: { uncompressedSize?: number } }; - // _data.uncompressedSize is populated from the ZIP central directory at parse time - // by jszip (private internal, jszip@3.x). If the field is absent the guard fails - // open (adds 0); this is an accepted limitation of the approach. - totalUncompressed += raw._data?.uncompressedSize ?? 0; - }); - if (totalUncompressed > ODT_MAX_DECOMPRESSED_SIZE) { - throw new Error( - `ODT file decompressed content (${Math.ceil(totalUncompressed / megabyte)}MB) exceeds the ${ODT_MAX_DECOMPRESSED_SIZE / megabyte}MB limit`, - ); - } - - const contentFile = zip.file('content.xml'); - if (!contentFile) { - throw new Error('ODT file is missing content.xml'); - } - const xml = await contentFile.async('string'); + const xml = await extractOdtContentXml(file.path); const bodyMatch = xml.match(/]*>([\s\S]*?)<\/office:body>/); if (!bodyMatch) { return ''; @@ -168,3 +147,85 @@ async function odtToText(file: Express.Multer.File): Promise { .replace(/\n{3,}/g, '\n\n') .trim(); } + +/** + * Streams content.xml out of an ODT ZIP archive using yauzl, counting real + * decompressed bytes and aborting mid-inflate if the cap is exceeded. + * Unlike JSZip metadata checks, this cannot be bypassed by falsifying + * the ZIP central directory's uncompressedSize fields. + * + * The zipfile is closed on all exit paths (success, size cap, missing entry, + * error) to prevent file descriptor leaks. + */ +function extractOdtContentXml(filePath: string): Promise { + return new Promise((resolve, reject) => { + yauzl.open(filePath, { lazyEntries: true }, (err, zipfile) => { + if (err) { + return reject(err); + } + if (!zipfile) { + return reject(new Error('Failed to open ODT file')); + } + + let settled = false; + const finish = (error: Error | null, result?: string) => { + if (settled) { + return; + } + settled = true; + zipfile.close(); + if (error) { + reject(error); + } else { + resolve(result as string); + } + }; + + let found = false; + zipfile.readEntry(); + + zipfile.on('entry', (entry: yauzl.Entry) => { + if (entry.fileName !== 'content.xml') { + zipfile.readEntry(); + return; + } + found = true; + zipfile.openReadStream(entry, (streamErr, readStream) => { + if (streamErr) { + return finish(streamErr); + } + if (!readStream) { + return finish(new Error('Failed to open content.xml stream')); + } + + let totalBytes = 0; + const chunks: Buffer[] = []; + + readStream.on('data', (chunk: Buffer) => { + totalBytes += chunk.byteLength; + if (totalBytes > ODT_MAX_DECOMPRESSED_SIZE) { + readStream.destroy( + new Error( + `ODT content.xml exceeds the ${ODT_MAX_DECOMPRESSED_SIZE / megabyte}MB decompressed limit`, + ), + ); + return; + } + chunks.push(chunk); + }); + + readStream.on('end', () => finish(null, Buffer.concat(chunks).toString('utf8'))); + readStream.on('error', (readErr: Error) => finish(readErr)); + }); + }); + + zipfile.on('end', () => { + if (!found) { + finish(new Error('ODT file is missing content.xml')); + } + }); + + zipfile.on('error', (zipErr: Error) => finish(zipErr)); + }); + }); +}