💣 fix: Harden against falsified ZIP metadata in ODT parsing (#12320)

* security: replace JSZip metadata guard with yauzl streaming decompression The ODT decompressed-size guard was checking JSZip's private _data.uncompressedSize fields, which are populated from the ZIP central directory — attacker-controlled metadata. A crafted ODT with falsified uncompressedSize values bypassed the 50MB cap entirely, allowing content.xml decompression to exhaust Node.js heap memory (DoS). Replace JSZip with yauzl for ODT extraction. The new extractOdtContentXml function uses yauzl's streaming API: it lazily iterates ZIP entries, opens a decompression stream for content.xml, and counts real bytes as they arrive from the inflate stream. The stream is destroyed the moment the byte count crosses ODT_MAX_DECOMPRESSED_SIZE, aborting the inflate before the full payload is materialised in memory. - Remove jszip from direct dependencies (still transitive via mammoth) - Add yauzl + @types/yauzl - Update zip-bomb test to verify streaming abort with DEFLATE payload * fix: close file descriptor leaks and declare jszip test dependency - Use a shared `finish()` helper in extractOdtContentXml that calls zipfile.close() on every exit path (success, size cap, missing entry, openReadStream errors, zipfile errors). Without this, any error path leaked one OS file descriptor permanently — uploading many malformed ODTs could exhaust the process FD limit (a distinct DoS vector). - Add jszip to devDependencies so the zip-bomb test has an explicit dependency rather than relying on mammoth's transitive jszip. - Update JSDoc to document that all exit paths close the zipfile. * fix: move yauzl from dependencies to peerDependencies Matches the established pattern for runtime parser libraries in packages/api: mammoth, pdfjs-dist, and xlsx are all peerDependencies (provided by the consuming /api workspace) with devDependencies for testing. yauzl was incorrectly placed in dependencies. * fix: add yauzl to /api dependencies to satisfy peer dep packages/api declares yauzl as a peerDependency; /api is the consuming workspace that must provide it at runtime, matching the pattern used for mammoth, pdfjs-dist, and xlsx.
2026-03-22 15:46:33 +01:00 · 2026-03-19 22:13:40 -04:00 · 2026-03-19 22:13:40 -04:00 · e442984364
commit e442984364
parent ecd6d76bc8
5 changed files with 108 additions and 36 deletions
--- a/packages/api/src/files/documents/crud.spec.ts
+++ b/packages/api/src/files/documents/crud.spec.ts
@ -104,7 +104,7 @@ describe('Document Parser', () => {
    await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
  });

-  test('parseDocument() throws for odt whose decompressed content exceeds the size limit', async () => {
+  test('parseDocument() aborts decompression when content.xml exceeds the size limit', async () => {
    const zip = new JSZip();
    zip.file('mimetype', 'application/vnd.oasis.opendocument.text', { compression: 'STORE' });
    zip.file('content.xml', 'x'.repeat(51 * 1024 * 1024), { compression: 'DEFLATE' });
@ -118,7 +118,7 @@ describe('Document Parser', () => {
        path: tmpPath,
        mimetype: 'application/vnd.oasis.opendocument.text',
      } as Express.Multer.File;
-      await expect(parseDocument({ file })).rejects.toThrow(/exceeds the 50MB limit/);
+      await expect(parseDocument({ file })).rejects.toThrow(/exceeds the 50MB decompressed limit/);
    } finally {
      await fs.promises.unlink(tmpPath);
    }
--- a/packages/api/src/files/documents/crud.ts
+++ b/packages/api/src/files/documents/crud.ts
@ -1,5 +1,5 @@
 import * as fs from 'fs';
-import JSZip from 'jszip';
+import yauzl from 'yauzl';
 import { megabyte, excelMimeTypes, FileSources } from 'librechat-data-provider';
 import type { TextItem } from 'pdfjs-dist/types/src/display/api';
 import type { MistralOCRUploadResult } from '~/types';
@ -124,28 +124,7 @@ async function excelSheetToText(file: Express.Multer.File): Promise<string> {
 * text boxes, and annotations are stripped without replacement.
 */
 async function odtToText(file: Express.Multer.File): Promise<string> {
-  const data = await fs.promises.readFile(file.path);
-  const zip = await JSZip.loadAsync(data);
-
-  let totalUncompressed = 0;
-  zip.forEach((_, entry) => {
-    const raw = entry as JSZip.JSZipObject & { _data?: { uncompressedSize?: number } };
-    // _data.uncompressedSize is populated from the ZIP central directory at parse time
-    // by jszip (private internal, jszip@3.x). If the field is absent the guard fails
-    // open (adds 0); this is an accepted limitation of the approach.
-    totalUncompressed += raw._data?.uncompressedSize ?? 0;
-  });
-  if (totalUncompressed > ODT_MAX_DECOMPRESSED_SIZE) {
-    throw new Error(
-      `ODT file decompressed content (${Math.ceil(totalUncompressed / megabyte)}MB) exceeds the ${ODT_MAX_DECOMPRESSED_SIZE / megabyte}MB limit`,
-    );
-  }
-
-  const contentFile = zip.file('content.xml');
-  if (!contentFile) {
-    throw new Error('ODT file is missing content.xml');
-  }
-  const xml = await contentFile.async('string');
+  const xml = await extractOdtContentXml(file.path);
  const bodyMatch = xml.match(/<office:body[^>]*>([\s\S]*?)<\/office:body>/);
  if (!bodyMatch) {
    return '';
@ -168,3 +147,85 @@ async function odtToText(file: Express.Multer.File): Promise<string> {
    .replace(/\n{3,}/g, '\n\n')
    .trim();
 }
+
+/**
+ * Streams content.xml out of an ODT ZIP archive using yauzl, counting real
+ * decompressed bytes and aborting mid-inflate if the cap is exceeded.
+ * Unlike JSZip metadata checks, this cannot be bypassed by falsifying
+ * the ZIP central directory's uncompressedSize fields.
+ *
+ * The zipfile is closed on all exit paths (success, size cap, missing entry,
+ * error) to prevent file descriptor leaks.
+ */
+function extractOdtContentXml(filePath: string): Promise<string> {
+  return new Promise((resolve, reject) => {
+    yauzl.open(filePath, { lazyEntries: true }, (err, zipfile) => {
+      if (err) {
+        return reject(err);
+      }
+      if (!zipfile) {
+        return reject(new Error('Failed to open ODT file'));
+      }
+
+      let settled = false;
+      const finish = (error: Error | null, result?: string) => {
+        if (settled) {
+          return;
+        }
+        settled = true;
+        zipfile.close();
+        if (error) {
+          reject(error);
+        } else {
+          resolve(result as string);
+        }
+      };
+
+      let found = false;
+      zipfile.readEntry();
+
+      zipfile.on('entry', (entry: yauzl.Entry) => {
+        if (entry.fileName !== 'content.xml') {
+          zipfile.readEntry();
+          return;
+        }
+        found = true;
+        zipfile.openReadStream(entry, (streamErr, readStream) => {
+          if (streamErr) {
+            return finish(streamErr);
+          }
+          if (!readStream) {
+            return finish(new Error('Failed to open content.xml stream'));
+          }
+
+          let totalBytes = 0;
+          const chunks: Buffer[] = [];
+
+          readStream.on('data', (chunk: Buffer) => {
+            totalBytes += chunk.byteLength;
+            if (totalBytes > ODT_MAX_DECOMPRESSED_SIZE) {
+              readStream.destroy(
+                new Error(
+                  `ODT content.xml exceeds the ${ODT_MAX_DECOMPRESSED_SIZE / megabyte}MB decompressed limit`,
+                ),
+              );
+              return;
+            }
+            chunks.push(chunk);
+          });
+
+          readStream.on('end', () => finish(null, Buffer.concat(chunks).toString('utf8')));
+          readStream.on('error', (readErr: Error) => finish(readErr));
+        });
+      });
+
+      zipfile.on('end', () => {
+        if (!found) {
+          finish(new Error('ODT file is missing content.xml'));
+        }
+      });
+
+      zipfile.on('error', (zipErr: Error) => finish(zipErr));
+    });
+  });
+}