diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts index a360b7f760..f22693718a 100644 --- a/packages/api/src/files/documents/crud.spec.ts +++ b/packages/api/src/files/documents/crud.spec.ts @@ -121,4 +121,28 @@ describe('Document Parser', () => { await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); }); + + test('parseDocument() parses empty xlsx with only sheet name', async () => { + const file = { + originalname: 'empty.xlsx', + path: path.join(__dirname, 'empty.xlsx'), + mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 8, + filename: 'empty.xlsx', + filepath: 'document_parser', + images: [], + text: 'Empty:\n\n', + }); + }); + + test('xlsx exports read and utils as named imports', async () => { + const { read, utils } = await import('xlsx'); + expect(typeof read).toBe('function'); + expect(typeof utils?.sheet_to_csv).toBe('function'); + }); }); diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index 94a563bc96..ab16534b45 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -68,14 +68,17 @@ async function pdfToText(file: Express.Multer.File): Promise { /** Parses Word document, returns text inside. */ async function wordDocToText(file: Express.Multer.File): Promise { const { extractRawText } = await import('mammoth'); - const rawText = await extractRawText({ path: file.path }); + const rawText = await extractRawText({ buffer: await fs.promises.readFile(file.path) }); return rawText.value; } /** Parses Excel sheet, returns text inside. */ async function excelSheetToText(file: Express.Multer.File): Promise { - const { readFile, utils } = await import('xlsx'); - const workbook = readFile(file.path); + // xlsx CDN build (0.20.x) does not bind fs internally when dynamically imported; + // readFile() fails with "Cannot access file". read() takes a pre-loaded Buffer instead. + const { read, utils } = await import('xlsx'); + const data = await fs.promises.readFile(file.path); + const workbook = read(data, { type: 'buffer' }); let text = ''; for (const sheetName of workbook.SheetNames) { diff --git a/packages/api/src/files/documents/empty.xlsx b/packages/api/src/files/documents/empty.xlsx new file mode 100644 index 0000000000..6e54514f24 Binary files /dev/null and b/packages/api/src/files/documents/empty.xlsx differ