import path from 'path'; import { parseDocument } from './crud'; describe('Document Parser', () => { test('parseDocument() parses text from docx', async () => { const file = { originalname: 'sample.docx', path: path.join(__dirname, 'sample.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', } as Express.Multer.File; const document = await parseDocument({ file }); expect(document).toEqual({ bytes: 29, filename: 'sample.docx', filepath: 'document_parser', images: [], text: 'This is a sample DOCX file.\n\n', }); }); test('parseDocument() parses text from xlsx', async () => { const file = { originalname: 'sample.xlsx', path: path.join(__dirname, 'sample.xlsx'), mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', } as Express.Multer.File; const document = await parseDocument({ file }); expect(document).toEqual({ bytes: 66, filename: 'sample.xlsx', filepath: 'document_parser', images: [], text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n', }); }); test('parseDocument() parses text from xls', async () => { const file = { originalname: 'sample.xls', path: path.join(__dirname, 'sample.xls'), mimetype: 'application/vnd.ms-excel', } as Express.Multer.File; const document = await parseDocument({ file }); expect(document).toEqual({ bytes: 31, filename: 'sample.xls', filepath: 'document_parser', images: [], text: 'Sheet One:\nData,on,first,sheet\n', }); }); test('parseDocument() parses text from ods', async () => { const file = { originalname: 'sample.ods', path: path.join(__dirname, 'sample.ods'), mimetype: 'application/vnd.oasis.opendocument.spreadsheet', } as Express.Multer.File; const document = await parseDocument({ file }); expect(document).toEqual({ bytes: 66, filename: 'sample.ods', filepath: 'document_parser', images: [], text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n', }); }); test.each([ 'application/msexcel', 'application/x-msexcel', 'application/x-ms-excel', 'application/x-excel', 'application/x-dos_ms_excel', 'application/xls', 'application/x-xls', ])('parseDocument() parses xls with variant MIME type: %s', async (mimetype) => { const file = { originalname: 'sample.xls', path: path.join(__dirname, 'sample.xls'), mimetype, } as Express.Multer.File; const document = await parseDocument({ file }); expect(document).toEqual({ bytes: 31, filename: 'sample.xls', filepath: 'document_parser', images: [], text: 'Sheet One:\nData,on,first,sheet\n', }); }); test('parseDocument() throws error for unhandled document type', async () => { const file = { originalname: 'nonexistent.file', path: path.join(__dirname, 'nonexistent.file'), mimetype: 'application/invalid', } as Express.Multer.File; await expect(parseDocument({ file })).rejects.toThrow( 'Unsupported file type in document parser: application/invalid', ); }); test('parseDocument() throws error for empty document', async () => { const file = { originalname: 'empty.docx', path: path.join(__dirname, 'empty.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', } as Express.Multer.File; await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); }); test('parseDocument() rejects files exceeding the pre-parse size limit', async () => { const file = { originalname: 'oversized.docx', path: path.join(__dirname, 'sample.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', size: 16 * 1024 * 1024, } as Express.Multer.File; await expect(parseDocument({ file })).rejects.toThrow( /exceeds the 15MB document parser limit \(16MB\)/, ); }); test('parseDocument() allows files exactly at the size limit boundary', async () => { const file = { originalname: 'sample.docx', path: path.join(__dirname, 'sample.docx'), mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', size: 15 * 1024 * 1024, } as Express.Multer.File; await expect(parseDocument({ file })).resolves.toBeDefined(); }); test('parseDocument() parses empty xlsx with only sheet name', async () => { const file = { originalname: 'empty.xlsx', path: path.join(__dirname, 'empty.xlsx'), mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', } as Express.Multer.File; const document = await parseDocument({ file }); expect(document).toEqual({ bytes: 8, filename: 'empty.xlsx', filepath: 'document_parser', images: [], text: 'Empty:\n\n', }); }); test('xlsx exports read and utils as named imports', async () => { const { read, utils } = await import('xlsx'); expect(typeof read).toBe('function'); expect(typeof utils?.sheet_to_csv).toBe('function'); }); });