mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-03-07 08:40:19 +01:00
🧮 fix: XLSX/XLS Upload-as-Text via Buffer-Based SheetJS Parsing (#12098)
* 🔧 fix: Update Excel sheet parsing to use fs.promises.readFile and correct import for xlsx - Modified the excelSheetToText function to read the file using fs.promises.readFile instead of directly accessing the file path. - Updated the import statement for the xlsx library to use the correct read method, ensuring proper functionality in parsing Excel sheets. * 🔧 fix: Update document parsing methods to use buffer for file reading - Modified the wordDocToText function to read the file as a buffer using fs.promises.readFile, ensuring compatibility with the mammoth library. - Updated the excelSheetToText function to read the Excel file as a buffer, addressing issues with the xlsx library's handling of dynamic imports and file access. * feat: Add tests for empty xlsx document parsing and validate xlsx imports - Introduced a new test case to verify that the `parseDocument` function correctly handles an empty xlsx file with only a sheet name, ensuring it returns the expected document structure. - Added a test to confirm that the `xlsx` library exports `read` and `utils` as named imports, validating the functionality of the library integration. - Included a new empty xlsx file to support the test cases.
This commit is contained in:
parent
5209f1dc9e
commit
3b84cc048a
3 changed files with 30 additions and 3 deletions
|
|
@ -121,4 +121,28 @@ describe('Document Parser', () => {
|
|||
|
||||
await expect(parseDocument({ file })).rejects.toThrow('No text found in document');
|
||||
});
|
||||
|
||||
test('parseDocument() parses empty xlsx with only sheet name', async () => {
|
||||
const file = {
|
||||
originalname: 'empty.xlsx',
|
||||
path: path.join(__dirname, 'empty.xlsx'),
|
||||
mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
|
||||
} as Express.Multer.File;
|
||||
|
||||
const document = await parseDocument({ file });
|
||||
|
||||
expect(document).toEqual({
|
||||
bytes: 8,
|
||||
filename: 'empty.xlsx',
|
||||
filepath: 'document_parser',
|
||||
images: [],
|
||||
text: 'Empty:\n\n',
|
||||
});
|
||||
});
|
||||
|
||||
test('xlsx exports read and utils as named imports', async () => {
|
||||
const { read, utils } = await import('xlsx');
|
||||
expect(typeof read).toBe('function');
|
||||
expect(typeof utils?.sheet_to_csv).toBe('function');
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -68,14 +68,17 @@ async function pdfToText(file: Express.Multer.File): Promise<string> {
|
|||
/** Parses Word document, returns text inside. */
|
||||
async function wordDocToText(file: Express.Multer.File): Promise<string> {
|
||||
const { extractRawText } = await import('mammoth');
|
||||
const rawText = await extractRawText({ path: file.path });
|
||||
const rawText = await extractRawText({ buffer: await fs.promises.readFile(file.path) });
|
||||
return rawText.value;
|
||||
}
|
||||
|
||||
/** Parses Excel sheet, returns text inside. */
|
||||
async function excelSheetToText(file: Express.Multer.File): Promise<string> {
|
||||
const { readFile, utils } = await import('xlsx');
|
||||
const workbook = readFile(file.path);
|
||||
// xlsx CDN build (0.20.x) does not bind fs internally when dynamically imported;
|
||||
// readFile() fails with "Cannot access file". read() takes a pre-loaded Buffer instead.
|
||||
const { read, utils } = await import('xlsx');
|
||||
const data = await fs.promises.readFile(file.path);
|
||||
const workbook = read(data, { type: 'buffer' });
|
||||
|
||||
let text = '';
|
||||
for (const sheetName of workbook.SheetNames) {
|
||||
|
|
|
|||
BIN
packages/api/src/files/documents/empty.xlsx
Normal file
BIN
packages/api/src/files/documents/empty.xlsx
Normal file
Binary file not shown.
Loading…
Add table
Add a link
Reference in a new issue