From 3b84cc048a5853dee7355d2286c99b6677b903f5 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Fri, 6 Mar 2026 00:21:55 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=A7=AE=20fix:=20XLSX/XLS=20Upload-as-Text?= =?UTF-8?q?=20via=20Buffer-Based=20SheetJS=20Parsing=20(#12098)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🔧 fix: Update Excel sheet parsing to use fs.promises.readFile and correct import for xlsx - Modified the excelSheetToText function to read the file using fs.promises.readFile instead of directly accessing the file path. - Updated the import statement for the xlsx library to use the correct read method, ensuring proper functionality in parsing Excel sheets. * 🔧 fix: Update document parsing methods to use buffer for file reading - Modified the wordDocToText function to read the file as a buffer using fs.promises.readFile, ensuring compatibility with the mammoth library. - Updated the excelSheetToText function to read the Excel file as a buffer, addressing issues with the xlsx library's handling of dynamic imports and file access. * feat: Add tests for empty xlsx document parsing and validate xlsx imports - Introduced a new test case to verify that the `parseDocument` function correctly handles an empty xlsx file with only a sheet name, ensuring it returns the expected document structure. - Added a test to confirm that the `xlsx` library exports `read` and `utils` as named imports, validating the functionality of the library integration. - Included a new empty xlsx file to support the test cases. --- packages/api/src/files/documents/crud.spec.ts | 24 ++++++++++++++++++ packages/api/src/files/documents/crud.ts | 9 ++++--- packages/api/src/files/documents/empty.xlsx | Bin 0 -> 15767 bytes 3 files changed, 30 insertions(+), 3 deletions(-) create mode 100644 packages/api/src/files/documents/empty.xlsx diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts index a360b7f760..f22693718a 100644 --- a/packages/api/src/files/documents/crud.spec.ts +++ b/packages/api/src/files/documents/crud.spec.ts @@ -121,4 +121,28 @@ describe('Document Parser', () => { await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); }); + + test('parseDocument() parses empty xlsx with only sheet name', async () => { + const file = { + originalname: 'empty.xlsx', + path: path.join(__dirname, 'empty.xlsx'), + mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 8, + filename: 'empty.xlsx', + filepath: 'document_parser', + images: [], + text: 'Empty:\n\n', + }); + }); + + test('xlsx exports read and utils as named imports', async () => { + const { read, utils } = await import('xlsx'); + expect(typeof read).toBe('function'); + expect(typeof utils?.sheet_to_csv).toBe('function'); + }); }); diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index 94a563bc96..ab16534b45 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -68,14 +68,17 @@ async function pdfToText(file: Express.Multer.File): Promise { /** Parses Word document, returns text inside. */ async function wordDocToText(file: Express.Multer.File): Promise { const { extractRawText } = await import('mammoth'); - const rawText = await extractRawText({ path: file.path }); + const rawText = await extractRawText({ buffer: await fs.promises.readFile(file.path) }); return rawText.value; } /** Parses Excel sheet, returns text inside. */ async function excelSheetToText(file: Express.Multer.File): Promise { - const { readFile, utils } = await import('xlsx'); - const workbook = readFile(file.path); + // xlsx CDN build (0.20.x) does not bind fs internally when dynamically imported; + // readFile() fails with "Cannot access file". read() takes a pre-loaded Buffer instead. + const { read, utils } = await import('xlsx'); + const data = await fs.promises.readFile(file.path); + const workbook = read(data, { type: 'buffer' }); let text = ''; for (const sheetName of workbook.SheetNames) { diff --git a/packages/api/src/files/documents/empty.xlsx b/packages/api/src/files/documents/empty.xlsx new file mode 100644 index 0000000000000000000000000000000000000000..6e54514f24e3227df81e13fbe30673fc1c91f5d0 GIT binary patch literal 15767 zcmeHO-HRkw6`z{+OsPB8hZqxI2#||8O zj9M1%!=1J#gV6Ha#aexAyrxOl@J!2XFV-GvY>dvQuaMBj$(yoAok%sj;##P5tdui zT;afkFR_7Y4OaZpLQH(!h8?(3NDYP^?2z-w0tQ?|%$vJ)x`e%O)ef{hVZ(MU*Yx%q z@&NX1i!j2A#o9Rkkp=51k{u1e&EakEUyK(cn7b3)XV7k@oz^#|=FYCA^cld8`r+2s z*H_o;DLn=dhJlePUh*`jHfHN93C$dky6vaEIzBx z#Jm)_H>$}9`YZ0#(%yOX`i-kE-?;j_8_zy_8czy9=T?JqC=?ClqRSPhTRmUZ;zub+G4wI5Ui!hVvM9XE0fI*r;Qiez1zvYLK{v80dOqKA3V@rQXRmlcr2x)TUpWakx^BC=^ProD z+ZJN4s_#|d7F4_5^$|4_qb$gNwId5`ZrQ?(kb<}}(&(MHD`my#9<{83TsJKv@ItQ@ zX^&dkim)n0(y*EZL&}iOSq}8ls~R&@U69S@Wo^Z?t46&h_ll5k2*JsUa_7{=8qzL^ z@qW>bgi}SbA&~9l6mvmDohl}GdV!ILTMr{>x-D%_TQ_AGR^eO7B>zL1P@v50!x{J(3@^h!PfoKE7b>?mZZy*F@LdlJJUt>wiB&zaOD%|ML-gkT0KpT+TuI-sWg$kQP)Vu zge|d&m8^jiwUAVk`%_j?JZd$NjxeQCQK2~}%{uEYF`>Tk(eRXw98a35)F8COnP9UR zY(Ga7azY>xEGwz(O}3l&E!)-h7i)9V6Vo-#5dLDVg_SOD9Un>yDHjxW8=Im=6ez1b z)EC)Fvk}i#TU8yK7Xuuo;KqIsMr$JMC>`>vSXH)5Jt_Vtrl-i*PGw(?cP8;T1WB#{XwS zq3HlS2$Lphz`*8iAT;un#hMpH9S>2+?^wo0fO(%|2do7I715AuO@=9YU-gPd$pH9TQ@U%CQ^zitVP{lXin*;0(p!n%04- zb2N#=0K=8-ulO8$P_s=rO5WcJku04$l@I)<6Bp&o7*r{jAp1?^pR&l(Dkd%ppV5;! zGCvS{k!49!iB}XRzqz@%s}r(tlnFRD*Q$usH{yxuggI!jf5d1^^BEueKpOpr`OgXr zTe2C`gxc`75nDAJKPdTWuqKe%WeDO4e9ZR?%Yt=ST)X;3%MbXqs)*-AIKIqDyMS0CDQ1lvh{kyh0 zoeF(Cu^;$b0b5toGkfu^9jXaeSskrm2-0@9WlOWngJk`YNdnw%?iXujZ>gIwRM)p&d_KluZ;07gS|Fz=a6=7C3*nQ>>Fa#7 zYIwF6Xsp9{kxZxL3St#U1@UHe3kMf=Pc<{1GECK=9F(j<^}rYGI#4`F86YpBhNT6} zLP+)94Ls1~M)xGb#hQ!a79N6vI|Lf@R}(8rWZw?gB#NlZO;`pZuJBsz5!W7#@W>)tpe?XR!+f~e@O_WYq|wP@vWk# z9*BOPnxcKnWVM4<+?nuwyLYkcG$js@sqQD%E%%Cka)T)hC1onEe7H!+IGRccH@lXN zG^=75plKdtg5fDX5=~ojLYnphp_pniYosq1YuWv#G_9^PmqKk>d!Cxo7HiqPf|_F8 zXF;J9gkF1BKlkn1Mn>rODHVh&>x@nwk5Y1{7LJfloGcoZtGG-Yhr3iyrUkNvV-^nk z90hS+whv{c_gYeqMlUkOrtcHjHB<+|aIH3ux1_lq1YUqw`A2dCSlmuT0Lvkj#E{)- z;S)I?^gIR#WaVHb^?mK>k!P;mJ~BeTyOjF!B}pgOT0-U2ly+bTaQBkloaCr_%s^~W zRdITGGOC}|1Iy^B!Yef)9hm(EPDiC>_7J=+Lj(wowWTwfwvddK=crsj#Jkel*yA6$ zu(rIlUDFWLce=ihg|sv;r58RbOO zB*UdM3#pt-P8$WExw(vZOTXA@lkjqDsb50n)SULzB5lIiRsjDp&f;keG1 zW+{%QNhFPJaNY5v9-<@VvlMs9Gv+PaaKSC*2nkkNGF+1OgIDf;_X5g`>Gu(YNL63_ z9;qpwIC>51;CkiM4s`>Q&ZwVyRaVEE z7NIiZ(%e!}(tUW@brB+y59-nr6Ha&;TI&1yC$Imr4uhlLM`3iSeB;5Kd}7YUM4k)# z=F}sOyHfV2zT8_zj*YpN%I}wU$cG=?S0mTSj z6j^Lo5em^SwYEe+drc`|%rVR{IarQibxn*kq7GD{2b{;Rf5QC)qdgKleDED{>cVh!;x2?#p9#SC77 z*Q75I2IB3x=9m+X$^+zEW9;!`KoCK1%n>MkU3Y2G3=sOBp_=~QP`-$MF^Z0=c>z(J zKiC-q{+jxDVn7FKu`+Z7!+UU$CE52dnd%`l9ls>nC_G*l_ z)vCmx_G%32RAU^B(wl0W{ed|2rW)spUp?()*46Z5!TD;^{q&@BC#FTHw-+=+=i9G;O}6O zO62|qe-&G%Q~ty-Bl0)&2RdaVE)yspN@WE8r7?lbD9fnjt>g^#na?q5>S*~7Sj$w( zJDM4lFMolll$p83a%0lv4aT&S1QIH`UUn*=Hlo88|y$tbBWBp6NGIlwa p&#=$dbJ&S9meI=@Xomj#iOPI*>-+~lL_lZo?