diff --git a/api/server/services/Files/process.js b/api/server/services/Files/process.js index d69be6a00c..d01128927a 100644 --- a/api/server/services/Files/process.js +++ b/api/server/services/Files/process.js @@ -16,6 +16,7 @@ const { removeNullishValues, isAssistantsEndpoint, getEndpointFileConfig, + documentParserMimeTypes, } = require('librechat-data-provider'); const { EnvVar } = require('@librechat/agents'); const { logger } = require('@librechat/data-schemas'); @@ -559,19 +560,12 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { const fileConfig = mergeFileConfig(appConfig.fileConfig); - const documentParserMimeTypes = [ - 'application/pdf', - 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', - 'application/vnd.ms-excel', - 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', - ]; - const shouldUseConfiguredOCR = appConfig?.ocr != null && fileConfig.checkType(file.mimetype, fileConfig.ocr?.supportedMimeTypes || []); const shouldUseDocumentParser = - !shouldUseConfiguredOCR && documentParserMimeTypes.includes(file.mimetype); + !shouldUseConfiguredOCR && documentParserMimeTypes.some((regex) => regex.test(file.mimetype)); const shouldUseOCR = shouldUseConfiguredOCR || shouldUseDocumentParser; diff --git a/api/server/services/Files/process.spec.js b/api/server/services/Files/process.spec.js index 2938391ff2..7737255a52 100644 --- a/api/server/services/Files/process.spec.js +++ b/api/server/services/Files/process.spec.js @@ -83,6 +83,10 @@ const PDF_MIME = 'application/pdf'; const DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; const XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; const XLS_MIME = 'application/vnd.ms-excel'; +const ODS_MIME = 'application/vnd.oasis.opendocument.spreadsheet'; +const ODT_MIME = 'application/vnd.oasis.opendocument.text'; +const ODP_MIME = 'application/vnd.oasis.opendocument.presentation'; +const ODG_MIME = 'application/vnd.oasis.opendocument.graphics'; const makeReq = ({ mimetype = PDF_MIME, ocrConfig = null } = {}) => ({ user: { id: 'user-123' }, @@ -138,6 +142,9 @@ describe('processAgentFileUpload', () => { ['DOCX', DOCX_MIME], ['XLSX', XLSX_MIME], ['XLS', XLS_MIME], + ['ODS', ODS_MIME], + ['Excel variant (msexcel)', 'application/msexcel'], + ['Excel variant (x-msexcel)', 'application/x-msexcel'], ])('uses document_parser automatically for %s when no OCR is configured', async (_, mime) => { mergeFileConfig.mockReturnValue(makeFileConfig()); const req = makeReq({ mimetype: mime, ocrConfig: null }); @@ -229,6 +236,23 @@ describe('processAgentFileUpload', () => { expect(getStrategyFunctions).not.toHaveBeenCalled(); }); + test.each([ + ['ODT', ODT_MIME], + ['ODP', ODP_MIME], + ['ODG', ODG_MIME], + ])('routes %s through configured OCR when OCR supports the type', async (_, mime) => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [mime] })); + const req = makeReq({ + mimetype: mime, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.mistral_ocr); + }); + test('throws instead of falling back to parseText when document_parser fails for a document MIME type', async () => { getStrategyFunctions.mockReturnValue({ handleFileUpload: jest.fn().mockRejectedValue(new Error('No text found in document')), diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts index 3b9e1636ef..a360b7f760 100644 --- a/packages/api/src/files/documents/crud.spec.ts +++ b/packages/api/src/files/documents/crud.spec.ts @@ -56,6 +56,50 @@ describe('Document Parser', () => { }); }); + test('parseDocument() parses text from ods', async () => { + const file = { + originalname: 'sample.ods', + path: path.join(__dirname, 'sample.ods'), + mimetype: 'application/vnd.oasis.opendocument.spreadsheet', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 66, + filename: 'sample.ods', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n', + }); + }); + + test.each([ + 'application/msexcel', + 'application/x-msexcel', + 'application/x-ms-excel', + 'application/x-excel', + 'application/x-dos_ms_excel', + 'application/xls', + 'application/x-xls', + ])('parseDocument() parses xls with variant MIME type: %s', async (mimetype) => { + const file = { + originalname: 'sample.xls', + path: path.join(__dirname, 'sample.xls'), + mimetype, + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 31, + filename: 'sample.xls', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\n', + }); + }); + test('parseDocument() throws error for unhandled document type', async () => { const file = { originalname: 'nonexistent.file', diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts index f2d45644d4..94a563bc96 100644 --- a/packages/api/src/files/documents/crud.ts +++ b/packages/api/src/files/documents/crud.ts @@ -1,12 +1,13 @@ import * as fs from 'fs'; -import { FileSources } from 'librechat-data-provider'; +import { excelMimeTypes, FileSources } from 'librechat-data-provider'; import type { TextItem } from 'pdfjs-dist/types/src/display/api'; import type { MistralOCRUploadResult } from '~/types'; /** * Parses an uploaded document and extracts its text content and metadata. + * Handled types must stay in sync with `documentParserMimeTypes` from data-provider. * - * Throws an Error if it fails to parse or no text is found. + * @throws {Error} if `file.mimetype` is not handled or no text is found. */ export async function parseDocument({ file, @@ -14,19 +15,19 @@ export async function parseDocument({ file: Express.Multer.File; }): Promise { let text: string; - switch (file.mimetype) { - case 'application/pdf': - text = await pdfToText(file); - break; - case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': - text = await wordDocToText(file); - break; - case 'application/vnd.ms-excel': - case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': - text = await excelSheetToText(file); - break; - default: - throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); + if (file.mimetype === 'application/pdf') { + text = await pdfToText(file); + } else if ( + file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document' + ) { + text = await wordDocToText(file); + } else if ( + excelMimeTypes.test(file.mimetype) || + file.mimetype === 'application/vnd.oasis.opendocument.spreadsheet' + ) { + text = await excelSheetToText(file); + } else { + throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); } if (!text?.trim()) { diff --git a/packages/api/src/files/documents/sample.ods b/packages/api/src/files/documents/sample.ods new file mode 100644 index 0000000000..81e333dc2e Binary files /dev/null and b/packages/api/src/files/documents/sample.ods differ diff --git a/packages/data-provider/src/file-config.spec.ts b/packages/data-provider/src/file-config.spec.ts index 4b9c866061..018b4dbfcf 100644 --- a/packages/data-provider/src/file-config.spec.ts +++ b/packages/data-provider/src/file-config.spec.ts @@ -3,9 +3,122 @@ import { fileConfig as baseFileConfig, getEndpointFileConfig, mergeFileConfig, + applicationMimeTypes, + defaultOCRMimeTypes, + documentParserMimeTypes, + supportedMimeTypes, } from './file-config'; import { EModelEndpoint } from './schemas'; +describe('applicationMimeTypes', () => { + const odfTypes = [ + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation', + 'application/vnd.oasis.opendocument.graphics', + ]; + + it.each(odfTypes)('matches ODF type: %s', (mimeType) => { + expect(applicationMimeTypes.test(mimeType)).toBe(true); + }); + + const existingTypes = [ + 'application/pdf', + 'application/json', + 'application/csv', + 'application/msword', + 'application/xml', + 'application/zip', + 'application/epub+zip', + 'application/x-tar', + 'application/x-sh', + 'application/typescript', + 'application/sql', + 'application/yaml', + 'application/x-parquet', + 'application/vnd.apache.parquet', + 'application/vnd.coffeescript', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.presentationml.presentation', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + ]; + + it.each(existingTypes)('matches existing type: %s', (mimeType) => { + expect(applicationMimeTypes.test(mimeType)).toBe(true); + }); + + const invalidTypes = [ + 'application/vnd.oasis.opendocument.text-template', + 'application/vnd.oasis.opendocument.texts', + 'application/vnd.oasis.opendocument.chart', + 'application/vnd.oasis.opendocument.formula', + 'application/vnd.oasis.opendocument.image', + 'application/vnd.oasis.opendocument.text-master', + 'text/plain', + 'image/png', + ]; + + it.each(invalidTypes)('does not match invalid type: %s', (mimeType) => { + expect(applicationMimeTypes.test(mimeType)).toBe(false); + }); +}); + +describe('defaultOCRMimeTypes', () => { + const checkOCRType = (mimeType: string): boolean => + defaultOCRMimeTypes.some((regex) => regex.test(mimeType)); + + it.each([ + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation', + 'application/vnd.oasis.opendocument.graphics', + ])('matches ODF type for OCR: %s', (mimeType) => { + expect(checkOCRType(mimeType)).toBe(true); + }); +}); + +describe('supportedMimeTypes', () => { + const checkSupported = (mimeType: string): boolean => + supportedMimeTypes.some((regex) => regex.test(mimeType)); + + it.each([ + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation', + 'application/vnd.oasis.opendocument.graphics', + ])('ODF type flows through supportedMimeTypes: %s', (mimeType) => { + expect(checkSupported(mimeType)).toBe(true); + }); +}); + +describe('documentParserMimeTypes', () => { + const check = (mimeType: string): boolean => + documentParserMimeTypes.some((regex) => regex.test(mimeType)); + + it.each([ + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + 'application/vnd.ms-excel', + 'application/msexcel', + 'application/x-msexcel', + 'application/x-ms-excel', + 'application/vnd.oasis.opendocument.spreadsheet', + ])('matches natively parseable type: %s', (mimeType) => { + expect(check(mimeType)).toBe(true); + }); + + it.each([ + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.presentation', + 'application/vnd.oasis.opendocument.graphics', + 'text/plain', + 'image/png', + ])('does not match OCR-only or unsupported type: %s', (mimeType) => { + expect(check(mimeType)).toBe(false); + }); +}); + describe('getEndpointFileConfig', () => { describe('custom endpoint lookup', () => { it('should find custom endpoint by direct lookup', () => { diff --git a/packages/data-provider/src/file-config.ts b/packages/data-provider/src/file-config.ts index 5a117eb760..033c868a80 100644 --- a/packages/data-provider/src/file-config.ts +++ b/packages/data-provider/src/file-config.ts @@ -61,6 +61,10 @@ export const fullMimeTypesList = [ 'application/xml', 'application/zip', 'application/x-parquet', + 'application/vnd.oasis.opendocument.text', + 'application/vnd.oasis.opendocument.spreadsheet', + 'application/vnd.oasis.opendocument.presentation', + 'application/vnd.oasis.opendocument.graphics', 'image/svg', 'image/svg+xml', // Video formats @@ -179,7 +183,7 @@ export const textMimeTypes = /^(text\/(x-c|x-csharp|tab-separated-values|x-c\+\+|x-h|x-java|html|markdown|x-php|x-python|x-script\.python|x-ruby|x-tex|plain|css|vtt|javascript|csv|xml))$/; export const applicationMimeTypes = - /^(application\/(epub\+zip|csv|json|msword|pdf|x-tar|x-sh|typescript|sql|yaml|x-parquet|vnd\.apache\.parquet|vnd\.coffeescript|vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)|xml|zip))$/; + /^(application\/(epub\+zip|csv|json|msword|pdf|x-tar|x-sh|typescript|sql|yaml|x-parquet|vnd\.apache\.parquet|vnd\.coffeescript|vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)|vnd\.oasis\.opendocument\.(text|spreadsheet|presentation|graphics)|xml|zip))$/; export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/; @@ -190,10 +194,20 @@ export const videoMimeTypes = /^video\/(mp4|avi|mov|wmv|flv|webm|mkv|m4v|3gp|ogv export const defaultOCRMimeTypes = [ imageMimeTypes, + excelMimeTypes, /^application\/pdf$/, - /^application\/vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)$/, - /^application\/vnd\.ms-(word|powerpoint|excel)$/, + /^application\/vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation)$/, + /^application\/vnd\.ms-(word|powerpoint)$/, /^application\/epub\+zip$/, + /^application\/vnd\.oasis\.opendocument\.(text|spreadsheet|presentation|graphics)$/, +]; + +/** MIME types handled by the built-in document parser (pdf, docx, excel variants, ods) */ +export const documentParserMimeTypes = [ + excelMimeTypes, + /^application\/pdf$/, + /^application\/vnd\.openxmlformats-officedocument\.wordprocessingml\.document$/, + /^application\/vnd\.oasis\.opendocument\.spreadsheet$/, ]; export const defaultTextMimeTypes = [/^[\w.-]+\/[\w.-]+$/]; @@ -331,6 +345,10 @@ export const codeTypeMapping: { [key: string]: string } = { tcl: 'text/plain', // .tcl - Tcl source awk: 'text/plain', // .awk - AWK script sed: 'text/plain', // .sed - Sed script + odt: 'application/vnd.oasis.opendocument.text', // .odt - OpenDocument Text + ods: 'application/vnd.oasis.opendocument.spreadsheet', // .ods - OpenDocument Spreadsheet + odp: 'application/vnd.oasis.opendocument.presentation', // .odp - OpenDocument Presentation + odg: 'application/vnd.oasis.opendocument.graphics', // .odg - OpenDocument Graphics }; /** Maps image extensions to MIME types for formats browsers may not recognize */