🧩 feat: OpenDocument Format File Upload and Native ODS Parsing (#11959)

*  feat: Add support for OpenDocument MIME types in file configuration

Updated the applicationMimeTypes regex to include support for OASIS OpenDocument formats, enhancing the file type recognition capabilities of the data provider.

* feat: document processing with OpenDocument support

Added support for OpenDocument Spreadsheet (ODS) MIME type in the file processing service and updated the document parser to handle ODS files. Included tests to verify correct parsing of ODS documents and updated file configuration to recognize OpenDocument formats.

* refactor: Enhance document processing to support additional Excel MIME types

Updated the document processing logic to utilize a regex for matching Excel MIME types, improving flexibility in handling various Excel file formats. Added tests to ensure correct parsing of new MIME types, including multiple Excel variants and OpenDocument formats. Adjusted file configuration to include these MIME types for better recognition in the file processing service.

* feat: Add support for additional OpenDocument MIME types in file processing

Enhanced the document processing service to support ODT, ODP, and ODG MIME types. Updated tests to verify correct routing through the OCR strategy for these new formats. Adjusted documentation to reflect changes in handled MIME types for improved clarity.
This commit is contained in:
Danny Avila 2026-02-26 14:39:49 -05:00 committed by GitHub
parent 3a079b980a
commit 046e92217f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 220 additions and 26 deletions

View file

@ -16,6 +16,7 @@ const {
removeNullishValues,
isAssistantsEndpoint,
getEndpointFileConfig,
documentParserMimeTypes,
} = require('librechat-data-provider');
const { EnvVar } = require('@librechat/agents');
const { logger } = require('@librechat/data-schemas');
@ -559,19 +560,12 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
const fileConfig = mergeFileConfig(appConfig.fileConfig);
const documentParserMimeTypes = [
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.ms-excel',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
];
const shouldUseConfiguredOCR =
appConfig?.ocr != null &&
fileConfig.checkType(file.mimetype, fileConfig.ocr?.supportedMimeTypes || []);
const shouldUseDocumentParser =
!shouldUseConfiguredOCR && documentParserMimeTypes.includes(file.mimetype);
!shouldUseConfiguredOCR && documentParserMimeTypes.some((regex) => regex.test(file.mimetype));
const shouldUseOCR = shouldUseConfiguredOCR || shouldUseDocumentParser;

View file

@ -83,6 +83,10 @@ const PDF_MIME = 'application/pdf';
const DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document';
const XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet';
const XLS_MIME = 'application/vnd.ms-excel';
const ODS_MIME = 'application/vnd.oasis.opendocument.spreadsheet';
const ODT_MIME = 'application/vnd.oasis.opendocument.text';
const ODP_MIME = 'application/vnd.oasis.opendocument.presentation';
const ODG_MIME = 'application/vnd.oasis.opendocument.graphics';
const makeReq = ({ mimetype = PDF_MIME, ocrConfig = null } = {}) => ({
user: { id: 'user-123' },
@ -138,6 +142,9 @@ describe('processAgentFileUpload', () => {
['DOCX', DOCX_MIME],
['XLSX', XLSX_MIME],
['XLS', XLS_MIME],
['ODS', ODS_MIME],
['Excel variant (msexcel)', 'application/msexcel'],
['Excel variant (x-msexcel)', 'application/x-msexcel'],
])('uses document_parser automatically for %s when no OCR is configured', async (_, mime) => {
mergeFileConfig.mockReturnValue(makeFileConfig());
const req = makeReq({ mimetype: mime, ocrConfig: null });
@ -229,6 +236,23 @@ describe('processAgentFileUpload', () => {
expect(getStrategyFunctions).not.toHaveBeenCalled();
});
test.each([
['ODT', ODT_MIME],
['ODP', ODP_MIME],
['ODG', ODG_MIME],
])('routes %s through configured OCR when OCR supports the type', async (_, mime) => {
mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [mime] }));
const req = makeReq({
mimetype: mime,
ocrConfig: { strategy: FileSources.mistral_ocr },
});
await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() });
expect(checkCapability).toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr);
expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.mistral_ocr);
});
test('throws instead of falling back to parseText when document_parser fails for a document MIME type', async () => {
getStrategyFunctions.mockReturnValue({
handleFileUpload: jest.fn().mockRejectedValue(new Error('No text found in document')),

View file

@ -56,6 +56,50 @@ describe('Document Parser', () => {
});
});
test('parseDocument() parses text from ods', async () => {
const file = {
originalname: 'sample.ods',
path: path.join(__dirname, 'sample.ods'),
mimetype: 'application/vnd.oasis.opendocument.spreadsheet',
} as Express.Multer.File;
const document = await parseDocument({ file });
expect(document).toEqual({
bytes: 66,
filename: 'sample.ods',
filepath: 'document_parser',
images: [],
text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n',
});
});
test.each([
'application/msexcel',
'application/x-msexcel',
'application/x-ms-excel',
'application/x-excel',
'application/x-dos_ms_excel',
'application/xls',
'application/x-xls',
])('parseDocument() parses xls with variant MIME type: %s', async (mimetype) => {
const file = {
originalname: 'sample.xls',
path: path.join(__dirname, 'sample.xls'),
mimetype,
} as Express.Multer.File;
const document = await parseDocument({ file });
expect(document).toEqual({
bytes: 31,
filename: 'sample.xls',
filepath: 'document_parser',
images: [],
text: 'Sheet One:\nData,on,first,sheet\n',
});
});
test('parseDocument() throws error for unhandled document type', async () => {
const file = {
originalname: 'nonexistent.file',

View file

@ -1,12 +1,13 @@
import * as fs from 'fs';
import { FileSources } from 'librechat-data-provider';
import { excelMimeTypes, FileSources } from 'librechat-data-provider';
import type { TextItem } from 'pdfjs-dist/types/src/display/api';
import type { MistralOCRUploadResult } from '~/types';
/**
* Parses an uploaded document and extracts its text content and metadata.
* Handled types must stay in sync with `documentParserMimeTypes` from data-provider.
*
* Throws an Error if it fails to parse or no text is found.
* @throws {Error} if `file.mimetype` is not handled or no text is found.
*/
export async function parseDocument({
file,
@ -14,19 +15,19 @@ export async function parseDocument({
file: Express.Multer.File;
}): Promise<MistralOCRUploadResult> {
let text: string;
switch (file.mimetype) {
case 'application/pdf':
text = await pdfToText(file);
break;
case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document':
text = await wordDocToText(file);
break;
case 'application/vnd.ms-excel':
case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet':
text = await excelSheetToText(file);
break;
default:
throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
if (file.mimetype === 'application/pdf') {
text = await pdfToText(file);
} else if (
file.mimetype === 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'
) {
text = await wordDocToText(file);
} else if (
excelMimeTypes.test(file.mimetype) ||
file.mimetype === 'application/vnd.oasis.opendocument.spreadsheet'
) {
text = await excelSheetToText(file);
} else {
throw new Error(`Unsupported file type in document parser: ${file.mimetype}`);
}
if (!text?.trim()) {

Binary file not shown.

View file

@ -3,9 +3,122 @@ import {
fileConfig as baseFileConfig,
getEndpointFileConfig,
mergeFileConfig,
applicationMimeTypes,
defaultOCRMimeTypes,
documentParserMimeTypes,
supportedMimeTypes,
} from './file-config';
import { EModelEndpoint } from './schemas';
describe('applicationMimeTypes', () => {
const odfTypes = [
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.presentation',
'application/vnd.oasis.opendocument.graphics',
];
it.each(odfTypes)('matches ODF type: %s', (mimeType) => {
expect(applicationMimeTypes.test(mimeType)).toBe(true);
});
const existingTypes = [
'application/pdf',
'application/json',
'application/csv',
'application/msword',
'application/xml',
'application/zip',
'application/epub+zip',
'application/x-tar',
'application/x-sh',
'application/typescript',
'application/sql',
'application/yaml',
'application/x-parquet',
'application/vnd.apache.parquet',
'application/vnd.coffeescript',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.presentationml.presentation',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
];
it.each(existingTypes)('matches existing type: %s', (mimeType) => {
expect(applicationMimeTypes.test(mimeType)).toBe(true);
});
const invalidTypes = [
'application/vnd.oasis.opendocument.text-template',
'application/vnd.oasis.opendocument.texts',
'application/vnd.oasis.opendocument.chart',
'application/vnd.oasis.opendocument.formula',
'application/vnd.oasis.opendocument.image',
'application/vnd.oasis.opendocument.text-master',
'text/plain',
'image/png',
];
it.each(invalidTypes)('does not match invalid type: %s', (mimeType) => {
expect(applicationMimeTypes.test(mimeType)).toBe(false);
});
});
describe('defaultOCRMimeTypes', () => {
const checkOCRType = (mimeType: string): boolean =>
defaultOCRMimeTypes.some((regex) => regex.test(mimeType));
it.each([
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.presentation',
'application/vnd.oasis.opendocument.graphics',
])('matches ODF type for OCR: %s', (mimeType) => {
expect(checkOCRType(mimeType)).toBe(true);
});
});
describe('supportedMimeTypes', () => {
const checkSupported = (mimeType: string): boolean =>
supportedMimeTypes.some((regex) => regex.test(mimeType));
it.each([
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.presentation',
'application/vnd.oasis.opendocument.graphics',
])('ODF type flows through supportedMimeTypes: %s', (mimeType) => {
expect(checkSupported(mimeType)).toBe(true);
});
});
describe('documentParserMimeTypes', () => {
const check = (mimeType: string): boolean =>
documentParserMimeTypes.some((regex) => regex.test(mimeType));
it.each([
'application/pdf',
'application/vnd.openxmlformats-officedocument.wordprocessingml.document',
'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet',
'application/vnd.ms-excel',
'application/msexcel',
'application/x-msexcel',
'application/x-ms-excel',
'application/vnd.oasis.opendocument.spreadsheet',
])('matches natively parseable type: %s', (mimeType) => {
expect(check(mimeType)).toBe(true);
});
it.each([
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.presentation',
'application/vnd.oasis.opendocument.graphics',
'text/plain',
'image/png',
])('does not match OCR-only or unsupported type: %s', (mimeType) => {
expect(check(mimeType)).toBe(false);
});
});
describe('getEndpointFileConfig', () => {
describe('custom endpoint lookup', () => {
it('should find custom endpoint by direct lookup', () => {

View file

@ -61,6 +61,10 @@ export const fullMimeTypesList = [
'application/xml',
'application/zip',
'application/x-parquet',
'application/vnd.oasis.opendocument.text',
'application/vnd.oasis.opendocument.spreadsheet',
'application/vnd.oasis.opendocument.presentation',
'application/vnd.oasis.opendocument.graphics',
'image/svg',
'image/svg+xml',
// Video formats
@ -179,7 +183,7 @@ export const textMimeTypes =
/^(text\/(x-c|x-csharp|tab-separated-values|x-c\+\+|x-h|x-java|html|markdown|x-php|x-python|x-script\.python|x-ruby|x-tex|plain|css|vtt|javascript|csv|xml))$/;
export const applicationMimeTypes =
/^(application\/(epub\+zip|csv|json|msword|pdf|x-tar|x-sh|typescript|sql|yaml|x-parquet|vnd\.apache\.parquet|vnd\.coffeescript|vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)|xml|zip))$/;
/^(application\/(epub\+zip|csv|json|msword|pdf|x-tar|x-sh|typescript|sql|yaml|x-parquet|vnd\.apache\.parquet|vnd\.coffeescript|vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)|vnd\.oasis\.opendocument\.(text|spreadsheet|presentation|graphics)|xml|zip))$/;
export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/;
@ -190,10 +194,20 @@ export const videoMimeTypes = /^video\/(mp4|avi|mov|wmv|flv|webm|mkv|m4v|3gp|ogv
export const defaultOCRMimeTypes = [
imageMimeTypes,
excelMimeTypes,
/^application\/pdf$/,
/^application\/vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation|spreadsheetml\.sheet)$/,
/^application\/vnd\.ms-(word|powerpoint|excel)$/,
/^application\/vnd\.openxmlformats-officedocument\.(wordprocessingml\.document|presentationml\.presentation)$/,
/^application\/vnd\.ms-(word|powerpoint)$/,
/^application\/epub\+zip$/,
/^application\/vnd\.oasis\.opendocument\.(text|spreadsheet|presentation|graphics)$/,
];
/** MIME types handled by the built-in document parser (pdf, docx, excel variants, ods) */
export const documentParserMimeTypes = [
excelMimeTypes,
/^application\/pdf$/,
/^application\/vnd\.openxmlformats-officedocument\.wordprocessingml\.document$/,
/^application\/vnd\.oasis\.opendocument\.spreadsheet$/,
];
export const defaultTextMimeTypes = [/^[\w.-]+\/[\w.-]+$/];
@ -331,6 +345,10 @@ export const codeTypeMapping: { [key: string]: string } = {
tcl: 'text/plain', // .tcl - Tcl source
awk: 'text/plain', // .awk - AWK script
sed: 'text/plain', // .sed - Sed script
odt: 'application/vnd.oasis.opendocument.text', // .odt - OpenDocument Text
ods: 'application/vnd.oasis.opendocument.spreadsheet', // .ods - OpenDocument Spreadsheet
odp: 'application/vnd.oasis.opendocument.presentation', // .odp - OpenDocument Presentation
odg: 'application/vnd.oasis.opendocument.graphics', // .odg - OpenDocument Graphics
};
/** Maps image extensions to MIME types for formats browsers may not recognize */