diff --git a/api/package.json b/api/package.json index 4542e25745..9951b6f01a 100644 --- a/api/package.json +++ b/api/package.json @@ -80,6 +80,7 @@ "klona": "^2.0.6", "librechat-data-provider": "*", "lodash": "^4.17.23", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "meilisearch": "^0.38.0", "memorystore": "^1.6.7", @@ -102,6 +103,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", + "pdfjs-dist": "^5.4.624", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", @@ -110,6 +112,7 @@ "undici": "^7.18.2", "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", "zod": "^3.22.4" }, "devDependencies": { diff --git a/api/server/services/Files/process.js b/api/server/services/Files/process.js index 30b47f2e52..d69be6a00c 100644 --- a/api/server/services/Files/process.js +++ b/api/server/services/Files/process.js @@ -523,6 +523,12 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { * @return {Promise} */ const createTextFile = async ({ text, bytes, filepath, type = 'text/plain' }) => { + const textBytes = Buffer.byteLength(text, 'utf8'); + if (textBytes > 15 * megabyte) { + throw new Error( + `Extracted text from "${file.originalname}" exceeds the 15MB storage limit (${Math.round(textBytes / megabyte)}MB). Try a shorter document.`, + ); + } const fileInfo = removeNullishValues({ text, bytes, @@ -553,29 +559,59 @@ const processAgentFileUpload = async ({ req, res, metadata }) => { const fileConfig = mergeFileConfig(appConfig.fileConfig); - const shouldUseOCR = + const documentParserMimeTypes = [ + 'application/pdf', + 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + 'application/vnd.ms-excel', + 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + ]; + + const shouldUseConfiguredOCR = appConfig?.ocr != null && fileConfig.checkType(file.mimetype, fileConfig.ocr?.supportedMimeTypes || []); - if (shouldUseOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { - throw new Error('OCR capability is not enabled for Agents'); - } else if (shouldUseOCR) { + const shouldUseDocumentParser = + !shouldUseConfiguredOCR && documentParserMimeTypes.includes(file.mimetype); + + const shouldUseOCR = shouldUseConfiguredOCR || shouldUseDocumentParser; + + const resolveDocumentText = async () => { + if (shouldUseConfiguredOCR) { + try { + const ocrStrategy = appConfig?.ocr?.strategy ?? FileSources.document_parser; + const { handleFileUpload } = getStrategyFunctions(ocrStrategy); + return await handleFileUpload({ req, file, loadAuthValues }); + } catch (err) { + logger.error( + `[processAgentFileUpload] Configured OCR failed for "${file.originalname}", falling back to document_parser:`, + err, + ); + } + } try { - const { handleFileUpload: uploadOCR } = getStrategyFunctions( - appConfig?.ocr?.strategy ?? FileSources.mistral_ocr, - ); - const { - text, - bytes, - filepath: ocrFileURL, - } = await uploadOCR({ req, file, loadAuthValues }); - return await createTextFile({ text, bytes, filepath: ocrFileURL }); - } catch (ocrError) { + const { handleFileUpload } = getStrategyFunctions(FileSources.document_parser); + return await handleFileUpload({ req, file, loadAuthValues }); + } catch (err) { logger.error( - `[processAgentFileUpload] OCR processing failed for file "${file.originalname}", falling back to text extraction:`, - ocrError, + `[processAgentFileUpload] Document parser failed for "${file.originalname}":`, + err, ); } + }; + + if (shouldUseConfiguredOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { + throw new Error('OCR capability is not enabled for Agents'); + } + + if (shouldUseOCR) { + const ocrResult = await resolveDocumentText(); + if (ocrResult) { + const { text, bytes, filepath: ocrFileURL } = ocrResult; + return await createTextFile({ text, bytes, filepath: ocrFileURL }); + } + throw new Error( + `Unable to extract text from "${file.originalname}". The document may be image-based and requires an OCR service to process.`, + ); } const shouldUseSTT = fileConfig.checkType( diff --git a/api/server/services/Files/process.spec.js b/api/server/services/Files/process.spec.js new file mode 100644 index 0000000000..2938391ff2 --- /dev/null +++ b/api/server/services/Files/process.spec.js @@ -0,0 +1,323 @@ +jest.mock('uuid', () => ({ v4: jest.fn(() => 'mock-uuid') })); + +jest.mock('@librechat/data-schemas', () => ({ + logger: { warn: jest.fn(), debug: jest.fn(), error: jest.fn() }, +})); + +jest.mock('@librechat/agents', () => ({ + EnvVar: { CODE_API_KEY: 'CODE_API_KEY' }, +})); + +jest.mock('@librechat/api', () => ({ + sanitizeFilename: jest.fn((n) => n), + parseText: jest.fn().mockResolvedValue({ text: '', bytes: 0 }), + processAudioFile: jest.fn(), +})); + +jest.mock('librechat-data-provider', () => ({ + ...jest.requireActual('librechat-data-provider'), + mergeFileConfig: jest.fn(), +})); + +jest.mock('~/server/services/Files/images', () => ({ + convertImage: jest.fn(), + resizeAndConvert: jest.fn(), + resizeImageBuffer: jest.fn(), +})); + +jest.mock('~/server/controllers/assistants/v2', () => ({ + addResourceFileId: jest.fn(), + deleteResourceFileId: jest.fn(), +})); + +jest.mock('~/models/Agent', () => ({ + addAgentResourceFile: jest.fn().mockResolvedValue({}), + removeAgentResourceFiles: jest.fn(), +})); + +jest.mock('~/server/controllers/assistants/helpers', () => ({ + getOpenAIClient: jest.fn(), +})); + +jest.mock('~/server/services/Tools/credentials', () => ({ + loadAuthValues: jest.fn(), +})); + +jest.mock('~/models', () => ({ + createFile: jest.fn().mockResolvedValue({ file_id: 'created-file-id' }), + updateFileUsage: jest.fn(), + deleteFiles: jest.fn(), +})); + +jest.mock('~/server/utils/getFileStrategy', () => ({ + getFileStrategy: jest.fn().mockReturnValue('local'), +})); + +jest.mock('~/server/services/Config', () => ({ + checkCapability: jest.fn().mockResolvedValue(true), +})); + +jest.mock('~/server/utils/queue', () => ({ + LB_QueueAsyncCall: jest.fn(), +})); + +jest.mock('~/server/services/Files/strategies', () => ({ + getStrategyFunctions: jest.fn(), +})); + +jest.mock('~/server/utils', () => ({ + determineFileType: jest.fn(), +})); + +jest.mock('~/server/services/Files/Audio/STTService', () => ({ + STTService: { getInstance: jest.fn() }, +})); + +const { EToolResources, FileSources, AgentCapabilities } = require('librechat-data-provider'); +const { mergeFileConfig } = require('librechat-data-provider'); +const { checkCapability } = require('~/server/services/Config'); +const { getStrategyFunctions } = require('~/server/services/Files/strategies'); +const { processAgentFileUpload } = require('./process'); + +const PDF_MIME = 'application/pdf'; +const DOCX_MIME = 'application/vnd.openxmlformats-officedocument.wordprocessingml.document'; +const XLSX_MIME = 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet'; +const XLS_MIME = 'application/vnd.ms-excel'; + +const makeReq = ({ mimetype = PDF_MIME, ocrConfig = null } = {}) => ({ + user: { id: 'user-123' }, + file: { + path: '/tmp/upload.bin', + originalname: 'upload.bin', + filename: 'upload-uuid.bin', + mimetype, + }, + body: { model: 'gpt-4o' }, + config: { + fileConfig: {}, + fileStrategy: 'local', + ocr: ocrConfig, + }, +}); + +const makeMetadata = () => ({ + agent_id: 'agent-abc', + tool_resource: EToolResources.context, + file_id: 'file-uuid-123', +}); + +const mockRes = { + status: jest.fn().mockReturnThis(), + json: jest.fn().mockReturnValue({}), +}; + +const makeFileConfig = ({ ocrSupportedMimeTypes = [] } = {}) => ({ + checkType: (mime, types) => (types ?? []).includes(mime), + ocr: { supportedMimeTypes: ocrSupportedMimeTypes }, + stt: { supportedMimeTypes: [] }, + text: { supportedMimeTypes: [] }, +}); + +describe('processAgentFileUpload', () => { + beforeEach(() => { + jest.clearAllMocks(); + mockRes.status.mockReturnThis(); + mockRes.json.mockReturnValue({}); + checkCapability.mockResolvedValue(true); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest + .fn() + .mockResolvedValue({ text: 'extracted text', bytes: 42, filepath: 'doc://result' }), + }); + mergeFileConfig.mockReturnValue(makeFileConfig()); + }); + + describe('OCR strategy selection', () => { + test.each([ + ['PDF', PDF_MIME], + ['DOCX', DOCX_MIME], + ['XLSX', XLSX_MIME], + ['XLS', XLS_MIME], + ])('uses document_parser automatically for %s when no OCR is configured', async (_, mime) => { + mergeFileConfig.mockReturnValue(makeFileConfig()); + const req = makeReq({ mimetype: mime, ocrConfig: null }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('does not check OCR capability when using automatic document_parser fallback', async () => { + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).not.toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('uses the configured OCR strategy when OCR is set up for the file type', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.mistral_ocr); + }); + + test('uses document_parser as default when OCR is configured but no strategy is specified', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { supportedMimeTypes: [PDF_MIME] }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('throws when configured OCR capability is not enabled for the agent', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + checkCapability.mockResolvedValue(false); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow('OCR capability is not enabled for Agents'); + }); + + test('uses document_parser (no capability check) when OCR capability returns false but no OCR config', async () => { + checkCapability.mockResolvedValue(false); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).not.toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('uses document_parser when OCR is configured but the file type is not in OCR supported types', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const req = makeReq({ + mimetype: DOCX_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }); + + expect(checkCapability).not.toHaveBeenCalledWith(expect.anything(), AgentCapabilities.ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + expect(getStrategyFunctions).not.toHaveBeenCalledWith(FileSources.mistral_ocr); + }); + + test('does not invoke any OCR strategy for unsupported MIME types without OCR config', async () => { + const req = makeReq({ mimetype: 'text/plain', ocrConfig: null }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow('File type text/plain is not supported for text parsing.'); + + expect(getStrategyFunctions).not.toHaveBeenCalled(); + }); + + test('throws instead of falling back to parseText when document_parser fails for a document MIME type', async () => { + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockRejectedValue(new Error('No text found in document')), + }); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + const { parseText } = require('@librechat/api'); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow(/image-based and requires an OCR service/); + + expect(parseText).not.toHaveBeenCalled(); + }); + + test('falls back to document_parser when configured OCR fails for a document MIME type', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + const failingUpload = jest.fn().mockRejectedValue(new Error('OCR API returned 500')); + const fallbackUpload = jest + .fn() + .mockResolvedValue({ text: 'parsed text', bytes: 11, filepath: 'doc://result' }); + getStrategyFunctions + .mockReturnValueOnce({ handleFileUpload: failingUpload }) + .mockReturnValueOnce({ handleFileUpload: fallbackUpload }); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).resolves.not.toThrow(); + + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.mistral_ocr); + expect(getStrategyFunctions).toHaveBeenCalledWith(FileSources.document_parser); + }); + + test('throws when both configured OCR and document_parser fallback fail', async () => { + mergeFileConfig.mockReturnValue(makeFileConfig({ ocrSupportedMimeTypes: [PDF_MIME] })); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockRejectedValue(new Error('failure')), + }); + const req = makeReq({ + mimetype: PDF_MIME, + ocrConfig: { strategy: FileSources.mistral_ocr }, + }); + const { parseText } = require('@librechat/api'); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow(/image-based and requires an OCR service/); + + expect(parseText).not.toHaveBeenCalled(); + }); + }); + + describe('text size guard', () => { + test('throws before writing to MongoDB when extracted text exceeds 15MB', async () => { + const oversizedText = 'x'.repeat(15 * 1024 * 1024 + 1); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockResolvedValue({ + text: oversizedText, + bytes: Buffer.byteLength(oversizedText, 'utf8'), + filepath: 'doc://result', + }), + }); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + const { createFile } = require('~/models'); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).rejects.toThrow(/exceeds the 15MB storage limit/); + + expect(createFile).not.toHaveBeenCalled(); + }); + + test('succeeds when extracted text is within the 15MB limit', async () => { + const okText = 'x'.repeat(1024); + getStrategyFunctions.mockReturnValue({ + handleFileUpload: jest.fn().mockResolvedValue({ + text: okText, + bytes: Buffer.byteLength(okText, 'utf8'), + filepath: 'doc://result', + }), + }); + const req = makeReq({ mimetype: PDF_MIME, ocrConfig: null }); + + await expect( + processAgentFileUpload({ req, res: mockRes, metadata: makeMetadata() }), + ).resolves.not.toThrow(); + }); + }); +}); diff --git a/api/server/services/Files/strategies.js b/api/server/services/Files/strategies.js index 2ad526194b..25341b5715 100644 --- a/api/server/services/Files/strategies.js +++ b/api/server/services/Files/strategies.js @@ -1,5 +1,6 @@ const { FileSources } = require('librechat-data-provider'); const { + parseDocument, uploadMistralOCR, uploadAzureMistralOCR, uploadGoogleVertexMistralOCR, @@ -246,6 +247,26 @@ const vertexMistralOCRStrategy = () => ({ handleFileUpload: uploadGoogleVertexMistralOCR, }); +const documentParserStrategy = () => ({ + /** @type {typeof saveFileFromURL | null} */ + saveURL: null, + /** @type {typeof getLocalFileURL | null} */ + getFileURL: null, + /** @type {typeof saveLocalBuffer | null} */ + saveBuffer: null, + /** @type {typeof processLocalAvatar | null} */ + processAvatar: null, + /** @type {typeof uploadLocalImage | null} */ + handleImageUpload: null, + /** @type {typeof prepareImagesLocal | null} */ + prepareImagePayload: null, + /** @type {typeof deleteLocalFile | null} */ + deleteFile: null, + /** @type {typeof getLocalFileStream | null} */ + getDownloadStream: null, + handleFileUpload: parseDocument, +}); + // Strategy Selector const getStrategyFunctions = (fileSource) => { if (fileSource === FileSources.firebase) { @@ -270,6 +291,8 @@ const getStrategyFunctions = (fileSource) => { return azureMistralOCRStrategy(); } else if (fileSource === FileSources.vertexai_mistral_ocr) { return vertexMistralOCRStrategy(); + } else if (fileSource === FileSources.document_parser) { + return documentParserStrategy(); } else if (fileSource === FileSources.text) { return localStrategy(); // Text files use local strategy } else { diff --git a/package-lock.json b/package-lock.json index 4bca60d435..04f8251dd6 100644 --- a/package-lock.json +++ b/package-lock.json @@ -95,6 +95,7 @@ "klona": "^2.0.6", "librechat-data-provider": "*", "lodash": "^4.17.23", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "meilisearch": "^0.38.0", "memorystore": "^1.6.7", @@ -117,6 +118,7 @@ "passport-jwt": "^4.0.1", "passport-ldapauth": "^3.0.1", "passport-local": "^1.0.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "sharp": "^0.33.5", "tiktoken": "^1.0.15", @@ -125,6 +127,7 @@ "undici": "^7.18.2", "winston": "^3.11.0", "winston-daily-rotate-file": "^5.0.0", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", "zod": "^3.22.4" }, "devDependencies": { @@ -11380,6 +11383,256 @@ "sparse-bitfield": "^3.0.3" } }, + "node_modules/@napi-rs/canvas": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas/-/canvas-0.1.88.tgz", + "integrity": "sha512-/p08f93LEbsL5mDZFQ3DBxcPv/I4QG9EDYRRq1WNlCOXVfAHBTHMSVMwxlqG/AtnSfUr9+vgfN7MKiyDo0+Weg==", + "license": "MIT", + "optional": true, + "workspaces": [ + "e2e/*" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + }, + "optionalDependencies": { + "@napi-rs/canvas-android-arm64": "0.1.88", + "@napi-rs/canvas-darwin-arm64": "0.1.88", + "@napi-rs/canvas-darwin-x64": "0.1.88", + "@napi-rs/canvas-linux-arm-gnueabihf": "0.1.88", + "@napi-rs/canvas-linux-arm64-gnu": "0.1.88", + "@napi-rs/canvas-linux-arm64-musl": "0.1.88", + "@napi-rs/canvas-linux-riscv64-gnu": "0.1.88", + "@napi-rs/canvas-linux-x64-gnu": "0.1.88", + "@napi-rs/canvas-linux-x64-musl": "0.1.88", + "@napi-rs/canvas-win32-arm64-msvc": "0.1.88", + "@napi-rs/canvas-win32-x64-msvc": "0.1.88" + } + }, + "node_modules/@napi-rs/canvas-android-arm64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-android-arm64/-/canvas-android-arm64-0.1.88.tgz", + "integrity": "sha512-KEaClPnZuVxJ8smUWjV1wWFkByBO/D+vy4lN+Dm5DFH514oqwukxKGeck9xcKJhaWJGjfruGmYGiwRe//+/zQQ==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "android" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-arm64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-arm64/-/canvas-darwin-arm64-0.1.88.tgz", + "integrity": "sha512-Xgywz0dDxOKSgx3eZnK85WgGMmGrQEW7ZLA/E7raZdlEE+xXCozobgqz2ZvYigpB6DJFYkqnwHjqCOTSDGlFdg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-darwin-x64": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-darwin-x64/-/canvas-darwin-x64-0.1.88.tgz", + "integrity": "sha512-Yz4wSCIQOUgNucgk+8NFtQxQxZV5NO8VKRl9ePKE6XoNyNVC8JDqtvhh3b3TPqKK8W5p2EQpAr1rjjm0mfBxdg==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "darwin" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm-gnueabihf": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm-gnueabihf/-/canvas-linux-arm-gnueabihf-0.1.88.tgz", + "integrity": "sha512-9gQM2SlTo76hYhxHi2XxWTAqpTOb+JtxMPEIr+H5nAhHhyEtNmTSDRtz93SP7mGd2G3Ojf2oF5tP9OdgtgXyKg==", + "cpu": [ + "arm" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-gnu/-/canvas-linux-arm64-gnu-0.1.88.tgz", + "integrity": "sha512-7qgaOBMXuVRk9Fzztzr3BchQKXDxGbY+nwsovD3I/Sx81e+sX0ReEDYHTItNb0Je4NHbAl7D0MKyd4SvUc04sg==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-arm64-musl": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-arm64-musl/-/canvas-linux-arm64-musl-0.1.88.tgz", + "integrity": "sha512-kYyNrUsHLkoGHBc77u4Unh067GrfiCUMbGHC2+OTxbeWfZkPt2o32UOQkhnSswKd9Fko/wSqqGkY956bIUzruA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-riscv64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-riscv64-gnu/-/canvas-linux-riscv64-gnu-0.1.88.tgz", + "integrity": "sha512-HVuH7QgzB0yavYdNZDRyAsn/ejoXB0hn8twwFnOqUbCCdkV+REna7RXjSR7+PdfW0qMQ2YYWsLvVBT5iL/mGpw==", + "cpu": [ + "riscv64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-gnu": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-gnu/-/canvas-linux-x64-gnu-0.1.88.tgz", + "integrity": "sha512-hvcvKIcPEQrvvJtJnwD35B3qk6umFJ8dFIr8bSymfrSMem0EQsfn1ztys8ETIFndTwdNWJKWluvxztA41ivsEw==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-linux-x64-musl": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-linux-x64-musl/-/canvas-linux-x64-musl-0.1.88.tgz", + "integrity": "sha512-eSMpGYY2xnZSQ6UxYJ6plDboxq4KeJ4zT5HaVkUnbObNN6DlbJe0Mclh3wifAmquXfrlgTZt6zhHsUgz++AK6g==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "linux" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-arm64-msvc": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-arm64-msvc/-/canvas-win32-arm64-msvc-0.1.88.tgz", + "integrity": "sha512-qcIFfEgHrchyYqRrxsCeTQgpJZ/GqHiqPcU/Fvw/ARVlQeDX1VyFH+X+0gCR2tca6UJrq96vnW+5o7buCq+erA==", + "cpu": [ + "arm64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, + "node_modules/@napi-rs/canvas-win32-x64-msvc": { + "version": "0.1.88", + "resolved": "https://registry.npmjs.org/@napi-rs/canvas-win32-x64-msvc/-/canvas-win32-x64-msvc-0.1.88.tgz", + "integrity": "sha512-ROVqbfS4QyZxYkqmaIBBpbz/BQvAR+05FXM5PAtTYVc0uyY8Y4BHJSMdGAaMf6TdIVRsQsiq+FG/dH9XhvWCFQ==", + "cpu": [ + "x64" + ], + "license": "MIT", + "optional": true, + "os": [ + "win32" + ], + "engines": { + "node": ">= 10" + }, + "funding": { + "type": "github", + "url": "https://github.com/sponsors/Brooooooklyn" + } + }, "node_modules/@napi-rs/wasm-runtime": { "version": "0.2.12", "resolved": "https://registry.npmjs.org/@napi-rs/wasm-runtime/-/wasm-runtime-0.2.12.tgz", @@ -21601,6 +21854,12 @@ "node": ">=8" } }, + "node_modules/bluebird": { + "version": "3.4.7", + "resolved": "https://registry.npmjs.org/bluebird/-/bluebird-3.4.7.tgz", + "integrity": "sha512-iD3898SR7sWVRHbiQv+sHUtHnMvC1o3nW5rAcqnq3uOn07DSAppZYUkIGslDz6gXC7HfunPe7YVBgoEJASPcHA==", + "license": "MIT" + }, "node_modules/bn.js": { "version": "5.2.2", "resolved": "https://registry.npmjs.org/bn.js/-/bn.js-5.2.2.tgz", @@ -22854,7 +23113,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/core-util-is/-/core-util-is-1.0.3.tgz", "integrity": "sha512-ZQBvi1DcpJ4GDqanjucZ2Hj3wEO5pZDS89BWbkcrvdxksJorwUDDZamX9ldFkp9aw2lmBDLgkObEA4DWNJ9FYQ==", - "dev": true, "license": "MIT" }, "node_modules/cors": { @@ -24239,6 +24497,12 @@ "dev": true, "license": "MIT" }, + "node_modules/dingbat-to-unicode": { + "version": "1.0.1", + "resolved": "https://registry.npmjs.org/dingbat-to-unicode/-/dingbat-to-unicode-1.0.1.tgz", + "integrity": "sha512-98l0sW87ZT58pU4i61wa2OHwxbiYSbuxsCBozaVnYX2iCnr3bLM3fIes1/ej7h1YdOKuKt/MLs706TVnALA65w==", + "license": "BSD-2-Clause" + }, "node_modules/dlv": { "version": "1.1.3", "resolved": "https://registry.npmjs.org/dlv/-/dlv-1.1.3.tgz", @@ -24375,6 +24639,15 @@ "resolved": "https://registry.npmjs.org/downloadjs/-/downloadjs-1.4.7.tgz", "integrity": "sha512-LN1gO7+u9xjU5oEScGFKvXhYf7Y/empUIIEAGBs1LzUq/rg5duiDrkuH5A2lQGd5jfMOb9X9usDa2oVXwJ0U/Q==" }, + "node_modules/duck": { + "version": "0.1.12", + "resolved": "https://registry.npmjs.org/duck/-/duck-0.1.12.tgz", + "integrity": "sha512-wkctla1O6VfP89gQ+J/yDesM0S7B7XLXjKGzXxMDVFg7uEn706niAtyYovKbyq1oT9YwDcly721/iUWoc8MVRg==", + "license": "BSD", + "dependencies": { + "underscore": "^1.13.1" + } + }, "node_modules/dunder-proto": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/dunder-proto/-/dunder-proto-1.0.1.tgz", @@ -27588,6 +27861,12 @@ "integrity": "sha512-Ius2VYcGNk7T90CppJqcIkS5ooHUZyIQK+ClZfMfMNFEF9VSE73Fq+906u/CWu92x4gzZMWOwfFYckPObzdEbA==", "dev": true }, + "node_modules/immediate": { + "version": "3.0.6", + "resolved": "https://registry.npmjs.org/immediate/-/immediate-3.0.6.tgz", + "integrity": "sha512-XXOFtyqDjNDAQxVfYxuF7g9Il/IbWmmlQg2MYKOH8ExIT1qg6xc4zyS3HaEEATgs1btfzxq15ciUiY7gjSXRGQ==", + "license": "MIT" + }, "node_modules/import-cwd": { "version": "3.0.0", "resolved": "https://registry.npmjs.org/import-cwd/-/import-cwd-3.0.0.tgz", @@ -30052,6 +30331,45 @@ "node": ">=4.0" } }, + "node_modules/jszip": { + "version": "3.10.1", + "resolved": "https://registry.npmjs.org/jszip/-/jszip-3.10.1.tgz", + "integrity": "sha512-xXDvecyTpGLrqFrvkrUSoxxfJI5AH7U8zxxtVclpsUtMCq4JQ290LY8AW5c7Ggnr/Y/oK+bQMbqK2qmtk3pN4g==", + "license": "(MIT OR GPL-3.0-or-later)", + "dependencies": { + "lie": "~3.3.0", + "pako": "~1.0.2", + "readable-stream": "~2.3.6", + "setimmediate": "^1.0.5" + } + }, + "node_modules/jszip/node_modules/isarray": { + "version": "1.0.0", + "resolved": "https://registry.npmjs.org/isarray/-/isarray-1.0.0.tgz", + "integrity": "sha512-VLghIWNM6ELQzo7zwmcg0NmTVyWKYjvIeM83yjp0wRDTmUnrM678fQbcKBo6n2CJEF0szoG//ytg+TKla89ALQ==", + "license": "MIT" + }, + "node_modules/jszip/node_modules/readable-stream": { + "version": "2.3.8", + "resolved": "https://registry.npmjs.org/readable-stream/-/readable-stream-2.3.8.tgz", + "integrity": "sha512-8p0AUk4XODgIewSi0l8Epjs+EVnWiK7NoDIEGU0HhE7+ZyY8D1IMY7odu5lRrFXGg71L15KG8QrPmum45RTtdA==", + "license": "MIT", + "dependencies": { + "core-util-is": "~1.0.0", + "inherits": "~2.0.3", + "isarray": "~1.0.0", + "process-nextick-args": "~2.0.0", + "safe-buffer": "~5.1.1", + "string_decoder": "~1.1.1", + "util-deprecate": "~1.0.1" + } + }, + "node_modules/jszip/node_modules/safe-buffer": { + "version": "5.1.2", + "resolved": "https://registry.npmjs.org/safe-buffer/-/safe-buffer-5.1.2.tgz", + "integrity": "sha512-Gd2UZBJDkXlY7GbJxfsE8/nvKkUEU1G38c1siN6QP6a9PT9MmHB8GnpscSmMJSoF8LOIrt8ud/wPtojys4G6+g==", + "license": "MIT" + }, "node_modules/jwa": { "version": "2.0.1", "resolved": "https://registry.npmjs.org/jwa/-/jwa-2.0.1.tgz", @@ -30354,6 +30672,15 @@ "resolved": "packages/data-provider", "link": true }, + "node_modules/lie": { + "version": "3.3.0", + "resolved": "https://registry.npmjs.org/lie/-/lie-3.3.0.tgz", + "integrity": "sha512-UaiMJzeWRlEujzAuw5LokY1L5ecNQYZKfmyZ9L7wDHb/p5etKaxXhohBcrw0EYby+G/NA52vRSN4N39dxHAIwQ==", + "license": "MIT", + "dependencies": { + "immediate": "~3.0.5" + } + }, "node_modules/lilconfig": { "version": "2.1.0", "resolved": "https://registry.npmjs.org/lilconfig/-/lilconfig-2.1.0.tgz", @@ -30997,6 +31324,17 @@ "loose-envify": "cli.js" } }, + "node_modules/lop": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/lop/-/lop-0.4.2.tgz", + "integrity": "sha512-RefILVDQ4DKoRZsJ4Pj22TxE3omDO47yFpkIBoDKzkqPRISs5U1cnAdg/5583YPkWPaLIYHOKRMQSvjFsO26cw==", + "license": "BSD-2-Clause", + "dependencies": { + "duck": "^0.1.12", + "option": "~0.2.1", + "underscore": "^1.13.1" + } + }, "node_modules/lowlight": { "version": "2.9.0", "resolved": "https://registry.npmjs.org/lowlight/-/lowlight-2.9.0.tgz", @@ -31106,6 +31444,48 @@ "tmpl": "1.0.5" } }, + "node_modules/mammoth": { + "version": "1.11.0", + "resolved": "https://registry.npmjs.org/mammoth/-/mammoth-1.11.0.tgz", + "integrity": "sha512-BcEqqY/BOwIcI1iR5tqyVlqc3KIaMRa4egSoK83YAVrBf6+yqdAAbtUcFDCWX8Zef8/fgNZ6rl4VUv+vVX8ddQ==", + "license": "BSD-2-Clause", + "dependencies": { + "@xmldom/xmldom": "^0.8.6", + "argparse": "~1.0.3", + "base64-js": "^1.5.1", + "bluebird": "~3.4.0", + "dingbat-to-unicode": "^1.0.1", + "jszip": "^3.7.1", + "lop": "^0.4.2", + "path-is-absolute": "^1.0.0", + "underscore": "^1.13.1", + "xmlbuilder": "^10.0.0" + }, + "bin": { + "mammoth": "bin/mammoth" + }, + "engines": { + "node": ">=12.0.0" + } + }, + "node_modules/mammoth/node_modules/argparse": { + "version": "1.0.10", + "resolved": "https://registry.npmjs.org/argparse/-/argparse-1.0.10.tgz", + "integrity": "sha512-o5Roy6tNG4SL/FOkCAN6RzjiakZS25RLYFrcMttJqbdd8BWrnA+fGz57iN5Pb06pvBGvl5gQ0B48dJlslXvoTg==", + "license": "MIT", + "dependencies": { + "sprintf-js": "~1.0.2" + } + }, + "node_modules/mammoth/node_modules/xmlbuilder": { + "version": "10.1.1", + "resolved": "https://registry.npmjs.org/xmlbuilder/-/xmlbuilder-10.1.1.tgz", + "integrity": "sha512-OyzrcFLL/nb6fMGHbiRDuPup9ljBycsdCypwuyg5AAHvyWzGfChJpCXMG88AGTIMFhGZ9RccFN1e6lhg3hkwKg==", + "license": "MIT", + "engines": { + "node": ">=4.0" + } + }, "node_modules/markdown-table": { "version": "3.0.4", "resolved": "https://registry.npmjs.org/markdown-table/-/markdown-table-3.0.4.tgz", @@ -33039,6 +33419,13 @@ "integrity": "sha512-O5lz91xSOeoXP6DulyHfllpq+Eg00MWitZIbtPfoSEvqIHdl5gfcY6hYzDWnj0qD5tz52PI08u9qUvSVeUBeHw==", "dev": true }, + "node_modules/node-readable-to-web-readable-stream": { + "version": "0.4.2", + "resolved": "https://registry.npmjs.org/node-readable-to-web-readable-stream/-/node-readable-to-web-readable-stream-0.4.2.tgz", + "integrity": "sha512-/cMZNI34v//jUTrI+UIo4ieHAB5EZRY/+7OmXZgBxaWBMcW2tGdceIw06RFxWxrKZ5Jp3sI2i5TsRo+CBhtVLQ==", + "license": "MIT", + "optional": true + }, "node_modules/node-releases": { "version": "2.0.27", "resolved": "https://registry.npmjs.org/node-releases/-/node-releases-2.0.27.tgz", @@ -33522,6 +33909,12 @@ "integrity": "sha512-N4YtSYJqghVu4iek2ZUvcN/0aqH1kRDuNqzcycDxhOUpg7GdvLa2F3DgS6yBNhInhv2r/6I0Flkn7CqL8+nIcw==", "dev": true }, + "node_modules/option": { + "version": "0.2.4", + "resolved": "https://registry.npmjs.org/option/-/option-0.2.4.tgz", + "integrity": "sha512-pkEqbDyl8ou5cpq+VsnQbe/WlEy5qS7xPzMS1U55OCG9KPvwFD46zDbxQIj3egJSFc3D+XhYOPUzz49zQAVy7A==", + "license": "BSD-2-Clause" + }, "node_modules/optionator": { "version": "0.9.3", "resolved": "https://registry.npmjs.org/optionator/-/optionator-0.9.3.tgz", @@ -33674,7 +34067,6 @@ "version": "1.0.11", "resolved": "https://registry.npmjs.org/pako/-/pako-1.0.11.tgz", "integrity": "sha512-4hLB8Py4zZce5s4yd9XzopqwVv/yGNhV1Bl8NTmCq1763HeK2+EwVTv+leGeL13Dnh2wfbqowVPXCIO0z4taYw==", - "dev": true, "license": "(MIT AND Zlib)" }, "node_modules/parent-module": { @@ -33953,7 +34345,6 @@ "version": "1.0.1", "resolved": "https://registry.npmjs.org/path-is-absolute/-/path-is-absolute-1.0.1.tgz", "integrity": "sha512-AVbw3UJ2e9bq64vSaS9Am0fje1Pa8pbGqTTsmXfaIiMpnr5DlDhfJOuLj9Sf95ZPVDAUerDfEk88MPmPe7UCQg==", - "dev": true, "engines": { "node": ">=0.10.0" } @@ -34032,6 +34423,19 @@ "node": ">= 0.10" } }, + "node_modules/pdfjs-dist": { + "version": "5.4.624", + "resolved": "https://registry.npmjs.org/pdfjs-dist/-/pdfjs-dist-5.4.624.tgz", + "integrity": "sha512-sm6TxKTtWv1Oh6n3C6J6a8odejb5uO4A4zo/2dgkHuC0iu8ZMAXOezEODkVaoVp8nX1Xzr+0WxFJJmUr45hQzg==", + "license": "Apache-2.0", + "engines": { + "node": ">=20.16.0 || >=22.3.0" + }, + "optionalDependencies": { + "@napi-rs/canvas": "^0.1.88", + "node-readable-to-web-readable-stream": "^0.4.2" + } + }, "node_modules/peek-readable": { "version": "5.0.0", "resolved": "https://registry.npmjs.org/peek-readable/-/peek-readable-5.0.0.tgz", @@ -35713,7 +36117,6 @@ "version": "2.0.1", "resolved": "https://registry.npmjs.org/process-nextick-args/-/process-nextick-args-2.0.1.tgz", "integrity": "sha512-3ouUOpQhtgrbOa17J7+uxOTpITYWaGP7/AhoR3+A+/1e9skrzelGi/dXzEYyvbxubEF6Wn2ypscTKiKJFFn1ag==", - "dev": true, "license": "MIT" }, "node_modules/promise.series": { @@ -38176,7 +38579,6 @@ "version": "1.0.5", "resolved": "https://registry.npmjs.org/setimmediate/-/setimmediate-1.0.5.tgz", "integrity": "sha512-MATJdZp8sLqDl/68LfQmbP8zKPLQNV6BIZoIgrscFDQ+RsvK/BxeDQOgyxKKoh0y/8h3BqVFnCqQ/gd+reiIXA==", - "dev": true, "license": "MIT" }, "node_modules/setprototypeof": { @@ -38441,7 +38843,6 @@ "version": "1.0.3", "resolved": "https://registry.npmjs.org/sprintf-js/-/sprintf-js-1.0.3.tgz", "integrity": "sha512-D9cPgkvLlV3t3IzL0D0YLvGA9Ahk4PcvVwUbN0dSGr1aP0Nrt4AEnTUbuGvquEC0mA64Gqt1fzirlRs5ibXx8g==", - "dev": true, "license": "BSD-3-Clause" }, "node_modules/sse.js": { @@ -40249,6 +40650,12 @@ "integrity": "sha512-WxONCrssBM8TSPRqN5EmsjVrsv4A8X12J4ArBiiayv3DyyG3ZlIg6yysuuSYdZsVz3TKcTg2fd//Ujd4CHV1iA==", "dev": true }, + "node_modules/underscore": { + "version": "1.13.7", + "resolved": "https://registry.npmjs.org/underscore/-/underscore-1.13.7.tgz", + "integrity": "sha512-GMXzWtsc57XAtguZgaQViUOzs0KTkk8ojr3/xAxXLITqf/3EMwxC0inyETfDFjH/Krbhuep0HNbbjI9i/q3F3g==", + "license": "MIT" + }, "node_modules/undici": { "version": "7.20.0", "resolved": "https://registry.npmjs.org/undici/-/undici-7.20.0.tgz", @@ -41904,6 +42311,18 @@ } } }, + "node_modules/xlsx": { + "version": "0.20.3", + "resolved": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz", + "integrity": "sha512-oLDq3jw7AcLqKWH2AhCpVTZl8mf6X2YReP+Neh0SJUzV/BdZYjth94tG5toiMB1PPrYtxOCfaoUCkvtuH+3AJA==", + "license": "Apache-2.0", + "bin": { + "xlsx": "bin/xlsx.njs" + }, + "engines": { + "node": ">=0.8" + } + }, "node_modules/xml": { "version": "1.0.1", "resolved": "https://registry.npmjs.org/xml/-/xml-1.0.1.tgz", @@ -42171,12 +42590,15 @@ "jest": "^30.2.0", "jest-junit": "^16.0.0", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mongodb": "^6.14.2", + "pdfjs-dist": "^5.4.624", "rimraf": "^6.1.2", "rollup": "^4.22.4", "rollup-plugin-peer-deps-external": "^2.2.4", "ts-node": "^10.9.2", - "typescript": "^5.0.4" + "typescript": "^5.0.4", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" }, "peerDependencies": { "@anthropic-ai/vertex-sdk": "^0.14.3", @@ -42207,13 +42629,16 @@ "keyv": "^5.3.2", "keyv-file": "^5.1.2", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "memorystore": "^1.6.7", "mongoose": "^8.12.1", "node-fetch": "2.7.0", + "pdfjs-dist": "^5.4.530", "rate-limit-redis": "^4.2.0", "tiktoken": "^1.0.15", "undici": "^7.18.2", + "xlsx": "*", "zod": "^3.22.4" } }, diff --git a/packages/api/package.json b/packages/api/package.json index 67cb5df816..6df880e0bf 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -67,12 +67,15 @@ "jest": "^30.2.0", "jest-junit": "^16.0.0", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mongodb": "^6.14.2", + "pdfjs-dist": "^5.4.624", "rimraf": "^6.1.2", "rollup": "^4.22.4", "rollup-plugin-peer-deps-external": "^2.2.4", "ts-node": "^10.9.2", - "typescript": "^5.0.4" + "typescript": "^5.0.4", + "xlsx": "https://cdn.sheetjs.com/xlsx-0.20.3/xlsx-0.20.3.tgz" }, "publishConfig": { "registry": "https://registry.npmjs.org/" @@ -106,10 +109,12 @@ "keyv": "^5.3.2", "keyv-file": "^5.1.2", "librechat-data-provider": "*", + "mammoth": "^1.11.0", "mathjs": "^15.1.0", "memorystore": "^1.6.7", "mongoose": "^8.12.1", "node-fetch": "2.7.0", + "pdfjs-dist": "^5.4.624", "rate-limit-redis": "^4.2.0", "tiktoken": "^1.0.15", "undici": "^7.18.2", diff --git a/packages/api/src/files/documents/crud.spec.ts b/packages/api/src/files/documents/crud.spec.ts new file mode 100644 index 0000000000..3b9e1636ef --- /dev/null +++ b/packages/api/src/files/documents/crud.spec.ts @@ -0,0 +1,80 @@ +import path from 'path'; +import { parseDocument } from './crud'; + +describe('Document Parser', () => { + test('parseDocument() parses text from docx', async () => { + const file = { + originalname: 'sample.docx', + path: path.join(__dirname, 'sample.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 29, + filename: 'sample.docx', + filepath: 'document_parser', + images: [], + text: 'This is a sample DOCX file.\n\n', + }); + }); + + test('parseDocument() parses text from xlsx', async () => { + const file = { + originalname: 'sample.xlsx', + path: path.join(__dirname, 'sample.xlsx'), + mimetype: 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 66, + filename: 'sample.xlsx', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\nSecond Sheet:\nData,On\nSecond,Sheet\n', + }); + }); + + test('parseDocument() parses text from xls', async () => { + const file = { + originalname: 'sample.xls', + path: path.join(__dirname, 'sample.xls'), + mimetype: 'application/vnd.ms-excel', + } as Express.Multer.File; + + const document = await parseDocument({ file }); + + expect(document).toEqual({ + bytes: 31, + filename: 'sample.xls', + filepath: 'document_parser', + images: [], + text: 'Sheet One:\nData,on,first,sheet\n', + }); + }); + + test('parseDocument() throws error for unhandled document type', async () => { + const file = { + originalname: 'nonexistent.file', + path: path.join(__dirname, 'nonexistent.file'), + mimetype: 'application/invalid', + } as Express.Multer.File; + + await expect(parseDocument({ file })).rejects.toThrow( + 'Unsupported file type in document parser: application/invalid', + ); + }); + + test('parseDocument() throws error for empty document', async () => { + const file = { + originalname: 'empty.docx', + path: path.join(__dirname, 'empty.docx'), + mimetype: 'application/vnd.openxmlformats-officedocument.wordprocessingml.document', + } as Express.Multer.File; + + await expect(parseDocument({ file })).rejects.toThrow('No text found in document'); + }); +}); diff --git a/packages/api/src/files/documents/crud.ts b/packages/api/src/files/documents/crud.ts new file mode 100644 index 0000000000..f2d45644d4 --- /dev/null +++ b/packages/api/src/files/documents/crud.ts @@ -0,0 +1,87 @@ +import * as fs from 'fs'; +import { FileSources } from 'librechat-data-provider'; +import type { TextItem } from 'pdfjs-dist/types/src/display/api'; +import type { MistralOCRUploadResult } from '~/types'; + +/** + * Parses an uploaded document and extracts its text content and metadata. + * + * Throws an Error if it fails to parse or no text is found. + */ +export async function parseDocument({ + file, +}: { + file: Express.Multer.File; +}): Promise { + let text: string; + switch (file.mimetype) { + case 'application/pdf': + text = await pdfToText(file); + break; + case 'application/vnd.openxmlformats-officedocument.wordprocessingml.document': + text = await wordDocToText(file); + break; + case 'application/vnd.ms-excel': + case 'application/vnd.openxmlformats-officedocument.spreadsheetml.sheet': + text = await excelSheetToText(file); + break; + default: + throw new Error(`Unsupported file type in document parser: ${file.mimetype}`); + } + + if (!text?.trim()) { + throw new Error('No text found in document'); + } + + return { + filename: file.originalname, + bytes: Buffer.byteLength(text, 'utf8'), + filepath: FileSources.document_parser, + text, + images: [], + }; +} + +/** Parses PDF, returns text inside. */ +async function pdfToText(file: Express.Multer.File): Promise { + // Imported inline so that Jest can test other routes without failing due to loading ESM + const { getDocument } = await import('pdfjs-dist/legacy/build/pdf.mjs'); + + const data = new Uint8Array(await fs.promises.readFile(file.path)); + const pdf = await getDocument({ data }).promise; + + let fullText = ''; + for (let i = 1; i <= pdf.numPages; i++) { + const page = await pdf.getPage(i); + const textContent = await page.getTextContent(); + const pageText = textContent.items + .filter((item): item is TextItem => !('type' in item)) + .map((item) => item.str) + .join(' '); + fullText += pageText + '\n'; + } + + return fullText; +} + +/** Parses Word document, returns text inside. */ +async function wordDocToText(file: Express.Multer.File): Promise { + const { extractRawText } = await import('mammoth'); + const rawText = await extractRawText({ path: file.path }); + return rawText.value; +} + +/** Parses Excel sheet, returns text inside. */ +async function excelSheetToText(file: Express.Multer.File): Promise { + const { readFile, utils } = await import('xlsx'); + const workbook = readFile(file.path); + + let text = ''; + for (const sheetName of workbook.SheetNames) { + const worksheet = workbook.Sheets[sheetName]; + const worksheetAsCsvString = utils.sheet_to_csv(worksheet); + text += `${sheetName}:\n${worksheetAsCsvString}\n`; + } + + return text; +} diff --git a/packages/api/src/files/documents/empty.docx b/packages/api/src/files/documents/empty.docx new file mode 100644 index 0000000000..c089246167 Binary files /dev/null and b/packages/api/src/files/documents/empty.docx differ diff --git a/packages/api/src/files/documents/sample.docx b/packages/api/src/files/documents/sample.docx new file mode 100644 index 0000000000..c7e1c02b65 Binary files /dev/null and b/packages/api/src/files/documents/sample.docx differ diff --git a/packages/api/src/files/documents/sample.xls b/packages/api/src/files/documents/sample.xls new file mode 100644 index 0000000000..d5976b0816 Binary files /dev/null and b/packages/api/src/files/documents/sample.xls differ diff --git a/packages/api/src/files/documents/sample.xlsx b/packages/api/src/files/documents/sample.xlsx new file mode 100644 index 0000000000..2abb6961d1 Binary files /dev/null and b/packages/api/src/files/documents/sample.xlsx differ diff --git a/packages/api/src/files/index.ts b/packages/api/src/files/index.ts index 3aedc5ba9d..707f2ef7fb 100644 --- a/packages/api/src/files/index.ts +++ b/packages/api/src/files/index.ts @@ -1,5 +1,6 @@ export * from './audio'; export * from './context'; +export * from './documents/crud'; export * from './encode'; export * from './filter'; export * from './mistral/crud'; diff --git a/packages/api/src/files/mistral/crud.ts b/packages/api/src/files/mistral/crud.ts index fefe4a4675..c818fab8b8 100644 --- a/packages/api/src/files/mistral/crud.ts +++ b/packages/api/src/files/mistral/crud.ts @@ -165,9 +165,11 @@ export async function performOCR({ config.httpsAgent = new HttpsProxyAgent(process.env.PROXY); } + const ocrURL = baseURL.endsWith('/ocr') ? baseURL : `${baseURL}/ocr`; + return axios .post( - `${baseURL}/ocr`, + ocrURL, { model, image_limit: 0, diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 82d477e54e..64fc99b0eb 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -820,6 +820,7 @@ export enum OCRStrategy { CUSTOM_OCR = 'custom_ocr', AZURE_MISTRAL_OCR = 'azure_mistral_ocr', VERTEXAI_MISTRAL_OCR = 'vertexai_mistral_ocr', + DOCUMENT_PARSER = 'document_parser', } export enum SearchCategories { diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts index ec42520bc0..1eb8c200d6 100644 --- a/packages/data-provider/src/types/files.ts +++ b/packages/data-provider/src/types/files.ts @@ -13,6 +13,7 @@ export enum FileSources { azure_mistral_ocr = 'azure_mistral_ocr', vertexai_mistral_ocr = 'vertexai_mistral_ocr', text = 'text', + document_parser = 'document_parser', } export const checkOpenAIStorage = (source: string) =>