🔍 refactor: OCR Fully Optional with Defaults for "Upload as Text" (#9856)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions

* refactor: move `loadOCRConfig` from `packages/data-provider` to `packages/api` and return `undefined` if not explicitly configured

* fix: loadOCRConfig import from @librechat/api

* refactor: update defaultTextMimeTypes to support virtually all file types for text parsing

* fix: improve OCR capability check and error message for unsupported file types

* ci: remove unnecessary ocr expectation from AppService test
This commit is contained in:
Danny Avila 2025-09-26 11:56:11 -04:00 committed by GitHub
parent 3d7eaf0fcc
commit 4b5b46604c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 15 additions and 20 deletions

View file

@ -1,16 +1,12 @@
const { FileSources, EModelEndpoint, getConfigDefaults } = require('librechat-data-provider');
const { const {
isEnabled, isEnabled,
loadOCRConfig,
loadMemoryConfig, loadMemoryConfig,
agentsConfigSetup, agentsConfigSetup,
loadWebSearchConfig, loadWebSearchConfig,
loadDefaultInterface, loadDefaultInterface,
} = require('@librechat/api'); } = require('@librechat/api');
const {
FileSources,
loadOCRConfig,
EModelEndpoint,
getConfigDefaults,
} = require('librechat-data-provider');
const { const {
checkWebSearchConfig, checkWebSearchConfig,
checkVariables, checkVariables,

View file

@ -142,7 +142,6 @@ describe('AppService', () => {
turnstileConfig: mockedTurnstileConfig, turnstileConfig: mockedTurnstileConfig,
modelSpecs: undefined, modelSpecs: undefined,
paths: expect.anything(), paths: expect.anything(),
ocr: expect.anything(),
imageOutputType: expect.any(String), imageOutputType: expect.any(String),
fileConfig: undefined, fileConfig: undefined,
secureImageLinks: undefined, secureImageLinks: undefined,

View file

@ -594,10 +594,9 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
const fileConfig = mergeFileConfig(appConfig.fileConfig); const fileConfig = mergeFileConfig(appConfig.fileConfig);
const shouldUseOCR = fileConfig.checkType( const shouldUseOCR =
file.mimetype, appConfig?.ocr != null &&
fileConfig.ocr?.supportedMimeTypes || [], fileConfig.checkType(file.mimetype, fileConfig.ocr?.supportedMimeTypes || []);
);
if (shouldUseOCR && !(await checkCapability(req, AgentCapabilities.ocr))) { if (shouldUseOCR && !(await checkCapability(req, AgentCapabilities.ocr))) {
throw new Error('OCR capability is not enabled for Agents'); throw new Error('OCR capability is not enabled for Agents');
@ -626,7 +625,7 @@ const processAgentFileUpload = async ({ req, res, metadata }) => {
); );
if (!shouldUseText) { if (!shouldUseText) {
throw new Error(`File type ${file.mimetype} is not supported for OCR or text parsing`); throw new Error(`File type ${file.mimetype} is not supported for text parsing.`);
} }
const { text, bytes } = await parseText({ req, file, file_id }); const { text, bytes } = await parseText({ req, file, file_id });

View file

@ -1,4 +1,5 @@
export * from './mistral/crud';
export * from './audio'; export * from './audio';
export * from './text'; export * from './mistral/crud';
export * from './ocr';
export * from './parse'; export * from './parse';
export * from './text';

View file

@ -303,7 +303,7 @@ async function loadAuthConfig(context: OCRContext): Promise<AuthConfig> {
/** /**
* Gets the model configuration * Gets the model configuration
*/ */
function getModelConfig(ocrConfig: TCustomConfig['ocr']): string { function getModelConfig(ocrConfig?: TCustomConfig['ocr']): string {
const modelConfig = ocrConfig?.mistralModel || ''; const modelConfig = ocrConfig?.mistralModel || '';
if (!modelConfig.trim()) { if (!modelConfig.trim()) {

View file

@ -1,7 +1,8 @@
import type { TCustomConfig } from '../src/config'; import { OCRStrategy } from 'librechat-data-provider';
import { OCRStrategy } from '../src/config'; import type { TCustomConfig } from 'librechat-data-provider';
export function loadOCRConfig(config: TCustomConfig['ocr']): TCustomConfig['ocr'] { export function loadOCRConfig(config?: TCustomConfig['ocr']): TCustomConfig['ocr'] | undefined {
if (!config) return;
const baseURL = config?.baseURL ?? ''; const baseURL = config?.baseURL ?? '';
const apiKey = config?.apiKey ?? ''; const apiKey = config?.apiKey ?? '';
const mistralModel = config?.mistralModel ?? ''; const mistralModel = config?.mistralModel ?? '';

View file

@ -133,7 +133,7 @@ export const defaultOCRMimeTypes = [
/^application\/epub\+zip$/, /^application\/epub\+zip$/,
]; ];
export const defaultTextMimeTypes = [textMimeTypes]; export const defaultTextMimeTypes = [/^[\w.-]+\/[\w.-]+$/];
export const defaultSTTMimeTypes = [audioMimeTypes]; export const defaultSTTMimeTypes = [audioMimeTypes];

View file

@ -9,7 +9,6 @@ export * from './messages';
export * from './artifacts'; export * from './artifacts';
/* schema helpers */ /* schema helpers */
export * from './parsers'; export * from './parsers';
export * from './ocr';
/* custom/dynamic configurations */ /* custom/dynamic configurations */
export * from './generate'; export * from './generate';
export * from './models'; export * from './models';