🔍 feat: Mistral OCR API / Upload Files as Text (#6274)

* refactor: move `loadAuthValues` to `~/services/Tools/credentials`

* feat: add createAxiosInstance function to configure axios with proxy support

* WIP: First pass mistral ocr

* refactor: replace getConvoFiles with getToolFiles for improved file retrieval logic

* refactor: improve document formatting in encodeAndFormat function

* refactor: remove unused resendFiles parameter from buildOptions function (this option comes from the agent config)

* fix: update getFiles call to include files with `text` property as well

* refactor: move file handling to `initializeAgentOptions`

* refactor: enhance addImageURLs method to handle OCR text and improve message formatting

* refactor: update message formatting to handle OCR text in various content types

* refactor: remove unused resendFiles property from compactAgentsSchema

* fix: add error handling for Mistral OCR document upload and logging

* refactor: integrate OCR capability into file upload options and configuration

* refactor: skip processing for text source files in delete request, as they are directly tied to database

* feat: add metadata field to ExtendedFile type and update PanelColumns and PanelTable components for localization and metadata handling

* fix: source icon styling

* wip: first pass, frontend file context agent resources

* refactor: add hover card with contextual information for File Context (OCR) in FileContext component

* feat: enhance file processing by integrating file retrieval for OCR resources in agent initialization

* feat: implement OCR config; fix: agent resource deletion for ocr files

* feat: enhance agent initialization by adding OCR capability check in resource priming

* ci: fix `~/config` module mock

* ci: add OCR property expectation in AppService tests

* refactor: simplify OCR config loading by removing environment variable extraction, to be done when OCR is actually performed

* ci: add unit test to ensure environment variable references are not parsed in OCR config

* refactor: disable base64 image inclusion in OCR request

* refactor: enhance OCR configuration handling by validating environment variables and providing defaults

* refactor: use file stream from disk for mistral ocr api
This commit is contained in:
Danny Avila 2025-03-10 17:23:46 -04:00 committed by GitHub
parent 9db00edfc4
commit ded3cd8876
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
48 changed files with 1621 additions and 131 deletions

View file

@ -5,6 +5,7 @@ import type { OptionWithIcon, ExtendedFile } from './types';
export type TAgentOption = OptionWithIcon &
Agent & {
knowledge_files?: Array<[string, ExtendedFile]>;
context_files?: Array<[string, ExtendedFile]>;
code_files?: Array<[string, ExtendedFile]>;
};

View file

@ -483,6 +483,7 @@ export interface ExtendedFile {
attached?: boolean;
embedded?: boolean;
tool_resource?: string;
metadata?: t.TFile['metadata'];
}
export type ContextType = { navVisible: boolean; setNavVisible: (visible: boolean) => void };

View file

@ -1,7 +1,7 @@
import * as Ariakit from '@ariakit/react';
import React, { useRef, useState, useMemo } from 'react';
import { FileSearch, ImageUpIcon, TerminalSquareIcon } from 'lucide-react';
import { EToolResources, EModelEndpoint } from 'librechat-data-provider';
import { FileSearch, ImageUpIcon, TerminalSquareIcon, FileType2Icon } from 'lucide-react';
import { FileUpload, TooltipAnchor, DropdownPopup } from '~/components/ui';
import { useGetEndpointsQuery } from '~/data-provider';
import { AttachmentIcon } from '~/components/svg';
@ -49,6 +49,17 @@ const AttachFile = ({ isRTL, disabled, handleFileChange }: AttachFileProps) => {
},
];
if (capabilities.includes(EToolResources.ocr)) {
items.push({
label: localize('com_ui_upload_ocr_text'),
onClick: () => {
setToolResource(EToolResources.ocr);
handleUploadClick();
},
icon: <FileType2Icon className="icon-md" />,
});
}
if (capabilities.includes(EToolResources.file_search)) {
items.push({
label: localize('com_ui_upload_file_search'),

View file

@ -1,6 +1,6 @@
import React, { useMemo } from 'react';
import { EModelEndpoint, EToolResources } from 'librechat-data-provider';
import { FileSearch, ImageUpIcon, TerminalSquareIcon } from 'lucide-react';
import { FileSearch, ImageUpIcon, FileType2Icon, TerminalSquareIcon } from 'lucide-react';
import OGDialogTemplate from '~/components/ui/OGDialogTemplate';
import { useGetEndpointsQuery } from '~/data-provider';
import useLocalize from '~/hooks/useLocalize';
@ -50,6 +50,12 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD
value: EToolResources.execute_code,
icon: <TerminalSquareIcon className="icon-md" />,
});
} else if (capability === EToolResources.ocr) {
_options.push({
label: localize('com_ui_upload_ocr_text'),
value: EToolResources.ocr,
icon: <FileType2Icon className="icon-md" />,
});
}
}

View file

@ -19,7 +19,7 @@ const FilePreview = ({
};
className?: string;
}) => {
const radius = 55; // Radius of the SVG circle
const radius = 55;
const circumference = 2 * Math.PI * radius;
const progress = useProgress(
file?.['progress'] ?? 1,
@ -27,16 +27,15 @@ const FilePreview = ({
(file as ExtendedFile | undefined)?.size ?? 1,
);
// Calculate the offset based on the loading progress
const offset = circumference - progress * circumference;
const circleCSSProperties = {
transition: 'stroke-dashoffset 0.5s linear',
};
return (
<div className={cn('size-10 shrink-0 overflow-hidden rounded-xl', className)}>
<div className={cn('relative size-10 shrink-0 overflow-hidden rounded-xl', className)}>
<FileIcon file={file} fileType={fileType} />
<SourceIcon source={file?.source} />
<SourceIcon source={file?.source} isCodeFile={!!file?.['metadata']?.fileIdentifier} />
{progress < 1 && (
<ProgressCircle
circumference={circumference}

View file

@ -1,3 +1,4 @@
import { Terminal, Type, Database } from 'lucide-react';
import { EModelEndpoint, FileSources } from 'librechat-data-provider';
import { MinimalIcon } from '~/components/Endpoints';
import { cn } from '~/utils';
@ -6,9 +7,13 @@ const sourceToEndpoint = {
[FileSources.openai]: EModelEndpoint.openAI,
[FileSources.azure]: EModelEndpoint.azureOpenAI,
};
const sourceToClassname = {
[FileSources.openai]: 'bg-white/75 dark:bg-black/65',
[FileSources.azure]: 'azure-bg-color opacity-85',
[FileSources.execute_code]: 'bg-black text-white opacity-85',
[FileSources.text]: 'bg-blue-100 dark:bg-blue-900 opacity-85 text-white',
[FileSources.vectordb]: 'bg-yellow-100 dark:bg-yellow-900 opacity-85 text-white',
};
const defaultClassName =
@ -16,13 +21,41 @@ const defaultClassName =
export default function SourceIcon({
source,
isCodeFile,
className = defaultClassName,
}: {
source?: FileSources;
isCodeFile?: boolean;
className?: string;
}) {
if (source === FileSources.local || source === FileSources.firebase) {
return null;
if (isCodeFile === true) {
return (
<div className={cn(className, sourceToClassname[FileSources.execute_code] ?? '')}>
<span className="flex items-center justify-center">
<Terminal className="h-3 w-3" />
</span>
</div>
);
}
if (source === FileSources.text) {
return (
<div className={cn(className, sourceToClassname[source] ?? '')}>
<span className="flex items-center justify-center">
<Type className="h-3 w-3" />
</span>
</div>
);
}
if (source === FileSources.vectordb) {
return (
<div className={cn(className, sourceToClassname[source] ?? '')}>
<span className="flex items-center justify-center">
<Database className="h-3 w-3" />
</span>
</div>
);
}
const endpoint = sourceToEndpoint[source ?? ''];
@ -31,7 +64,7 @@ export default function SourceIcon({
return null;
}
return (
<button type="button" className={cn(className, sourceToClassname[source ?? ''] ?? '')}>
<div className={cn(className, sourceToClassname[source ?? ''] ?? '')}>
<span className="flex items-center justify-center">
<MinimalIcon
endpoint={endpoint}
@ -40,6 +73,6 @@ export default function SourceIcon({
iconClassName="h-3 w-3"
/>
</span>
</button>
</div>
);
}

View file

@ -23,6 +23,7 @@ import { processAgentOption } from '~/utils';
import AdminSettings from './AdminSettings';
import DeleteButton from './DeleteButton';
import AgentAvatar from './AgentAvatar';
import FileContext from './FileContext';
import { Spinner } from '~/components';
import FileSearch from './FileSearch';
import ShareAgent from './ShareAgent';
@ -82,6 +83,10 @@ export default function AgentConfig({
() => agentsConfig?.capabilities.includes(AgentCapabilities.artifacts) ?? false,
[agentsConfig],
);
const ocrEnabled = useMemo(
() => agentsConfig?.capabilities.includes(AgentCapabilities.ocr) ?? false,
[agentsConfig],
);
const fileSearchEnabled = useMemo(
() => agentsConfig?.capabilities.includes(AgentCapabilities.file_search) ?? false,
[agentsConfig],
@ -91,6 +96,26 @@ export default function AgentConfig({
[agentsConfig],
);
const context_files = useMemo(() => {
if (typeof agent === 'string') {
return [];
}
if (agent?.id !== agent_id) {
return [];
}
if (agent.context_files) {
return agent.context_files;
}
const _agent = processAgentOption({
agent,
fileMap,
});
return _agent.context_files ?? [];
}, [agent, agent_id, fileMap]);
const knowledge_files = useMemo(() => {
if (typeof agent === 'string') {
return [];
@ -334,7 +359,7 @@ export default function AgentConfig({
</div>
</button>
</div>
{(codeEnabled || fileSearchEnabled || artifactsEnabled) && (
{(codeEnabled || fileSearchEnabled || artifactsEnabled || ocrEnabled) && (
<div className="mb-4 flex w-full flex-col items-start gap-3">
<label className="text-token-text-primary block font-medium">
{localize('com_assistants_capabilities')}
@ -345,6 +370,8 @@ export default function AgentConfig({
{fileSearchEnabled && <FileSearch agent_id={agent_id} files={knowledge_files} />}
{/* Artifacts */}
{artifactsEnabled && <Artifacts />}
{/* File Context (OCR) */}
{ocrEnabled && <FileContext agent_id={agent_id} files={context_files} />}
</div>
)}
{/* Agent Tools & Actions */}

View file

@ -0,0 +1,128 @@
import { useState, useRef } from 'react';
import {
EModelEndpoint,
EToolResources,
mergeFileConfig,
fileConfig as defaultFileConfig,
} from 'librechat-data-provider';
import type { ExtendedFile } from '~/common';
import { useFileHandling, useLocalize, useLazyEffect } from '~/hooks';
import FileRow from '~/components/Chat/Input/Files/FileRow';
import { useGetFileConfig } from '~/data-provider';
import { HoverCard, HoverCardContent, HoverCardPortal, HoverCardTrigger } from '~/components/ui';
import { AttachmentIcon, CircleHelpIcon } from '~/components/svg';
import { useChatContext } from '~/Providers';
import { ESide } from '~/common';
export default function FileContext({
agent_id,
files: _files,
}: {
agent_id: string;
files?: [string, ExtendedFile][];
}) {
const localize = useLocalize();
const { setFilesLoading } = useChatContext();
const fileInputRef = useRef<HTMLInputElement>(null);
const [files, setFiles] = useState<Map<string, ExtendedFile>>(new Map());
const { data: fileConfig = defaultFileConfig } = useGetFileConfig({
select: (data) => mergeFileConfig(data),
});
const { handleFileChange } = useFileHandling({
overrideEndpoint: EModelEndpoint.agents,
additionalMetadata: { agent_id, tool_resource: EToolResources.ocr },
fileSetter: setFiles,
});
useLazyEffect(
() => {
if (_files) {
setFiles(new Map(_files));
}
},
[_files],
750,
);
const endpointFileConfig = fileConfig.endpoints[EModelEndpoint.agents];
const isUploadDisabled = endpointFileConfig.disabled ?? false;
if (isUploadDisabled) {
return null;
}
const handleButtonClick = () => {
// necessary to reset the input
if (fileInputRef.current) {
fileInputRef.current.value = '';
}
fileInputRef.current?.click();
};
return (
<div className="w-full">
<HoverCard openDelay={50}>
<div className="mb-2 flex items-center gap-2">
<HoverCardTrigger asChild>
<span className="flex items-center gap-2">
<label className="text-token-text-primary block font-medium">
{localize('com_agents_file_context')}
</label>
<CircleHelpIcon className="h-4 w-4 text-text-tertiary" />
</span>
</HoverCardTrigger>
<HoverCardPortal>
<HoverCardContent side={ESide.Top} className="w-80">
<div className="space-y-2">
<p className="text-sm text-text-secondary">
{localize('com_agents_file_context_info')}
</p>
</div>
</HoverCardContent>
</HoverCardPortal>
</div>
</HoverCard>
<div className="flex flex-col gap-3">
{/* File Context (OCR) Files */}
<FileRow
files={files}
setFiles={setFiles}
setFilesLoading={setFilesLoading}
agent_id={agent_id}
tool_resource={EToolResources.ocr}
Wrapper={({ children }) => <div className="flex flex-wrap gap-2">{children}</div>}
/>
<div>
<button
type="button"
disabled={!agent_id}
className="btn btn-neutral border-token-border-light relative h-9 w-full rounded-lg font-medium"
onClick={handleButtonClick}
>
<div className="flex w-full items-center justify-center gap-1">
<AttachmentIcon className="text-token-text-primary h-4 w-4" />
<input
multiple={true}
type="file"
style={{ display: 'none' }}
tabIndex={-1}
ref={fileInputRef}
disabled={!agent_id}
onChange={handleFileChange}
/>
{localize('com_ui_upload_file_context')}
</div>
</button>
</div>
{/* Disabled Message */}
{agent_id ? null : (
<div className="text-xs text-text-secondary">
{localize('com_agents_file_context_disabled')}
</div>
)}
</div>
</div>
);
}

View file

@ -1,21 +1,23 @@
import { ArrowUpDown } from 'lucide-react';
import type { ColumnDef } from '@tanstack/react-table';
import type { TFile } from 'librechat-data-provider';
import useLocalize from '~/hooks/useLocalize';
import PanelFileCell from './PanelFileCell';
import { Button } from '~/components/ui';
import { formatDate } from '~/utils';
export const columns: ColumnDef<TFile>[] = [
export const columns: ColumnDef<TFile | undefined>[] = [
{
accessorKey: 'filename',
header: ({ column }) => {
const localize = useLocalize();
return (
<Button
variant="ghost"
className="hover:bg-surface-hover"
onClick={() => column.toggleSorting(column.getIsSorted() === 'asc')}
>
Name
{localize('com_ui_name')}
<ArrowUpDown className="ml-2 h-4 w-4" />
</Button>
);
@ -31,20 +33,21 @@ export const columns: ColumnDef<TFile>[] = [
size: '10%',
},
header: ({ column }) => {
const localize = useLocalize();
return (
<Button
variant="ghost"
className="hover:bg-surface-hover"
onClick={() => column.toggleSorting(column.getIsSorted() === 'asc')}
>
Date
{localize('com_ui_date')}
<ArrowUpDown className="ml-2 h-4 w-4" />
</Button>
);
},
cell: ({ row }) => (
<span className="flex justify-end text-xs">
{formatDate(row.original.updatedAt?.toString() ?? '')}
{formatDate(row.original?.updatedAt?.toString() ?? '')}
</span>
),
},

View file

@ -6,7 +6,6 @@ import { getFileType } from '~/utils';
export default function PanelFileCell({ row }: { row: Row<TFile | undefined> }) {
const file = row.original;
return (
<div className="flex w-full items-center gap-2">
{file?.type.startsWith('image') === true ? (

View file

@ -159,6 +159,7 @@ export default function DataTable<TData, TValue>({ columns, data }: DataTablePro
filename: fileData.filename,
source: fileData.source,
size: fileData.bytes,
metadata: fileData.metadata,
});
},
[addFile, fileMap, conversation, localize, showToast, fileConfig.endpoints],

View file

@ -63,8 +63,9 @@ export const useUploadFileMutation = (
const update = {};
const prevResources = agent.tool_resources ?? {};
const prevResource: t.ExecuteCodeResource | t.AgentFileSearchResource = agent
.tool_resources?.[tool_resource] ?? {
const prevResource: t.ExecuteCodeResource | t.AgentFileResource = agent.tool_resources?.[
tool_resource
] ?? {
file_ids: [],
};
if (!prevResource.file_ids) {

View file

@ -11,6 +11,9 @@
"com_agents_create_error": "There was an error creating your agent.",
"com_agents_description_placeholder": "Optional: Describe your Agent here",
"com_agents_enable_file_search": "Enable File Search",
"com_agents_file_context": "File Context (OCR)",
"com_agents_file_context_disabled": "Agent must be created before uploading files for File Context.",
"com_agents_file_context_info": "Files uploaded as \"Context\" are processed using OCR to extract text, which is then added to the Agent's instructions. Ideal for documents, images with text, or PDFs where you need the full text content of a file",
"com_agents_file_search_disabled": "Agent must be created before uploading files for File Search.",
"com_agents_file_search_info": "When enabled, the agent will be informed of the exact filenames listed below, allowing it to retrieve relevant context from these files.",
"com_agents_instructions_placeholder": "The system instructions that the agent uses",
@ -811,10 +814,12 @@
"com_ui_upload_code_files": "Upload for Code Interpreter",
"com_ui_upload_delay": "Uploading \"{{0}}\" is taking more time than anticipated. Please wait while the file finishes indexing for retrieval.",
"com_ui_upload_error": "There was an error uploading your file",
"com_ui_upload_file_context": "Upload File Context",
"com_ui_upload_file_search": "Upload for File Search",
"com_ui_upload_files": "Upload files",
"com_ui_upload_image": "Upload an image",
"com_ui_upload_image_input": "Upload Image",
"com_ui_upload_ocr_text": "Upload as Text",
"com_ui_upload_invalid": "Invalid file for upload. Must be an image not exceeding the limit",
"com_ui_upload_invalid_var": "Invalid file for upload. Must be an image not exceeding {{0}} MB",
"com_ui_upload_success": "Successfully uploaded file",
@ -835,4 +840,4 @@
"com_ui_zoom": "Zoom",
"com_user_message": "You",
"com_warning_resubmit_unsupported": "Resubmitting the AI message is not supported for this endpoint."
}
}

View file

@ -58,6 +58,9 @@ export const processAgentOption = ({
label: _agent?.name ?? '',
value: _agent?.id ?? '',
icon: isGlobal ? <EarthIcon className="icon-md text-green-400" /> : null,
context_files: _agent?.tool_resources?.ocr?.file_ids
? ([] as Array<[string, ExtendedFile]>)
: undefined,
knowledge_files: _agent?.tool_resources?.file_search?.file_ids
? ([] as Array<[string, ExtendedFile]>)
: undefined,
@ -83,7 +86,7 @@ export const processAgentOption = ({
const source =
tool_resource === EToolResources.file_search
? FileSources.vectordb
: file?.source ?? FileSources.local;
: (file?.source ?? FileSources.local);
if (file) {
list?.push([
@ -97,6 +100,7 @@ export const processAgentOption = ({
height: file.height,
size: file.bytes,
preview: file.filepath,
metadata: file.metadata,
progress: 1,
source,
},
@ -117,6 +121,16 @@ export const processAgentOption = ({
}
};
if (agent.context_files && _agent?.tool_resources?.ocr?.file_ids) {
_agent.tool_resources.ocr.file_ids.forEach((file_id) =>
handleFile({
file_id,
list: agent.context_files,
tool_resource: EToolResources.ocr,
}),
);
}
if (agent.knowledge_files && _agent?.tool_resources?.file_search?.file_ids) {
_agent.tool_resources.file_search.file_ids.forEach((file_id) =>
handleFile({