diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index f34303047a..6e57759e16 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -9,6 +9,7 @@ import { TerminalSquareIcon, } from 'lucide-react'; import { + Providers, EToolResources, EModelEndpoint, defaultAgentCapabilities, @@ -36,6 +37,8 @@ import { ephemeralAgentByConvoId } from '~/store'; import { MenuItemProps } from '~/common'; import { cn } from '~/utils'; +type FileUploadType = 'image' | 'document' | 'image_document' | 'image_document_video_audio'; + interface AttachFileMenuProps { agentId?: string | null; endpoint?: string | null; @@ -83,9 +86,7 @@ const AttachFileMenu = ({ ephemeralAgent, ); - const handleUploadClick = ( - fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal', - ) => { + const handleUploadClick = (fileType?: FileUploadType) => { if (!inputRef.current) { return; } @@ -94,9 +95,9 @@ const AttachFileMenu = ({ inputRef.current.accept = 'image/*'; } else if (fileType === 'document') { inputRef.current.accept = '.pdf,application/pdf'; - } else if (fileType === 'multimodal') { + } else if (fileType === 'image_document') { inputRef.current.accept = 'image/*,.pdf,application/pdf'; - } else if (fileType === 'google_multimodal') { + } else if (fileType === 'image_document_video_audio') { inputRef.current.accept = 'image/*,.pdf,application/pdf,video/*,audio/*'; } else { inputRef.current.accept = ''; @@ -106,12 +107,16 @@ const AttachFileMenu = ({ }; const dropdownItems = useMemo(() => { - const createMenuItems = ( - onAction: (fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal') => void, - ) => { + const createMenuItems = (onAction: (fileType?: FileUploadType) => void) => { const items: MenuItemProps[] = []; - const currentProvider = provider || endpoint; + let currentProvider = provider || endpoint; + + // This will be removed in a future PR to formally normalize Providers comparisons to be case insensitive + if (currentProvider?.toLowerCase() === Providers.OPENROUTER) { + currentProvider = Providers.OPENROUTER; + } + if ( isDocumentSupportedProvider(endpointType) || isDocumentSupportedProvider(currentProvider) @@ -120,9 +125,11 @@ const AttachFileMenu = ({ label: localize('com_ui_upload_provider'), onClick: () => { setToolResource(undefined); - onAction( - (provider || endpoint) === EModelEndpoint.google ? 'google_multimodal' : 'multimodal', - ); + let fileType: Exclude = 'image_document'; + if (currentProvider === Providers.GOOGLE || currentProvider === Providers.OPENROUTER) { + fileType = 'image_document_video_audio'; + } + onAction(fileType); }, icon: , }); diff --git a/client/src/components/Chat/Input/Files/DragDropModal.tsx b/client/src/components/Chat/Input/Files/DragDropModal.tsx index eb5f86d3b9..65647a2f22 100644 --- a/client/src/components/Chat/Input/Files/DragDropModal.tsx +++ b/client/src/components/Chat/Input/Files/DragDropModal.tsx @@ -2,6 +2,7 @@ import React, { useMemo } from 'react'; import { useRecoilValue } from 'recoil'; import { OGDialog, OGDialogTemplate } from '@librechat/client'; import { + Providers, inferMimeType, EToolResources, EModelEndpoint, @@ -55,15 +56,21 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD const options = useMemo(() => { const _options: FileOption[] = []; - const currentProvider = provider || endpoint; + let currentProvider = provider || endpoint; + + // This will be removed in a future PR to formally normalize Providers comparisons to be case insensitive + if (currentProvider?.toLowerCase() === Providers.OPENROUTER) { + currentProvider = Providers.OPENROUTER; + } /** Helper to get inferred MIME type for a file */ const getFileType = (file: File) => inferMimeType(file.name, file.type); // Check if provider supports document upload if (isDocumentSupportedProvider(endpointType) || isDocumentSupportedProvider(currentProvider)) { - const isGoogleProvider = currentProvider === EModelEndpoint.google; - const validFileTypes = isGoogleProvider + const supportsImageDocVideoAudio = + currentProvider === EModelEndpoint.google || currentProvider === Providers.OPENROUTER; + const validFileTypes = supportsImageDocVideoAudio ? files.every((file) => { const type = getFileType(file); return ( diff --git a/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx b/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx index 36c4ee40e7..a9b7139737 100644 --- a/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx +++ b/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx @@ -512,7 +512,7 @@ describe('AttachFileMenu', () => { }); describe('Google Provider Special Case', () => { - it('should use google_multimodal file type for Google provider', () => { + it('should use image_document_video_audio file type for Google provider', () => { mockUseAgentToolPermissions.mockReturnValue({ fileSearchAllowedByAgent: false, codeAllowedByAgent: false, @@ -536,7 +536,7 @@ describe('AttachFileMenu', () => { // The file input should have been clicked (indirectly tested through the implementation) }); - it('should use multimodal file type for non-Google providers', () => { + it('should use image_document file type for non-Google providers', () => { mockUseAgentToolPermissions.mockReturnValue({ fileSearchAllowedByAgent: false, codeAllowedByAgent: false, @@ -555,7 +555,7 @@ describe('AttachFileMenu', () => { expect(uploadProviderButton).toBeInTheDocument(); fireEvent.click(uploadProviderButton); - // Implementation detail - multimodal type is used + // Implementation detail - image_document type is used }); }); diff --git a/packages/api/src/files/encode/audio.ts b/packages/api/src/files/encode/audio.ts index 6018df497d..d29163d868 100644 --- a/packages/api/src/files/encode/audio.ts +++ b/packages/api/src/files/encode/audio.ts @@ -79,6 +79,21 @@ export async function encodeAndFormatAudios( mimeType: file.type, data: content, }); + } else if (provider === Providers.OPENROUTER) { + // Extract format from filename extension (e.g., 'audio.mp3' -> 'mp3') + // OpenRouter expects format values like: wav, mp3, aiff, aac, ogg, flac, m4a, pcm16, pcm24 + // Note: MIME types don't always match (e.g., 'audio/mpeg' is mp3, not mpeg), so that is why we are using the file extension instead + const format = file.filename.split('.').pop()?.toLowerCase(); + if (!format) { + throw new Error(`Could not extract audio format from filename: ${file.filename}`); + } + result.audios.push({ + type: 'input_audio', + input_audio: { + data: content, + format, + }, + }); } result.files.push(metadata); diff --git a/packages/api/src/files/encode/video.ts b/packages/api/src/files/encode/video.ts index faace9eca1..b0d9bb8c2d 100644 --- a/packages/api/src/files/encode/video.ts +++ b/packages/api/src/files/encode/video.ts @@ -79,6 +79,13 @@ export async function encodeAndFormatVideos( mimeType: file.type, data: content, }); + } else if (provider === Providers.OPENROUTER) { + result.videos.push({ + type: 'video_url', + video_url: { + url: `data:${file.type};base64,${content}`, + }, + }); } result.files.push(metadata); diff --git a/packages/api/src/types/files.ts b/packages/api/src/types/files.ts index 7ee641aab1..6a403932da 100644 --- a/packages/api/src/types/files.ts +++ b/packages/api/src/types/files.ts @@ -29,12 +29,25 @@ export interface AudioProcessingResult { bytes: number; } +/** Google video block format */ +export interface GoogleVideoBlock { + type: 'media'; + mimeType: string; + data: string; +} + +/** OpenRouter video block format */ +export interface OpenRouterVideoBlock { + type: 'video_url'; + video_url: { + url: string; + }; +} + +export type VideoBlock = GoogleVideoBlock | OpenRouterVideoBlock; + export interface VideoResult { - videos: Array<{ - type: string; - mimeType: string; - data: string; - }>; + videos: VideoBlock[]; files: Array<{ file_id?: string; temp_file_id?: string; @@ -100,12 +113,26 @@ export interface DocumentResult { }>; } -export interface AudioResult { - audios: Array<{ - type: string; - mimeType: string; +/** Google audio block format */ +export interface GoogleAudioBlock { + type: 'media'; + mimeType: string; + data: string; +} + +/** OpenRouter audio block format */ +export interface OpenRouterAudioBlock { + type: 'input_audio'; + input_audio: { data: string; - }>; + format: string; + }; +} + +export type AudioBlock = GoogleAudioBlock | OpenRouterAudioBlock; + +export interface AudioResult { + audios: AudioBlock[]; files: Array<{ file_id?: string; temp_file_id?: string; diff --git a/packages/data-provider/src/types/agents.ts b/packages/data-provider/src/types/agents.ts index 7305d2f062..ac3f464019 100644 --- a/packages/data-provider/src/types/agents.ts +++ b/packages/data-provider/src/types/agents.ts @@ -33,11 +33,26 @@ export namespace Agents { image_url: string | { url: string; detail?: ImageDetail }; }; + export type MessageContentVideoUrl = { + type: ContentTypes.VIDEO_URL; + video_url: { url: string }; + }; + + export type MessageContentInputAudio = { + type: ContentTypes.INPUT_AUDIO; + input_audio: { + data: string; + format: string; + }; + }; + export type MessageContentComplex = | ReasoningContentText | AgentUpdate | MessageContentText | MessageContentImageUrl + | MessageContentVideoUrl + | MessageContentInputAudio // eslint-disable-next-line @typescript-eslint/no-explicit-any | (Record & { type?: ContentTypes | string }) // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -295,6 +310,8 @@ export namespace Agents { | ContentTypes.THINK | ContentTypes.TEXT | ContentTypes.IMAGE_URL + | ContentTypes.VIDEO_URL + | ContentTypes.INPUT_AUDIO | string; } diff --git a/packages/data-provider/src/types/assistants.ts b/packages/data-provider/src/types/assistants.ts index b0ed1f01c1..185df5fa9f 100644 --- a/packages/data-provider/src/types/assistants.ts +++ b/packages/data-provider/src/types/assistants.ts @@ -515,7 +515,9 @@ export type TMessageContentParts = } & ContentMetadata) | ({ type: ContentTypes.IMAGE_FILE; image_file: ImageFile & PartMetadata } & ContentMetadata) | (Agents.AgentUpdate & ContentMetadata) - | (Agents.MessageContentImageUrl & ContentMetadata); + | (Agents.MessageContentImageUrl & ContentMetadata) + | (Agents.MessageContentVideoUrl & ContentMetadata) + | (Agents.MessageContentInputAudio & ContentMetadata); export type StreamContentData = TMessageContentParts & { /** The index of the current content part */ diff --git a/packages/data-provider/src/types/runs.ts b/packages/data-provider/src/types/runs.ts index bba5126054..de61357b92 100644 --- a/packages/data-provider/src/types/runs.ts +++ b/packages/data-provider/src/types/runs.ts @@ -5,6 +5,8 @@ export enum ContentTypes { TOOL_CALL = 'tool_call', IMAGE_FILE = 'image_file', IMAGE_URL = 'image_url', + VIDEO_URL = 'video_url', + INPUT_AUDIO = 'input_audio', AGENT_UPDATE = 'agent_update', ERROR = 'error', }