From 4fe223eedd90c026ee1e8afdb395a691501ca5c5 Mon Sep 17 00:00:00 2001 From: papasaidfine <44102846+papasaidfine@users.noreply.github.com> Date: Thu, 25 Dec 2025 13:23:29 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=9E=EF=B8=8F=20feat:=20OpenRouter=20Au?= =?UTF-8?q?dio/Video=20File=20Upload=20Support=20(#11070)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * Added video upload support for OpenRouter - Added VIDEO_URL content type to support video_url message format - Implemented OpenRouter video encoding using base64 data URLs - Extended encodeAndFormatVideos() to handle OpenRouter provider - Updated UI to accept video uploads for OpenRouter (mp4, webm, mpeg, mov) - Fixed case-sensitivity in provider detection for agents - Made isDocumentSupportedProvider() and isOpenAILikeProvider() case-insensitive Videos are now converted to data:video/mp4;base64,... format compatible with OpenRouter's API requirements per their documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 * refactor: change multimodal and google_multimodal to more transparent variable names of image_document and image_document_video_audio (also google_multimodal doesn't apply as much since we are adding support for video and audio uploads for open router) * fix: revert .toLowerCase change to isOpenAILikeProvider and isDocumentSupportedProvider which broke upload to provider detection for openAI endpoints * wip: add audio support to openrouter * fix: filetypes now properly parsed and sent rather than destructured mimetypes for openrouter * refactor: Omit to Exclude for ESLint * feat: update DragDropModal for new openrouter support * fix: special case openrouter for lower case provider (currently getting issues with the provider coming in as 'OpenRouter' and our enum being 'openrouter') This will probably require a larger refactor later to handle case insensitivity for all providers, but that will have to be thoroughly tested in its own isolated PR --------- Co-authored-by: Claude Sonnet 4.5 Co-authored-by: Dustin Healy <54083382+dustinhealy@users.noreply.github.com> --- .../Chat/Input/Files/AttachFileMenu.tsx | 31 +++++++----- .../Chat/Input/Files/DragDropModal.tsx | 13 +++-- .../Files/__tests__/AttachFileMenu.spec.tsx | 6 +-- packages/api/src/files/encode/audio.ts | 15 ++++++ packages/api/src/files/encode/video.ts | 7 +++ packages/api/src/types/files.ts | 47 +++++++++++++++---- packages/data-provider/src/types/agents.ts | 17 +++++++ .../data-provider/src/types/assistants.ts | 4 +- packages/data-provider/src/types/runs.ts | 2 + 9 files changed, 113 insertions(+), 29 deletions(-) diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index f34303047a..6e57759e16 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -9,6 +9,7 @@ import { TerminalSquareIcon, } from 'lucide-react'; import { + Providers, EToolResources, EModelEndpoint, defaultAgentCapabilities, @@ -36,6 +37,8 @@ import { ephemeralAgentByConvoId } from '~/store'; import { MenuItemProps } from '~/common'; import { cn } from '~/utils'; +type FileUploadType = 'image' | 'document' | 'image_document' | 'image_document_video_audio'; + interface AttachFileMenuProps { agentId?: string | null; endpoint?: string | null; @@ -83,9 +86,7 @@ const AttachFileMenu = ({ ephemeralAgent, ); - const handleUploadClick = ( - fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal', - ) => { + const handleUploadClick = (fileType?: FileUploadType) => { if (!inputRef.current) { return; } @@ -94,9 +95,9 @@ const AttachFileMenu = ({ inputRef.current.accept = 'image/*'; } else if (fileType === 'document') { inputRef.current.accept = '.pdf,application/pdf'; - } else if (fileType === 'multimodal') { + } else if (fileType === 'image_document') { inputRef.current.accept = 'image/*,.pdf,application/pdf'; - } else if (fileType === 'google_multimodal') { + } else if (fileType === 'image_document_video_audio') { inputRef.current.accept = 'image/*,.pdf,application/pdf,video/*,audio/*'; } else { inputRef.current.accept = ''; @@ -106,12 +107,16 @@ const AttachFileMenu = ({ }; const dropdownItems = useMemo(() => { - const createMenuItems = ( - onAction: (fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal') => void, - ) => { + const createMenuItems = (onAction: (fileType?: FileUploadType) => void) => { const items: MenuItemProps[] = []; - const currentProvider = provider || endpoint; + let currentProvider = provider || endpoint; + + // This will be removed in a future PR to formally normalize Providers comparisons to be case insensitive + if (currentProvider?.toLowerCase() === Providers.OPENROUTER) { + currentProvider = Providers.OPENROUTER; + } + if ( isDocumentSupportedProvider(endpointType) || isDocumentSupportedProvider(currentProvider) @@ -120,9 +125,11 @@ const AttachFileMenu = ({ label: localize('com_ui_upload_provider'), onClick: () => { setToolResource(undefined); - onAction( - (provider || endpoint) === EModelEndpoint.google ? 'google_multimodal' : 'multimodal', - ); + let fileType: Exclude = 'image_document'; + if (currentProvider === Providers.GOOGLE || currentProvider === Providers.OPENROUTER) { + fileType = 'image_document_video_audio'; + } + onAction(fileType); }, icon: , }); diff --git a/client/src/components/Chat/Input/Files/DragDropModal.tsx b/client/src/components/Chat/Input/Files/DragDropModal.tsx index eb5f86d3b9..65647a2f22 100644 --- a/client/src/components/Chat/Input/Files/DragDropModal.tsx +++ b/client/src/components/Chat/Input/Files/DragDropModal.tsx @@ -2,6 +2,7 @@ import React, { useMemo } from 'react'; import { useRecoilValue } from 'recoil'; import { OGDialog, OGDialogTemplate } from '@librechat/client'; import { + Providers, inferMimeType, EToolResources, EModelEndpoint, @@ -55,15 +56,21 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD const options = useMemo(() => { const _options: FileOption[] = []; - const currentProvider = provider || endpoint; + let currentProvider = provider || endpoint; + + // This will be removed in a future PR to formally normalize Providers comparisons to be case insensitive + if (currentProvider?.toLowerCase() === Providers.OPENROUTER) { + currentProvider = Providers.OPENROUTER; + } /** Helper to get inferred MIME type for a file */ const getFileType = (file: File) => inferMimeType(file.name, file.type); // Check if provider supports document upload if (isDocumentSupportedProvider(endpointType) || isDocumentSupportedProvider(currentProvider)) { - const isGoogleProvider = currentProvider === EModelEndpoint.google; - const validFileTypes = isGoogleProvider + const supportsImageDocVideoAudio = + currentProvider === EModelEndpoint.google || currentProvider === Providers.OPENROUTER; + const validFileTypes = supportsImageDocVideoAudio ? files.every((file) => { const type = getFileType(file); return ( diff --git a/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx b/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx index 36c4ee40e7..a9b7139737 100644 --- a/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx +++ b/client/src/components/Chat/Input/Files/__tests__/AttachFileMenu.spec.tsx @@ -512,7 +512,7 @@ describe('AttachFileMenu', () => { }); describe('Google Provider Special Case', () => { - it('should use google_multimodal file type for Google provider', () => { + it('should use image_document_video_audio file type for Google provider', () => { mockUseAgentToolPermissions.mockReturnValue({ fileSearchAllowedByAgent: false, codeAllowedByAgent: false, @@ -536,7 +536,7 @@ describe('AttachFileMenu', () => { // The file input should have been clicked (indirectly tested through the implementation) }); - it('should use multimodal file type for non-Google providers', () => { + it('should use image_document file type for non-Google providers', () => { mockUseAgentToolPermissions.mockReturnValue({ fileSearchAllowedByAgent: false, codeAllowedByAgent: false, @@ -555,7 +555,7 @@ describe('AttachFileMenu', () => { expect(uploadProviderButton).toBeInTheDocument(); fireEvent.click(uploadProviderButton); - // Implementation detail - multimodal type is used + // Implementation detail - image_document type is used }); }); diff --git a/packages/api/src/files/encode/audio.ts b/packages/api/src/files/encode/audio.ts index 6018df497d..d29163d868 100644 --- a/packages/api/src/files/encode/audio.ts +++ b/packages/api/src/files/encode/audio.ts @@ -79,6 +79,21 @@ export async function encodeAndFormatAudios( mimeType: file.type, data: content, }); + } else if (provider === Providers.OPENROUTER) { + // Extract format from filename extension (e.g., 'audio.mp3' -> 'mp3') + // OpenRouter expects format values like: wav, mp3, aiff, aac, ogg, flac, m4a, pcm16, pcm24 + // Note: MIME types don't always match (e.g., 'audio/mpeg' is mp3, not mpeg), so that is why we are using the file extension instead + const format = file.filename.split('.').pop()?.toLowerCase(); + if (!format) { + throw new Error(`Could not extract audio format from filename: ${file.filename}`); + } + result.audios.push({ + type: 'input_audio', + input_audio: { + data: content, + format, + }, + }); } result.files.push(metadata); diff --git a/packages/api/src/files/encode/video.ts b/packages/api/src/files/encode/video.ts index faace9eca1..b0d9bb8c2d 100644 --- a/packages/api/src/files/encode/video.ts +++ b/packages/api/src/files/encode/video.ts @@ -79,6 +79,13 @@ export async function encodeAndFormatVideos( mimeType: file.type, data: content, }); + } else if (provider === Providers.OPENROUTER) { + result.videos.push({ + type: 'video_url', + video_url: { + url: `data:${file.type};base64,${content}`, + }, + }); } result.files.push(metadata); diff --git a/packages/api/src/types/files.ts b/packages/api/src/types/files.ts index 7ee641aab1..6a403932da 100644 --- a/packages/api/src/types/files.ts +++ b/packages/api/src/types/files.ts @@ -29,12 +29,25 @@ export interface AudioProcessingResult { bytes: number; } +/** Google video block format */ +export interface GoogleVideoBlock { + type: 'media'; + mimeType: string; + data: string; +} + +/** OpenRouter video block format */ +export interface OpenRouterVideoBlock { + type: 'video_url'; + video_url: { + url: string; + }; +} + +export type VideoBlock = GoogleVideoBlock | OpenRouterVideoBlock; + export interface VideoResult { - videos: Array<{ - type: string; - mimeType: string; - data: string; - }>; + videos: VideoBlock[]; files: Array<{ file_id?: string; temp_file_id?: string; @@ -100,12 +113,26 @@ export interface DocumentResult { }>; } -export interface AudioResult { - audios: Array<{ - type: string; - mimeType: string; +/** Google audio block format */ +export interface GoogleAudioBlock { + type: 'media'; + mimeType: string; + data: string; +} + +/** OpenRouter audio block format */ +export interface OpenRouterAudioBlock { + type: 'input_audio'; + input_audio: { data: string; - }>; + format: string; + }; +} + +export type AudioBlock = GoogleAudioBlock | OpenRouterAudioBlock; + +export interface AudioResult { + audios: AudioBlock[]; files: Array<{ file_id?: string; temp_file_id?: string; diff --git a/packages/data-provider/src/types/agents.ts b/packages/data-provider/src/types/agents.ts index 7305d2f062..ac3f464019 100644 --- a/packages/data-provider/src/types/agents.ts +++ b/packages/data-provider/src/types/agents.ts @@ -33,11 +33,26 @@ export namespace Agents { image_url: string | { url: string; detail?: ImageDetail }; }; + export type MessageContentVideoUrl = { + type: ContentTypes.VIDEO_URL; + video_url: { url: string }; + }; + + export type MessageContentInputAudio = { + type: ContentTypes.INPUT_AUDIO; + input_audio: { + data: string; + format: string; + }; + }; + export type MessageContentComplex = | ReasoningContentText | AgentUpdate | MessageContentText | MessageContentImageUrl + | MessageContentVideoUrl + | MessageContentInputAudio // eslint-disable-next-line @typescript-eslint/no-explicit-any | (Record & { type?: ContentTypes | string }) // eslint-disable-next-line @typescript-eslint/no-explicit-any @@ -295,6 +310,8 @@ export namespace Agents { | ContentTypes.THINK | ContentTypes.TEXT | ContentTypes.IMAGE_URL + | ContentTypes.VIDEO_URL + | ContentTypes.INPUT_AUDIO | string; } diff --git a/packages/data-provider/src/types/assistants.ts b/packages/data-provider/src/types/assistants.ts index b0ed1f01c1..185df5fa9f 100644 --- a/packages/data-provider/src/types/assistants.ts +++ b/packages/data-provider/src/types/assistants.ts @@ -515,7 +515,9 @@ export type TMessageContentParts = } & ContentMetadata) | ({ type: ContentTypes.IMAGE_FILE; image_file: ImageFile & PartMetadata } & ContentMetadata) | (Agents.AgentUpdate & ContentMetadata) - | (Agents.MessageContentImageUrl & ContentMetadata); + | (Agents.MessageContentImageUrl & ContentMetadata) + | (Agents.MessageContentVideoUrl & ContentMetadata) + | (Agents.MessageContentInputAudio & ContentMetadata); export type StreamContentData = TMessageContentParts & { /** The index of the current content part */ diff --git a/packages/data-provider/src/types/runs.ts b/packages/data-provider/src/types/runs.ts index bba5126054..de61357b92 100644 --- a/packages/data-provider/src/types/runs.ts +++ b/packages/data-provider/src/types/runs.ts @@ -5,6 +5,8 @@ export enum ContentTypes { TOOL_CALL = 'tool_call', IMAGE_FILE = 'image_file', IMAGE_URL = 'image_url', + VIDEO_URL = 'video_url', + INPUT_AUDIO = 'input_audio', AGENT_UPDATE = 'agent_update', ERROR = 'error', }