mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-27 13:48:51 +01:00
🎞️ feat: OpenRouter Audio/Video File Upload Support (#11070)
* Added video upload support for OpenRouter - Added VIDEO_URL content type to support video_url message format - Implemented OpenRouter video encoding using base64 data URLs - Extended encodeAndFormatVideos() to handle OpenRouter provider - Updated UI to accept video uploads for OpenRouter (mp4, webm, mpeg, mov) - Fixed case-sensitivity in provider detection for agents - Made isDocumentSupportedProvider() and isOpenAILikeProvider() case-insensitive Videos are now converted to data:video/mp4;base64,... format compatible with OpenRouter's API requirements per their documentation. 🤖 Generated with [Claude Code](https://claude.com/claude-code) Co-Authored-By: Claude Sonnet 4.5 <noreply@anthropic.com> * refactor: change multimodal and google_multimodal to more transparent variable names of image_document and image_document_video_audio (also google_multimodal doesn't apply as much since we are adding support for video and audio uploads for open router) * fix: revert .toLowerCase change to isOpenAILikeProvider and isDocumentSupportedProvider which broke upload to provider detection for openAI endpoints * wip: add audio support to openrouter * fix: filetypes now properly parsed and sent rather than destructured mimetypes for openrouter * refactor: Omit to Exclude for ESLint * feat: update DragDropModal for new openrouter support * fix: special case openrouter for lower case provider (currently getting issues with the provider coming in as 'OpenRouter' and our enum being 'openrouter') This will probably require a larger refactor later to handle case insensitivity for all providers, but that will have to be thoroughly tested in its own isolated PR --------- Co-authored-by: Claude Sonnet 4.5 <noreply@anthropic.com> Co-authored-by: Dustin Healy <54083382+dustinhealy@users.noreply.github.com>
This commit is contained in:
parent
5caa008432
commit
4fe223eedd
9 changed files with 113 additions and 29 deletions
|
|
@ -9,6 +9,7 @@ import {
|
|||
TerminalSquareIcon,
|
||||
} from 'lucide-react';
|
||||
import {
|
||||
Providers,
|
||||
EToolResources,
|
||||
EModelEndpoint,
|
||||
defaultAgentCapabilities,
|
||||
|
|
@ -36,6 +37,8 @@ import { ephemeralAgentByConvoId } from '~/store';
|
|||
import { MenuItemProps } from '~/common';
|
||||
import { cn } from '~/utils';
|
||||
|
||||
type FileUploadType = 'image' | 'document' | 'image_document' | 'image_document_video_audio';
|
||||
|
||||
interface AttachFileMenuProps {
|
||||
agentId?: string | null;
|
||||
endpoint?: string | null;
|
||||
|
|
@ -83,9 +86,7 @@ const AttachFileMenu = ({
|
|||
ephemeralAgent,
|
||||
);
|
||||
|
||||
const handleUploadClick = (
|
||||
fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal',
|
||||
) => {
|
||||
const handleUploadClick = (fileType?: FileUploadType) => {
|
||||
if (!inputRef.current) {
|
||||
return;
|
||||
}
|
||||
|
|
@ -94,9 +95,9 @@ const AttachFileMenu = ({
|
|||
inputRef.current.accept = 'image/*';
|
||||
} else if (fileType === 'document') {
|
||||
inputRef.current.accept = '.pdf,application/pdf';
|
||||
} else if (fileType === 'multimodal') {
|
||||
} else if (fileType === 'image_document') {
|
||||
inputRef.current.accept = 'image/*,.pdf,application/pdf';
|
||||
} else if (fileType === 'google_multimodal') {
|
||||
} else if (fileType === 'image_document_video_audio') {
|
||||
inputRef.current.accept = 'image/*,.pdf,application/pdf,video/*,audio/*';
|
||||
} else {
|
||||
inputRef.current.accept = '';
|
||||
|
|
@ -106,12 +107,16 @@ const AttachFileMenu = ({
|
|||
};
|
||||
|
||||
const dropdownItems = useMemo(() => {
|
||||
const createMenuItems = (
|
||||
onAction: (fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal') => void,
|
||||
) => {
|
||||
const createMenuItems = (onAction: (fileType?: FileUploadType) => void) => {
|
||||
const items: MenuItemProps[] = [];
|
||||
|
||||
const currentProvider = provider || endpoint;
|
||||
let currentProvider = provider || endpoint;
|
||||
|
||||
// This will be removed in a future PR to formally normalize Providers comparisons to be case insensitive
|
||||
if (currentProvider?.toLowerCase() === Providers.OPENROUTER) {
|
||||
currentProvider = Providers.OPENROUTER;
|
||||
}
|
||||
|
||||
if (
|
||||
isDocumentSupportedProvider(endpointType) ||
|
||||
isDocumentSupportedProvider(currentProvider)
|
||||
|
|
@ -120,9 +125,11 @@ const AttachFileMenu = ({
|
|||
label: localize('com_ui_upload_provider'),
|
||||
onClick: () => {
|
||||
setToolResource(undefined);
|
||||
onAction(
|
||||
(provider || endpoint) === EModelEndpoint.google ? 'google_multimodal' : 'multimodal',
|
||||
);
|
||||
let fileType: Exclude<FileUploadType, 'image' | 'document'> = 'image_document';
|
||||
if (currentProvider === Providers.GOOGLE || currentProvider === Providers.OPENROUTER) {
|
||||
fileType = 'image_document_video_audio';
|
||||
}
|
||||
onAction(fileType);
|
||||
},
|
||||
icon: <FileImageIcon className="icon-md" />,
|
||||
});
|
||||
|
|
|
|||
|
|
@ -2,6 +2,7 @@ import React, { useMemo } from 'react';
|
|||
import { useRecoilValue } from 'recoil';
|
||||
import { OGDialog, OGDialogTemplate } from '@librechat/client';
|
||||
import {
|
||||
Providers,
|
||||
inferMimeType,
|
||||
EToolResources,
|
||||
EModelEndpoint,
|
||||
|
|
@ -55,15 +56,21 @@ const DragDropModal = ({ onOptionSelect, setShowModal, files, isVisible }: DragD
|
|||
|
||||
const options = useMemo(() => {
|
||||
const _options: FileOption[] = [];
|
||||
const currentProvider = provider || endpoint;
|
||||
let currentProvider = provider || endpoint;
|
||||
|
||||
// This will be removed in a future PR to formally normalize Providers comparisons to be case insensitive
|
||||
if (currentProvider?.toLowerCase() === Providers.OPENROUTER) {
|
||||
currentProvider = Providers.OPENROUTER;
|
||||
}
|
||||
|
||||
/** Helper to get inferred MIME type for a file */
|
||||
const getFileType = (file: File) => inferMimeType(file.name, file.type);
|
||||
|
||||
// Check if provider supports document upload
|
||||
if (isDocumentSupportedProvider(endpointType) || isDocumentSupportedProvider(currentProvider)) {
|
||||
const isGoogleProvider = currentProvider === EModelEndpoint.google;
|
||||
const validFileTypes = isGoogleProvider
|
||||
const supportsImageDocVideoAudio =
|
||||
currentProvider === EModelEndpoint.google || currentProvider === Providers.OPENROUTER;
|
||||
const validFileTypes = supportsImageDocVideoAudio
|
||||
? files.every((file) => {
|
||||
const type = getFileType(file);
|
||||
return (
|
||||
|
|
|
|||
|
|
@ -512,7 +512,7 @@ describe('AttachFileMenu', () => {
|
|||
});
|
||||
|
||||
describe('Google Provider Special Case', () => {
|
||||
it('should use google_multimodal file type for Google provider', () => {
|
||||
it('should use image_document_video_audio file type for Google provider', () => {
|
||||
mockUseAgentToolPermissions.mockReturnValue({
|
||||
fileSearchAllowedByAgent: false,
|
||||
codeAllowedByAgent: false,
|
||||
|
|
@ -536,7 +536,7 @@ describe('AttachFileMenu', () => {
|
|||
// The file input should have been clicked (indirectly tested through the implementation)
|
||||
});
|
||||
|
||||
it('should use multimodal file type for non-Google providers', () => {
|
||||
it('should use image_document file type for non-Google providers', () => {
|
||||
mockUseAgentToolPermissions.mockReturnValue({
|
||||
fileSearchAllowedByAgent: false,
|
||||
codeAllowedByAgent: false,
|
||||
|
|
@ -555,7 +555,7 @@ describe('AttachFileMenu', () => {
|
|||
expect(uploadProviderButton).toBeInTheDocument();
|
||||
fireEvent.click(uploadProviderButton);
|
||||
|
||||
// Implementation detail - multimodal type is used
|
||||
// Implementation detail - image_document type is used
|
||||
});
|
||||
});
|
||||
|
||||
|
|
|
|||
|
|
@ -79,6 +79,21 @@ export async function encodeAndFormatAudios(
|
|||
mimeType: file.type,
|
||||
data: content,
|
||||
});
|
||||
} else if (provider === Providers.OPENROUTER) {
|
||||
// Extract format from filename extension (e.g., 'audio.mp3' -> 'mp3')
|
||||
// OpenRouter expects format values like: wav, mp3, aiff, aac, ogg, flac, m4a, pcm16, pcm24
|
||||
// Note: MIME types don't always match (e.g., 'audio/mpeg' is mp3, not mpeg), so that is why we are using the file extension instead
|
||||
const format = file.filename.split('.').pop()?.toLowerCase();
|
||||
if (!format) {
|
||||
throw new Error(`Could not extract audio format from filename: ${file.filename}`);
|
||||
}
|
||||
result.audios.push({
|
||||
type: 'input_audio',
|
||||
input_audio: {
|
||||
data: content,
|
||||
format,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
result.files.push(metadata);
|
||||
|
|
|
|||
|
|
@ -79,6 +79,13 @@ export async function encodeAndFormatVideos(
|
|||
mimeType: file.type,
|
||||
data: content,
|
||||
});
|
||||
} else if (provider === Providers.OPENROUTER) {
|
||||
result.videos.push({
|
||||
type: 'video_url',
|
||||
video_url: {
|
||||
url: `data:${file.type};base64,${content}`,
|
||||
},
|
||||
});
|
||||
}
|
||||
|
||||
result.files.push(metadata);
|
||||
|
|
|
|||
|
|
@ -29,12 +29,25 @@ export interface AudioProcessingResult {
|
|||
bytes: number;
|
||||
}
|
||||
|
||||
/** Google video block format */
|
||||
export interface GoogleVideoBlock {
|
||||
type: 'media';
|
||||
mimeType: string;
|
||||
data: string;
|
||||
}
|
||||
|
||||
/** OpenRouter video block format */
|
||||
export interface OpenRouterVideoBlock {
|
||||
type: 'video_url';
|
||||
video_url: {
|
||||
url: string;
|
||||
};
|
||||
}
|
||||
|
||||
export type VideoBlock = GoogleVideoBlock | OpenRouterVideoBlock;
|
||||
|
||||
export interface VideoResult {
|
||||
videos: Array<{
|
||||
type: string;
|
||||
mimeType: string;
|
||||
data: string;
|
||||
}>;
|
||||
videos: VideoBlock[];
|
||||
files: Array<{
|
||||
file_id?: string;
|
||||
temp_file_id?: string;
|
||||
|
|
@ -100,12 +113,26 @@ export interface DocumentResult {
|
|||
}>;
|
||||
}
|
||||
|
||||
export interface AudioResult {
|
||||
audios: Array<{
|
||||
type: string;
|
||||
mimeType: string;
|
||||
/** Google audio block format */
|
||||
export interface GoogleAudioBlock {
|
||||
type: 'media';
|
||||
mimeType: string;
|
||||
data: string;
|
||||
}
|
||||
|
||||
/** OpenRouter audio block format */
|
||||
export interface OpenRouterAudioBlock {
|
||||
type: 'input_audio';
|
||||
input_audio: {
|
||||
data: string;
|
||||
}>;
|
||||
format: string;
|
||||
};
|
||||
}
|
||||
|
||||
export type AudioBlock = GoogleAudioBlock | OpenRouterAudioBlock;
|
||||
|
||||
export interface AudioResult {
|
||||
audios: AudioBlock[];
|
||||
files: Array<{
|
||||
file_id?: string;
|
||||
temp_file_id?: string;
|
||||
|
|
|
|||
|
|
@ -33,11 +33,26 @@ export namespace Agents {
|
|||
image_url: string | { url: string; detail?: ImageDetail };
|
||||
};
|
||||
|
||||
export type MessageContentVideoUrl = {
|
||||
type: ContentTypes.VIDEO_URL;
|
||||
video_url: { url: string };
|
||||
};
|
||||
|
||||
export type MessageContentInputAudio = {
|
||||
type: ContentTypes.INPUT_AUDIO;
|
||||
input_audio: {
|
||||
data: string;
|
||||
format: string;
|
||||
};
|
||||
};
|
||||
|
||||
export type MessageContentComplex =
|
||||
| ReasoningContentText
|
||||
| AgentUpdate
|
||||
| MessageContentText
|
||||
| MessageContentImageUrl
|
||||
| MessageContentVideoUrl
|
||||
| MessageContentInputAudio
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
| (Record<string, any> & { type?: ContentTypes | string })
|
||||
// eslint-disable-next-line @typescript-eslint/no-explicit-any
|
||||
|
|
@ -295,6 +310,8 @@ export namespace Agents {
|
|||
| ContentTypes.THINK
|
||||
| ContentTypes.TEXT
|
||||
| ContentTypes.IMAGE_URL
|
||||
| ContentTypes.VIDEO_URL
|
||||
| ContentTypes.INPUT_AUDIO
|
||||
| string;
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -515,7 +515,9 @@ export type TMessageContentParts =
|
|||
} & ContentMetadata)
|
||||
| ({ type: ContentTypes.IMAGE_FILE; image_file: ImageFile & PartMetadata } & ContentMetadata)
|
||||
| (Agents.AgentUpdate & ContentMetadata)
|
||||
| (Agents.MessageContentImageUrl & ContentMetadata);
|
||||
| (Agents.MessageContentImageUrl & ContentMetadata)
|
||||
| (Agents.MessageContentVideoUrl & ContentMetadata)
|
||||
| (Agents.MessageContentInputAudio & ContentMetadata);
|
||||
|
||||
export type StreamContentData = TMessageContentParts & {
|
||||
/** The index of the current content part */
|
||||
|
|
|
|||
|
|
@ -5,6 +5,8 @@ export enum ContentTypes {
|
|||
TOOL_CALL = 'tool_call',
|
||||
IMAGE_FILE = 'image_file',
|
||||
IMAGE_URL = 'image_url',
|
||||
VIDEO_URL = 'video_url',
|
||||
INPUT_AUDIO = 'input_audio',
|
||||
AGENT_UPDATE = 'agent_update',
|
||||
ERROR = 'error',
|
||||
}
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue