diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js index ba385dff3a..4411438abb 100644 --- a/api/server/controllers/agents/client.js +++ b/api/server/controllers/agents/client.js @@ -42,8 +42,11 @@ const { setMemory, } = require('~/models'); const { getMCPAuthMap, checkCapability, hasCustomUserVars } = require('~/server/services/Config'); -const { encodeAndFormatDocuments } = require('~/server/services/Files/documents/encode'); +const { encodeAndFormatDocuments } = require('~/server/services/Files/Documents/encode'); const { addCacheControl, createContextHandlers } = require('~/app/clients/prompts'); +const { encodeAndFormatVideos } = require('~/server/services/Files/Video/encode'); +const { encodeAndFormatAudios } = require('~/server/services/Files/Audio/encode'); +const { getFiles } = require('~/models'); const { initializeAgent } = require('~/server/services/Endpoints/agents/agent'); const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens'); const { getFormattedMemories, deleteMemory, setMemory } = require('~/models'); @@ -244,13 +247,137 @@ class AgentClient extends BaseClient { return documentResult.files; } + async addVideos(message, attachments) { + const videoResult = await encodeAndFormatVideos( + this.options.req, + attachments, + this.options.agent.provider, + ); + message.videos = + videoResult.videos && videoResult.videos.length ? videoResult.videos : undefined; + return videoResult.files; + } + + async addAudios(message, attachments) { + const audioResult = await encodeAndFormatAudios( + this.options.req, + attachments, + this.options.agent.provider, + ); + message.audios = + audioResult.audios && audioResult.audios.length ? audioResult.audios : undefined; + return audioResult.files; + } + + /** + * Override addPreviousAttachments to handle all file types, not just images + * @param {TMessage[]} _messages + * @returns {Promise} + */ + async addPreviousAttachments(_messages) { + if (!this.options.resendFiles) { + return _messages; + } + + const seen = new Set(); + const attachmentsProcessed = + this.options.attachments && !(this.options.attachments instanceof Promise); + if (attachmentsProcessed) { + for (const attachment of this.options.attachments) { + seen.add(attachment.file_id); + } + } + + /** + * + * @param {TMessage} message + */ + const processMessage = async (message) => { + if (!this.message_file_map) { + /** @type {Record */ + this.message_file_map = {}; + } + + const fileIds = []; + for (const file of message.files) { + if (seen.has(file.file_id)) { + continue; + } + fileIds.push(file.file_id); + seen.add(file.file_id); + } + + if (fileIds.length === 0) { + return message; + } + + const files = await getFiles( + { + file_id: { $in: fileIds }, + }, + {}, + {}, + ); + + await this.processAttachments(message, files); + + this.message_file_map[message.messageId] = files; + return message; + }; + + const promises = []; + + for (const message of _messages) { + if (!message.files) { + promises.push(message); + continue; + } + + promises.push(processMessage(message)); + } + + const messages = await Promise.all(promises); + + this.checkVisionRequest(Object.values(this.message_file_map ?? {}).flat()); + return messages; + } + async processAttachments(message, attachments) { - const [imageFiles, documentFiles] = await Promise.all([ - this.addImageURLs(message, attachments), - this.addDocuments(message, attachments), + const categorizedAttachments = { + images: [], + documents: [], + videos: [], + audios: [], + }; + + for (const file of attachments) { + if (file.type.startsWith('image/')) { + categorizedAttachments.images.push(file); + } else if (file.type === 'application/pdf') { + categorizedAttachments.documents.push(file); + } else if (file.type.startsWith('video/')) { + categorizedAttachments.videos.push(file); + } else if (file.type.startsWith('audio/')) { + categorizedAttachments.audios.push(file); + } + } + + const [imageFiles, documentFiles, videoFiles, audioFiles] = await Promise.all([ + categorizedAttachments.images.length > 0 + ? this.addImageURLs(message, categorizedAttachments.images) + : Promise.resolve([]), + categorizedAttachments.documents.length > 0 + ? this.addDocuments(message, categorizedAttachments.documents) + : Promise.resolve([]), + categorizedAttachments.videos.length > 0 + ? this.addVideos(message, categorizedAttachments.videos) + : Promise.resolve([]), + categorizedAttachments.audios.length > 0 + ? this.addAudios(message, categorizedAttachments.audios) + : Promise.resolve([]), ]); - const allFiles = [...imageFiles, ...documentFiles]; + const allFiles = [...imageFiles, ...documentFiles, ...videoFiles, ...audioFiles]; const seenFileIds = new Set(); const uniqueFiles = []; @@ -322,14 +449,31 @@ class AgentClient extends BaseClient { assistantName: this.options?.modelLabel, }); + const hasFiles = + (message.documents && message.documents.length > 0) || + (message.videos && message.videos.length > 0) || + (message.audios && message.audios.length > 0) || + (message.image_urls && message.image_urls.length > 0); + if ( - message.documents && - message.documents.length > 0 && + hasFiles && message.isCreatedByUser && isDocumentSupportedEndpoint(this.options.agent.provider) ) { const contentParts = []; - contentParts.push(...message.documents); + + if (message.documents && message.documents.length > 0) { + contentParts.push(...message.documents); + } + + if (message.videos && message.videos.length > 0) { + contentParts.push(...message.videos); + } + + if (message.audios && message.audios.length > 0) { + contentParts.push(...message.audios); + } + if (message.image_urls && message.image_urls.length > 0) { contentParts.push(...message.image_urls); } @@ -338,8 +482,11 @@ class AgentClient extends BaseClient { contentParts.push({ type: 'text', text: formattedMessage.content }); } else { const textPart = formattedMessage.content.find((part) => part.type === 'text'); - contentParts.push(textPart); + if (textPart) { + contentParts.push(textPart); + } } + formattedMessage.content = contentParts; } diff --git a/api/server/services/Files/Audio/encode.js b/api/server/services/Files/Audio/encode.js new file mode 100644 index 0000000000..98d920c565 --- /dev/null +++ b/api/server/services/Files/Audio/encode.js @@ -0,0 +1,111 @@ +const { EModelEndpoint, isDocumentSupportedEndpoint } = require('librechat-data-provider'); +const { getStrategyFunctions } = require('~/server/services/Files/strategies'); +const { validateAudio } = require('@librechat/api'); +const { streamToBuffer } = require('~/server/services/Files/Documents/encode'); + +/** + * Encodes and formats audio files for different endpoints + * @param {Express.Request} req - The request object + * @param {Array} files - Array of audio files + * @param {EModelEndpoint} endpoint - The endpoint to format for + * @returns {Promise<{ audios: Array, files: Array }>} + */ +async function encodeAndFormatAudios(req, files, endpoint) { + const promises = []; + const encodingMethods = {}; + /** @type {{ audios: any[]; files: MongoFile[] }} */ + const result = { + audios: [], + files: [], + }; + + for (const file of files) { + if (!file || !file.filepath) { + continue; + } + + const source = file.source ?? 'local'; + if (!encodingMethods[source]) { + encodingMethods[source] = getStrategyFunctions(source); + } + + const fileMetadata = { + file_id: file.file_id || file._id, + temp_file_id: file.temp_file_id, + filepath: file.filepath, + source: file.source, + filename: file.filename, + type: file.type, + }; + + promises.push([file, fileMetadata]); + } + + const results = await Promise.allSettled( + promises.map(async ([file, fileMetadata]) => { + if (!file || !fileMetadata) { + return { file: null, content: null, metadata: fileMetadata }; + } + + try { + const source = file.source ?? 'local'; + const { getDownloadStream } = encodingMethods[source]; + + const stream = await getDownloadStream(req, file.filepath); + const buffer = await streamToBuffer(stream); + const audioContent = buffer.toString('base64'); + + return { + file, + content: audioContent, + metadata: fileMetadata, + }; + } catch (error) { + console.error(`Error processing audio ${file.filename}:`, error); + return { file, content: null, metadata: fileMetadata }; + } + }), + ); + + for (const settledResult of results) { + if (settledResult.status === 'rejected') { + console.error('Audio processing failed:', settledResult.reason); + continue; + } + + const { file, content, metadata } = settledResult.value; + + if (!content || !file) { + if (metadata) { + result.files.push(metadata); + } + continue; + } + + if (file.type.startsWith('audio/') && isDocumentSupportedEndpoint(endpoint)) { + const audioBuffer = Buffer.from(content, 'base64'); + + const validation = await validateAudio(audioBuffer, audioBuffer.length, endpoint); + if (!validation.isValid) { + throw new Error(`Audio validation failed: ${validation.error}`); + } + + if (endpoint === EModelEndpoint.google) { + const audioPart = { + type: 'audio', + mimeType: file.type, + data: content, + }; + result.audios.push(audioPart); + } + + result.files.push(metadata); + } + } + + return result; +} + +module.exports = { + encodeAndFormatAudios, +}; diff --git a/api/server/services/Files/Video/encode.js b/api/server/services/Files/Video/encode.js new file mode 100644 index 0000000000..2959b08799 --- /dev/null +++ b/api/server/services/Files/Video/encode.js @@ -0,0 +1,111 @@ +const { EModelEndpoint, isDocumentSupportedEndpoint } = require('librechat-data-provider'); +const { getStrategyFunctions } = require('~/server/services/Files/strategies'); +const { validateVideo } = require('@librechat/api'); +const { streamToBuffer } = require('~/server/services/Files/Documents/encode'); + +/** + * Encodes and formats video files for different endpoints + * @param {Express.Request} req - The request object + * @param {Array} files - Array of video files + * @param {EModelEndpoint} endpoint - The endpoint to format for + * @returns {Promise<{ videos: Array, files: Array }>} + */ +async function encodeAndFormatVideos(req, files, endpoint) { + const promises = []; + const encodingMethods = {}; + /** @type {{ videos: any[]; files: MongoFile[] }} */ + const result = { + videos: [], + files: [], + }; + + for (const file of files) { + if (!file || !file.filepath) { + continue; + } + + const source = file.source ?? 'local'; + if (!encodingMethods[source]) { + encodingMethods[source] = getStrategyFunctions(source); + } + + const fileMetadata = { + file_id: file.file_id || file._id, + temp_file_id: file.temp_file_id, + filepath: file.filepath, + source: file.source, + filename: file.filename, + type: file.type, + }; + + promises.push([file, fileMetadata]); + } + + const results = await Promise.allSettled( + promises.map(async ([file, fileMetadata]) => { + if (!file || !fileMetadata) { + return { file: null, content: null, metadata: fileMetadata }; + } + + try { + const source = file.source ?? 'local'; + const { getDownloadStream } = encodingMethods[source]; + + const stream = await getDownloadStream(req, file.filepath); + const buffer = await streamToBuffer(stream); + const videoContent = buffer.toString('base64'); + + return { + file, + content: videoContent, + metadata: fileMetadata, + }; + } catch (error) { + console.error(`Error processing video ${file.filename}:`, error); + return { file, content: null, metadata: fileMetadata }; + } + }), + ); + + for (const settledResult of results) { + if (settledResult.status === 'rejected') { + console.error('Video processing failed:', settledResult.reason); + continue; + } + + const { file, content, metadata } = settledResult.value; + + if (!content || !file) { + if (metadata) { + result.files.push(metadata); + } + continue; + } + + if (file.type.startsWith('video/') && isDocumentSupportedEndpoint(endpoint)) { + const videoBuffer = Buffer.from(content, 'base64'); + + const validation = await validateVideo(videoBuffer, videoBuffer.length, endpoint); + if (!validation.isValid) { + throw new Error(`Video validation failed: ${validation.error}`); + } + + if (endpoint === EModelEndpoint.google) { + const videoPart = { + type: 'video', + mimeType: file.type, + data: content, + }; + result.videos.push(videoPart); + } + + result.files.push(metadata); + } + } + + return result; +} + +module.exports = { + encodeAndFormatVideos, +}; diff --git a/api/server/services/Files/documents/encode.js b/api/server/services/Files/documents/encode.js index 4042238ea1..6970a8cc6a 100644 --- a/api/server/services/Files/documents/encode.js +++ b/api/server/services/Files/documents/encode.js @@ -159,6 +159,13 @@ async function encodeAndFormatDocuments(req, files, endpoint) { file_data: `data:application/pdf;base64,${content}`, }; result.documents.push(documentPart); + } else if (endpoint === EModelEndpoint.google) { + const documentPart = { + type: 'document', + mimeType: 'application/pdf', + data: content, + }; + result.documents.push(documentPart); } result.files.push(metadata); @@ -170,4 +177,5 @@ async function encodeAndFormatDocuments(req, files, endpoint) { module.exports = { encodeAndFormatDocuments, + streamToBuffer, }; diff --git a/api/server/services/Files/documents/index.js b/api/server/services/Files/documents/index.js index 6082509ba9..6cc1e42dcf 100644 --- a/api/server/services/Files/documents/index.js +++ b/api/server/services/Files/documents/index.js @@ -1,5 +1,6 @@ -const { encodeAndFormatDocuments } = require('./encode'); +const { encodeAndFormatDocuments, streamToBuffer } = require('./encode'); module.exports = { encodeAndFormatDocuments, + streamToBuffer, }; diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index bfd97c5021..89259fadf7 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -88,6 +88,8 @@ const AttachFileMenu = ({ inputRef.current.accept = '.pdf,application/pdf'; } else if (fileType === 'multimodal') { inputRef.current.accept = 'image/*,.pdf,application/pdf'; + } else if (fileType === 'google_multimodal') { + inputRef.current.accept = 'image/*,.pdf,application/pdf,video/*,audio/*'; } else { inputRef.current.accept = ''; } @@ -97,7 +99,7 @@ const AttachFileMenu = ({ const dropdownItems = useMemo(() => { const createMenuItems = ( - onAction: (fileType?: 'image' | 'document' | 'multimodal') => void, + onAction: (fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal') => void, ) => { const items: MenuItemProps[] = []; @@ -108,7 +110,7 @@ const AttachFileMenu = ({ label: localize('com_ui_upload_provider'), onClick: () => { setToolResource(EToolResources.direct_attach); - onAction('multimodal'); + onAction(endpoint === EModelEndpoint.google ? 'google_multimodal' : 'multimodal'); }, icon: , }); diff --git a/client/src/utils/files.ts b/client/src/utils/files.ts index 496c6a1c0a..9dd9632e88 100644 --- a/client/src/utils/files.ts +++ b/client/src/utils/files.ts @@ -1,4 +1,11 @@ -import { SheetPaths, TextPaths, FilePaths, CodePaths } from '@librechat/client'; +import { + SheetPaths, + TextPaths, + FilePaths, + CodePaths, + AudioPaths, + VideoPaths, +} from '@librechat/client'; import { megabyte, QueryKeys, @@ -38,6 +45,18 @@ const artifact = { title: 'Code', }; +const audioFile = { + paths: AudioPaths, + fill: '#FF6B35', + title: 'Audio', +}; + +const videoFile = { + paths: VideoPaths, + fill: '#8B5CF6', + title: 'Video', +}; + export const fileTypes = { /* Category matches */ file: { @@ -47,6 +66,8 @@ export const fileTypes = { }, text: textDocument, txt: textDocument, + audio: audioFile, + video: videoFile, // application:, /* Partial matches */ diff --git a/packages/api/src/files/validation.ts b/packages/api/src/files/validation.ts index 3d6b8ed192..6de2ea320e 100644 --- a/packages/api/src/files/validation.ts +++ b/packages/api/src/files/validation.ts @@ -5,6 +5,16 @@ export interface PDFValidationResult { error?: string; } +export interface VideoValidationResult { + isValid: boolean; + error?: string; +} + +export interface AudioValidationResult { + isValid: boolean; + error?: string; +} + export async function validatePdf( pdfBuffer: Buffer, fileSize: number, @@ -18,6 +28,10 @@ export async function validatePdf( return validateOpenAIPdf(fileSize); } + if (endpoint === EModelEndpoint.google) { + return validateGooglePdf(fileSize); + } + return { isValid: true }; } @@ -96,3 +110,76 @@ async function validateOpenAIPdf(fileSize: number): Promise return { isValid: true }; } + +async function validateGooglePdf(fileSize: number): Promise { + if (fileSize > 20 * 1024 * 1024) { + return { + isValid: false, + error: "PDF file size exceeds Google's 20MB limit", + }; + } + + return { isValid: true }; +} + +/** + * Validates video files for different endpoints + * @param videoBuffer - The video file as a buffer + * @param fileSize - The file size in bytes + * @param endpoint - The endpoint to validate for + * @returns Promise that resolves to validation result + */ +export async function validateVideo( + videoBuffer: Buffer, + fileSize: number, + endpoint: EModelEndpoint, +): Promise { + if (endpoint === EModelEndpoint.google) { + if (fileSize > 20 * 1024 * 1024) { + return { + isValid: false, + error: `Video file size (${Math.round(fileSize / (1024 * 1024))}MB) exceeds Google's 20MB limit`, + }; + } + } + + if (!videoBuffer || videoBuffer.length < 10) { + return { + isValid: false, + error: 'Invalid video file: too small or corrupted', + }; + } + + return { isValid: true }; +} + +/** + * Validates audio files for different endpoints + * @param audioBuffer - The audio file as a buffer + * @param fileSize - The file size in bytes + * @param endpoint - The endpoint to validate for + * @returns Promise that resolves to validation result + */ +export async function validateAudio( + audioBuffer: Buffer, + fileSize: number, + endpoint: EModelEndpoint, +): Promise { + if (endpoint === EModelEndpoint.google) { + if (fileSize > 20 * 1024 * 1024) { + return { + isValid: false, + error: `Audio file size (${Math.round(fileSize / (1024 * 1024))}MB) exceeds Google's 20MB limit`, + }; + } + } + + if (!audioBuffer || audioBuffer.length < 10) { + return { + isValid: false, + error: 'Invalid audio file: too small or corrupted', + }; + } + + return { isValid: true }; +} diff --git a/packages/client/src/svgs/AudioPaths.tsx b/packages/client/src/svgs/AudioPaths.tsx new file mode 100644 index 0000000000..874f54328d --- /dev/null +++ b/packages/client/src/svgs/AudioPaths.tsx @@ -0,0 +1,41 @@ +export default function AudioPaths() { + return ( + <> + + + + + + + ); +} diff --git a/packages/client/src/svgs/VideoPaths.tsx b/packages/client/src/svgs/VideoPaths.tsx new file mode 100644 index 0000000000..6876824e42 --- /dev/null +++ b/packages/client/src/svgs/VideoPaths.tsx @@ -0,0 +1,10 @@ +export default function VideoPaths() { + return ( + <> + {/* Video container - rounded rectangle (not filled) */} + + {/* Play button - centered and pointing right */} + + + ); +} diff --git a/packages/client/src/svgs/index.ts b/packages/client/src/svgs/index.ts index 13a5a1cc0a..d3f8c6e45b 100644 --- a/packages/client/src/svgs/index.ts +++ b/packages/client/src/svgs/index.ts @@ -65,9 +65,11 @@ export { default as PersonalizationIcon } from './PersonalizationIcon'; export { default as MCPIcon } from './MCPIcon'; export { default as VectorIcon } from './VectorIcon'; export { default as SquirclePlusIcon } from './SquirclePlusIcon'; +export { default as AudioPaths } from './AudioPaths'; export { default as CodePaths } from './CodePaths'; export { default as FileIcon } from './FileIcon'; export { default as FilePaths } from './FilePaths'; export { default as SheetPaths } from './SheetPaths'; export { default as TextPaths } from './TextPaths'; +export { default as VideoPaths } from './VideoPaths'; export { default as SharePointIcon } from './SharePointIcon'; diff --git a/packages/data-provider/src/file-config.ts b/packages/data-provider/src/file-config.ts index a4749f399d..2e5ea9c30f 100644 --- a/packages/data-provider/src/file-config.ts +++ b/packages/data-provider/src/file-config.ts @@ -57,6 +57,27 @@ export const fullMimeTypesList = [ 'application/zip', 'image/svg', 'image/svg+xml', + // Video formats + 'video/mp4', + 'video/avi', + 'video/mov', + 'video/wmv', + 'video/flv', + 'video/webm', + 'video/mkv', + 'video/m4v', + 'video/3gp', + 'video/ogv', + // Audio formats + 'audio/mp3', + 'audio/wav', + 'audio/ogg', + 'audio/m4a', + 'audio/aac', + 'audio/flac', + 'audio/wma', + 'audio/opus', + 'audio/mpeg', ...excelFileTypes, ]; @@ -123,7 +144,9 @@ export const applicationMimeTypes = export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/; export const audioMimeTypes = - /^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm)$/; + /^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm|aac|wma|opus)$/; + +export const videoMimeTypes = /^video\/(mp4|avi|mov|wmv|flv|webm|mkv|m4v|3gp|ogv)$/; export const defaultOCRMimeTypes = [ imageMimeTypes, @@ -142,8 +165,9 @@ export const supportedMimeTypes = [ excelMimeTypes, applicationMimeTypes, imageMimeTypes, + videoMimeTypes, audioMimeTypes, - /** Supported by LC Code Interpreter PAI */ + /** Supported by LC Code Interpreter API */ /^image\/(svg|svg\+xml)$/, ]; diff --git a/packages/data-provider/src/schemas.ts b/packages/data-provider/src/schemas.ts index f6093ec24a..1dd034de10 100644 --- a/packages/data-provider/src/schemas.ts +++ b/packages/data-provider/src/schemas.ts @@ -38,6 +38,7 @@ export const documentSupportedEndpoints = new Set([ EModelEndpoint.anthropic, EModelEndpoint.openAI, EModelEndpoint.azureOpenAI, + EModelEndpoint.google, ]); export const isDocumentSupportedEndpoint = (endpoint: EModelEndpoint): boolean => {