🗂️ feat: Send Attachments Directly to Provider (Google) (#9100)

* feat: add validation for google PDFs and add google endpoint as a document supporting endpoint

* feat: add proper pdf formatting for google endpoints (requires PR #14 in agents)

* feat: add multimodal support for google endpoint attachments

* feat: add audio file svg

* fix: refactor attachments logic so multi-attachment messages work properly

* feat: add video file svg

* fix: allows for followup questions of uploaded multimodal attachments

* fix: remove incorrect final message filtering that was breaking Attachment component rendering
This commit is contained in:
Dustin Healy 2025-08-18 05:39:50 -07:00 committed by Dustin Healy
parent b5aadf1302
commit aae47e7b3f
13 changed files with 581 additions and 15 deletions

View file

@ -42,8 +42,11 @@ const {
setMemory,
} = require('~/models');
const { getMCPAuthMap, checkCapability, hasCustomUserVars } = require('~/server/services/Config');
const { encodeAndFormatDocuments } = require('~/server/services/Files/documents/encode');
const { encodeAndFormatDocuments } = require('~/server/services/Files/Documents/encode');
const { addCacheControl, createContextHandlers } = require('~/app/clients/prompts');
const { encodeAndFormatVideos } = require('~/server/services/Files/Video/encode');
const { encodeAndFormatAudios } = require('~/server/services/Files/Audio/encode');
const { getFiles } = require('~/models');
const { initializeAgent } = require('~/server/services/Endpoints/agents/agent');
const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
const { getFormattedMemories, deleteMemory, setMemory } = require('~/models');
@ -244,13 +247,137 @@ class AgentClient extends BaseClient {
return documentResult.files;
}
async addVideos(message, attachments) {
const videoResult = await encodeAndFormatVideos(
this.options.req,
attachments,
this.options.agent.provider,
);
message.videos =
videoResult.videos && videoResult.videos.length ? videoResult.videos : undefined;
return videoResult.files;
}
async addAudios(message, attachments) {
const audioResult = await encodeAndFormatAudios(
this.options.req,
attachments,
this.options.agent.provider,
);
message.audios =
audioResult.audios && audioResult.audios.length ? audioResult.audios : undefined;
return audioResult.files;
}
/**
* Override addPreviousAttachments to handle all file types, not just images
* @param {TMessage[]} _messages
* @returns {Promise<TMessage[]>}
*/
async addPreviousAttachments(_messages) {
if (!this.options.resendFiles) {
return _messages;
}
const seen = new Set();
const attachmentsProcessed =
this.options.attachments && !(this.options.attachments instanceof Promise);
if (attachmentsProcessed) {
for (const attachment of this.options.attachments) {
seen.add(attachment.file_id);
}
}
/**
*
* @param {TMessage} message
*/
const processMessage = async (message) => {
if (!this.message_file_map) {
/** @type {Record<string, MongoFile[]> */
this.message_file_map = {};
}
const fileIds = [];
for (const file of message.files) {
if (seen.has(file.file_id)) {
continue;
}
fileIds.push(file.file_id);
seen.add(file.file_id);
}
if (fileIds.length === 0) {
return message;
}
const files = await getFiles(
{
file_id: { $in: fileIds },
},
{},
{},
);
await this.processAttachments(message, files);
this.message_file_map[message.messageId] = files;
return message;
};
const promises = [];
for (const message of _messages) {
if (!message.files) {
promises.push(message);
continue;
}
promises.push(processMessage(message));
}
const messages = await Promise.all(promises);
this.checkVisionRequest(Object.values(this.message_file_map ?? {}).flat());
return messages;
}
async processAttachments(message, attachments) {
const [imageFiles, documentFiles] = await Promise.all([
this.addImageURLs(message, attachments),
this.addDocuments(message, attachments),
const categorizedAttachments = {
images: [],
documents: [],
videos: [],
audios: [],
};
for (const file of attachments) {
if (file.type.startsWith('image/')) {
categorizedAttachments.images.push(file);
} else if (file.type === 'application/pdf') {
categorizedAttachments.documents.push(file);
} else if (file.type.startsWith('video/')) {
categorizedAttachments.videos.push(file);
} else if (file.type.startsWith('audio/')) {
categorizedAttachments.audios.push(file);
}
}
const [imageFiles, documentFiles, videoFiles, audioFiles] = await Promise.all([
categorizedAttachments.images.length > 0
? this.addImageURLs(message, categorizedAttachments.images)
: Promise.resolve([]),
categorizedAttachments.documents.length > 0
? this.addDocuments(message, categorizedAttachments.documents)
: Promise.resolve([]),
categorizedAttachments.videos.length > 0
? this.addVideos(message, categorizedAttachments.videos)
: Promise.resolve([]),
categorizedAttachments.audios.length > 0
? this.addAudios(message, categorizedAttachments.audios)
: Promise.resolve([]),
]);
const allFiles = [...imageFiles, ...documentFiles];
const allFiles = [...imageFiles, ...documentFiles, ...videoFiles, ...audioFiles];
const seenFileIds = new Set();
const uniqueFiles = [];
@ -322,14 +449,31 @@ class AgentClient extends BaseClient {
assistantName: this.options?.modelLabel,
});
const hasFiles =
(message.documents && message.documents.length > 0) ||
(message.videos && message.videos.length > 0) ||
(message.audios && message.audios.length > 0) ||
(message.image_urls && message.image_urls.length > 0);
if (
message.documents &&
message.documents.length > 0 &&
hasFiles &&
message.isCreatedByUser &&
isDocumentSupportedEndpoint(this.options.agent.provider)
) {
const contentParts = [];
contentParts.push(...message.documents);
if (message.documents && message.documents.length > 0) {
contentParts.push(...message.documents);
}
if (message.videos && message.videos.length > 0) {
contentParts.push(...message.videos);
}
if (message.audios && message.audios.length > 0) {
contentParts.push(...message.audios);
}
if (message.image_urls && message.image_urls.length > 0) {
contentParts.push(...message.image_urls);
}
@ -338,8 +482,11 @@ class AgentClient extends BaseClient {
contentParts.push({ type: 'text', text: formattedMessage.content });
} else {
const textPart = formattedMessage.content.find((part) => part.type === 'text');
contentParts.push(textPart);
if (textPart) {
contentParts.push(textPart);
}
}
formattedMessage.content = contentParts;
}

View file

@ -0,0 +1,111 @@
const { EModelEndpoint, isDocumentSupportedEndpoint } = require('librechat-data-provider');
const { getStrategyFunctions } = require('~/server/services/Files/strategies');
const { validateAudio } = require('@librechat/api');
const { streamToBuffer } = require('~/server/services/Files/Documents/encode');
/**
* Encodes and formats audio files for different endpoints
* @param {Express.Request} req - The request object
* @param {Array<MongoFile>} files - Array of audio files
* @param {EModelEndpoint} endpoint - The endpoint to format for
* @returns {Promise<{ audios: Array, files: Array<MongoFile> }>}
*/
async function encodeAndFormatAudios(req, files, endpoint) {
const promises = [];
const encodingMethods = {};
/** @type {{ audios: any[]; files: MongoFile[] }} */
const result = {
audios: [],
files: [],
};
for (const file of files) {
if (!file || !file.filepath) {
continue;
}
const source = file.source ?? 'local';
if (!encodingMethods[source]) {
encodingMethods[source] = getStrategyFunctions(source);
}
const fileMetadata = {
file_id: file.file_id || file._id,
temp_file_id: file.temp_file_id,
filepath: file.filepath,
source: file.source,
filename: file.filename,
type: file.type,
};
promises.push([file, fileMetadata]);
}
const results = await Promise.allSettled(
promises.map(async ([file, fileMetadata]) => {
if (!file || !fileMetadata) {
return { file: null, content: null, metadata: fileMetadata };
}
try {
const source = file.source ?? 'local';
const { getDownloadStream } = encodingMethods[source];
const stream = await getDownloadStream(req, file.filepath);
const buffer = await streamToBuffer(stream);
const audioContent = buffer.toString('base64');
return {
file,
content: audioContent,
metadata: fileMetadata,
};
} catch (error) {
console.error(`Error processing audio ${file.filename}:`, error);
return { file, content: null, metadata: fileMetadata };
}
}),
);
for (const settledResult of results) {
if (settledResult.status === 'rejected') {
console.error('Audio processing failed:', settledResult.reason);
continue;
}
const { file, content, metadata } = settledResult.value;
if (!content || !file) {
if (metadata) {
result.files.push(metadata);
}
continue;
}
if (file.type.startsWith('audio/') && isDocumentSupportedEndpoint(endpoint)) {
const audioBuffer = Buffer.from(content, 'base64');
const validation = await validateAudio(audioBuffer, audioBuffer.length, endpoint);
if (!validation.isValid) {
throw new Error(`Audio validation failed: ${validation.error}`);
}
if (endpoint === EModelEndpoint.google) {
const audioPart = {
type: 'audio',
mimeType: file.type,
data: content,
};
result.audios.push(audioPart);
}
result.files.push(metadata);
}
}
return result;
}
module.exports = {
encodeAndFormatAudios,
};

View file

@ -0,0 +1,111 @@
const { EModelEndpoint, isDocumentSupportedEndpoint } = require('librechat-data-provider');
const { getStrategyFunctions } = require('~/server/services/Files/strategies');
const { validateVideo } = require('@librechat/api');
const { streamToBuffer } = require('~/server/services/Files/Documents/encode');
/**
* Encodes and formats video files for different endpoints
* @param {Express.Request} req - The request object
* @param {Array<MongoFile>} files - Array of video files
* @param {EModelEndpoint} endpoint - The endpoint to format for
* @returns {Promise<{ videos: Array, files: Array<MongoFile> }>}
*/
async function encodeAndFormatVideos(req, files, endpoint) {
const promises = [];
const encodingMethods = {};
/** @type {{ videos: any[]; files: MongoFile[] }} */
const result = {
videos: [],
files: [],
};
for (const file of files) {
if (!file || !file.filepath) {
continue;
}
const source = file.source ?? 'local';
if (!encodingMethods[source]) {
encodingMethods[source] = getStrategyFunctions(source);
}
const fileMetadata = {
file_id: file.file_id || file._id,
temp_file_id: file.temp_file_id,
filepath: file.filepath,
source: file.source,
filename: file.filename,
type: file.type,
};
promises.push([file, fileMetadata]);
}
const results = await Promise.allSettled(
promises.map(async ([file, fileMetadata]) => {
if (!file || !fileMetadata) {
return { file: null, content: null, metadata: fileMetadata };
}
try {
const source = file.source ?? 'local';
const { getDownloadStream } = encodingMethods[source];
const stream = await getDownloadStream(req, file.filepath);
const buffer = await streamToBuffer(stream);
const videoContent = buffer.toString('base64');
return {
file,
content: videoContent,
metadata: fileMetadata,
};
} catch (error) {
console.error(`Error processing video ${file.filename}:`, error);
return { file, content: null, metadata: fileMetadata };
}
}),
);
for (const settledResult of results) {
if (settledResult.status === 'rejected') {
console.error('Video processing failed:', settledResult.reason);
continue;
}
const { file, content, metadata } = settledResult.value;
if (!content || !file) {
if (metadata) {
result.files.push(metadata);
}
continue;
}
if (file.type.startsWith('video/') && isDocumentSupportedEndpoint(endpoint)) {
const videoBuffer = Buffer.from(content, 'base64');
const validation = await validateVideo(videoBuffer, videoBuffer.length, endpoint);
if (!validation.isValid) {
throw new Error(`Video validation failed: ${validation.error}`);
}
if (endpoint === EModelEndpoint.google) {
const videoPart = {
type: 'video',
mimeType: file.type,
data: content,
};
result.videos.push(videoPart);
}
result.files.push(metadata);
}
}
return result;
}
module.exports = {
encodeAndFormatVideos,
};

View file

@ -159,6 +159,13 @@ async function encodeAndFormatDocuments(req, files, endpoint) {
file_data: `data:application/pdf;base64,${content}`,
};
result.documents.push(documentPart);
} else if (endpoint === EModelEndpoint.google) {
const documentPart = {
type: 'document',
mimeType: 'application/pdf',
data: content,
};
result.documents.push(documentPart);
}
result.files.push(metadata);
@ -170,4 +177,5 @@ async function encodeAndFormatDocuments(req, files, endpoint) {
module.exports = {
encodeAndFormatDocuments,
streamToBuffer,
};

View file

@ -1,5 +1,6 @@
const { encodeAndFormatDocuments } = require('./encode');
const { encodeAndFormatDocuments, streamToBuffer } = require('./encode');
module.exports = {
encodeAndFormatDocuments,
streamToBuffer,
};

View file

@ -88,6 +88,8 @@ const AttachFileMenu = ({
inputRef.current.accept = '.pdf,application/pdf';
} else if (fileType === 'multimodal') {
inputRef.current.accept = 'image/*,.pdf,application/pdf';
} else if (fileType === 'google_multimodal') {
inputRef.current.accept = 'image/*,.pdf,application/pdf,video/*,audio/*';
} else {
inputRef.current.accept = '';
}
@ -97,7 +99,7 @@ const AttachFileMenu = ({
const dropdownItems = useMemo(() => {
const createMenuItems = (
onAction: (fileType?: 'image' | 'document' | 'multimodal') => void,
onAction: (fileType?: 'image' | 'document' | 'multimodal' | 'google_multimodal') => void,
) => {
const items: MenuItemProps[] = [];
@ -108,7 +110,7 @@ const AttachFileMenu = ({
label: localize('com_ui_upload_provider'),
onClick: () => {
setToolResource(EToolResources.direct_attach);
onAction('multimodal');
onAction(endpoint === EModelEndpoint.google ? 'google_multimodal' : 'multimodal');
},
icon: <FileImageIcon className="icon-md" />,
});

View file

@ -1,4 +1,11 @@
import { SheetPaths, TextPaths, FilePaths, CodePaths } from '@librechat/client';
import {
SheetPaths,
TextPaths,
FilePaths,
CodePaths,
AudioPaths,
VideoPaths,
} from '@librechat/client';
import {
megabyte,
QueryKeys,
@ -38,6 +45,18 @@ const artifact = {
title: 'Code',
};
const audioFile = {
paths: AudioPaths,
fill: '#FF6B35',
title: 'Audio',
};
const videoFile = {
paths: VideoPaths,
fill: '#8B5CF6',
title: 'Video',
};
export const fileTypes = {
/* Category matches */
file: {
@ -47,6 +66,8 @@ export const fileTypes = {
},
text: textDocument,
txt: textDocument,
audio: audioFile,
video: videoFile,
// application:,
/* Partial matches */

View file

@ -5,6 +5,16 @@ export interface PDFValidationResult {
error?: string;
}
export interface VideoValidationResult {
isValid: boolean;
error?: string;
}
export interface AudioValidationResult {
isValid: boolean;
error?: string;
}
export async function validatePdf(
pdfBuffer: Buffer,
fileSize: number,
@ -18,6 +28,10 @@ export async function validatePdf(
return validateOpenAIPdf(fileSize);
}
if (endpoint === EModelEndpoint.google) {
return validateGooglePdf(fileSize);
}
return { isValid: true };
}
@ -96,3 +110,76 @@ async function validateOpenAIPdf(fileSize: number): Promise<PDFValidationResult>
return { isValid: true };
}
async function validateGooglePdf(fileSize: number): Promise<PDFValidationResult> {
if (fileSize > 20 * 1024 * 1024) {
return {
isValid: false,
error: "PDF file size exceeds Google's 20MB limit",
};
}
return { isValid: true };
}
/**
* Validates video files for different endpoints
* @param videoBuffer - The video file as a buffer
* @param fileSize - The file size in bytes
* @param endpoint - The endpoint to validate for
* @returns Promise that resolves to validation result
*/
export async function validateVideo(
videoBuffer: Buffer,
fileSize: number,
endpoint: EModelEndpoint,
): Promise<VideoValidationResult> {
if (endpoint === EModelEndpoint.google) {
if (fileSize > 20 * 1024 * 1024) {
return {
isValid: false,
error: `Video file size (${Math.round(fileSize / (1024 * 1024))}MB) exceeds Google's 20MB limit`,
};
}
}
if (!videoBuffer || videoBuffer.length < 10) {
return {
isValid: false,
error: 'Invalid video file: too small or corrupted',
};
}
return { isValid: true };
}
/**
* Validates audio files for different endpoints
* @param audioBuffer - The audio file as a buffer
* @param fileSize - The file size in bytes
* @param endpoint - The endpoint to validate for
* @returns Promise that resolves to validation result
*/
export async function validateAudio(
audioBuffer: Buffer,
fileSize: number,
endpoint: EModelEndpoint,
): Promise<AudioValidationResult> {
if (endpoint === EModelEndpoint.google) {
if (fileSize > 20 * 1024 * 1024) {
return {
isValid: false,
error: `Audio file size (${Math.round(fileSize / (1024 * 1024))}MB) exceeds Google's 20MB limit`,
};
}
}
if (!audioBuffer || audioBuffer.length < 10) {
return {
isValid: false,
error: 'Invalid audio file: too small or corrupted',
};
}
return { isValid: true };
}

View file

@ -0,0 +1,41 @@
export default function AudioPaths() {
return (
<>
<path
d="M8 15v6"
stroke="white"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
/>
<path
d="M13 8v20"
stroke="white"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
/>
<path
d="M18 10v16"
stroke="white"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
/>
<path
d="M23 6v24"
stroke="white"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
/>
<path
d="M28 12v12"
stroke="white"
strokeWidth="2"
strokeLinecap="round"
strokeLinejoin="round"
/>
</>
);
}

View file

@ -0,0 +1,10 @@
export default function VideoPaths() {
return (
<>
{/* Video container - rounded rectangle (not filled) */}
<rect x="8" y="10" width="20" height="16" rx="3" stroke="white" strokeWidth="2" fill="none" />
{/* Play button - centered and pointing right */}
<path d="M22 18l-6 4v-8L22 18z" fill="white" />
</>
);
}

View file

@ -65,9 +65,11 @@ export { default as PersonalizationIcon } from './PersonalizationIcon';
export { default as MCPIcon } from './MCPIcon';
export { default as VectorIcon } from './VectorIcon';
export { default as SquirclePlusIcon } from './SquirclePlusIcon';
export { default as AudioPaths } from './AudioPaths';
export { default as CodePaths } from './CodePaths';
export { default as FileIcon } from './FileIcon';
export { default as FilePaths } from './FilePaths';
export { default as SheetPaths } from './SheetPaths';
export { default as TextPaths } from './TextPaths';
export { default as VideoPaths } from './VideoPaths';
export { default as SharePointIcon } from './SharePointIcon';

View file

@ -57,6 +57,27 @@ export const fullMimeTypesList = [
'application/zip',
'image/svg',
'image/svg+xml',
// Video formats
'video/mp4',
'video/avi',
'video/mov',
'video/wmv',
'video/flv',
'video/webm',
'video/mkv',
'video/m4v',
'video/3gp',
'video/ogv',
// Audio formats
'audio/mp3',
'audio/wav',
'audio/ogg',
'audio/m4a',
'audio/aac',
'audio/flac',
'audio/wma',
'audio/opus',
'audio/mpeg',
...excelFileTypes,
];
@ -123,7 +144,9 @@ export const applicationMimeTypes =
export const imageMimeTypes = /^image\/(jpeg|gif|png|webp|heic|heif)$/;
export const audioMimeTypes =
/^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm)$/;
/^audio\/(mp3|mpeg|mpeg3|wav|wave|x-wav|ogg|vorbis|mp4|x-m4a|flac|x-flac|webm|aac|wma|opus)$/;
export const videoMimeTypes = /^video\/(mp4|avi|mov|wmv|flv|webm|mkv|m4v|3gp|ogv)$/;
export const defaultOCRMimeTypes = [
imageMimeTypes,
@ -142,8 +165,9 @@ export const supportedMimeTypes = [
excelMimeTypes,
applicationMimeTypes,
imageMimeTypes,
videoMimeTypes,
audioMimeTypes,
/** Supported by LC Code Interpreter PAI */
/** Supported by LC Code Interpreter API */
/^image\/(svg|svg\+xml)$/,
];

View file

@ -38,6 +38,7 @@ export const documentSupportedEndpoints = new Set<EModelEndpoint>([
EModelEndpoint.anthropic,
EModelEndpoint.openAI,
EModelEndpoint.azureOpenAI,
EModelEndpoint.google,
]);
export const isDocumentSupportedEndpoint = (endpoint: EModelEndpoint): boolean => {