📁 feat: Send Attachments Directly to Provider (OpenAI) (#9098)

* refactor: change references from direct upload to direct attach to better reflect functionality

since we are just using base64 encoding strategy now rather than Files/File API for sending our attachments directly to the provider, the upload nomenclature no longer makes sense. direct_attach better describes the different methods of sending attachments to providers anyways even if we later introduce direct upload support

* feat: add upload to provider option for openai (and agent) ui

* chore: move anthropic pdf validator over to packages/api

* feat: simple pdf validation according to openai docs

* feat: add provider agnostic validatePdf logic to start handling multiple endpoints

* feat: add handling for openai specific documentPart formatting

* refactor: move require statement to proper place at top of file

* chore: add in openAI endpoint for the rest of the document handling logic

* feat: add direct attach support for azureOpenAI endpoint and agents

* feat: add pdf validation for azureOpenAI endpoint

* refactor: unify all the endpoint checks with isDocumentSupportedEndpoint

* refactor: consolidate Upload to Provider vs Upload image logic for clarity

* refactor: remove anthropic from anthropic_multimodal fileType since we support multiple providers now
This commit is contained in:
Dustin Healy 2025-08-17 02:14:25 -07:00 committed by Dustin Healy
parent 89843262b2
commit b5aadf1302
10 changed files with 122 additions and 64 deletions

View file

@ -33,7 +33,16 @@ const {
AgentCapabilities,
bedrockInputSchema,
removeNullishValues,
isDocumentSupportedEndpoint,
} = require('librechat-data-provider');
const {
findPluginAuthsByKeys,
getFormattedMemories,
deleteMemory,
setMemory,
} = require('~/models');
const { getMCPAuthMap, checkCapability, hasCustomUserVars } = require('~/server/services/Config');
const { encodeAndFormatDocuments } = require('~/server/services/Files/documents/encode');
const { addCacheControl, createContextHandlers } = require('~/app/clients/prompts');
const { initializeAgent } = require('~/server/services/Endpoints/agents/agent');
const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
@ -223,12 +232,11 @@ class AgentClient extends BaseClient {
}
async addDocuments(message, attachments) {
const documentResult =
await require('~/server/services/Files/documents').encodeAndFormatDocuments(
this.options.req,
attachments,
this.options.agent.provider,
);
const documentResult = await encodeAndFormatDocuments(
this.options.req,
attachments,
this.options.agent.provider,
);
message.documents =
documentResult.documents && documentResult.documents.length
? documentResult.documents
@ -318,7 +326,7 @@ class AgentClient extends BaseClient {
message.documents &&
message.documents.length > 0 &&
message.isCreatedByUser &&
this.options.agent.provider === EModelEndpoint.anthropic
isDocumentSupportedEndpoint(this.options.agent.provider)
) {
const contentParts = [];
contentParts.push(...message.documents);

View file

@ -1,6 +1,6 @@
const { EModelEndpoint } = require('librechat-data-provider');
const { EModelEndpoint, isDocumentSupportedEndpoint } = require('librechat-data-provider');
const { getStrategyFunctions } = require('~/server/services/Files/strategies');
const { validateAnthropicPdf } = require('../validation/pdfValidator');
const { validatePdf } = require('@librechat/api');
/**
* Converts a readable stream to a buffer.
@ -71,7 +71,7 @@ async function encodeAndFormatDocuments(req, files, endpoint) {
/** @type {FileSources} */
const source = file.source ?? 'local';
if (file.type !== 'application/pdf' || endpoint !== EModelEndpoint.anthropic) {
if (file.type !== 'application/pdf' || !isDocumentSupportedEndpoint(endpoint)) {
continue;
}
@ -132,26 +132,35 @@ async function encodeAndFormatDocuments(req, files, endpoint) {
continue;
}
if (file.type === 'application/pdf' && endpoint === EModelEndpoint.anthropic) {
if (file.type === 'application/pdf' && isDocumentSupportedEndpoint(endpoint)) {
const pdfBuffer = Buffer.from(content, 'base64');
const validation = await validateAnthropicPdf(pdfBuffer, pdfBuffer.length);
const validation = await validatePdf(pdfBuffer, pdfBuffer.length, endpoint);
if (!validation.isValid) {
throw new Error(`PDF validation failed: ${validation.error}`);
}
const documentPart = {
type: 'document',
source: {
type: 'base64',
media_type: 'application/pdf',
data: content,
},
cache_control: { type: 'ephemeral' },
citations: { enabled: true },
};
if (endpoint === EModelEndpoint.anthropic) {
const documentPart = {
type: 'document',
source: {
type: 'base64',
media_type: 'application/pdf',
data: content,
},
cache_control: { type: 'ephemeral' },
citations: { enabled: true },
};
result.documents.push(documentPart);
} else if (endpoint === EModelEndpoint.openAI) {
const documentPart = {
type: 'input_file',
filename: file.filename,
file_data: `data:application/pdf;base64,${content}`,
};
result.documents.push(documentPart);
}
result.documents.push(documentPart);
result.files.push(metadata);
}
}

View file

@ -21,7 +21,7 @@ export type TAgentCapabilities = {
[AgentCapabilities.execute_code]: boolean;
[AgentCapabilities.end_after_tools]?: boolean;
[AgentCapabilities.hide_sequential_outputs]?: boolean;
[AgentCapabilities.direct_upload]?: boolean;
[AgentCapabilities.direct_attach]?: boolean;
};
export type AgentForm = {

View file

@ -8,7 +8,12 @@ import {
FileType2Icon,
FileImageIcon,
} from 'lucide-react';
import { EToolResources, EModelEndpoint, defaultAgentCapabilities } from 'librechat-data-provider';
import {
EToolResources,
EModelEndpoint,
defaultAgentCapabilities,
isDocumentSupportedEndpoint,
} from 'librechat-data-provider';
import {
FileUpload,
TooltipAnchor,
@ -72,7 +77,7 @@ const AttachFileMenu = ({
* */
const capabilities = useAgentCapabilities(agentsConfig?.capabilities ?? defaultAgentCapabilities);
const handleUploadClick = (fileType?: 'image' | 'document' | 'anthropic_multimodal') => {
const handleUploadClick = (fileType?: 'image' | 'document' | 'multimodal') => {
if (!inputRef.current) {
return;
}
@ -81,7 +86,7 @@ const AttachFileMenu = ({
inputRef.current.accept = 'image/*';
} else if (fileType === 'document') {
inputRef.current.accept = '.pdf,application/pdf';
} else if (fileType === 'anthropic_multimodal') {
} else if (fileType === 'multimodal') {
inputRef.current.accept = 'image/*,.pdf,application/pdf';
} else {
inputRef.current.accept = '';
@ -92,15 +97,22 @@ const AttachFileMenu = ({
const dropdownItems = useMemo(() => {
const createMenuItems = (
onAction: (fileType?: 'image' | 'document' | 'anthropic_multimodal') => void,
onAction: (fileType?: 'image' | 'document' | 'multimodal') => void,
) => {
const items: MenuItemProps[] = [];
// this is temporary until i add direct upload support for the other providers and can make a more robust solution
const isAnthropicAgent = agent?.provider === 'anthropic';
const shouldShowDirectUpload = endpoint === EModelEndpoint.anthropic || isAnthropicAgent;
const shouldShowDirectAttach = isDocumentSupportedEndpoint(agent?.provider ?? endpoint);
if (!shouldShowDirectUpload) {
if (shouldShowDirectAttach) {
items.push({
label: localize('com_ui_upload_provider'),
onClick: () => {
setToolResource(EToolResources.direct_attach);
onAction('multimodal');
},
icon: <FileImageIcon className="icon-md" />,
});
} else {
items.push({
label: localize('com_ui_upload_image_input'),
onClick: () => {
@ -111,17 +123,6 @@ const AttachFileMenu = ({
});
}
if (shouldShowDirectUpload) {
items.push({
label: localize('com_ui_upload_provider'),
onClick: () => {
setToolResource(EToolResources.direct_upload);
onAction('anthropic_multimodal');
},
icon: <FileImageIcon className="icon-md" />,
});
}
if (capabilities.ocrEnabled) {
items.push({
label: localize('com_ui_upload_ocr_text'),

View file

@ -9,7 +9,7 @@ interface AgentCapabilitiesResult {
fileSearchEnabled: boolean;
webSearchEnabled: boolean;
codeEnabled: boolean;
directUploadEnabled: boolean;
directAttachEnabled: boolean;
}
export default function useAgentCapabilities(
@ -50,8 +50,8 @@ export default function useAgentCapabilities(
[capabilities],
);
const directUploadEnabled = useMemo(
() => capabilities?.includes(AgentCapabilities.direct_upload) ?? false,
const directAttachEnabled = useMemo(
() => capabilities?.includes(AgentCapabilities.direct_attach) ?? false,
[capabilities],
);
@ -63,6 +63,6 @@ export default function useAgentCapabilities(
artifactsEnabled,
webSearchEnabled,
fileSearchEnabled,
directUploadEnabled,
directAttachEnabled,
};
}

View file

@ -2,3 +2,4 @@ export * from './mistral/crud';
export * from './audio';
export * from './text';
export * from './parse';
export * from './validation';

View file

@ -1,13 +1,36 @@
const { logger } = require('~/config');
const { anthropicPdfSizeLimit } = require('librechat-data-provider');
import { anthropicPdfSizeLimit, EModelEndpoint } from 'librechat-data-provider';
export interface PDFValidationResult {
isValid: boolean;
error?: string;
}
export async function validatePdf(
pdfBuffer: Buffer,
fileSize: number,
endpoint: EModelEndpoint,
): Promise<PDFValidationResult> {
if (endpoint === EModelEndpoint.anthropic) {
return validateAnthropicPdf(pdfBuffer, fileSize);
}
if (endpoint === EModelEndpoint.openAI || endpoint === EModelEndpoint.azureOpenAI) {
return validateOpenAIPdf(fileSize);
}
return { isValid: true };
}
/**
* Validates if a PDF meets Anthropic's requirements
* @param {Buffer} pdfBuffer - The PDF file as a buffer
* @param {number} fileSize - The file size in bytes
* @returns {Promise<{isValid: boolean, error?: string}>}
* @param pdfBuffer - The PDF file as a buffer
* @param fileSize - The file size in bytes
* @returns Promise that resolves to validation result
*/
async function validateAnthropicPdf(pdfBuffer, fileSize) {
async function validateAnthropicPdf(
pdfBuffer: Buffer,
fileSize: number,
): Promise<PDFValidationResult> {
try {
if (fileSize > anthropicPdfSizeLimit) {
return {
@ -53,13 +76,9 @@ async function validateAnthropicPdf(pdfBuffer, fileSize) {
};
}
logger.debug(
`PDF validation passed: ${Math.round(fileSize / 1024)}KB, ~${estimatedPages} pages`,
);
return { isValid: true };
} catch (error) {
logger.error('PDF validation error:', error);
console.error('PDF validation error:', error);
return {
isValid: false,
error: 'Failed to validate PDF file',
@ -67,6 +86,13 @@ async function validateAnthropicPdf(pdfBuffer, fileSize) {
}
}
module.exports = {
validateAnthropicPdf,
};
async function validateOpenAIPdf(fileSize: number): Promise<PDFValidationResult> {
if (fileSize > 10 * 1024 * 1024) {
return {
isValid: false,
error: "PDF file size exceeds OpenAI's 10MB limit",
};
}
return { isValid: true };
}

View file

@ -175,7 +175,7 @@ export enum Capabilities {
export enum AgentCapabilities {
hide_sequential_outputs = 'hide_sequential_outputs',
end_after_tools = 'end_after_tools',
direct_upload = 'direct_upload',
direct_attach = 'direct_attach',
execute_code = 'execute_code',
file_search = 'file_search',
web_search = 'web_search',
@ -249,6 +249,7 @@ export const assistantEndpointSchema = baseEndpointSchema.merge(
export type TAssistantEndpoint = z.infer<typeof assistantEndpointSchema>;
export const defaultAgentCapabilities = [
AgentCapabilities.direct_attach,
AgentCapabilities.execute_code,
AgentCapabilities.file_search,
AgentCapabilities.web_search,
@ -257,7 +258,6 @@ export const defaultAgentCapabilities = [
AgentCapabilities.tools,
AgentCapabilities.chain,
AgentCapabilities.ocr,
AgentCapabilities.direct_upload,
];
export const agentsEndpointSchema = baseEndpointSchema

View file

@ -31,6 +31,19 @@ export enum EModelEndpoint {
gptPlugins = 'gptPlugins',
}
/**
* Endpoints that support direct PDF processing in the agent system
*/
export const documentSupportedEndpoints = new Set<EModelEndpoint>([
EModelEndpoint.anthropic,
EModelEndpoint.openAI,
EModelEndpoint.azureOpenAI,
]);
export const isDocumentSupportedEndpoint = (endpoint: EModelEndpoint): boolean => {
return documentSupportedEndpoints.has(endpoint);
};
export const paramEndpoints = new Set<EModelEndpoint | string>([
EModelEndpoint.agents,
EModelEndpoint.openAI,

View file

@ -27,7 +27,7 @@ export enum Tools {
export enum EToolResources {
code_interpreter = 'code_interpreter',
direct_upload = 'direct_upload',
direct_attach = 'direct_attach',
execute_code = 'execute_code',
file_search = 'file_search',
image_edit = 'image_edit',