From b5aadf1302fc6807a24066a3d189780050368a0a Mon Sep 17 00:00:00 2001 From: Dustin Healy <54083382+dustinhealy@users.noreply.github.com> Date: Sun, 17 Aug 2025 02:14:25 -0700 Subject: [PATCH] =?UTF-8?q?=F0=9F=93=81=20feat:=20Send=20Attachments=20Dir?= =?UTF-8?q?ectly=20to=20Provider=20(OpenAI)=20(#9098)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * refactor: change references from direct upload to direct attach to better reflect functionality since we are just using base64 encoding strategy now rather than Files/File API for sending our attachments directly to the provider, the upload nomenclature no longer makes sense. direct_attach better describes the different methods of sending attachments to providers anyways even if we later introduce direct upload support * feat: add upload to provider option for openai (and agent) ui * chore: move anthropic pdf validator over to packages/api * feat: simple pdf validation according to openai docs * feat: add provider agnostic validatePdf logic to start handling multiple endpoints * feat: add handling for openai specific documentPart formatting * refactor: move require statement to proper place at top of file * chore: add in openAI endpoint for the rest of the document handling logic * feat: add direct attach support for azureOpenAI endpoint and agents * feat: add pdf validation for azureOpenAI endpoint * refactor: unify all the endpoint checks with isDocumentSupportedEndpoint * refactor: consolidate Upload to Provider vs Upload image logic for clarity * refactor: remove anthropic from anthropic_multimodal fileType since we support multiple providers now --- api/server/controllers/agents/client.js | 22 +++++--- api/server/services/Files/documents/encode.js | 41 ++++++++------ client/src/common/agents-types.ts | 2 +- .../Chat/Input/Files/AttachFileMenu.tsx | 39 +++++++------- .../src/hooks/Agents/useAgentCapabilities.ts | 8 +-- packages/api/src/files/index.ts | 1 + .../api/src/files/validation.ts | 54 ++++++++++++++----- packages/data-provider/src/config.ts | 4 +- packages/data-provider/src/schemas.ts | 13 +++++ .../data-provider/src/types/assistants.ts | 2 +- 10 files changed, 122 insertions(+), 64 deletions(-) rename api/server/services/Files/validation/pdfValidator.js => packages/api/src/files/validation.ts (56%) diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js index 4c63ddaa9f..ba385dff3a 100644 --- a/api/server/controllers/agents/client.js +++ b/api/server/controllers/agents/client.js @@ -33,7 +33,16 @@ const { AgentCapabilities, bedrockInputSchema, removeNullishValues, + isDocumentSupportedEndpoint, } = require('librechat-data-provider'); +const { + findPluginAuthsByKeys, + getFormattedMemories, + deleteMemory, + setMemory, +} = require('~/models'); +const { getMCPAuthMap, checkCapability, hasCustomUserVars } = require('~/server/services/Config'); +const { encodeAndFormatDocuments } = require('~/server/services/Files/documents/encode'); const { addCacheControl, createContextHandlers } = require('~/app/clients/prompts'); const { initializeAgent } = require('~/server/services/Endpoints/agents/agent'); const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens'); @@ -223,12 +232,11 @@ class AgentClient extends BaseClient { } async addDocuments(message, attachments) { - const documentResult = - await require('~/server/services/Files/documents').encodeAndFormatDocuments( - this.options.req, - attachments, - this.options.agent.provider, - ); + const documentResult = await encodeAndFormatDocuments( + this.options.req, + attachments, + this.options.agent.provider, + ); message.documents = documentResult.documents && documentResult.documents.length ? documentResult.documents @@ -318,7 +326,7 @@ class AgentClient extends BaseClient { message.documents && message.documents.length > 0 && message.isCreatedByUser && - this.options.agent.provider === EModelEndpoint.anthropic + isDocumentSupportedEndpoint(this.options.agent.provider) ) { const contentParts = []; contentParts.push(...message.documents); diff --git a/api/server/services/Files/documents/encode.js b/api/server/services/Files/documents/encode.js index d5f66ec7a7..4042238ea1 100644 --- a/api/server/services/Files/documents/encode.js +++ b/api/server/services/Files/documents/encode.js @@ -1,6 +1,6 @@ -const { EModelEndpoint } = require('librechat-data-provider'); +const { EModelEndpoint, isDocumentSupportedEndpoint } = require('librechat-data-provider'); const { getStrategyFunctions } = require('~/server/services/Files/strategies'); -const { validateAnthropicPdf } = require('../validation/pdfValidator'); +const { validatePdf } = require('@librechat/api'); /** * Converts a readable stream to a buffer. @@ -71,7 +71,7 @@ async function encodeAndFormatDocuments(req, files, endpoint) { /** @type {FileSources} */ const source = file.source ?? 'local'; - if (file.type !== 'application/pdf' || endpoint !== EModelEndpoint.anthropic) { + if (file.type !== 'application/pdf' || !isDocumentSupportedEndpoint(endpoint)) { continue; } @@ -132,26 +132,35 @@ async function encodeAndFormatDocuments(req, files, endpoint) { continue; } - if (file.type === 'application/pdf' && endpoint === EModelEndpoint.anthropic) { + if (file.type === 'application/pdf' && isDocumentSupportedEndpoint(endpoint)) { const pdfBuffer = Buffer.from(content, 'base64'); - const validation = await validateAnthropicPdf(pdfBuffer, pdfBuffer.length); + const validation = await validatePdf(pdfBuffer, pdfBuffer.length, endpoint); if (!validation.isValid) { throw new Error(`PDF validation failed: ${validation.error}`); } - const documentPart = { - type: 'document', - source: { - type: 'base64', - media_type: 'application/pdf', - data: content, - }, - cache_control: { type: 'ephemeral' }, - citations: { enabled: true }, - }; + if (endpoint === EModelEndpoint.anthropic) { + const documentPart = { + type: 'document', + source: { + type: 'base64', + media_type: 'application/pdf', + data: content, + }, + cache_control: { type: 'ephemeral' }, + citations: { enabled: true }, + }; + result.documents.push(documentPart); + } else if (endpoint === EModelEndpoint.openAI) { + const documentPart = { + type: 'input_file', + filename: file.filename, + file_data: `data:application/pdf;base64,${content}`, + }; + result.documents.push(documentPart); + } - result.documents.push(documentPart); result.files.push(metadata); } } diff --git a/client/src/common/agents-types.ts b/client/src/common/agents-types.ts index 2d18bfe572..f150b41eef 100644 --- a/client/src/common/agents-types.ts +++ b/client/src/common/agents-types.ts @@ -21,7 +21,7 @@ export type TAgentCapabilities = { [AgentCapabilities.execute_code]: boolean; [AgentCapabilities.end_after_tools]?: boolean; [AgentCapabilities.hide_sequential_outputs]?: boolean; - [AgentCapabilities.direct_upload]?: boolean; + [AgentCapabilities.direct_attach]?: boolean; }; export type AgentForm = { diff --git a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx index 6a31535755..bfd97c5021 100644 --- a/client/src/components/Chat/Input/Files/AttachFileMenu.tsx +++ b/client/src/components/Chat/Input/Files/AttachFileMenu.tsx @@ -8,7 +8,12 @@ import { FileType2Icon, FileImageIcon, } from 'lucide-react'; -import { EToolResources, EModelEndpoint, defaultAgentCapabilities } from 'librechat-data-provider'; +import { + EToolResources, + EModelEndpoint, + defaultAgentCapabilities, + isDocumentSupportedEndpoint, +} from 'librechat-data-provider'; import { FileUpload, TooltipAnchor, @@ -72,7 +77,7 @@ const AttachFileMenu = ({ * */ const capabilities = useAgentCapabilities(agentsConfig?.capabilities ?? defaultAgentCapabilities); - const handleUploadClick = (fileType?: 'image' | 'document' | 'anthropic_multimodal') => { + const handleUploadClick = (fileType?: 'image' | 'document' | 'multimodal') => { if (!inputRef.current) { return; } @@ -81,7 +86,7 @@ const AttachFileMenu = ({ inputRef.current.accept = 'image/*'; } else if (fileType === 'document') { inputRef.current.accept = '.pdf,application/pdf'; - } else if (fileType === 'anthropic_multimodal') { + } else if (fileType === 'multimodal') { inputRef.current.accept = 'image/*,.pdf,application/pdf'; } else { inputRef.current.accept = ''; @@ -92,15 +97,22 @@ const AttachFileMenu = ({ const dropdownItems = useMemo(() => { const createMenuItems = ( - onAction: (fileType?: 'image' | 'document' | 'anthropic_multimodal') => void, + onAction: (fileType?: 'image' | 'document' | 'multimodal') => void, ) => { const items: MenuItemProps[] = []; - // this is temporary until i add direct upload support for the other providers and can make a more robust solution - const isAnthropicAgent = agent?.provider === 'anthropic'; - const shouldShowDirectUpload = endpoint === EModelEndpoint.anthropic || isAnthropicAgent; + const shouldShowDirectAttach = isDocumentSupportedEndpoint(agent?.provider ?? endpoint); - if (!shouldShowDirectUpload) { + if (shouldShowDirectAttach) { + items.push({ + label: localize('com_ui_upload_provider'), + onClick: () => { + setToolResource(EToolResources.direct_attach); + onAction('multimodal'); + }, + icon: , + }); + } else { items.push({ label: localize('com_ui_upload_image_input'), onClick: () => { @@ -111,17 +123,6 @@ const AttachFileMenu = ({ }); } - if (shouldShowDirectUpload) { - items.push({ - label: localize('com_ui_upload_provider'), - onClick: () => { - setToolResource(EToolResources.direct_upload); - onAction('anthropic_multimodal'); - }, - icon: , - }); - } - if (capabilities.ocrEnabled) { items.push({ label: localize('com_ui_upload_ocr_text'), diff --git a/client/src/hooks/Agents/useAgentCapabilities.ts b/client/src/hooks/Agents/useAgentCapabilities.ts index 74d0d0d260..6c86a8602c 100644 --- a/client/src/hooks/Agents/useAgentCapabilities.ts +++ b/client/src/hooks/Agents/useAgentCapabilities.ts @@ -9,7 +9,7 @@ interface AgentCapabilitiesResult { fileSearchEnabled: boolean; webSearchEnabled: boolean; codeEnabled: boolean; - directUploadEnabled: boolean; + directAttachEnabled: boolean; } export default function useAgentCapabilities( @@ -50,8 +50,8 @@ export default function useAgentCapabilities( [capabilities], ); - const directUploadEnabled = useMemo( - () => capabilities?.includes(AgentCapabilities.direct_upload) ?? false, + const directAttachEnabled = useMemo( + () => capabilities?.includes(AgentCapabilities.direct_attach) ?? false, [capabilities], ); @@ -63,6 +63,6 @@ export default function useAgentCapabilities( artifactsEnabled, webSearchEnabled, fileSearchEnabled, - directUploadEnabled, + directAttachEnabled, }; } diff --git a/packages/api/src/files/index.ts b/packages/api/src/files/index.ts index fa156f15f1..a19584efca 100644 --- a/packages/api/src/files/index.ts +++ b/packages/api/src/files/index.ts @@ -2,3 +2,4 @@ export * from './mistral/crud'; export * from './audio'; export * from './text'; export * from './parse'; +export * from './validation'; diff --git a/api/server/services/Files/validation/pdfValidator.js b/packages/api/src/files/validation.ts similarity index 56% rename from api/server/services/Files/validation/pdfValidator.js rename to packages/api/src/files/validation.ts index 56414e8484..3d6b8ed192 100644 --- a/api/server/services/Files/validation/pdfValidator.js +++ b/packages/api/src/files/validation.ts @@ -1,13 +1,36 @@ -const { logger } = require('~/config'); -const { anthropicPdfSizeLimit } = require('librechat-data-provider'); +import { anthropicPdfSizeLimit, EModelEndpoint } from 'librechat-data-provider'; + +export interface PDFValidationResult { + isValid: boolean; + error?: string; +} + +export async function validatePdf( + pdfBuffer: Buffer, + fileSize: number, + endpoint: EModelEndpoint, +): Promise { + if (endpoint === EModelEndpoint.anthropic) { + return validateAnthropicPdf(pdfBuffer, fileSize); + } + + if (endpoint === EModelEndpoint.openAI || endpoint === EModelEndpoint.azureOpenAI) { + return validateOpenAIPdf(fileSize); + } + + return { isValid: true }; +} /** * Validates if a PDF meets Anthropic's requirements - * @param {Buffer} pdfBuffer - The PDF file as a buffer - * @param {number} fileSize - The file size in bytes - * @returns {Promise<{isValid: boolean, error?: string}>} + * @param pdfBuffer - The PDF file as a buffer + * @param fileSize - The file size in bytes + * @returns Promise that resolves to validation result */ -async function validateAnthropicPdf(pdfBuffer, fileSize) { +async function validateAnthropicPdf( + pdfBuffer: Buffer, + fileSize: number, +): Promise { try { if (fileSize > anthropicPdfSizeLimit) { return { @@ -53,13 +76,9 @@ async function validateAnthropicPdf(pdfBuffer, fileSize) { }; } - logger.debug( - `PDF validation passed: ${Math.round(fileSize / 1024)}KB, ~${estimatedPages} pages`, - ); - return { isValid: true }; } catch (error) { - logger.error('PDF validation error:', error); + console.error('PDF validation error:', error); return { isValid: false, error: 'Failed to validate PDF file', @@ -67,6 +86,13 @@ async function validateAnthropicPdf(pdfBuffer, fileSize) { } } -module.exports = { - validateAnthropicPdf, -}; +async function validateOpenAIPdf(fileSize: number): Promise { + if (fileSize > 10 * 1024 * 1024) { + return { + isValid: false, + error: "PDF file size exceeds OpenAI's 10MB limit", + }; + } + + return { isValid: true }; +} diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 209efcbfa6..28ee2a6077 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -175,7 +175,7 @@ export enum Capabilities { export enum AgentCapabilities { hide_sequential_outputs = 'hide_sequential_outputs', end_after_tools = 'end_after_tools', - direct_upload = 'direct_upload', + direct_attach = 'direct_attach', execute_code = 'execute_code', file_search = 'file_search', web_search = 'web_search', @@ -249,6 +249,7 @@ export const assistantEndpointSchema = baseEndpointSchema.merge( export type TAssistantEndpoint = z.infer; export const defaultAgentCapabilities = [ + AgentCapabilities.direct_attach, AgentCapabilities.execute_code, AgentCapabilities.file_search, AgentCapabilities.web_search, @@ -257,7 +258,6 @@ export const defaultAgentCapabilities = [ AgentCapabilities.tools, AgentCapabilities.chain, AgentCapabilities.ocr, - AgentCapabilities.direct_upload, ]; export const agentsEndpointSchema = baseEndpointSchema diff --git a/packages/data-provider/src/schemas.ts b/packages/data-provider/src/schemas.ts index 0e9548b1d0..f6093ec24a 100644 --- a/packages/data-provider/src/schemas.ts +++ b/packages/data-provider/src/schemas.ts @@ -31,6 +31,19 @@ export enum EModelEndpoint { gptPlugins = 'gptPlugins', } +/** + * Endpoints that support direct PDF processing in the agent system + */ +export const documentSupportedEndpoints = new Set([ + EModelEndpoint.anthropic, + EModelEndpoint.openAI, + EModelEndpoint.azureOpenAI, +]); + +export const isDocumentSupportedEndpoint = (endpoint: EModelEndpoint): boolean => { + return documentSupportedEndpoints.has(endpoint); +}; + export const paramEndpoints = new Set([ EModelEndpoint.agents, EModelEndpoint.openAI, diff --git a/packages/data-provider/src/types/assistants.ts b/packages/data-provider/src/types/assistants.ts index 3637dbf6ba..88ae1a4b8a 100644 --- a/packages/data-provider/src/types/assistants.ts +++ b/packages/data-provider/src/types/assistants.ts @@ -27,7 +27,7 @@ export enum Tools { export enum EToolResources { code_interpreter = 'code_interpreter', - direct_upload = 'direct_upload', + direct_attach = 'direct_attach', execute_code = 'execute_code', file_search = 'file_search', image_edit = 'image_edit',