From 2a50c372efa7d999eed355c3e7657742513a8e37 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Mon, 12 Jan 2026 23:02:08 -0500 Subject: [PATCH] =?UTF-8?q?=F0=9F=AA=99=20refactor:=20Collected=20Usage=20?= =?UTF-8?q?&=20Anthropic=20Prompt=20Caching=20(#11319)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * 🔧 refactor: Improve token calculation in AgentClient.recordCollectedUsage - Updated the token calculation logic to sum output tokens directly from all entries, addressing issues with negative values in parallel execution scenarios. - Added comments for clarity on the usage of input tokens and output tokens. - Introduced a new test file for comprehensive testing of the recordCollectedUsage function, covering various execution scenarios including sequential and parallel processing, cache token handling, and model fallback logic. * 🔧 refactor: Anthropic `promptCache` handling in LLM configuration * 🔧 test: Add comprehensive test for cache token handling in recordCollectedUsage - Introduced a new test case to validate the handling of cache tokens across multiple tool calls in the recordCollectedUsage function. - Ensured correct calculations for input and output tokens, including scenarios with cache creation and reading. - Verified the expected interactions with token spending methods to enhance the robustness of the token management logic. --- api/package.json | 2 +- api/server/controllers/agents/client.js | 27 +- .../agents/recordCollectedUsage.spec.js | 712 ++++++++++++++++++ package-lock.json | 17 +- packages/api/package.json | 2 +- .../api/src/endpoints/anthropic/llm.spec.ts | 88 ++- packages/api/src/endpoints/anthropic/llm.ts | 6 + .../endpoints/openai/config.anthropic.spec.ts | 14 +- 8 files changed, 828 insertions(+), 40 deletions(-) create mode 100644 api/server/controllers/agents/recordCollectedUsage.spec.js diff --git a/api/package.json b/api/package.json index 9e134bd32a..c2f0dd9801 100644 --- a/api/package.json +++ b/api/package.json @@ -46,7 +46,7 @@ "@googleapis/youtube": "^20.0.0", "@keyv/redis": "^4.3.3", "@langchain/core": "^0.3.80", - "@librechat/agents": "^3.0.66", + "@librechat/agents": "^3.0.77", "@librechat/api": "*", "@librechat/data-schemas": "*", "@microsoft/microsoft-graph-client": "^3.0.7", diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js index 79e63d1c7f..2b5872411b 100644 --- a/api/server/controllers/agents/client.js +++ b/api/server/controllers/agents/client.js @@ -784,6 +784,7 @@ class AgentClient extends BaseClient { if (!collectedUsage || !collectedUsage.length) { return; } + // Use first entry's input_tokens as the base input (represents initial user message context) // Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens) const firstUsage = collectedUsage[0]; const input_tokens = @@ -795,10 +796,11 @@ class AgentClient extends BaseClient { Number(firstUsage?.cache_read_input_tokens) || 0); - let output_tokens = 0; - let previousTokens = input_tokens; // Start with original input - for (let i = 0; i < collectedUsage.length; i++) { - const usage = collectedUsage[i]; + // Sum output_tokens directly from all entries - works for both sequential and parallel execution + // This avoids the incremental calculation that produced negative values for parallel agents + let total_output_tokens = 0; + + for (const usage of collectedUsage) { if (!usage) { continue; } @@ -811,6 +813,9 @@ class AgentClient extends BaseClient { const cache_read = Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0; + // Accumulate output tokens for the usage summary + total_output_tokens += Number(usage.output_tokens) || 0; + const txMetadata = { context, balance, @@ -821,18 +826,6 @@ class AgentClient extends BaseClient { model: usage.model ?? model ?? this.model ?? this.options.agent.model_parameters.model, }; - if (i > 0) { - // Count new tokens generated (input_tokens minus previous accumulated tokens) - output_tokens += - (Number(usage.input_tokens) || 0) + cache_creation + cache_read - previousTokens; - } - - // Add this message's output tokens - output_tokens += Number(usage.output_tokens) || 0; - - // Update previousTokens to include this message's output - previousTokens += Number(usage.output_tokens) || 0; - if (cache_creation > 0 || cache_read > 0) { spendStructuredTokens(txMetadata, { promptTokens: { @@ -862,7 +855,7 @@ class AgentClient extends BaseClient { this.usage = { input_tokens, - output_tokens, + output_tokens: total_output_tokens, }; } diff --git a/api/server/controllers/agents/recordCollectedUsage.spec.js b/api/server/controllers/agents/recordCollectedUsage.spec.js new file mode 100644 index 0000000000..6904f2ed39 --- /dev/null +++ b/api/server/controllers/agents/recordCollectedUsage.spec.js @@ -0,0 +1,712 @@ +/** + * Tests for AgentClient.recordCollectedUsage + * + * This is a critical function that handles token spending for agent LLM calls. + * It must correctly handle: + * - Sequential execution (single agent with tool calls) + * - Parallel execution (multiple agents with independent inputs) + * - Cache token handling (OpenAI and Anthropic formats) + */ + +const { EModelEndpoint } = require('librechat-data-provider'); + +// Mock dependencies before requiring the module +const mockSpendTokens = jest.fn().mockResolvedValue(); +const mockSpendStructuredTokens = jest.fn().mockResolvedValue(); + +jest.mock('~/models/spendTokens', () => ({ + spendTokens: (...args) => mockSpendTokens(...args), + spendStructuredTokens: (...args) => mockSpendStructuredTokens(...args), +})); + +jest.mock('~/config', () => ({ + logger: { + debug: jest.fn(), + error: jest.fn(), + warn: jest.fn(), + info: jest.fn(), + }, + getMCPManager: jest.fn(() => ({ + formatInstructionsForContext: jest.fn(), + })), +})); + +jest.mock('@librechat/agents', () => ({ + ...jest.requireActual('@librechat/agents'), + createMetadataAggregator: () => ({ + handleLLMEnd: jest.fn(), + collected: [], + }), +})); + +const AgentClient = require('./client'); + +describe('AgentClient - recordCollectedUsage', () => { + let client; + let mockAgent; + let mockOptions; + + beforeEach(() => { + jest.clearAllMocks(); + + mockAgent = { + id: 'agent-123', + endpoint: EModelEndpoint.openAI, + provider: EModelEndpoint.openAI, + model_parameters: { + model: 'gpt-4', + }, + }; + + mockOptions = { + req: { + user: { id: 'user-123' }, + body: { model: 'gpt-4', endpoint: EModelEndpoint.openAI }, + }, + res: {}, + agent: mockAgent, + endpointTokenConfig: {}, + }; + + client = new AgentClient(mockOptions); + client.conversationId = 'convo-123'; + client.user = 'user-123'; + }); + + describe('basic functionality', () => { + it('should return early if collectedUsage is empty', async () => { + await client.recordCollectedUsage({ + collectedUsage: [], + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).not.toHaveBeenCalled(); + expect(mockSpendStructuredTokens).not.toHaveBeenCalled(); + expect(client.usage).toBeUndefined(); + }); + + it('should return early if collectedUsage is null', async () => { + await client.recordCollectedUsage({ + collectedUsage: null, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).not.toHaveBeenCalled(); + expect(client.usage).toBeUndefined(); + }); + + it('should handle single usage entry correctly', async () => { + const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(1); + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ + conversationId: 'convo-123', + user: 'user-123', + model: 'gpt-4', + }), + { promptTokens: 100, completionTokens: 50 }, + ); + expect(client.usage.input_tokens).toBe(100); + expect(client.usage.output_tokens).toBe(50); + }); + + it('should skip null entries in collectedUsage', async () => { + const collectedUsage = [ + { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, + null, + { input_tokens: 200, output_tokens: 60, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(2); + }); + }); + + describe('sequential execution (single agent with tool calls)', () => { + it('should calculate tokens correctly for sequential tool calls', async () => { + // Sequential flow: output of call N becomes part of input for call N+1 + // Call 1: input=100, output=50 + // Call 2: input=150 (100+50), output=30 + // Call 3: input=180 (150+30), output=20 + const collectedUsage = [ + { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, + { input_tokens: 150, output_tokens: 30, model: 'gpt-4' }, + { input_tokens: 180, output_tokens: 20, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(3); + // Total output should be sum of all output_tokens: 50 + 30 + 20 = 100 + expect(client.usage.output_tokens).toBe(100); + expect(client.usage.input_tokens).toBe(100); // First entry's input + }); + }); + + describe('parallel execution (multiple agents)', () => { + it('should handle parallel agents with independent input tokens', async () => { + // Parallel agents have INDEPENDENT input tokens (not cumulative) + // Agent A: input=100, output=50 + // Agent B: input=80, output=40 (different context, not 100+50) + const collectedUsage = [ + { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, + { input_tokens: 80, output_tokens: 40, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(2); + // Expected total output: 50 + 40 = 90 + // output_tokens must be positive and should reflect total output + expect(client.usage.output_tokens).toBeGreaterThan(0); + }); + + it('should NOT produce negative output_tokens for parallel execution', async () => { + // Critical bug scenario: parallel agents where second agent has LOWER input tokens + const collectedUsage = [ + { input_tokens: 200, output_tokens: 100, model: 'gpt-4' }, + { input_tokens: 50, output_tokens: 30, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + // output_tokens MUST be positive for proper token tracking + expect(client.usage.output_tokens).toBeGreaterThan(0); + // Correct value should be 100 + 30 = 130 + }); + + it('should calculate correct total output for parallel agents', async () => { + // Three parallel agents with independent contexts + const collectedUsage = [ + { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, + { input_tokens: 120, output_tokens: 60, model: 'gpt-4-turbo' }, + { input_tokens: 80, output_tokens: 40, model: 'claude-3' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(3); + // Total output should be 50 + 60 + 40 = 150 + expect(client.usage.output_tokens).toBe(150); + }); + + it('should handle worst-case parallel scenario without negative tokens', async () => { + // Extreme case: first agent has very high input, subsequent have low + const collectedUsage = [ + { input_tokens: 1000, output_tokens: 500, model: 'gpt-4' }, + { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, + { input_tokens: 50, output_tokens: 25, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + // Must be positive, should be 500 + 50 + 25 = 575 + expect(client.usage.output_tokens).toBeGreaterThan(0); + expect(client.usage.output_tokens).toBe(575); + }); + }); + + describe('real-world scenarios', () => { + it('should correctly sum output tokens for sequential tool calls with growing context', async () => { + // Real production data: Claude Opus with multiple tool calls + // Context grows as tool results are added, but output_tokens should only count model generations + const collectedUsage = [ + { + input_tokens: 31596, + output_tokens: 151, + total_tokens: 31747, + input_token_details: { cache_read: 0, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 35368, + output_tokens: 150, + total_tokens: 35518, + input_token_details: { cache_read: 0, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 58362, + output_tokens: 295, + total_tokens: 58657, + input_token_details: { cache_read: 0, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 112604, + output_tokens: 193, + total_tokens: 112797, + input_token_details: { cache_read: 0, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 257440, + output_tokens: 2217, + total_tokens: 259657, + input_token_details: { cache_read: 0, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + // input_tokens should be first entry's input (initial context) + expect(client.usage.input_tokens).toBe(31596); + + // output_tokens should be sum of all model outputs: 151 + 150 + 295 + 193 + 2217 = 3006 + // NOT the inflated value from incremental calculation (338,559) + expect(client.usage.output_tokens).toBe(3006); + + // Verify spendTokens was called for each entry with correct values + expect(mockSpendTokens).toHaveBeenCalledTimes(5); + expect(mockSpendTokens).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ model: 'claude-opus-4-5-20251101' }), + { promptTokens: 31596, completionTokens: 151 }, + ); + expect(mockSpendTokens).toHaveBeenNthCalledWith( + 5, + expect.objectContaining({ model: 'claude-opus-4-5-20251101' }), + { promptTokens: 257440, completionTokens: 2217 }, + ); + }); + + it('should handle single followup message correctly', async () => { + // Real production data: followup to the above conversation + const collectedUsage = [ + { + input_tokens: 263406, + output_tokens: 257, + total_tokens: 263663, + input_token_details: { cache_read: 0, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(client.usage.input_tokens).toBe(263406); + expect(client.usage.output_tokens).toBe(257); + + expect(mockSpendTokens).toHaveBeenCalledTimes(1); + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'claude-opus-4-5-20251101' }), + { promptTokens: 263406, completionTokens: 257 }, + ); + }); + + it('should ensure output_tokens > 0 check passes for BaseClient.sendMessage', async () => { + // This verifies the fix for the duplicate token spending bug + // BaseClient.sendMessage checks: if (usage != null && Number(usage[this.outputTokensKey]) > 0) + const collectedUsage = [ + { + input_tokens: 31596, + output_tokens: 151, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 35368, + output_tokens: 150, + model: 'claude-opus-4-5-20251101', + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + const usage = client.getStreamUsage(); + + // The check that was failing before the fix + expect(usage).not.toBeNull(); + expect(Number(usage.output_tokens)).toBeGreaterThan(0); + + // Verify correct value + expect(usage.output_tokens).toBe(301); // 151 + 150 + }); + + it('should correctly handle cache tokens with multiple tool calls', async () => { + // Real production data: Claude Opus with cache tokens (prompt caching) + // First entry has cache_creation, subsequent entries have cache_read + const collectedUsage = [ + { + input_tokens: 788, + output_tokens: 163, + total_tokens: 951, + input_token_details: { cache_read: 0, cache_creation: 30808 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 3802, + output_tokens: 149, + total_tokens: 3951, + input_token_details: { cache_read: 30808, cache_creation: 768 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 26808, + output_tokens: 225, + total_tokens: 27033, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 80912, + output_tokens: 204, + total_tokens: 81116, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 136454, + output_tokens: 206, + total_tokens: 136660, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 146316, + output_tokens: 224, + total_tokens: 146540, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 150402, + output_tokens: 1248, + total_tokens: 151650, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 156268, + output_tokens: 139, + total_tokens: 156407, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + { + input_tokens: 167126, + output_tokens: 2961, + total_tokens: 170087, + input_token_details: { cache_read: 31576, cache_creation: 0 }, + model: 'claude-opus-4-5-20251101', + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + // input_tokens = first entry's input + cache_creation + cache_read + // = 788 + 30808 + 0 = 31596 + expect(client.usage.input_tokens).toBe(31596); + + // output_tokens = sum of all output_tokens + // = 163 + 149 + 225 + 204 + 206 + 224 + 1248 + 139 + 2961 = 5519 + expect(client.usage.output_tokens).toBe(5519); + + // First 2 entries have cache tokens, should use spendStructuredTokens + // Remaining 7 entries have cache_read but no cache_creation, still structured + expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(9); + expect(mockSpendTokens).toHaveBeenCalledTimes(0); + + // Verify first entry uses structured tokens with cache_creation + expect(mockSpendStructuredTokens).toHaveBeenNthCalledWith( + 1, + expect.objectContaining({ model: 'claude-opus-4-5-20251101' }), + { + promptTokens: { input: 788, write: 30808, read: 0 }, + completionTokens: 163, + }, + ); + + // Verify second entry uses structured tokens with both cache_creation and cache_read + expect(mockSpendStructuredTokens).toHaveBeenNthCalledWith( + 2, + expect.objectContaining({ model: 'claude-opus-4-5-20251101' }), + { + promptTokens: { input: 3802, write: 768, read: 30808 }, + completionTokens: 149, + }, + ); + }); + }); + + describe('cache token handling', () => { + it('should handle OpenAI format cache tokens (input_token_details)', async () => { + const collectedUsage = [ + { + input_tokens: 100, + output_tokens: 50, + model: 'gpt-4', + input_token_details: { + cache_creation: 20, + cache_read: 10, + }, + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1); + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gpt-4' }), + { + promptTokens: { + input: 100, + write: 20, + read: 10, + }, + completionTokens: 50, + }, + ); + }); + + it('should handle Anthropic format cache tokens (cache_*_input_tokens)', async () => { + const collectedUsage = [ + { + input_tokens: 100, + output_tokens: 50, + model: 'claude-3', + cache_creation_input_tokens: 25, + cache_read_input_tokens: 15, + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1); + expect(mockSpendStructuredTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'claude-3' }), + { + promptTokens: { + input: 100, + write: 25, + read: 15, + }, + completionTokens: 50, + }, + ); + }); + + it('should use spendTokens for entries without cache tokens', async () => { + const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(1); + expect(mockSpendStructuredTokens).not.toHaveBeenCalled(); + }); + + it('should handle mixed cache and non-cache entries', async () => { + const collectedUsage = [ + { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, + { + input_tokens: 150, + output_tokens: 30, + model: 'gpt-4', + input_token_details: { cache_creation: 10, cache_read: 5 }, + }, + { input_tokens: 200, output_tokens: 20, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledTimes(2); + expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1); + }); + + it('should include cache tokens in total input calculation', async () => { + const collectedUsage = [ + { + input_tokens: 100, + output_tokens: 50, + model: 'gpt-4', + input_token_details: { + cache_creation: 20, + cache_read: 10, + }, + }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + // Total input should include cache tokens: 100 + 20 + 10 = 130 + expect(client.usage.input_tokens).toBe(130); + }); + }); + + describe('model fallback', () => { + it('should use usage.model when available', async () => { + const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4-turbo' }]; + + await client.recordCollectedUsage({ + model: 'fallback-model', + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gpt-4-turbo' }), + expect.any(Object), + ); + }); + + it('should fallback to param model when usage.model is missing', async () => { + const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }]; + + await client.recordCollectedUsage({ + model: 'param-model', + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'param-model' }), + expect.any(Object), + ); + }); + + it('should fallback to client.model when param model is missing', async () => { + client.model = 'client-model'; + const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'client-model' }), + expect.any(Object), + ); + }); + + it('should fallback to agent model_parameters.model as last resort', async () => { + const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + expect(mockSpendTokens).toHaveBeenCalledWith( + expect.objectContaining({ model: 'gpt-4' }), + expect.any(Object), + ); + }); + }); + + describe('getStreamUsage integration', () => { + it('should return the usage object set by recordCollectedUsage', async () => { + const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + const usage = client.getStreamUsage(); + expect(usage).toEqual({ + input_tokens: 100, + output_tokens: 50, + }); + }); + + it('should return undefined before recordCollectedUsage is called', () => { + const usage = client.getStreamUsage(); + expect(usage).toBeUndefined(); + }); + + it('should have output_tokens > 0 for BaseClient.sendMessage check', async () => { + // This test verifies the usage will pass the check in BaseClient.sendMessage: + // if (usage != null && Number(usage[this.outputTokensKey]) > 0) + const collectedUsage = [ + { input_tokens: 200, output_tokens: 100, model: 'gpt-4' }, + { input_tokens: 50, output_tokens: 30, model: 'gpt-4' }, + ]; + + await client.recordCollectedUsage({ + collectedUsage, + balance: { enabled: true }, + transactions: { enabled: true }, + }); + + const usage = client.getStreamUsage(); + expect(usage).not.toBeNull(); + expect(Number(usage.output_tokens)).toBeGreaterThan(0); + }); + }); +}); diff --git a/package-lock.json b/package-lock.json index 3337696267..6456b21325 100644 --- a/package-lock.json +++ b/package-lock.json @@ -60,7 +60,7 @@ "@googleapis/youtube": "^20.0.0", "@keyv/redis": "^4.3.3", "@langchain/core": "^0.3.80", - "@librechat/agents": "^3.0.66", + "@librechat/agents": "^3.0.77", "@librechat/api": "*", "@librechat/data-schemas": "*", "@microsoft/microsoft-graph-client": "^3.0.7", @@ -12660,9 +12660,9 @@ } }, "node_modules/@librechat/agents": { - "version": "3.0.66", - "resolved": "https://registry.npmjs.org/@librechat/agents/-/agents-3.0.66.tgz", - "integrity": "sha512-JpQo7w+/yLM3dJ46lyGrm4gPTjiHERwcpojw7drvpYWqOU4e2jmjK0JbNxQ0jP00q+nDhPG+mqJ2qQU7TVraOQ==", + "version": "3.0.77", + "resolved": "https://registry.npmjs.org/@librechat/agents/-/agents-3.0.77.tgz", + "integrity": "sha512-Wr9d8bjJAQSl03nEgnAPG6jBQT1fL3sNV3TFDN1FvFQt6WGfdok838Cbcn+/tSGXSPJcICTxNkMT7VN8P6bCPw==", "license": "MIT", "dependencies": { "@langchain/anthropic": "^0.3.26", @@ -12686,6 +12686,7 @@ "https-proxy-agent": "^7.0.6", "mathjs": "^15.1.0", "nanoid": "^3.3.7", + "okapibm25": "^1.4.1", "openai": "5.8.2" }, "engines": { @@ -34310,6 +34311,12 @@ "url": "https://github.com/sponsors/ljharb" } }, + "node_modules/okapibm25": { + "version": "1.4.1", + "resolved": "https://registry.npmjs.org/okapibm25/-/okapibm25-1.4.1.tgz", + "integrity": "sha512-UHmeH4MAtZXGFVncwbY7pfFvDVNxpsyM3W66aGPU0SHj1+ld59ty+9lJ0ifcrcnPUl1XdYoDgb06ObyCnpTs3g==", + "license": "MIT" + }, "node_modules/ollama": { "version": "0.5.18", "resolved": "https://registry.npmjs.org/ollama/-/ollama-0.5.18.tgz", @@ -43169,7 +43176,7 @@ "@google/genai": "^1.19.0", "@keyv/redis": "^4.3.3", "@langchain/core": "^0.3.80", - "@librechat/agents": "^3.0.66", + "@librechat/agents": "^3.0.77", "@librechat/data-schemas": "*", "@modelcontextprotocol/sdk": "^1.25.2", "@smithy/node-http-handler": "^4.4.5", diff --git a/packages/api/package.json b/packages/api/package.json index 8538264ceb..5f5576e293 100644 --- a/packages/api/package.json +++ b/packages/api/package.json @@ -88,7 +88,7 @@ "@google/genai": "^1.19.0", "@keyv/redis": "^4.3.3", "@langchain/core": "^0.3.80", - "@librechat/agents": "^3.0.66", + "@librechat/agents": "^3.0.77", "@librechat/data-schemas": "*", "@modelcontextprotocol/sdk": "^1.25.2", "@smithy/node-http-handler": "^4.4.5", diff --git a/packages/api/src/endpoints/anthropic/llm.spec.ts b/packages/api/src/endpoints/anthropic/llm.spec.ts index 0e457b60c2..c15d5445ed 100644 --- a/packages/api/src/endpoints/anthropic/llm.spec.ts +++ b/packages/api/src/endpoints/anthropic/llm.spec.ts @@ -87,7 +87,7 @@ describe('getLLMConfig', () => { expect(result.llmConfig.thinking).toHaveProperty('budget_tokens', 2000); }); - it('should add "context-1m" beta header for claude-sonnet-4 model', () => { + it('should add "context-1m" beta header and promptCache boolean for claude-sonnet-4 model', () => { const modelOptions = { model: 'claude-sonnet-4-20250514', promptCache: true, @@ -98,9 +98,10 @@ describe('getLLMConfig', () => { expect(clientOptions?.defaultHeaders).toHaveProperty('anthropic-beta'); const defaultHeaders = clientOptions?.defaultHeaders as Record; expect(defaultHeaders['anthropic-beta']).toBe('context-1m-2025-08-07'); + expect(result.llmConfig.promptCache).toBe(true); }); - it('should add "context-1m" beta header for claude-sonnet-4 model formats', () => { + it('should add "context-1m" beta header and promptCache boolean for claude-sonnet-4 model formats', () => { const modelVariations = [ 'claude-sonnet-4-20250514', 'claude-sonnet-4-latest', @@ -115,10 +116,11 @@ describe('getLLMConfig', () => { expect(clientOptions?.defaultHeaders).toHaveProperty('anthropic-beta'); const defaultHeaders = clientOptions?.defaultHeaders as Record; expect(defaultHeaders['anthropic-beta']).toBe('context-1m-2025-08-07'); + expect(result.llmConfig.promptCache).toBe(true); }); }); - it('should not add beta headers for claude-opus-4-5 model (prompt caching no longer needs header)', () => { + it('should pass promptCache boolean for claude-opus-4-5 model (no beta header needed)', () => { const modelOptions = { model: 'claude-opus-4-5', promptCache: true, @@ -126,9 +128,10 @@ describe('getLLMConfig', () => { const result = getLLMConfig('test-key', { modelOptions }); const clientOptions = result.llmConfig.clientOptions; expect(clientOptions?.defaultHeaders).toBeUndefined(); + expect(result.llmConfig.promptCache).toBe(true); }); - it('should not add beta headers for claude-opus-4-5 model formats (prompt caching no longer needs header)', () => { + it('should pass promptCache boolean for claude-opus-4-5 model formats (no beta header needed)', () => { const modelVariations = [ 'claude-opus-4-5', 'claude-opus-4-5-20250420', @@ -141,6 +144,7 @@ describe('getLLMConfig', () => { const result = getLLMConfig('test-key', { modelOptions }); const clientOptions = result.llmConfig.clientOptions; expect(clientOptions?.defaultHeaders).toBeUndefined(); + expect(result.llmConfig.promptCache).toBe(true); }); }); @@ -299,10 +303,11 @@ describe('getLLMConfig', () => { }, }); - // claude-3-5-sonnet supports prompt caching and should get the max-tokens header + // claude-3-5-sonnet supports prompt caching and should get the max-tokens header and promptCache boolean expect(result.llmConfig.clientOptions?.defaultHeaders).toEqual({ 'anthropic-beta': 'max-tokens-3-5-sonnet-2024-07-15', }); + expect(result.llmConfig.promptCache).toBe(true); }); it('should handle thinking and thinkingBudget options', () => { @@ -512,6 +517,8 @@ describe('getLLMConfig', () => { expect(result.llmConfig.clientOptions?.defaultHeaders).toEqual({ 'anthropic-beta': 'token-efficient-tools-2025-02-19,output-128k-2025-02-19', }); + // Should pass promptCache boolean + expect(result.llmConfig.promptCache).toBe(true); }); it('should handle web search functionality like production', () => { @@ -1160,21 +1167,66 @@ describe('getLLMConfig', () => { it('should handle prompt cache support logic for different models', () => { const testCases = [ // Models that support prompt cache (and have other beta headers) - { model: 'claude-3-5-sonnet', promptCache: true, shouldHaveHeaders: true }, - { model: 'claude-3.5-sonnet-20241022', promptCache: true, shouldHaveHeaders: true }, - { model: 'claude-3-7-sonnet', promptCache: true, shouldHaveHeaders: true }, - { model: 'claude-3.7-sonnet-20250109', promptCache: true, shouldHaveHeaders: true }, - { model: 'claude-sonnet-4-20250514', promptCache: true, shouldHaveHeaders: true }, + { + model: 'claude-3-5-sonnet', + promptCache: true, + shouldHaveHeaders: true, + shouldHavePromptCache: true, + }, + { + model: 'claude-3.5-sonnet-20241022', + promptCache: true, + shouldHaveHeaders: true, + shouldHavePromptCache: true, + }, + { + model: 'claude-3-7-sonnet', + promptCache: true, + shouldHaveHeaders: true, + shouldHavePromptCache: true, + }, + { + model: 'claude-3.7-sonnet-20250109', + promptCache: true, + shouldHaveHeaders: true, + shouldHavePromptCache: true, + }, + { + model: 'claude-sonnet-4-20250514', + promptCache: true, + shouldHaveHeaders: true, + shouldHavePromptCache: true, + }, // Models that support prompt cache but have no additional beta headers needed - { model: 'claude-3-opus', promptCache: true, shouldHaveHeaders: false }, + { + model: 'claude-3-opus', + promptCache: true, + shouldHaveHeaders: false, + shouldHavePromptCache: true, + }, // Models that don't support prompt cache - { model: 'claude-3-5-sonnet-latest', promptCache: true, shouldHaveHeaders: false }, - { model: 'claude-3.5-sonnet-latest', promptCache: true, shouldHaveHeaders: false }, + { + model: 'claude-3-5-sonnet-latest', + promptCache: true, + shouldHaveHeaders: false, + shouldHavePromptCache: false, + }, + { + model: 'claude-3.5-sonnet-latest', + promptCache: true, + shouldHaveHeaders: false, + shouldHavePromptCache: false, + }, // Prompt cache disabled - { model: 'claude-3-5-sonnet', promptCache: false, shouldHaveHeaders: false }, + { + model: 'claude-3-5-sonnet', + promptCache: false, + shouldHaveHeaders: false, + shouldHavePromptCache: false, + }, ]; - testCases.forEach(({ model, promptCache, shouldHaveHeaders }) => { + testCases.forEach(({ model, promptCache, shouldHaveHeaders, shouldHavePromptCache }) => { const result = getLLMConfig('test-key', { modelOptions: { model, promptCache }, }); @@ -1187,6 +1239,12 @@ describe('getLLMConfig', () => { } else { expect(headers).toBeUndefined(); } + + if (shouldHavePromptCache) { + expect(result.llmConfig.promptCache).toBe(true); + } else { + expect(result.llmConfig.promptCache).toBeUndefined(); + } }); }); }); diff --git a/packages/api/src/endpoints/anthropic/llm.ts b/packages/api/src/endpoints/anthropic/llm.ts index 408ad2a77c..34ec354365 100644 --- a/packages/api/src/endpoints/anthropic/llm.ts +++ b/packages/api/src/endpoints/anthropic/llm.ts @@ -155,6 +155,12 @@ function getLLMConfig( const supportsCacheControl = systemOptions.promptCache === true && checkPromptCacheSupport(requestOptions.model ?? ''); + + /** Pass promptCache boolean for downstream cache_control application */ + if (supportsCacheControl) { + (requestOptions as Record).promptCache = true; + } + const headers = getClaudeHeaders(requestOptions.model ?? '', supportsCacheControl); if (headers && requestOptions.clientOptions) { requestOptions.clientOptions.defaultHeaders = headers; diff --git a/packages/api/src/endpoints/openai/config.anthropic.spec.ts b/packages/api/src/endpoints/openai/config.anthropic.spec.ts index eeb17a311d..7109341e8c 100644 --- a/packages/api/src/endpoints/openai/config.anthropic.spec.ts +++ b/packages/api/src/endpoints/openai/config.anthropic.spec.ts @@ -39,6 +39,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { type: 'enabled', budget_tokens: 2000, }, + promptCache: true, }, }, configOptions: { @@ -87,6 +88,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { type: 'enabled', budget_tokens: 3000, }, + promptCache: true, }, }, configOptions: { @@ -134,6 +136,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { user_id: 'user123', }, topK: 50, + promptCache: true, }, }, configOptions: { @@ -175,6 +178,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { metadata: { user_id: 'user456', }, + promptCache: true, }, }, configOptions: { @@ -187,7 +191,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { }); }); - it('should apply custom headers without anthropic-beta for models that dont need it', () => { + it('should apply custom headers and promptCache for models that support caching', () => { const apiKey = 'sk-custom'; const endpoint = 'Anthropic (via LiteLLM)'; const options = { @@ -218,6 +222,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { metadata: { user_id: undefined, }, + promptCache: true, }, }, configOptions: { @@ -300,6 +305,9 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { stream: true, topP: 0.9, maxTokens: 2048, + modelKwargs: { + promptCache: true, + }, // temperature is dropped // modelKwargs.topK is dropped // modelKwargs.metadata is dropped completely @@ -379,6 +387,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { metadata: { user_id: 'searchUser', }, + promptCache: true, }, }, configOptions: { @@ -425,6 +434,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { user_id: 'testUser', }, topK: 40, + promptCache: true, }, }, configOptions: { @@ -470,6 +480,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { metadata: { user_id: 'addUser', }, + promptCache: true, customParam1: 'value1', // Unknown params added to modelKwargs customParam2: 42, }, @@ -519,6 +530,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => { metadata: { user_id: 'bothUser', }, + promptCache: true, customParam: 'customValue', // topK is dropped },