From 2a50c372efa7d999eed355c3e7657742513a8e37 Mon Sep 17 00:00:00 2001
From: Danny Avila <danny@librechat.ai>
Date: Mon, 12 Jan 2026 23:02:08 -0500
Subject: [PATCH] =?UTF-8?q?=F0=9F=AA=99=20refactor:=20Collected=20Usage=20?=
 =?UTF-8?q?&=20Anthropic=20Prompt=20Caching=20(#11319)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* 🔧 refactor: Improve token calculation in AgentClient.recordCollectedUsage

- Updated the token calculation logic to sum output tokens directly from all entries, addressing issues with negative values in parallel execution scenarios.
- Added comments for clarity on the usage of input tokens and output tokens.
- Introduced a new test file for comprehensive testing of the recordCollectedUsage function, covering various execution scenarios including sequential and parallel processing, cache token handling, and model fallback logic.

* 🔧 refactor: Anthropic `promptCache` handling in LLM configuration

* 🔧 test: Add comprehensive test for cache token handling in recordCollectedUsage

- Introduced a new test case to validate the handling of cache tokens across multiple tool calls in the recordCollectedUsage function.
- Ensured correct calculations for input and output tokens, including scenarios with cache creation and reading.
- Verified the expected interactions with token spending methods to enhance the robustness of the token management logic.
---
 api/package.json                              |   2 +-
 api/server/controllers/agents/client.js       |  27 +-
 .../agents/recordCollectedUsage.spec.js       | 712 ++++++++++++++++++
 package-lock.json                             |  17 +-
 packages/api/package.json                     |   2 +-
 .../api/src/endpoints/anthropic/llm.spec.ts   |  88 ++-
 packages/api/src/endpoints/anthropic/llm.ts   |   6 +
 .../endpoints/openai/config.anthropic.spec.ts |  14 +-
 8 files changed, 828 insertions(+), 40 deletions(-)
 create mode 100644 api/server/controllers/agents/recordCollectedUsage.spec.js

diff --git a/api/package.json b/api/package.json
index 9e134bd32a..c2f0dd9801 100644
--- a/api/package.json
+++ b/api/package.json
@@ -46,7 +46,7 @@
     "@googleapis/youtube": "^20.0.0",
     "@keyv/redis": "^4.3.3",
     "@langchain/core": "^0.3.80",
-    "@librechat/agents": "^3.0.66",
+    "@librechat/agents": "^3.0.77",
     "@librechat/api": "*",
     "@librechat/data-schemas": "*",
     "@microsoft/microsoft-graph-client": "^3.0.7",
diff --git a/api/server/controllers/agents/client.js b/api/server/controllers/agents/client.js
index 79e63d1c7f..2b5872411b 100644
--- a/api/server/controllers/agents/client.js
+++ b/api/server/controllers/agents/client.js
@@ -784,6 +784,7 @@ class AgentClient extends BaseClient {
     if (!collectedUsage || !collectedUsage.length) {
       return;
     }
+    // Use first entry's input_tokens as the base input (represents initial user message context)
     // Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens)
     const firstUsage = collectedUsage[0];
     const input_tokens =
@@ -795,10 +796,11 @@ class AgentClient extends BaseClient {
         Number(firstUsage?.cache_read_input_tokens) ||
         0);
 
-    let output_tokens = 0;
-    let previousTokens = input_tokens; // Start with original input
-    for (let i = 0; i < collectedUsage.length; i++) {
-      const usage = collectedUsage[i];
+    // Sum output_tokens directly from all entries - works for both sequential and parallel execution
+    // This avoids the incremental calculation that produced negative values for parallel agents
+    let total_output_tokens = 0;
+
+    for (const usage of collectedUsage) {
       if (!usage) {
         continue;
       }
@@ -811,6 +813,9 @@ class AgentClient extends BaseClient {
       const cache_read =
         Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
 
+      // Accumulate output tokens for the usage summary
+      total_output_tokens += Number(usage.output_tokens) || 0;
+
       const txMetadata = {
         context,
         balance,
@@ -821,18 +826,6 @@ class AgentClient extends BaseClient {
         model: usage.model ?? model ?? this.model ?? this.options.agent.model_parameters.model,
       };
 
-      if (i > 0) {
-        // Count new tokens generated (input_tokens minus previous accumulated tokens)
-        output_tokens +=
-          (Number(usage.input_tokens) || 0) + cache_creation + cache_read - previousTokens;
-      }
-
-      // Add this message's output tokens
-      output_tokens += Number(usage.output_tokens) || 0;
-
-      // Update previousTokens to include this message's output
-      previousTokens += Number(usage.output_tokens) || 0;
-
       if (cache_creation > 0 || cache_read > 0) {
         spendStructuredTokens(txMetadata, {
           promptTokens: {
@@ -862,7 +855,7 @@ class AgentClient extends BaseClient {
 
     this.usage = {
       input_tokens,
-      output_tokens,
+      output_tokens: total_output_tokens,
     };
   }
 
diff --git a/api/server/controllers/agents/recordCollectedUsage.spec.js b/api/server/controllers/agents/recordCollectedUsage.spec.js
new file mode 100644
index 0000000000..6904f2ed39
--- /dev/null
+++ b/api/server/controllers/agents/recordCollectedUsage.spec.js
@@ -0,0 +1,712 @@
+/**
+ * Tests for AgentClient.recordCollectedUsage
+ *
+ * This is a critical function that handles token spending for agent LLM calls.
+ * It must correctly handle:
+ * - Sequential execution (single agent with tool calls)
+ * - Parallel execution (multiple agents with independent inputs)
+ * - Cache token handling (OpenAI and Anthropic formats)
+ */
+
+const { EModelEndpoint } = require('librechat-data-provider');
+
+// Mock dependencies before requiring the module
+const mockSpendTokens = jest.fn().mockResolvedValue();
+const mockSpendStructuredTokens = jest.fn().mockResolvedValue();
+
+jest.mock('~/models/spendTokens', () => ({
+  spendTokens: (...args) => mockSpendTokens(...args),
+  spendStructuredTokens: (...args) => mockSpendStructuredTokens(...args),
+}));
+
+jest.mock('~/config', () => ({
+  logger: {
+    debug: jest.fn(),
+    error: jest.fn(),
+    warn: jest.fn(),
+    info: jest.fn(),
+  },
+  getMCPManager: jest.fn(() => ({
+    formatInstructionsForContext: jest.fn(),
+  })),
+}));
+
+jest.mock('@librechat/agents', () => ({
+  ...jest.requireActual('@librechat/agents'),
+  createMetadataAggregator: () => ({
+    handleLLMEnd: jest.fn(),
+    collected: [],
+  }),
+}));
+
+const AgentClient = require('./client');
+
+describe('AgentClient - recordCollectedUsage', () => {
+  let client;
+  let mockAgent;
+  let mockOptions;
+
+  beforeEach(() => {
+    jest.clearAllMocks();
+
+    mockAgent = {
+      id: 'agent-123',
+      endpoint: EModelEndpoint.openAI,
+      provider: EModelEndpoint.openAI,
+      model_parameters: {
+        model: 'gpt-4',
+      },
+    };
+
+    mockOptions = {
+      req: {
+        user: { id: 'user-123' },
+        body: { model: 'gpt-4', endpoint: EModelEndpoint.openAI },
+      },
+      res: {},
+      agent: mockAgent,
+      endpointTokenConfig: {},
+    };
+
+    client = new AgentClient(mockOptions);
+    client.conversationId = 'convo-123';
+    client.user = 'user-123';
+  });
+
+  describe('basic functionality', () => {
+    it('should return early if collectedUsage is empty', async () => {
+      await client.recordCollectedUsage({
+        collectedUsage: [],
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).not.toHaveBeenCalled();
+      expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
+      expect(client.usage).toBeUndefined();
+    });
+
+    it('should return early if collectedUsage is null', async () => {
+      await client.recordCollectedUsage({
+        collectedUsage: null,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).not.toHaveBeenCalled();
+      expect(client.usage).toBeUndefined();
+    });
+
+    it('should handle single usage entry correctly', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({
+          conversationId: 'convo-123',
+          user: 'user-123',
+          model: 'gpt-4',
+        }),
+        { promptTokens: 100, completionTokens: 50 },
+      );
+      expect(client.usage.input_tokens).toBe(100);
+      expect(client.usage.output_tokens).toBe(50);
+    });
+
+    it('should skip null entries in collectedUsage', async () => {
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        null,
+        { input_tokens: 200, output_tokens: 60, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+    });
+  });
+
+  describe('sequential execution (single agent with tool calls)', () => {
+    it('should calculate tokens correctly for sequential tool calls', async () => {
+      // Sequential flow: output of call N becomes part of input for call N+1
+      // Call 1: input=100, output=50
+      // Call 2: input=150 (100+50), output=30
+      // Call 3: input=180 (150+30), output=20
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 150, output_tokens: 30, model: 'gpt-4' },
+        { input_tokens: 180, output_tokens: 20, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(3);
+      // Total output should be sum of all output_tokens: 50 + 30 + 20 = 100
+      expect(client.usage.output_tokens).toBe(100);
+      expect(client.usage.input_tokens).toBe(100); // First entry's input
+    });
+  });
+
+  describe('parallel execution (multiple agents)', () => {
+    it('should handle parallel agents with independent input tokens', async () => {
+      // Parallel agents have INDEPENDENT input tokens (not cumulative)
+      // Agent A: input=100, output=50
+      // Agent B: input=80, output=40 (different context, not 100+50)
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 80, output_tokens: 40, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+      // Expected total output: 50 + 40 = 90
+      // output_tokens must be positive and should reflect total output
+      expect(client.usage.output_tokens).toBeGreaterThan(0);
+    });
+
+    it('should NOT produce negative output_tokens for parallel execution', async () => {
+      // Critical bug scenario: parallel agents where second agent has LOWER input tokens
+      const collectedUsage = [
+        { input_tokens: 200, output_tokens: 100, model: 'gpt-4' },
+        { input_tokens: 50, output_tokens: 30, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      // output_tokens MUST be positive for proper token tracking
+      expect(client.usage.output_tokens).toBeGreaterThan(0);
+      // Correct value should be 100 + 30 = 130
+    });
+
+    it('should calculate correct total output for parallel agents', async () => {
+      // Three parallel agents with independent contexts
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 120, output_tokens: 60, model: 'gpt-4-turbo' },
+        { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(3);
+      // Total output should be 50 + 60 + 40 = 150
+      expect(client.usage.output_tokens).toBe(150);
+    });
+
+    it('should handle worst-case parallel scenario without negative tokens', async () => {
+      // Extreme case: first agent has very high input, subsequent have low
+      const collectedUsage = [
+        { input_tokens: 1000, output_tokens: 500, model: 'gpt-4' },
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 50, output_tokens: 25, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      // Must be positive, should be 500 + 50 + 25 = 575
+      expect(client.usage.output_tokens).toBeGreaterThan(0);
+      expect(client.usage.output_tokens).toBe(575);
+    });
+  });
+
+  describe('real-world scenarios', () => {
+    it('should correctly sum output tokens for sequential tool calls with growing context', async () => {
+      // Real production data: Claude Opus with multiple tool calls
+      // Context grows as tool results are added, but output_tokens should only count model generations
+      const collectedUsage = [
+        {
+          input_tokens: 31596,
+          output_tokens: 151,
+          total_tokens: 31747,
+          input_token_details: { cache_read: 0, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 35368,
+          output_tokens: 150,
+          total_tokens: 35518,
+          input_token_details: { cache_read: 0, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 58362,
+          output_tokens: 295,
+          total_tokens: 58657,
+          input_token_details: { cache_read: 0, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 112604,
+          output_tokens: 193,
+          total_tokens: 112797,
+          input_token_details: { cache_read: 0, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 257440,
+          output_tokens: 2217,
+          total_tokens: 259657,
+          input_token_details: { cache_read: 0, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      // input_tokens should be first entry's input (initial context)
+      expect(client.usage.input_tokens).toBe(31596);
+
+      // output_tokens should be sum of all model outputs: 151 + 150 + 295 + 193 + 2217 = 3006
+      // NOT the inflated value from incremental calculation (338,559)
+      expect(client.usage.output_tokens).toBe(3006);
+
+      // Verify spendTokens was called for each entry with correct values
+      expect(mockSpendTokens).toHaveBeenCalledTimes(5);
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        1,
+        expect.objectContaining({ model: 'claude-opus-4-5-20251101' }),
+        { promptTokens: 31596, completionTokens: 151 },
+      );
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        5,
+        expect.objectContaining({ model: 'claude-opus-4-5-20251101' }),
+        { promptTokens: 257440, completionTokens: 2217 },
+      );
+    });
+
+    it('should handle single followup message correctly', async () => {
+      // Real production data: followup to the above conversation
+      const collectedUsage = [
+        {
+          input_tokens: 263406,
+          output_tokens: 257,
+          total_tokens: 263663,
+          input_token_details: { cache_read: 0, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(client.usage.input_tokens).toBe(263406);
+      expect(client.usage.output_tokens).toBe(257);
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'claude-opus-4-5-20251101' }),
+        { promptTokens: 263406, completionTokens: 257 },
+      );
+    });
+
+    it('should ensure output_tokens > 0 check passes for BaseClient.sendMessage', async () => {
+      // This verifies the fix for the duplicate token spending bug
+      // BaseClient.sendMessage checks: if (usage != null && Number(usage[this.outputTokensKey]) > 0)
+      const collectedUsage = [
+        {
+          input_tokens: 31596,
+          output_tokens: 151,
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 35368,
+          output_tokens: 150,
+          model: 'claude-opus-4-5-20251101',
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      const usage = client.getStreamUsage();
+
+      // The check that was failing before the fix
+      expect(usage).not.toBeNull();
+      expect(Number(usage.output_tokens)).toBeGreaterThan(0);
+
+      // Verify correct value
+      expect(usage.output_tokens).toBe(301); // 151 + 150
+    });
+
+    it('should correctly handle cache tokens with multiple tool calls', async () => {
+      // Real production data: Claude Opus with cache tokens (prompt caching)
+      // First entry has cache_creation, subsequent entries have cache_read
+      const collectedUsage = [
+        {
+          input_tokens: 788,
+          output_tokens: 163,
+          total_tokens: 951,
+          input_token_details: { cache_read: 0, cache_creation: 30808 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 3802,
+          output_tokens: 149,
+          total_tokens: 3951,
+          input_token_details: { cache_read: 30808, cache_creation: 768 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 26808,
+          output_tokens: 225,
+          total_tokens: 27033,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 80912,
+          output_tokens: 204,
+          total_tokens: 81116,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 136454,
+          output_tokens: 206,
+          total_tokens: 136660,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 146316,
+          output_tokens: 224,
+          total_tokens: 146540,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 150402,
+          output_tokens: 1248,
+          total_tokens: 151650,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 156268,
+          output_tokens: 139,
+          total_tokens: 156407,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+        {
+          input_tokens: 167126,
+          output_tokens: 2961,
+          total_tokens: 170087,
+          input_token_details: { cache_read: 31576, cache_creation: 0 },
+          model: 'claude-opus-4-5-20251101',
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      // input_tokens = first entry's input + cache_creation + cache_read
+      // = 788 + 30808 + 0 = 31596
+      expect(client.usage.input_tokens).toBe(31596);
+
+      // output_tokens = sum of all output_tokens
+      // = 163 + 149 + 225 + 204 + 206 + 224 + 1248 + 139 + 2961 = 5519
+      expect(client.usage.output_tokens).toBe(5519);
+
+      // First 2 entries have cache tokens, should use spendStructuredTokens
+      // Remaining 7 entries have cache_read but no cache_creation, still structured
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(9);
+      expect(mockSpendTokens).toHaveBeenCalledTimes(0);
+
+      // Verify first entry uses structured tokens with cache_creation
+      expect(mockSpendStructuredTokens).toHaveBeenNthCalledWith(
+        1,
+        expect.objectContaining({ model: 'claude-opus-4-5-20251101' }),
+        {
+          promptTokens: { input: 788, write: 30808, read: 0 },
+          completionTokens: 163,
+        },
+      );
+
+      // Verify second entry uses structured tokens with both cache_creation and cache_read
+      expect(mockSpendStructuredTokens).toHaveBeenNthCalledWith(
+        2,
+        expect.objectContaining({ model: 'claude-opus-4-5-20251101' }),
+        {
+          promptTokens: { input: 3802, write: 768, read: 30808 },
+          completionTokens: 149,
+        },
+      );
+    });
+  });
+
+  describe('cache token handling', () => {
+    it('should handle OpenAI format cache tokens (input_token_details)', async () => {
+      const collectedUsage = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'gpt-4',
+          input_token_details: {
+            cache_creation: 20,
+            cache_read: 10,
+          },
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gpt-4' }),
+        {
+          promptTokens: {
+            input: 100,
+            write: 20,
+            read: 10,
+          },
+          completionTokens: 50,
+        },
+      );
+    });
+
+    it('should handle Anthropic format cache tokens (cache_*_input_tokens)', async () => {
+      const collectedUsage = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'claude-3',
+          cache_creation_input_tokens: 25,
+          cache_read_input_tokens: 15,
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'claude-3' }),
+        {
+          promptTokens: {
+            input: 100,
+            write: 25,
+            read: 15,
+          },
+          completionTokens: 50,
+        },
+      );
+    });
+
+    it('should use spendTokens for entries without cache tokens', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
+    });
+
+    it('should handle mixed cache and non-cache entries', async () => {
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        {
+          input_tokens: 150,
+          output_tokens: 30,
+          model: 'gpt-4',
+          input_token_details: { cache_creation: 10, cache_read: 5 },
+        },
+        { input_tokens: 200, output_tokens: 20, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
+    });
+
+    it('should include cache tokens in total input calculation', async () => {
+      const collectedUsage = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'gpt-4',
+          input_token_details: {
+            cache_creation: 20,
+            cache_read: 10,
+          },
+        },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      // Total input should include cache tokens: 100 + 20 + 10 = 130
+      expect(client.usage.input_tokens).toBe(130);
+    });
+  });
+
+  describe('model fallback', () => {
+    it('should use usage.model when available', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4-turbo' }];
+
+      await client.recordCollectedUsage({
+        model: 'fallback-model',
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gpt-4-turbo' }),
+        expect.any(Object),
+      );
+    });
+
+    it('should fallback to param model when usage.model is missing', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }];
+
+      await client.recordCollectedUsage({
+        model: 'param-model',
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'param-model' }),
+        expect.any(Object),
+      );
+    });
+
+    it('should fallback to client.model when param model is missing', async () => {
+      client.model = 'client-model';
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'client-model' }),
+        expect.any(Object),
+      );
+    });
+
+    it('should fallback to agent model_parameters.model as last resort', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gpt-4' }),
+        expect.any(Object),
+      );
+    });
+  });
+
+  describe('getStreamUsage integration', () => {
+    it('should return the usage object set by recordCollectedUsage', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      const usage = client.getStreamUsage();
+      expect(usage).toEqual({
+        input_tokens: 100,
+        output_tokens: 50,
+      });
+    });
+
+    it('should return undefined before recordCollectedUsage is called', () => {
+      const usage = client.getStreamUsage();
+      expect(usage).toBeUndefined();
+    });
+
+    it('should have output_tokens > 0 for BaseClient.sendMessage check', async () => {
+      // This test verifies the usage will pass the check in BaseClient.sendMessage:
+      // if (usage != null && Number(usage[this.outputTokensKey]) > 0)
+      const collectedUsage = [
+        { input_tokens: 200, output_tokens: 100, model: 'gpt-4' },
+        { input_tokens: 50, output_tokens: 30, model: 'gpt-4' },
+      ];
+
+      await client.recordCollectedUsage({
+        collectedUsage,
+        balance: { enabled: true },
+        transactions: { enabled: true },
+      });
+
+      const usage = client.getStreamUsage();
+      expect(usage).not.toBeNull();
+      expect(Number(usage.output_tokens)).toBeGreaterThan(0);
+    });
+  });
+});
diff --git a/package-lock.json b/package-lock.json
index 3337696267..6456b21325 100644
--- a/package-lock.json
+++ b/package-lock.json
@@ -60,7 +60,7 @@
         "@googleapis/youtube": "^20.0.0",
         "@keyv/redis": "^4.3.3",
         "@langchain/core": "^0.3.80",
-        "@librechat/agents": "^3.0.66",
+        "@librechat/agents": "^3.0.77",
         "@librechat/api": "*",
         "@librechat/data-schemas": "*",
         "@microsoft/microsoft-graph-client": "^3.0.7",
@@ -12660,9 +12660,9 @@
       }
     },
     "node_modules/@librechat/agents": {
-      "version": "3.0.66",
-      "resolved": "https://registry.npmjs.org/@librechat/agents/-/agents-3.0.66.tgz",
-      "integrity": "sha512-JpQo7w+/yLM3dJ46lyGrm4gPTjiHERwcpojw7drvpYWqOU4e2jmjK0JbNxQ0jP00q+nDhPG+mqJ2qQU7TVraOQ==",
+      "version": "3.0.77",
+      "resolved": "https://registry.npmjs.org/@librechat/agents/-/agents-3.0.77.tgz",
+      "integrity": "sha512-Wr9d8bjJAQSl03nEgnAPG6jBQT1fL3sNV3TFDN1FvFQt6WGfdok838Cbcn+/tSGXSPJcICTxNkMT7VN8P6bCPw==",
       "license": "MIT",
       "dependencies": {
         "@langchain/anthropic": "^0.3.26",
@@ -12686,6 +12686,7 @@
         "https-proxy-agent": "^7.0.6",
         "mathjs": "^15.1.0",
         "nanoid": "^3.3.7",
+        "okapibm25": "^1.4.1",
         "openai": "5.8.2"
       },
       "engines": {
@@ -34310,6 +34311,12 @@
         "url": "https://github.com/sponsors/ljharb"
       }
     },
+    "node_modules/okapibm25": {
+      "version": "1.4.1",
+      "resolved": "https://registry.npmjs.org/okapibm25/-/okapibm25-1.4.1.tgz",
+      "integrity": "sha512-UHmeH4MAtZXGFVncwbY7pfFvDVNxpsyM3W66aGPU0SHj1+ld59ty+9lJ0ifcrcnPUl1XdYoDgb06ObyCnpTs3g==",
+      "license": "MIT"
+    },
     "node_modules/ollama": {
       "version": "0.5.18",
       "resolved": "https://registry.npmjs.org/ollama/-/ollama-0.5.18.tgz",
@@ -43169,7 +43176,7 @@
         "@google/genai": "^1.19.0",
         "@keyv/redis": "^4.3.3",
         "@langchain/core": "^0.3.80",
-        "@librechat/agents": "^3.0.66",
+        "@librechat/agents": "^3.0.77",
         "@librechat/data-schemas": "*",
         "@modelcontextprotocol/sdk": "^1.25.2",
         "@smithy/node-http-handler": "^4.4.5",
diff --git a/packages/api/package.json b/packages/api/package.json
index 8538264ceb..5f5576e293 100644
--- a/packages/api/package.json
+++ b/packages/api/package.json
@@ -88,7 +88,7 @@
     "@google/genai": "^1.19.0",
     "@keyv/redis": "^4.3.3",
     "@langchain/core": "^0.3.80",
-    "@librechat/agents": "^3.0.66",
+    "@librechat/agents": "^3.0.77",
     "@librechat/data-schemas": "*",
     "@modelcontextprotocol/sdk": "^1.25.2",
     "@smithy/node-http-handler": "^4.4.5",
diff --git a/packages/api/src/endpoints/anthropic/llm.spec.ts b/packages/api/src/endpoints/anthropic/llm.spec.ts
index 0e457b60c2..c15d5445ed 100644
--- a/packages/api/src/endpoints/anthropic/llm.spec.ts
+++ b/packages/api/src/endpoints/anthropic/llm.spec.ts
@@ -87,7 +87,7 @@ describe('getLLMConfig', () => {
     expect(result.llmConfig.thinking).toHaveProperty('budget_tokens', 2000);
   });
 
-  it('should add "context-1m" beta header for claude-sonnet-4 model', () => {
+  it('should add "context-1m" beta header and promptCache boolean for claude-sonnet-4 model', () => {
     const modelOptions = {
       model: 'claude-sonnet-4-20250514',
       promptCache: true,
@@ -98,9 +98,10 @@ describe('getLLMConfig', () => {
     expect(clientOptions?.defaultHeaders).toHaveProperty('anthropic-beta');
     const defaultHeaders = clientOptions?.defaultHeaders as Record<string, string>;
     expect(defaultHeaders['anthropic-beta']).toBe('context-1m-2025-08-07');
+    expect(result.llmConfig.promptCache).toBe(true);
   });
 
-  it('should add "context-1m" beta header for claude-sonnet-4 model formats', () => {
+  it('should add "context-1m" beta header and promptCache boolean for claude-sonnet-4 model formats', () => {
     const modelVariations = [
       'claude-sonnet-4-20250514',
       'claude-sonnet-4-latest',
@@ -115,10 +116,11 @@ describe('getLLMConfig', () => {
       expect(clientOptions?.defaultHeaders).toHaveProperty('anthropic-beta');
       const defaultHeaders = clientOptions?.defaultHeaders as Record<string, string>;
       expect(defaultHeaders['anthropic-beta']).toBe('context-1m-2025-08-07');
+      expect(result.llmConfig.promptCache).toBe(true);
     });
   });
 
-  it('should not add beta headers for claude-opus-4-5 model (prompt caching no longer needs header)', () => {
+  it('should pass promptCache boolean for claude-opus-4-5 model (no beta header needed)', () => {
     const modelOptions = {
       model: 'claude-opus-4-5',
       promptCache: true,
@@ -126,9 +128,10 @@ describe('getLLMConfig', () => {
     const result = getLLMConfig('test-key', { modelOptions });
     const clientOptions = result.llmConfig.clientOptions;
     expect(clientOptions?.defaultHeaders).toBeUndefined();
+    expect(result.llmConfig.promptCache).toBe(true);
   });
 
-  it('should not add beta headers for claude-opus-4-5 model formats (prompt caching no longer needs header)', () => {
+  it('should pass promptCache boolean for claude-opus-4-5 model formats (no beta header needed)', () => {
     const modelVariations = [
       'claude-opus-4-5',
       'claude-opus-4-5-20250420',
@@ -141,6 +144,7 @@ describe('getLLMConfig', () => {
       const result = getLLMConfig('test-key', { modelOptions });
       const clientOptions = result.llmConfig.clientOptions;
       expect(clientOptions?.defaultHeaders).toBeUndefined();
+      expect(result.llmConfig.promptCache).toBe(true);
     });
   });
 
@@ -299,10 +303,11 @@ describe('getLLMConfig', () => {
         },
       });
 
-      // claude-3-5-sonnet supports prompt caching and should get the max-tokens header
+      // claude-3-5-sonnet supports prompt caching and should get the max-tokens header and promptCache boolean
       expect(result.llmConfig.clientOptions?.defaultHeaders).toEqual({
         'anthropic-beta': 'max-tokens-3-5-sonnet-2024-07-15',
       });
+      expect(result.llmConfig.promptCache).toBe(true);
     });
 
     it('should handle thinking and thinkingBudget options', () => {
@@ -512,6 +517,8 @@ describe('getLLMConfig', () => {
         expect(result.llmConfig.clientOptions?.defaultHeaders).toEqual({
           'anthropic-beta': 'token-efficient-tools-2025-02-19,output-128k-2025-02-19',
         });
+        // Should pass promptCache boolean
+        expect(result.llmConfig.promptCache).toBe(true);
       });
 
       it('should handle web search functionality like production', () => {
@@ -1160,21 +1167,66 @@ describe('getLLMConfig', () => {
       it('should handle prompt cache support logic for different models', () => {
         const testCases = [
           // Models that support prompt cache (and have other beta headers)
-          { model: 'claude-3-5-sonnet', promptCache: true, shouldHaveHeaders: true },
-          { model: 'claude-3.5-sonnet-20241022', promptCache: true, shouldHaveHeaders: true },
-          { model: 'claude-3-7-sonnet', promptCache: true, shouldHaveHeaders: true },
-          { model: 'claude-3.7-sonnet-20250109', promptCache: true, shouldHaveHeaders: true },
-          { model: 'claude-sonnet-4-20250514', promptCache: true, shouldHaveHeaders: true },
+          {
+            model: 'claude-3-5-sonnet',
+            promptCache: true,
+            shouldHaveHeaders: true,
+            shouldHavePromptCache: true,
+          },
+          {
+            model: 'claude-3.5-sonnet-20241022',
+            promptCache: true,
+            shouldHaveHeaders: true,
+            shouldHavePromptCache: true,
+          },
+          {
+            model: 'claude-3-7-sonnet',
+            promptCache: true,
+            shouldHaveHeaders: true,
+            shouldHavePromptCache: true,
+          },
+          {
+            model: 'claude-3.7-sonnet-20250109',
+            promptCache: true,
+            shouldHaveHeaders: true,
+            shouldHavePromptCache: true,
+          },
+          {
+            model: 'claude-sonnet-4-20250514',
+            promptCache: true,
+            shouldHaveHeaders: true,
+            shouldHavePromptCache: true,
+          },
           // Models that support prompt cache but have no additional beta headers needed
-          { model: 'claude-3-opus', promptCache: true, shouldHaveHeaders: false },
+          {
+            model: 'claude-3-opus',
+            promptCache: true,
+            shouldHaveHeaders: false,
+            shouldHavePromptCache: true,
+          },
           // Models that don't support prompt cache
-          { model: 'claude-3-5-sonnet-latest', promptCache: true, shouldHaveHeaders: false },
-          { model: 'claude-3.5-sonnet-latest', promptCache: true, shouldHaveHeaders: false },
+          {
+            model: 'claude-3-5-sonnet-latest',
+            promptCache: true,
+            shouldHaveHeaders: false,
+            shouldHavePromptCache: false,
+          },
+          {
+            model: 'claude-3.5-sonnet-latest',
+            promptCache: true,
+            shouldHaveHeaders: false,
+            shouldHavePromptCache: false,
+          },
           // Prompt cache disabled
-          { model: 'claude-3-5-sonnet', promptCache: false, shouldHaveHeaders: false },
+          {
+            model: 'claude-3-5-sonnet',
+            promptCache: false,
+            shouldHaveHeaders: false,
+            shouldHavePromptCache: false,
+          },
         ];
 
-        testCases.forEach(({ model, promptCache, shouldHaveHeaders }) => {
+        testCases.forEach(({ model, promptCache, shouldHaveHeaders, shouldHavePromptCache }) => {
           const result = getLLMConfig('test-key', {
             modelOptions: { model, promptCache },
           });
@@ -1187,6 +1239,12 @@ describe('getLLMConfig', () => {
           } else {
             expect(headers).toBeUndefined();
           }
+
+          if (shouldHavePromptCache) {
+            expect(result.llmConfig.promptCache).toBe(true);
+          } else {
+            expect(result.llmConfig.promptCache).toBeUndefined();
+          }
         });
       });
     });
diff --git a/packages/api/src/endpoints/anthropic/llm.ts b/packages/api/src/endpoints/anthropic/llm.ts
index 408ad2a77c..34ec354365 100644
--- a/packages/api/src/endpoints/anthropic/llm.ts
+++ b/packages/api/src/endpoints/anthropic/llm.ts
@@ -155,6 +155,12 @@ function getLLMConfig(
 
   const supportsCacheControl =
     systemOptions.promptCache === true && checkPromptCacheSupport(requestOptions.model ?? '');
+
+  /** Pass promptCache boolean for downstream cache_control application */
+  if (supportsCacheControl) {
+    (requestOptions as Record<string, unknown>).promptCache = true;
+  }
+
   const headers = getClaudeHeaders(requestOptions.model ?? '', supportsCacheControl);
   if (headers && requestOptions.clientOptions) {
     requestOptions.clientOptions.defaultHeaders = headers;
diff --git a/packages/api/src/endpoints/openai/config.anthropic.spec.ts b/packages/api/src/endpoints/openai/config.anthropic.spec.ts
index eeb17a311d..7109341e8c 100644
--- a/packages/api/src/endpoints/openai/config.anthropic.spec.ts
+++ b/packages/api/src/endpoints/openai/config.anthropic.spec.ts
@@ -39,6 +39,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
               type: 'enabled',
               budget_tokens: 2000,
             },
+            promptCache: true,
           },
         },
         configOptions: {
@@ -87,6 +88,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
               type: 'enabled',
               budget_tokens: 3000,
             },
+            promptCache: true,
           },
         },
         configOptions: {
@@ -134,6 +136,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
               user_id: 'user123',
             },
             topK: 50,
+            promptCache: true,
           },
         },
         configOptions: {
@@ -175,6 +178,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
             metadata: {
               user_id: 'user456',
             },
+            promptCache: true,
           },
         },
         configOptions: {
@@ -187,7 +191,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
       });
     });
 
-    it('should apply custom headers without anthropic-beta for models that dont need it', () => {
+    it('should apply custom headers and promptCache for models that support caching', () => {
       const apiKey = 'sk-custom';
       const endpoint = 'Anthropic (via LiteLLM)';
       const options = {
@@ -218,6 +222,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
             metadata: {
               user_id: undefined,
             },
+            promptCache: true,
           },
         },
         configOptions: {
@@ -300,6 +305,9 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
           stream: true,
           topP: 0.9,
           maxTokens: 2048,
+          modelKwargs: {
+            promptCache: true,
+          },
           // temperature is dropped
           // modelKwargs.topK is dropped
           // modelKwargs.metadata is dropped completely
@@ -379,6 +387,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
             metadata: {
               user_id: 'searchUser',
             },
+            promptCache: true,
           },
         },
         configOptions: {
@@ -425,6 +434,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
               user_id: 'testUser',
             },
             topK: 40,
+            promptCache: true,
           },
         },
         configOptions: {
@@ -470,6 +480,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
             metadata: {
               user_id: 'addUser',
             },
+            promptCache: true,
             customParam1: 'value1', // Unknown params added to modelKwargs
             customParam2: 42,
           },
@@ -519,6 +530,7 @@ describe('getOpenAIConfig - Anthropic Compatibility', () => {
             metadata: {
               user_id: 'bothUser',
             },
+            promptCache: true,
             customParam: 'customValue',
             // topK is dropped
           },