💰 fix: Multi-Agent Token Spending & Prevent Double-Spend (#11433)

* fix: Token Spending Logic for Multi-Agents on Abort Scenarios * Implemented logic to skip token spending if a conversation is aborted, preventing double-spending. * Introduced `spendCollectedUsage` function to handle token spending for multiple models during aborts, ensuring accurate accounting for parallel agents. * Updated `GenerationJobManager` to store and retrieve collected usage data for improved abort handling. * Added comprehensive tests for the new functionality, covering various scenarios including cache token handling and parallel agent usage. * fix: Memory Context Handling for Multi-Agents * Refactored `buildMessages` method to pass memory context to parallel agents, ensuring they share the same user context. * Improved handling of memory context when no existing instructions are present for parallel agents. * Added comprehensive tests to verify memory context propagation and behavior under various scenarios, including cases with no memory available and empty agent configurations. * Enhanced logging for better traceability of memory context additions to agents. * chore: Memory Context Documentation for Parallel Agents * Updated documentation in the `AgentClient` class to clarify the in-place mutation of agentConfig objects when passing memory context to parallel agents. * Added notes on the implications of mutating objects directly to ensure all parallel agents receive the correct memory context before execution. * chore: UsageMetadata Interface docs for Token Spending * Expanded the UsageMetadata interface to support both OpenAI and Anthropic cache token formats. * Added detailed documentation for cache token properties, including mutually exclusive fields for different model types. * Improved clarity on how to access cache token details for accurate token spending tracking. * fix: Enhance Token Spending Logic in Abort Middleware * Refactored `spendCollectedUsage` function to utilize Promise.all for concurrent token spending, improving performance and ensuring all operations complete before clearing the collectedUsage array. * Added documentation to clarify the importance of clearing the collectedUsage array to prevent double-spending in abort scenarios. * Updated tests to verify the correct behavior of the spending logic and the clearing of the array after spending operations.
2026-03-09 09:32:36 +01:00 · 2026-01-20 14:43:19 -05:00 · 2026-01-20 14:43:19 -05:00 · 36c5a88c4e
commit 36c5a88c4e
parent 32e6f3b8e5
11 changed files with 1440 additions and 28 deletions
--- a/api/server/controllers/agents/client.js
+++ b/api/server/controllers/agents/client.js
@ -522,14 +522,36 @@ class AgentClient extends BaseClient {
    }

    const withoutKeys = await this.useMemory();
-    if (withoutKeys) {
-      systemContent += `${memoryInstructions}\n\n# Existing memory about the user:\n${withoutKeys}`;
+    const memoryContext = withoutKeys
+      ? `${memoryInstructions}\n\n# Existing memory about the user:\n${withoutKeys}`
+      : '';
+    if (memoryContext) {
+      systemContent += memoryContext;
    }

    if (systemContent) {
      this.options.agent.instructions = systemContent;
    }

+    /**
+     * Pass memory context to parallel agents (addedConvo) so they have the same user context.
+     *
+     * NOTE: This intentionally mutates the agentConfig objects in place. The agentConfigs Map
+     * holds references to config objects that will be passed to the graph runtime. Mutating
+     * them here ensures all parallel agents receive the memory context before execution starts.
+     * Creating new objects would not work because the Map references would still point to the old objects.
+     */
+    if (memoryContext && this.agentConfigs?.size > 0) {
+      for (const [agentId, agentConfig] of this.agentConfigs.entries()) {
+        if (agentConfig.instructions) {
+          agentConfig.instructions = agentConfig.instructions + '\n\n' + memoryContext;
+        } else {
+          agentConfig.instructions = memoryContext;
+        }
+        logger.debug(`[AgentClient] Added memory context to parallel agent: ${agentId}`);
+      }
+    }
+
    return result;
  }

@ -1084,11 +1106,20 @@ class AgentClient extends BaseClient {
          this.artifactPromises.push(...attachments);
        }

-        await this.recordCollectedUsage({
-          context: 'message',
-          balance: balanceConfig,
-          transactions: transactionsConfig,
-        });
+        /** Skip token spending if aborted - the abort handler (abortMiddleware.js) handles it
+        This prevents double-spending when user aborts via `/api/agents/chat/abort` */
+        const wasAborted = abortController?.signal?.aborted;
+        if (!wasAborted) {
+          await this.recordCollectedUsage({
+            context: 'message',
+            balance: balanceConfig,
+            transactions: transactionsConfig,
+          });
+        } else {
+          logger.debug(
+            '[api/server/controllers/agents/client.js #chatCompletion] Skipping token spending - handled by abort middleware',
+          );
+        }
      } catch (err) {
        logger.error(
          '[api/server/controllers/agents/client.js #chatCompletion] Error in cleanup phase',
--- a/api/server/controllers/agents/client.test.js
+++ b/api/server/controllers/agents/client.test.js
@ -1849,4 +1849,224 @@ describe('AgentClient - titleConvo', () => {
      });
    });
  });
+
+  describe('buildMessages - memory context for parallel agents', () => {
+    let client;
+    let mockReq;
+    let mockRes;
+    let mockAgent;
+    let mockOptions;
+
+    beforeEach(() => {
+      jest.clearAllMocks();
+
+      mockAgent = {
+        id: 'primary-agent',
+        name: 'Primary Agent',
+        endpoint: EModelEndpoint.openAI,
+        provider: EModelEndpoint.openAI,
+        instructions: 'Primary agent instructions',
+        model_parameters: {
+          model: 'gpt-4',
+        },
+        tools: [],
+      };
+
+      mockReq = {
+        user: {
+          id: 'user-123',
+          personalization: {
+            memories: true,
+          },
+        },
+        body: {
+          endpoint: EModelEndpoint.openAI,
+        },
+        config: {
+          memory: {
+            disabled: false,
+          },
+        },
+      };
+
+      mockRes = {};
+
+      mockOptions = {
+        req: mockReq,
+        res: mockRes,
+        agent: mockAgent,
+        endpoint: EModelEndpoint.agents,
+      };
+
+      client = new AgentClient(mockOptions);
+      client.conversationId = 'convo-123';
+      client.responseMessageId = 'response-123';
+      client.shouldSummarize = false;
+      client.maxContextTokens = 4096;
+    });
+
+    it('should pass memory context to parallel agents (addedConvo)', async () => {
+      const memoryContent = 'User prefers dark mode. User is a software developer.';
+      client.useMemory = jest.fn().mockResolvedValue(memoryContent);
+
+      const parallelAgent1 = {
+        id: 'parallel-agent-1',
+        name: 'Parallel Agent 1',
+        instructions: 'Parallel agent 1 instructions',
+        provider: EModelEndpoint.openAI,
+      };
+
+      const parallelAgent2 = {
+        id: 'parallel-agent-2',
+        name: 'Parallel Agent 2',
+        instructions: 'Parallel agent 2 instructions',
+        provider: EModelEndpoint.anthropic,
+      };
+
+      client.agentConfigs = new Map([
+        ['parallel-agent-1', parallelAgent1],
+        ['parallel-agent-2', parallelAgent2],
+      ]);
+
+      const messages = [
+        {
+          messageId: 'msg-1',
+          parentMessageId: null,
+          sender: 'User',
+          text: 'Hello',
+          isCreatedByUser: true,
+        },
+      ];
+
+      await client.buildMessages(messages, null, {
+        instructions: 'Base instructions',
+        additional_instructions: null,
+      });
+
+      expect(client.useMemory).toHaveBeenCalled();
+
+      expect(client.options.agent.instructions).toContain('Base instructions');
+      expect(client.options.agent.instructions).toContain(memoryContent);
+
+      expect(parallelAgent1.instructions).toContain('Parallel agent 1 instructions');
+      expect(parallelAgent1.instructions).toContain(memoryContent);
+
+      expect(parallelAgent2.instructions).toContain('Parallel agent 2 instructions');
+      expect(parallelAgent2.instructions).toContain(memoryContent);
+    });
+
+    it('should not modify parallel agents when no memory context is available', async () => {
+      client.useMemory = jest.fn().mockResolvedValue(undefined);
+
+      const parallelAgent = {
+        id: 'parallel-agent-1',
+        name: 'Parallel Agent 1',
+        instructions: 'Original parallel instructions',
+        provider: EModelEndpoint.openAI,
+      };
+
+      client.agentConfigs = new Map([['parallel-agent-1', parallelAgent]]);
+
+      const messages = [
+        {
+          messageId: 'msg-1',
+          parentMessageId: null,
+          sender: 'User',
+          text: 'Hello',
+          isCreatedByUser: true,
+        },
+      ];
+
+      await client.buildMessages(messages, null, {
+        instructions: 'Base instructions',
+        additional_instructions: null,
+      });
+
+      expect(parallelAgent.instructions).toBe('Original parallel instructions');
+    });
+
+    it('should handle parallel agents without existing instructions', async () => {
+      const memoryContent = 'User is a data scientist.';
+      client.useMemory = jest.fn().mockResolvedValue(memoryContent);
+
+      const parallelAgentNoInstructions = {
+        id: 'parallel-agent-no-instructions',
+        name: 'Parallel Agent No Instructions',
+        provider: EModelEndpoint.openAI,
+      };
+
+      client.agentConfigs = new Map([
+        ['parallel-agent-no-instructions', parallelAgentNoInstructions],
+      ]);
+
+      const messages = [
+        {
+          messageId: 'msg-1',
+          parentMessageId: null,
+          sender: 'User',
+          text: 'Hello',
+          isCreatedByUser: true,
+        },
+      ];
+
+      await client.buildMessages(messages, null, {
+        instructions: null,
+        additional_instructions: null,
+      });
+
+      expect(parallelAgentNoInstructions.instructions).toContain(memoryContent);
+    });
+
+    it('should not modify agentConfigs when none exist', async () => {
+      const memoryContent = 'User prefers concise responses.';
+      client.useMemory = jest.fn().mockResolvedValue(memoryContent);
+
+      client.agentConfigs = null;
+
+      const messages = [
+        {
+          messageId: 'msg-1',
+          parentMessageId: null,
+          sender: 'User',
+          text: 'Hello',
+          isCreatedByUser: true,
+        },
+      ];
+
+      await expect(
+        client.buildMessages(messages, null, {
+          instructions: 'Base instructions',
+          additional_instructions: null,
+        }),
+      ).resolves.not.toThrow();
+
+      expect(client.options.agent.instructions).toContain(memoryContent);
+    });
+
+    it('should handle empty agentConfigs map', async () => {
+      const memoryContent = 'User likes detailed explanations.';
+      client.useMemory = jest.fn().mockResolvedValue(memoryContent);
+
+      client.agentConfigs = new Map();
+
+      const messages = [
+        {
+          messageId: 'msg-1',
+          parentMessageId: null,
+          sender: 'User',
+          text: 'Hello',
+          isCreatedByUser: true,
+        },
+      ];
+
+      await expect(
+        client.buildMessages(messages, null, {
+          instructions: 'Base instructions',
+          additional_instructions: null,
+        }),
+      ).resolves.not.toThrow();
+
+      expect(client.options.agent.instructions).toContain(memoryContent);
+    });
+  });
 });
--- a/api/server/middleware/abortMiddleware.js
+++ b/api/server/middleware/abortMiddleware.js
@ -7,13 +7,89 @@ const {
  sanitizeMessageForTransmit,
 } = require('@librechat/api');
 const { isAssistantsEndpoint, ErrorTypes } = require('librechat-data-provider');
+const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
 const { truncateText, smartTruncateText } = require('~/app/clients/prompts');
 const clearPendingReq = require('~/cache/clearPendingReq');
 const { sendError } = require('~/server/middleware/error');
-const { spendTokens } = require('~/models/spendTokens');
 const { saveMessage, getConvo } = require('~/models');
 const { abortRun } = require('./abortRun');

+/**
+ * Spend tokens for all models from collected usage.
+ * This handles both sequential and parallel agent execution.
+ *
+ * IMPORTANT: After spending, this function clears the collectedUsage array
+ * to prevent double-spending. The array is shared with AgentClient.collectedUsage,
+ * so clearing it here prevents the finally block from also spending tokens.
+ *
+ * @param {Object} params
+ * @param {string} params.userId - User ID
+ * @param {string} params.conversationId - Conversation ID
+ * @param {Array<Object>} params.collectedUsage - Usage metadata from all models
+ * @param {string} [params.fallbackModel] - Fallback model name if not in usage
+ */
+async function spendCollectedUsage({ userId, conversationId, collectedUsage, fallbackModel }) {
+  if (!collectedUsage || collectedUsage.length === 0) {
+    return;
+  }
+
+  const spendPromises = [];
+
+  for (const usage of collectedUsage) {
+    if (!usage) {
+      continue;
+    }
+
+    // Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens)
+    const cache_creation =
+      Number(usage.input_token_details?.cache_creation) ||
+      Number(usage.cache_creation_input_tokens) ||
+      0;
+    const cache_read =
+      Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
+
+    const txMetadata = {
+      context: 'abort',
+      conversationId,
+      user: userId,
+      model: usage.model ?? fallbackModel,
+    };
+
+    if (cache_creation > 0 || cache_read > 0) {
+      spendPromises.push(
+        spendStructuredTokens(txMetadata, {
+          promptTokens: {
+            input: usage.input_tokens,
+            write: cache_creation,
+            read: cache_read,
+          },
+          completionTokens: usage.output_tokens,
+        }).catch((err) => {
+          logger.error('[abortMiddleware] Error spending structured tokens for abort', err);
+        }),
+      );
+      continue;
+    }
+
+    spendPromises.push(
+      spendTokens(txMetadata, {
+        promptTokens: usage.input_tokens,
+        completionTokens: usage.output_tokens,
+      }).catch((err) => {
+        logger.error('[abortMiddleware] Error spending tokens for abort', err);
+      }),
+    );
+  }
+
+  // Wait for all token spending to complete
+  await Promise.all(spendPromises);
+
+  // Clear the array to prevent double-spending from the AgentClient finally block.
+  // The collectedUsage array is shared by reference with AgentClient.collectedUsage,
+  // so clearing it here ensures recordCollectedUsage() sees an empty array and returns early.
+  collectedUsage.length = 0;
+}
+
 /**
 * Abort an active message generation.
 * Uses GenerationJobManager for all agent requests.
@ -39,9 +115,8 @@ async function abortMessage(req, res) {
    return;
  }

-  const { jobData, content, text } = abortResult;
+  const { jobData, content, text, collectedUsage } = abortResult;

-  // Count tokens and spend them
  const completionTokens = await countTokens(text);
  const promptTokens = jobData?.promptTokens ?? 0;

@ -62,10 +137,21 @@ async function abortMessage(req, res) {
    tokenCount: completionTokens,
  };

-  await spendTokens(
-    { ...responseMessage, context: 'incomplete', user: userId },
-    { promptTokens, completionTokens },
-  );
+  // Spend tokens for ALL models from collectedUsage (handles parallel agents/addedConvo)
+  if (collectedUsage && collectedUsage.length > 0) {
+    await spendCollectedUsage({
+      userId,
+      conversationId: jobData?.conversationId,
+      collectedUsage,
+      fallbackModel: jobData?.model,
+    });
+  } else {
+    // Fallback: no collected usage, use text-based token counting for primary model only
+    await spendTokens(
+      { ...responseMessage, context: 'incomplete', user: userId },
+      { promptTokens, completionTokens },
+    );
+  }

  await saveMessage(
    req,
--- a/api/server/middleware/abortMiddleware.spec.js
+++ b/api/server/middleware/abortMiddleware.spec.js
@ -0,0 +1,428 @@
+/**
+ * Tests for abortMiddleware - spendCollectedUsage function
+ *
+ * This tests the token spending logic for abort scenarios,
+ * particularly for parallel agents (addedConvo) where multiple
+ * models need their tokens spent.
+ */
+
+const mockSpendTokens = jest.fn().mockResolvedValue();
+const mockSpendStructuredTokens = jest.fn().mockResolvedValue();
+
+jest.mock('~/models/spendTokens', () => ({
+  spendTokens: (...args) => mockSpendTokens(...args),
+  spendStructuredTokens: (...args) => mockSpendStructuredTokens(...args),
+}));
+
+jest.mock('@librechat/data-schemas', () => ({
+  logger: {
+    debug: jest.fn(),
+    error: jest.fn(),
+    warn: jest.fn(),
+    info: jest.fn(),
+  },
+}));
+
+jest.mock('@librechat/api', () => ({
+  countTokens: jest.fn().mockResolvedValue(100),
+  isEnabled: jest.fn().mockReturnValue(false),
+  sendEvent: jest.fn(),
+  GenerationJobManager: {
+    abortJob: jest.fn(),
+  },
+  sanitizeMessageForTransmit: jest.fn((msg) => msg),
+}));
+
+jest.mock('librechat-data-provider', () => ({
+  isAssistantsEndpoint: jest.fn().mockReturnValue(false),
+  ErrorTypes: { INVALID_REQUEST: 'INVALID_REQUEST', NO_SYSTEM_MESSAGES: 'NO_SYSTEM_MESSAGES' },
+}));
+
+jest.mock('~/app/clients/prompts', () => ({
+  truncateText: jest.fn((text) => text),
+  smartTruncateText: jest.fn((text) => text),
+}));
+
+jest.mock('~/cache/clearPendingReq', () => jest.fn().mockResolvedValue());
+
+jest.mock('~/server/middleware/error', () => ({
+  sendError: jest.fn(),
+}));
+
+jest.mock('~/models', () => ({
+  saveMessage: jest.fn().mockResolvedValue(),
+  getConvo: jest.fn().mockResolvedValue({ title: 'Test Chat' }),
+}));
+
+jest.mock('./abortRun', () => ({
+  abortRun: jest.fn(),
+}));
+
+// Import the module after mocks are set up
+// We need to extract the spendCollectedUsage function for testing
+// Since it's not exported, we'll test it through the handleAbort flow
+
+describe('abortMiddleware - spendCollectedUsage', () => {
+  beforeEach(() => {
+    jest.clearAllMocks();
+  });
+
+  describe('spendCollectedUsage logic', () => {
+    // Since spendCollectedUsage is not exported, we test the logic directly
+    // by replicating the function here for unit testing
+
+    const spendCollectedUsage = async ({
+      userId,
+      conversationId,
+      collectedUsage,
+      fallbackModel,
+    }) => {
+      if (!collectedUsage || collectedUsage.length === 0) {
+        return;
+      }
+
+      const spendPromises = [];
+
+      for (const usage of collectedUsage) {
+        if (!usage) {
+          continue;
+        }
+
+        const cache_creation =
+          Number(usage.input_token_details?.cache_creation) ||
+          Number(usage.cache_creation_input_tokens) ||
+          0;
+        const cache_read =
+          Number(usage.input_token_details?.cache_read) ||
+          Number(usage.cache_read_input_tokens) ||
+          0;
+
+        const txMetadata = {
+          context: 'abort',
+          conversationId,
+          user: userId,
+          model: usage.model ?? fallbackModel,
+        };
+
+        if (cache_creation > 0 || cache_read > 0) {
+          spendPromises.push(
+            mockSpendStructuredTokens(txMetadata, {
+              promptTokens: {
+                input: usage.input_tokens,
+                write: cache_creation,
+                read: cache_read,
+              },
+              completionTokens: usage.output_tokens,
+            }).catch(() => {
+              // Log error but don't throw
+            }),
+          );
+          continue;
+        }
+
+        spendPromises.push(
+          mockSpendTokens(txMetadata, {
+            promptTokens: usage.input_tokens,
+            completionTokens: usage.output_tokens,
+          }).catch(() => {
+            // Log error but don't throw
+          }),
+        );
+      }
+
+      // Wait for all token spending to complete
+      await Promise.all(spendPromises);
+
+      // Clear the array to prevent double-spending
+      collectedUsage.length = 0;
+    };
+
+    it('should return early if collectedUsage is empty', async () => {
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage: [],
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).not.toHaveBeenCalled();
+      expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
+    });
+
+    it('should return early if collectedUsage is null', async () => {
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage: null,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).not.toHaveBeenCalled();
+      expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
+    });
+
+    it('should skip null entries in collectedUsage', async () => {
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        null,
+        { input_tokens: 200, output_tokens: 60, model: 'gpt-4' },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+    });
+
+    it('should spend tokens for single model', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({
+          context: 'abort',
+          conversationId: 'convo-123',
+          user: 'user-123',
+          model: 'gpt-4',
+        }),
+        { promptTokens: 100, completionTokens: 50 },
+      );
+    });
+
+    it('should spend tokens for multiple models (parallel agents)', async () => {
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+        { input_tokens: 120, output_tokens: 60, model: 'gemini-pro' },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(3);
+
+      // Verify each model was called
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        1,
+        expect.objectContaining({ model: 'gpt-4' }),
+        { promptTokens: 100, completionTokens: 50 },
+      );
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        2,
+        expect.objectContaining({ model: 'claude-3' }),
+        { promptTokens: 80, completionTokens: 40 },
+      );
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        3,
+        expect.objectContaining({ model: 'gemini-pro' }),
+        { promptTokens: 120, completionTokens: 60 },
+      );
+    });
+
+    it('should use fallbackModel when usage.model is missing', async () => {
+      const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'fallback-model',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'fallback-model' }),
+        expect.any(Object),
+      );
+    });
+
+    it('should use spendStructuredTokens for OpenAI format cache tokens', async () => {
+      const collectedUsage = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'gpt-4',
+          input_token_details: {
+            cache_creation: 20,
+            cache_read: 10,
+          },
+        },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendTokens).not.toHaveBeenCalled();
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'gpt-4', context: 'abort' }),
+        {
+          promptTokens: {
+            input: 100,
+            write: 20,
+            read: 10,
+          },
+          completionTokens: 50,
+        },
+      );
+    });
+
+    it('should use spendStructuredTokens for Anthropic format cache tokens', async () => {
+      const collectedUsage = [
+        {
+          input_tokens: 100,
+          output_tokens: 50,
+          model: 'claude-3',
+          cache_creation_input_tokens: 25,
+          cache_read_input_tokens: 15,
+        },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'claude-3',
+      });
+
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
+      expect(mockSpendTokens).not.toHaveBeenCalled();
+      expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
+        expect.objectContaining({ model: 'claude-3' }),
+        {
+          promptTokens: {
+            input: 100,
+            write: 25,
+            read: 15,
+          },
+          completionTokens: 50,
+        },
+      );
+    });
+
+    it('should handle mixed cache and non-cache entries', async () => {
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        {
+          input_tokens: 150,
+          output_tokens: 30,
+          model: 'claude-3',
+          cache_creation_input_tokens: 20,
+          cache_read_input_tokens: 10,
+        },
+        { input_tokens: 200, output_tokens: 20, model: 'gemini-pro' },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+      expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
+    });
+
+    it('should handle real-world parallel agent abort scenario', async () => {
+      // Simulates: Primary agent (gemini) + addedConvo agent (gpt-5) aborted mid-stream
+      const collectedUsage = [
+        { input_tokens: 31596, output_tokens: 151, model: 'gemini-3-flash-preview' },
+        { input_tokens: 28000, output_tokens: 120, model: 'gpt-5.2' },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gemini-3-flash-preview',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+
+      // Primary model
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        1,
+        expect.objectContaining({ model: 'gemini-3-flash-preview' }),
+        { promptTokens: 31596, completionTokens: 151 },
+      );
+
+      // Parallel model (addedConvo)
+      expect(mockSpendTokens).toHaveBeenNthCalledWith(
+        2,
+        expect.objectContaining({ model: 'gpt-5.2' }),
+        { promptTokens: 28000, completionTokens: 120 },
+      );
+    });
+
+    it('should clear collectedUsage array after spending to prevent double-spending', async () => {
+      // This tests the race condition fix: after abort middleware spends tokens,
+      // the collectedUsage array is cleared so AgentClient.recordCollectedUsage()
+      // (which shares the same array reference) sees an empty array and returns early.
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+      ];
+
+      expect(collectedUsage.length).toBe(2);
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      expect(mockSpendTokens).toHaveBeenCalledTimes(2);
+
+      // The array should be cleared after spending
+      expect(collectedUsage.length).toBe(0);
+    });
+
+    it('should await all token spending operations before clearing array', async () => {
+      // Ensure we don't clear the array before spending completes
+      let spendCallCount = 0;
+      mockSpendTokens.mockImplementation(async () => {
+        spendCallCount++;
+        // Simulate async delay
+        await new Promise((resolve) => setTimeout(resolve, 10));
+      });
+
+      const collectedUsage = [
+        { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+        { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+      ];
+
+      await spendCollectedUsage({
+        userId: 'user-123',
+        conversationId: 'convo-123',
+        collectedUsage,
+        fallbackModel: 'gpt-4',
+      });
+
+      // Both spend calls should have completed
+      expect(spendCallCount).toBe(2);
+
+      // Array should be cleared after awaiting
+      expect(collectedUsage.length).toBe(0);
+    });
+  });
+});
--- a/api/server/services/Endpoints/agents/initialize.js
+++ b/api/server/services/Endpoints/agents/initialize.js
@ -3,10 +3,11 @@ const { createContentAggregator } = require('@librechat/agents');
 const {
  initializeAgent,
  validateAgentModel,
-  getCustomEndpointConfig,
-  createSequentialChainEdges,
  createEdgeCollector,
  filterOrphanedEdges,
+  GenerationJobManager,
+  getCustomEndpointConfig,
+  createSequentialChainEdges,
 } = require('@librechat/api');
 const {
  EModelEndpoint,
@ -314,6 +315,10 @@ const initializeClient = async ({ req, res, signal, endpointOption }) => {
    endpoint: isEphemeralAgentId(primaryConfig.id) ? primaryConfig.endpoint : EModelEndpoint.agents,
  });

+  if (streamId) {
+    GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
+  }
+
  return { client, userMCPAuthMap };
 };

--- a/packages/api/src/stream/GenerationJobManager.ts
+++ b/packages/api/src/stream/GenerationJobManager.ts
@ -1,9 +1,11 @@
 import { logger } from '@librechat/data-schemas';
 import type { StandardGraph } from '@librechat/agents';
-import type { Agents } from 'librechat-data-provider';
+import { parseTextParts } from 'librechat-data-provider';
+import type { Agents, TMessageContentParts } from 'librechat-data-provider';
 import type {
  SerializableJobData,
  IEventTransport,
+  UsageMetadata,
  AbortResult,
  IJobStore,
 } from './interfaces/IJobStore';
@ -585,7 +587,14 @@ class GenerationJobManagerClass {

    if (!jobData) {
      logger.warn(`[GenerationJobManager] Cannot abort - job not found: ${streamId}`);
-      return { success: false, jobData: null, content: [], finalEvent: null };
+      return {
+        text: '',
+        content: [],
+        jobData: null,
+        success: false,
+        finalEvent: null,
+        collectedUsage: [],
+      };
    }

    // Emit abort signal for cross-replica support (Redis mode)
@ -599,15 +608,21 @@ class GenerationJobManagerClass {
      runtime.abortController.abort();
    }

-    // Get content before clearing state
+    /** Content before clearing state */
    const result = await this.jobStore.getContentParts(streamId);
    const content = result?.content ?? [];

-    // Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
-    // In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation
+    /** Collected usage for all models */
+    const collectedUsage = this.jobStore.getCollectedUsage(streamId);
+
+    /** Text from content parts for fallback token counting */
+    const text = parseTextParts(content as TMessageContentParts[]);
+
+    /** Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
+    In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation */
    const isEarlyAbort = content.length === 0 && !jobData.responseMessageId;

-    // Create final event for abort
+    /** Final event for abort */
    const userMessageId = jobData.userMessage?.messageId;

    const abortFinalEvent: t.ServerSentEvent = {
@ -669,6 +684,8 @@ class GenerationJobManagerClass {
      jobData,
      content,
      finalEvent: abortFinalEvent,
+      text,
+      collectedUsage,
    };
  }

@ -933,6 +950,18 @@ class GenerationJobManagerClass {
    this.jobStore.setContentParts(streamId, contentParts);
  }

+  /**
+   * Set reference to the collectedUsage array.
+   * This array accumulates token usage from all models during generation.
+   */
+  setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
+    // Use runtime state check for performance (sync check)
+    if (!this.runtimeState.has(streamId)) {
+      return;
+    }
+    this.jobStore.setCollectedUsage(streamId, collectedUsage);
+  }
+
  /**
   * Set reference to the graph instance.
   */
--- a/packages/api/src/stream/tests/collectedUsage.spec.ts
+++ b/packages/api/src/stream/tests/collectedUsage.spec.ts
@ -0,0 +1,482 @@
+/**
+ * Tests for collected usage functionality in GenerationJobManager.
+ *
+ * This tests the storage and retrieval of collectedUsage for abort handling,
+ * ensuring all models (including parallel agents from addedConvo) have their
+ * tokens spent when a conversation is aborted.
+ */
+
+import type { UsageMetadata } from '../interfaces/IJobStore';
+
+describe('CollectedUsage - InMemoryJobStore', () => {
+  beforeEach(() => {
+    jest.resetModules();
+  });
+
+  it('should store and retrieve collectedUsage', async () => {
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const store = new InMemoryJobStore();
+    await store.initialize();
+
+    const streamId = 'test-stream-1';
+    await store.createJob(streamId, 'user-1');
+
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+      { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+    ];
+
+    store.setCollectedUsage(streamId, collectedUsage);
+    const retrieved = store.getCollectedUsage(streamId);
+
+    expect(retrieved).toEqual(collectedUsage);
+    expect(retrieved).toHaveLength(2);
+
+    await store.destroy();
+  });
+
+  it('should return empty array when no collectedUsage set', async () => {
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const store = new InMemoryJobStore();
+    await store.initialize();
+
+    const streamId = 'test-stream-2';
+    await store.createJob(streamId, 'user-1');
+
+    const retrieved = store.getCollectedUsage(streamId);
+
+    expect(retrieved).toEqual([]);
+
+    await store.destroy();
+  });
+
+  it('should return empty array for non-existent stream', async () => {
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const store = new InMemoryJobStore();
+    await store.initialize();
+
+    const retrieved = store.getCollectedUsage('non-existent-stream');
+
+    expect(retrieved).toEqual([]);
+
+    await store.destroy();
+  });
+
+  it('should update collectedUsage when set multiple times', async () => {
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const store = new InMemoryJobStore();
+    await store.initialize();
+
+    const streamId = 'test-stream-3';
+    await store.createJob(streamId, 'user-1');
+
+    const usage1: UsageMetadata[] = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
+    store.setCollectedUsage(streamId, usage1);
+
+    // Simulate more usage being added
+    const usage2: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+      { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+    ];
+    store.setCollectedUsage(streamId, usage2);
+
+    const retrieved = store.getCollectedUsage(streamId);
+    expect(retrieved).toHaveLength(2);
+
+    await store.destroy();
+  });
+
+  it('should clear collectedUsage when clearContentState is called', async () => {
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const store = new InMemoryJobStore();
+    await store.initialize();
+
+    const streamId = 'test-stream-4';
+    await store.createJob(streamId, 'user-1');
+
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+    ];
+    store.setCollectedUsage(streamId, collectedUsage);
+
+    expect(store.getCollectedUsage(streamId)).toHaveLength(1);
+
+    store.clearContentState(streamId);
+
+    expect(store.getCollectedUsage(streamId)).toEqual([]);
+
+    await store.destroy();
+  });
+
+  it('should clear collectedUsage when job is deleted', async () => {
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const store = new InMemoryJobStore();
+    await store.initialize();
+
+    const streamId = 'test-stream-5';
+    await store.createJob(streamId, 'user-1');
+
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+    ];
+    store.setCollectedUsage(streamId, collectedUsage);
+
+    await store.deleteJob(streamId);
+
+    expect(store.getCollectedUsage(streamId)).toEqual([]);
+
+    await store.destroy();
+  });
+});
+
+describe('CollectedUsage - GenerationJobManager', () => {
+  beforeEach(() => {
+    jest.resetModules();
+  });
+
+  it('should set and retrieve collectedUsage through manager', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `manager-test-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+      { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+    ];
+
+    GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
+
+    // Retrieve through abort
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.collectedUsage).toEqual(collectedUsage);
+    expect(abortResult.collectedUsage).toHaveLength(2);
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should return empty collectedUsage when none set', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `no-usage-test-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.collectedUsage).toEqual([]);
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should not set collectedUsage if job does not exist', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+    ];
+
+    // This should not throw, just silently do nothing
+    GenerationJobManager.setCollectedUsage('non-existent-stream', collectedUsage);
+
+    const abortResult = await GenerationJobManager.abortJob('non-existent-stream');
+    expect(abortResult.success).toBe(false);
+
+    await GenerationJobManager.destroy();
+  });
+});
+
+describe('AbortJob - Text and CollectedUsage', () => {
+  beforeEach(() => {
+    jest.resetModules();
+  });
+
+  it('should extract text from content parts on abort', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `text-extract-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    // Set content parts with text
+    const contentParts = [
+      { type: 'text', text: 'Hello ' },
+      { type: 'text', text: 'world!' },
+    ];
+    GenerationJobManager.setContentParts(streamId, contentParts as never);
+
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.text).toBe('Hello world!');
+    expect(abortResult.success).toBe(true);
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should return empty text when no content parts', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `empty-text-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.text).toBe('');
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should return both text and collectedUsage on abort', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `full-abort-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    // Set content parts
+    const contentParts = [{ type: 'text', text: 'Partial response...' }];
+    GenerationJobManager.setContentParts(streamId, contentParts as never);
+
+    // Set collected usage
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
+      { input_tokens: 80, output_tokens: 40, model: 'claude-3' },
+    ];
+    GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
+
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.success).toBe(true);
+    expect(abortResult.text).toBe('Partial response...');
+    expect(abortResult.collectedUsage).toEqual(collectedUsage);
+    expect(abortResult.content).toHaveLength(1);
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should return empty values for non-existent job', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const abortResult = await GenerationJobManager.abortJob('non-existent-job');
+
+    expect(abortResult.success).toBe(false);
+    expect(abortResult.text).toBe('');
+    expect(abortResult.collectedUsage).toEqual([]);
+    expect(abortResult.content).toEqual([]);
+    expect(abortResult.jobData).toBeNull();
+
+    await GenerationJobManager.destroy();
+  });
+});
+
+describe('Real-world Scenarios', () => {
+  beforeEach(() => {
+    jest.resetModules();
+  });
+
+  it('should handle parallel agent abort with collected usage', async () => {
+    /**
+     * Scenario: User aborts a conversation with addedConvo (parallel agents)
+     * - Primary agent: gemini-3-flash-preview
+     * - Parallel agent: gpt-5.2
+     * Both should have their tokens spent on abort
+     */
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `parallel-abort-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    // Simulate content from primary agent
+    const contentParts = [
+      { type: 'text', text: 'Primary agent output...' },
+      { type: 'text', text: 'More content...' },
+    ];
+    GenerationJobManager.setContentParts(streamId, contentParts as never);
+
+    // Simulate collected usage from both agents (as would happen during generation)
+    const collectedUsage: UsageMetadata[] = [
+      {
+        input_tokens: 31596,
+        output_tokens: 151,
+        model: 'gemini-3-flash-preview',
+      },
+      {
+        input_tokens: 28000,
+        output_tokens: 120,
+        model: 'gpt-5.2',
+      },
+    ];
+    GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
+
+    // Abort the job
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    // Verify both models' usage is returned
+    expect(abortResult.success).toBe(true);
+    expect(abortResult.collectedUsage).toHaveLength(2);
+    expect(abortResult.collectedUsage[0].model).toBe('gemini-3-flash-preview');
+    expect(abortResult.collectedUsage[1].model).toBe('gpt-5.2');
+
+    // Verify text is extracted
+    expect(abortResult.text).toContain('Primary agent output');
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should handle abort with cache tokens from Anthropic', async () => {
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `cache-abort-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    // Anthropic-style cache tokens
+    const collectedUsage: UsageMetadata[] = [
+      {
+        input_tokens: 788,
+        output_tokens: 163,
+        cache_creation_input_tokens: 30808,
+        cache_read_input_tokens: 0,
+        model: 'claude-opus-4-5-20251101',
+      },
+    ];
+    GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
+
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.collectedUsage[0].cache_creation_input_tokens).toBe(30808);
+
+    await GenerationJobManager.destroy();
+  });
+
+  it('should handle abort with sequential tool calls usage', async () => {
+    /**
+     * Scenario: Single agent with multiple tool calls, aborted mid-execution
+     * Usage accumulates for each LLM call
+     */
+    const { GenerationJobManager } = await import('../GenerationJobManager');
+    const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
+    const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
+
+    GenerationJobManager.configure({
+      jobStore: new InMemoryJobStore(),
+      eventTransport: new InMemoryEventTransport(),
+      isRedis: false,
+      cleanupOnComplete: false,
+    });
+
+    await GenerationJobManager.initialize();
+
+    const streamId = `sequential-abort-${Date.now()}`;
+    await GenerationJobManager.createJob(streamId, 'user-1');
+
+    // Usage from multiple sequential LLM calls (tool use pattern)
+    const collectedUsage: UsageMetadata[] = [
+      { input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, // Initial call
+      { input_tokens: 150, output_tokens: 30, model: 'gpt-4' }, // After tool result 1
+      { input_tokens: 180, output_tokens: 20, model: 'gpt-4' }, // After tool result 2 (aborted here)
+    ];
+    GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
+
+    const abortResult = await GenerationJobManager.abortJob(streamId);
+
+    expect(abortResult.collectedUsage).toHaveLength(3);
+    // All three entries should be present for proper token accounting
+
+    await GenerationJobManager.destroy();
+  });
+});
--- a/packages/api/src/stream/implementations/InMemoryJobStore.ts
+++ b/packages/api/src/stream/implementations/InMemoryJobStore.ts
@ -1,7 +1,12 @@
 import { logger } from '@librechat/data-schemas';
 import type { StandardGraph } from '@librechat/agents';
 import type { Agents } from 'librechat-data-provider';
-import type { IJobStore, SerializableJobData, JobStatus } from '~/stream/interfaces/IJobStore';
+import type {
+  SerializableJobData,
+  UsageMetadata,
+  IJobStore,
+  JobStatus,
+} from '~/stream/interfaces/IJobStore';

 /**
 * Content state for a job - volatile, in-memory only.
@ -10,6 +15,7 @@ import type { IJobStore, SerializableJobData, JobStatus } from '~/stream/interfa
 interface ContentState {
  contentParts: Agents.MessageContentComplex[];
  graphRef: WeakRef<StandardGraph> | null;
+  collectedUsage: UsageMetadata[];
 }

 /**
@ -240,6 +246,7 @@ export class InMemoryJobStore implements IJobStore {
      this.contentState.set(streamId, {
        contentParts: [],
        graphRef: new WeakRef(graph),
+        collectedUsage: [],
      });
    }
  }
@ -252,10 +259,30 @@ export class InMemoryJobStore implements IJobStore {
    if (existing) {
      existing.contentParts = contentParts;
    } else {
-      this.contentState.set(streamId, { contentParts, graphRef: null });
+      this.contentState.set(streamId, { contentParts, graphRef: null, collectedUsage: [] });
    }
  }

+  /**
+   * Set collected usage reference for a job.
+   */
+  setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
+    const existing = this.contentState.get(streamId);
+    if (existing) {
+      existing.collectedUsage = collectedUsage;
+    } else {
+      this.contentState.set(streamId, { contentParts: [], graphRef: null, collectedUsage });
+    }
+  }
+
+  /**
+   * Get collected usage for a job.
+   */
+  getCollectedUsage(streamId: string): UsageMetadata[] {
+    const state = this.contentState.get(streamId);
+    return state?.collectedUsage ?? [];
+  }
+
  /**
   * Get content parts for a job.
   * Returns live content from stored reference.
--- a/packages/api/src/stream/implementations/RedisJobStore.ts
+++ b/packages/api/src/stream/implementations/RedisJobStore.ts
@ -1,9 +1,14 @@
 import { logger } from '@librechat/data-schemas';
 import { createContentAggregator } from '@librechat/agents';
-import type { IJobStore, SerializableJobData, JobStatus } from '~/stream/interfaces/IJobStore';
 import type { StandardGraph } from '@librechat/agents';
 import type { Agents } from 'librechat-data-provider';
 import type { Redis, Cluster } from 'ioredis';
+import type {
+  SerializableJobData,
+  UsageMetadata,
+  IJobStore,
+  JobStatus,
+} from '~/stream/interfaces/IJobStore';

 /**
 * Key prefixes for Redis storage.
@ -90,6 +95,13 @@ export class RedisJobStore implements IJobStore {
   */
  private localGraphCache = new Map<string, WeakRef<StandardGraph>>();

+  /**
+   * Local cache for collectedUsage arrays.
+   * Generation happens on a single instance, so collectedUsage is only available locally.
+   * For cross-replica abort, the abort handler falls back to text-based token counting.
+   */
+  private localCollectedUsageCache = new Map<string, UsageMetadata[]>();
+
  /** Cleanup interval in ms (1 minute) */
  private cleanupIntervalMs = 60000;

@ -227,6 +239,7 @@ export class RedisJobStore implements IJobStore {
  async deleteJob(streamId: string): Promise<void> {
    // Clear local caches
    this.localGraphCache.delete(streamId);
+    this.localCollectedUsageCache.delete(streamId);

    // Note: userJobs cleanup is handled lazily via self-healing in getActiveJobIdsByUser
    // In cluster mode, separate runningJobs (global) from stream-specific keys (same slot)
@ -290,6 +303,7 @@ export class RedisJobStore implements IJobStore {
      if (!job) {
        await this.redis.srem(KEYS.runningJobs, streamId);
        this.localGraphCache.delete(streamId);
+        this.localCollectedUsageCache.delete(streamId);
        cleaned++;
        continue;
      }
@ -298,6 +312,7 @@ export class RedisJobStore implements IJobStore {
      if (job.status !== 'running') {
        await this.redis.srem(KEYS.runningJobs, streamId);
        this.localGraphCache.delete(streamId);
+        this.localCollectedUsageCache.delete(streamId);
        cleaned++;
        continue;
      }
@ -382,6 +397,7 @@ export class RedisJobStore implements IJobStore {
    }
    // Clear local caches
    this.localGraphCache.clear();
+    this.localCollectedUsageCache.clear();
    // Don't close the Redis connection - it's shared
    logger.info('[RedisJobStore] Destroyed');
  }
@ -406,11 +422,28 @@ export class RedisJobStore implements IJobStore {
   * No-op for Redis - content parts are reconstructed from chunks.
   * Metadata (agentId, groupId) is embedded directly on content parts by the agent runtime.
   */
-  setContentParts(_streamId: string, _contentParts: Agents.MessageContentComplex[]): void {
+  setContentParts(): void {
    // Content parts are reconstructed from chunks during getContentParts
    // No separate storage needed
  }

+  /**
+   * Store collectedUsage reference in local cache.
+   * This is used for abort handling to spend tokens for all models.
+   * Note: Only available on the generating instance; cross-replica abort uses fallback.
+   */
+  setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
+    this.localCollectedUsageCache.set(streamId, collectedUsage);
+  }
+
+  /**
+   * Get collected usage for a job.
+   * Only available if this is the generating instance.
+   */
+  getCollectedUsage(streamId: string): UsageMetadata[] {
+    return this.localCollectedUsageCache.get(streamId) ?? [];
+  }
+
  /**
   * Get aggregated content - tries local cache first, falls back to Redis reconstruction.
   *
@ -528,6 +561,7 @@ export class RedisJobStore implements IJobStore {
  clearContentState(streamId: string): void {
    // Clear local caches immediately
    this.localGraphCache.delete(streamId);
+    this.localCollectedUsageCache.delete(streamId);

    // Fire and forget - async cleanup for Redis
    this.clearContentStateAsync(streamId).catch((err) => {
--- a/packages/api/src/stream/index.ts
+++ b/packages/api/src/stream/index.ts
@ -5,11 +5,12 @@ export {
 } from './GenerationJobManager';

 export type {
-  AbortResult,
  SerializableJobData,
+  IEventTransport,
+  UsageMetadata,
+  AbortResult,
  JobStatus,
  IJobStore,
-  IEventTransport,
 } from './interfaces/IJobStore';

 export { createStreamServices } from './createStreamServices';
--- a/packages/api/src/stream/interfaces/IJobStore.ts
+++ b/packages/api/src/stream/interfaces/IJobStore.ts
@ -45,6 +45,54 @@ export interface SerializableJobData {
  promptTokens?: number;
 }

+/**
+ * Usage metadata for token spending across different LLM providers.
+ *
+ * This interface supports two mutually exclusive cache token formats:
+ *
+ * **OpenAI format** (GPT-4, o1, etc.):
+ * - Uses `input_token_details.cache_creation` and `input_token_details.cache_read`
+ * - Cache tokens are nested under the `input_token_details` object
+ *
+ * **Anthropic format** (Claude models):
+ * - Uses `cache_creation_input_tokens` and `cache_read_input_tokens`
+ * - Cache tokens are top-level properties
+ *
+ * When processing usage data, check both formats:
+ * ```typescript
+ * const cacheCreation = usage.input_token_details?.cache_creation
+ *   || usage.cache_creation_input_tokens || 0;
+ * ```
+ */
+export interface UsageMetadata {
+  /** Total input tokens (prompt tokens) */
+  input_tokens?: number;
+  /** Total output tokens (completion tokens) */
+  output_tokens?: number;
+  /** Model identifier that generated this usage */
+  model?: string;
+  /**
+   * OpenAI-style cache token details.
+   * Present for OpenAI models (GPT-4, o1, etc.)
+   */
+  input_token_details?: {
+    /** Tokens written to cache */
+    cache_creation?: number;
+    /** Tokens read from cache */
+    cache_read?: number;
+  };
+  /**
+   * Anthropic-style cache creation tokens.
+   * Present for Claude models. Mutually exclusive with input_token_details.
+   */
+  cache_creation_input_tokens?: number;
+  /**
+   * Anthropic-style cache read tokens.
+   * Present for Claude models. Mutually exclusive with input_token_details.
+   */
+  cache_read_input_tokens?: number;
+}
+
 /**
 * Result returned from aborting a job - contains all data needed
 * for token spending and message saving without storing callbacks
@ -58,6 +106,10 @@ export interface AbortResult {
  content: Agents.MessageContentComplex[];
  /** Final event to send to client */
  finalEvent: unknown;
+  /** Concatenated text from all content parts for token counting fallback */
+  text: string;
+  /** Collected usage metadata from all models for token spending */
+  collectedUsage: UsageMetadata[];
 }

 /**
@ -210,6 +262,23 @@ export interface IJobStore {
   * @param runSteps - Run steps to save
   */
  saveRunSteps?(streamId: string, runSteps: Agents.RunStep[]): Promise<void>;
+
+  /**
+   * Set collected usage reference for a job.
+   * This array accumulates token usage from all models during generation.
+   *
+   * @param streamId - The stream identifier
+   * @param collectedUsage - Array of usage metadata from all models
+   */
+  setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void;
+
+  /**
+   * Get collected usage for a job.
+   *
+   * @param streamId - The stream identifier
+   * @returns Array of usage metadata or empty array
+   */
+  getCollectedUsage(streamId: string): UsageMetadata[];
 }

 /**