💰 fix: Multi-Agent Token Spending & Prevent Double-Spend (#11433)

* fix: Token Spending Logic for Multi-Agents on Abort Scenarios * Implemented logic to skip token spending if a conversation is aborted, preventing double-spending. * Introduced `spendCollectedUsage` function to handle token spending for multiple models during aborts, ensuring accurate accounting for parallel agents. * Updated `GenerationJobManager` to store and retrieve collected usage data for improved abort handling. * Added comprehensive tests for the new functionality, covering various scenarios including cache token handling and parallel agent usage. * fix: Memory Context Handling for Multi-Agents * Refactored `buildMessages` method to pass memory context to parallel agents, ensuring they share the same user context. * Improved handling of memory context when no existing instructions are present for parallel agents. * Added comprehensive tests to verify memory context propagation and behavior under various scenarios, including cases with no memory available and empty agent configurations. * Enhanced logging for better traceability of memory context additions to agents. * chore: Memory Context Documentation for Parallel Agents * Updated documentation in the `AgentClient` class to clarify the in-place mutation of agentConfig objects when passing memory context to parallel agents. * Added notes on the implications of mutating objects directly to ensure all parallel agents receive the correct memory context before execution. * chore: UsageMetadata Interface docs for Token Spending * Expanded the UsageMetadata interface to support both OpenAI and Anthropic cache token formats. * Added detailed documentation for cache token properties, including mutually exclusive fields for different model types. * Improved clarity on how to access cache token details for accurate token spending tracking. * fix: Enhance Token Spending Logic in Abort Middleware * Refactored `spendCollectedUsage` function to utilize Promise.all for concurrent token spending, improving performance and ensuring all operations complete before clearing the collectedUsage array. * Added documentation to clarify the importance of clearing the collectedUsage array to prevent double-spending in abort scenarios. * Updated tests to verify the correct behavior of the spending logic and the clearing of the array after spending operations.
2026-01-22 18:26:12 +01:00 · 2026-01-20 14:43:19 -05:00 · 2026-01-20 14:43:19 -05:00 · 36c5a88c4e
commit 36c5a88c4e
parent 32e6f3b8e5
11 changed files with 1440 additions and 28 deletions
--- a/packages/api/src/stream/GenerationJobManager.ts
+++ b/packages/api/src/stream/GenerationJobManager.ts
@ -1,9 +1,11 @@
 import { logger } from '@librechat/data-schemas';
 import type { StandardGraph } from '@librechat/agents';
-import type { Agents } from 'librechat-data-provider';
+import { parseTextParts } from 'librechat-data-provider';
+import type { Agents, TMessageContentParts } from 'librechat-data-provider';
 import type {
  SerializableJobData,
  IEventTransport,
+  UsageMetadata,
  AbortResult,
  IJobStore,
 } from './interfaces/IJobStore';
@ -585,7 +587,14 @@ class GenerationJobManagerClass {

    if (!jobData) {
      logger.warn(`[GenerationJobManager] Cannot abort - job not found: ${streamId}`);
-      return { success: false, jobData: null, content: [], finalEvent: null };
+      return {
+        text: '',
+        content: [],
+        jobData: null,
+        success: false,
+        finalEvent: null,
+        collectedUsage: [],
+      };
    }

    // Emit abort signal for cross-replica support (Redis mode)
@ -599,15 +608,21 @@ class GenerationJobManagerClass {
      runtime.abortController.abort();
    }

-    // Get content before clearing state
+    /** Content before clearing state */
    const result = await this.jobStore.getContentParts(streamId);
    const content = result?.content ?? [];

-    // Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
-    // In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation
+    /** Collected usage for all models */
+    const collectedUsage = this.jobStore.getCollectedUsage(streamId);
+
+    /** Text from content parts for fallback token counting */
+    const text = parseTextParts(content as TMessageContentParts[]);
+
+    /** Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
+    In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation */
    const isEarlyAbort = content.length === 0 && !jobData.responseMessageId;

-    // Create final event for abort
+    /** Final event for abort */
    const userMessageId = jobData.userMessage?.messageId;

    const abortFinalEvent: t.ServerSentEvent = {
@ -669,6 +684,8 @@ class GenerationJobManagerClass {
      jobData,
      content,
      finalEvent: abortFinalEvent,
+      text,
+      collectedUsage,
    };
  }

@ -933,6 +950,18 @@ class GenerationJobManagerClass {
    this.jobStore.setContentParts(streamId, contentParts);
  }

+  /**
+   * Set reference to the collectedUsage array.
+   * This array accumulates token usage from all models during generation.
+   */
+  setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
+    // Use runtime state check for performance (sync check)
+    if (!this.runtimeState.has(streamId)) {
+      return;
+    }
+    this.jobStore.setCollectedUsage(streamId, collectedUsage);
+  }
+
  /**
   * Set reference to the graph instance.
   */