💰 fix: Multi-Agent Token Spending & Prevent Double-Spend (#11433)

* fix: Token Spending Logic for Multi-Agents on Abort Scenarios * Implemented logic to skip token spending if a conversation is aborted, preventing double-spending. * Introduced `spendCollectedUsage` function to handle token spending for multiple models during aborts, ensuring accurate accounting for parallel agents. * Updated `GenerationJobManager` to store and retrieve collected usage data for improved abort handling. * Added comprehensive tests for the new functionality, covering various scenarios including cache token handling and parallel agent usage. * fix: Memory Context Handling for Multi-Agents * Refactored `buildMessages` method to pass memory context to parallel agents, ensuring they share the same user context. * Improved handling of memory context when no existing instructions are present for parallel agents. * Added comprehensive tests to verify memory context propagation and behavior under various scenarios, including cases with no memory available and empty agent configurations. * Enhanced logging for better traceability of memory context additions to agents. * chore: Memory Context Documentation for Parallel Agents * Updated documentation in the `AgentClient` class to clarify the in-place mutation of agentConfig objects when passing memory context to parallel agents. * Added notes on the implications of mutating objects directly to ensure all parallel agents receive the correct memory context before execution. * chore: UsageMetadata Interface docs for Token Spending * Expanded the UsageMetadata interface to support both OpenAI and Anthropic cache token formats. * Added detailed documentation for cache token properties, including mutually exclusive fields for different model types. * Improved clarity on how to access cache token details for accurate token spending tracking. * fix: Enhance Token Spending Logic in Abort Middleware * Refactored `spendCollectedUsage` function to utilize Promise.all for concurrent token spending, improving performance and ensuring all operations complete before clearing the collectedUsage array. * Added documentation to clarify the importance of clearing the collectedUsage array to prevent double-spending in abort scenarios. * Updated tests to verify the correct behavior of the spending logic and the clearing of the array after spending operations.
2026-03-09 17:42:38 +01:00 · 2026-01-20 14:43:19 -05:00 · 2026-01-20 14:43:19 -05:00 · 36c5a88c4e
commit 36c5a88c4e
parent 32e6f3b8e5
11 changed files with 1440 additions and 28 deletions
--- a/api/server/middleware/abortMiddleware.js
+++ b/api/server/middleware/abortMiddleware.js
@ -7,13 +7,89 @@ const {
  sanitizeMessageForTransmit,
 } = require('@librechat/api');
 const { isAssistantsEndpoint, ErrorTypes } = require('librechat-data-provider');
+const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
 const { truncateText, smartTruncateText } = require('~/app/clients/prompts');
 const clearPendingReq = require('~/cache/clearPendingReq');
 const { sendError } = require('~/server/middleware/error');
-const { spendTokens } = require('~/models/spendTokens');
 const { saveMessage, getConvo } = require('~/models');
 const { abortRun } = require('./abortRun');

+/**
+ * Spend tokens for all models from collected usage.
+ * This handles both sequential and parallel agent execution.
+ *
+ * IMPORTANT: After spending, this function clears the collectedUsage array
+ * to prevent double-spending. The array is shared with AgentClient.collectedUsage,
+ * so clearing it here prevents the finally block from also spending tokens.
+ *
+ * @param {Object} params
+ * @param {string} params.userId - User ID
+ * @param {string} params.conversationId - Conversation ID
+ * @param {Array<Object>} params.collectedUsage - Usage metadata from all models
+ * @param {string} [params.fallbackModel] - Fallback model name if not in usage
+ */
+async function spendCollectedUsage({ userId, conversationId, collectedUsage, fallbackModel }) {
+  if (!collectedUsage || collectedUsage.length === 0) {
+    return;
+  }
+
+  const spendPromises = [];
+
+  for (const usage of collectedUsage) {
+    if (!usage) {
+      continue;
+    }
+
+    // Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens)
+    const cache_creation =
+      Number(usage.input_token_details?.cache_creation) ||
+      Number(usage.cache_creation_input_tokens) ||
+      0;
+    const cache_read =
+      Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
+
+    const txMetadata = {
+      context: 'abort',
+      conversationId,
+      user: userId,
+      model: usage.model ?? fallbackModel,
+    };
+
+    if (cache_creation > 0 || cache_read > 0) {
+      spendPromises.push(
+        spendStructuredTokens(txMetadata, {
+          promptTokens: {
+            input: usage.input_tokens,
+            write: cache_creation,
+            read: cache_read,
+          },
+          completionTokens: usage.output_tokens,
+        }).catch((err) => {
+          logger.error('[abortMiddleware] Error spending structured tokens for abort', err);
+        }),
+      );
+      continue;
+    }
+
+    spendPromises.push(
+      spendTokens(txMetadata, {
+        promptTokens: usage.input_tokens,
+        completionTokens: usage.output_tokens,
+      }).catch((err) => {
+        logger.error('[abortMiddleware] Error spending tokens for abort', err);
+      }),
+    );
+  }
+
+  // Wait for all token spending to complete
+  await Promise.all(spendPromises);
+
+  // Clear the array to prevent double-spending from the AgentClient finally block.
+  // The collectedUsage array is shared by reference with AgentClient.collectedUsage,
+  // so clearing it here ensures recordCollectedUsage() sees an empty array and returns early.
+  collectedUsage.length = 0;
+}
+
 /**
 * Abort an active message generation.
 * Uses GenerationJobManager for all agent requests.
@ -39,9 +115,8 @@ async function abortMessage(req, res) {
    return;
  }

-  const { jobData, content, text } = abortResult;
+  const { jobData, content, text, collectedUsage } = abortResult;

-  // Count tokens and spend them
  const completionTokens = await countTokens(text);
  const promptTokens = jobData?.promptTokens ?? 0;

@ -62,10 +137,21 @@ async function abortMessage(req, res) {
    tokenCount: completionTokens,
  };

-  await spendTokens(
-    { ...responseMessage, context: 'incomplete', user: userId },
-    { promptTokens, completionTokens },
-  );
+  // Spend tokens for ALL models from collectedUsage (handles parallel agents/addedConvo)
+  if (collectedUsage && collectedUsage.length > 0) {
+    await spendCollectedUsage({
+      userId,
+      conversationId: jobData?.conversationId,
+      collectedUsage,
+      fallbackModel: jobData?.model,
+    });
+  } else {
+    // Fallback: no collected usage, use text-based token counting for primary model only
+    await spendTokens(
+      { ...responseMessage, context: 'incomplete', user: userId },
+      { promptTokens, completionTokens },
+    );
+  }

  await saveMessage(
    req,