💰 fix: Multi-Agent Token Spending & Prevent Double-Spend (#11433)

* fix: Token Spending Logic for Multi-Agents on Abort Scenarios

* Implemented logic to skip token spending if a conversation is aborted, preventing double-spending.
* Introduced `spendCollectedUsage` function to handle token spending for multiple models during aborts, ensuring accurate accounting for parallel agents.
* Updated `GenerationJobManager` to store and retrieve collected usage data for improved abort handling.
* Added comprehensive tests for the new functionality, covering various scenarios including cache token handling and parallel agent usage.

* fix: Memory Context Handling for Multi-Agents

* Refactored `buildMessages` method to pass memory context to parallel agents, ensuring they share the same user context.
* Improved handling of memory context when no existing instructions are present for parallel agents.
* Added comprehensive tests to verify memory context propagation and behavior under various scenarios, including cases with no memory available and empty agent configurations.
* Enhanced logging for better traceability of memory context additions to agents.

* chore: Memory Context Documentation for Parallel Agents

* Updated documentation in the `AgentClient` class to clarify the in-place mutation of agentConfig objects when passing memory context to parallel agents.
* Added notes on the implications of mutating objects directly to ensure all parallel agents receive the correct memory context before execution.

* chore: UsageMetadata Interface docs for Token Spending

* Expanded the UsageMetadata interface to support both OpenAI and Anthropic cache token formats.
* Added detailed documentation for cache token properties, including mutually exclusive fields for different model types.
* Improved clarity on how to access cache token details for accurate token spending tracking.

* fix: Enhance Token Spending Logic in Abort Middleware

* Refactored `spendCollectedUsage` function to utilize Promise.all for concurrent token spending, improving performance and ensuring all operations complete before clearing the collectedUsage array.
* Added documentation to clarify the importance of clearing the collectedUsage array to prevent double-spending in abort scenarios.
* Updated tests to verify the correct behavior of the spending logic and the clearing of the array after spending operations.
This commit is contained in:
Danny Avila 2026-01-20 14:43:19 -05:00 committed by GitHub
parent 32e6f3b8e5
commit 36c5a88c4e
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 1440 additions and 28 deletions

View file

@ -1,9 +1,11 @@
import { logger } from '@librechat/data-schemas';
import type { StandardGraph } from '@librechat/agents';
import type { Agents } from 'librechat-data-provider';
import { parseTextParts } from 'librechat-data-provider';
import type { Agents, TMessageContentParts } from 'librechat-data-provider';
import type {
SerializableJobData,
IEventTransport,
UsageMetadata,
AbortResult,
IJobStore,
} from './interfaces/IJobStore';
@ -585,7 +587,14 @@ class GenerationJobManagerClass {
if (!jobData) {
logger.warn(`[GenerationJobManager] Cannot abort - job not found: ${streamId}`);
return { success: false, jobData: null, content: [], finalEvent: null };
return {
text: '',
content: [],
jobData: null,
success: false,
finalEvent: null,
collectedUsage: [],
};
}
// Emit abort signal for cross-replica support (Redis mode)
@ -599,15 +608,21 @@ class GenerationJobManagerClass {
runtime.abortController.abort();
}
// Get content before clearing state
/** Content before clearing state */
const result = await this.jobStore.getContentParts(streamId);
const content = result?.content ?? [];
// Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
// In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation
/** Collected usage for all models */
const collectedUsage = this.jobStore.getCollectedUsage(streamId);
/** Text from content parts for fallback token counting */
const text = parseTextParts(content as TMessageContentParts[]);
/** Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation */
const isEarlyAbort = content.length === 0 && !jobData.responseMessageId;
// Create final event for abort
/** Final event for abort */
const userMessageId = jobData.userMessage?.messageId;
const abortFinalEvent: t.ServerSentEvent = {
@ -669,6 +684,8 @@ class GenerationJobManagerClass {
jobData,
content,
finalEvent: abortFinalEvent,
text,
collectedUsage,
};
}
@ -933,6 +950,18 @@ class GenerationJobManagerClass {
this.jobStore.setContentParts(streamId, contentParts);
}
/**
* Set reference to the collectedUsage array.
* This array accumulates token usage from all models during generation.
*/
setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
// Use runtime state check for performance (sync check)
if (!this.runtimeState.has(streamId)) {
return;
}
this.jobStore.setCollectedUsage(streamId, collectedUsage);
}
/**
* Set reference to the graph instance.
*/