mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-01-22 18:26:12 +01:00
💰 fix: Multi-Agent Token Spending & Prevent Double-Spend (#11433)
* fix: Token Spending Logic for Multi-Agents on Abort Scenarios * Implemented logic to skip token spending if a conversation is aborted, preventing double-spending. * Introduced `spendCollectedUsage` function to handle token spending for multiple models during aborts, ensuring accurate accounting for parallel agents. * Updated `GenerationJobManager` to store and retrieve collected usage data for improved abort handling. * Added comprehensive tests for the new functionality, covering various scenarios including cache token handling and parallel agent usage. * fix: Memory Context Handling for Multi-Agents * Refactored `buildMessages` method to pass memory context to parallel agents, ensuring they share the same user context. * Improved handling of memory context when no existing instructions are present for parallel agents. * Added comprehensive tests to verify memory context propagation and behavior under various scenarios, including cases with no memory available and empty agent configurations. * Enhanced logging for better traceability of memory context additions to agents. * chore: Memory Context Documentation for Parallel Agents * Updated documentation in the `AgentClient` class to clarify the in-place mutation of agentConfig objects when passing memory context to parallel agents. * Added notes on the implications of mutating objects directly to ensure all parallel agents receive the correct memory context before execution. * chore: UsageMetadata Interface docs for Token Spending * Expanded the UsageMetadata interface to support both OpenAI and Anthropic cache token formats. * Added detailed documentation for cache token properties, including mutually exclusive fields for different model types. * Improved clarity on how to access cache token details for accurate token spending tracking. * fix: Enhance Token Spending Logic in Abort Middleware * Refactored `spendCollectedUsage` function to utilize Promise.all for concurrent token spending, improving performance and ensuring all operations complete before clearing the collectedUsage array. * Added documentation to clarify the importance of clearing the collectedUsage array to prevent double-spending in abort scenarios. * Updated tests to verify the correct behavior of the spending logic and the clearing of the array after spending operations.
This commit is contained in:
parent
32e6f3b8e5
commit
36c5a88c4e
11 changed files with 1440 additions and 28 deletions
|
|
@ -7,13 +7,89 @@ const {
|
|||
sanitizeMessageForTransmit,
|
||||
} = require('@librechat/api');
|
||||
const { isAssistantsEndpoint, ErrorTypes } = require('librechat-data-provider');
|
||||
const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
|
||||
const { truncateText, smartTruncateText } = require('~/app/clients/prompts');
|
||||
const clearPendingReq = require('~/cache/clearPendingReq');
|
||||
const { sendError } = require('~/server/middleware/error');
|
||||
const { spendTokens } = require('~/models/spendTokens');
|
||||
const { saveMessage, getConvo } = require('~/models');
|
||||
const { abortRun } = require('./abortRun');
|
||||
|
||||
/**
|
||||
* Spend tokens for all models from collected usage.
|
||||
* This handles both sequential and parallel agent execution.
|
||||
*
|
||||
* IMPORTANT: After spending, this function clears the collectedUsage array
|
||||
* to prevent double-spending. The array is shared with AgentClient.collectedUsage,
|
||||
* so clearing it here prevents the finally block from also spending tokens.
|
||||
*
|
||||
* @param {Object} params
|
||||
* @param {string} params.userId - User ID
|
||||
* @param {string} params.conversationId - Conversation ID
|
||||
* @param {Array<Object>} params.collectedUsage - Usage metadata from all models
|
||||
* @param {string} [params.fallbackModel] - Fallback model name if not in usage
|
||||
*/
|
||||
async function spendCollectedUsage({ userId, conversationId, collectedUsage, fallbackModel }) {
|
||||
if (!collectedUsage || collectedUsage.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const spendPromises = [];
|
||||
|
||||
for (const usage of collectedUsage) {
|
||||
if (!usage) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens)
|
||||
const cache_creation =
|
||||
Number(usage.input_token_details?.cache_creation) ||
|
||||
Number(usage.cache_creation_input_tokens) ||
|
||||
0;
|
||||
const cache_read =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
|
||||
const txMetadata = {
|
||||
context: 'abort',
|
||||
conversationId,
|
||||
user: userId,
|
||||
model: usage.model ?? fallbackModel,
|
||||
};
|
||||
|
||||
if (cache_creation > 0 || cache_read > 0) {
|
||||
spendPromises.push(
|
||||
spendStructuredTokens(txMetadata, {
|
||||
promptTokens: {
|
||||
input: usage.input_tokens,
|
||||
write: cache_creation,
|
||||
read: cache_read,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
}).catch((err) => {
|
||||
logger.error('[abortMiddleware] Error spending structured tokens for abort', err);
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
spendPromises.push(
|
||||
spendTokens(txMetadata, {
|
||||
promptTokens: usage.input_tokens,
|
||||
completionTokens: usage.output_tokens,
|
||||
}).catch((err) => {
|
||||
logger.error('[abortMiddleware] Error spending tokens for abort', err);
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for all token spending to complete
|
||||
await Promise.all(spendPromises);
|
||||
|
||||
// Clear the array to prevent double-spending from the AgentClient finally block.
|
||||
// The collectedUsage array is shared by reference with AgentClient.collectedUsage,
|
||||
// so clearing it here ensures recordCollectedUsage() sees an empty array and returns early.
|
||||
collectedUsage.length = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Abort an active message generation.
|
||||
* Uses GenerationJobManager for all agent requests.
|
||||
|
|
@ -39,9 +115,8 @@ async function abortMessage(req, res) {
|
|||
return;
|
||||
}
|
||||
|
||||
const { jobData, content, text } = abortResult;
|
||||
const { jobData, content, text, collectedUsage } = abortResult;
|
||||
|
||||
// Count tokens and spend them
|
||||
const completionTokens = await countTokens(text);
|
||||
const promptTokens = jobData?.promptTokens ?? 0;
|
||||
|
||||
|
|
@ -62,10 +137,21 @@ async function abortMessage(req, res) {
|
|||
tokenCount: completionTokens,
|
||||
};
|
||||
|
||||
await spendTokens(
|
||||
{ ...responseMessage, context: 'incomplete', user: userId },
|
||||
{ promptTokens, completionTokens },
|
||||
);
|
||||
// Spend tokens for ALL models from collectedUsage (handles parallel agents/addedConvo)
|
||||
if (collectedUsage && collectedUsage.length > 0) {
|
||||
await spendCollectedUsage({
|
||||
userId,
|
||||
conversationId: jobData?.conversationId,
|
||||
collectedUsage,
|
||||
fallbackModel: jobData?.model,
|
||||
});
|
||||
} else {
|
||||
// Fallback: no collected usage, use text-based token counting for primary model only
|
||||
await spendTokens(
|
||||
{ ...responseMessage, context: 'incomplete', user: userId },
|
||||
{ promptTokens, completionTokens },
|
||||
);
|
||||
}
|
||||
|
||||
await saveMessage(
|
||||
req,
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue