mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-01-24 19:26:14 +01:00
🪙 refactor: Collected Usage & Anthropic Prompt Caching (#11319)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
* 🔧 refactor: Improve token calculation in AgentClient.recordCollectedUsage - Updated the token calculation logic to sum output tokens directly from all entries, addressing issues with negative values in parallel execution scenarios. - Added comments for clarity on the usage of input tokens and output tokens. - Introduced a new test file for comprehensive testing of the recordCollectedUsage function, covering various execution scenarios including sequential and parallel processing, cache token handling, and model fallback logic. * 🔧 refactor: Anthropic `promptCache` handling in LLM configuration * 🔧 test: Add comprehensive test for cache token handling in recordCollectedUsage - Introduced a new test case to validate the handling of cache tokens across multiple tool calls in the recordCollectedUsage function. - Ensured correct calculations for input and output tokens, including scenarios with cache creation and reading. - Verified the expected interactions with token spending methods to enhance the robustness of the token management logic.
This commit is contained in:
parent
1329e16d3a
commit
2a50c372ef
8 changed files with 828 additions and 40 deletions
|
|
@ -784,6 +784,7 @@ class AgentClient extends BaseClient {
|
|||
if (!collectedUsage || !collectedUsage.length) {
|
||||
return;
|
||||
}
|
||||
// Use first entry's input_tokens as the base input (represents initial user message context)
|
||||
// Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens)
|
||||
const firstUsage = collectedUsage[0];
|
||||
const input_tokens =
|
||||
|
|
@ -795,10 +796,11 @@ class AgentClient extends BaseClient {
|
|||
Number(firstUsage?.cache_read_input_tokens) ||
|
||||
0);
|
||||
|
||||
let output_tokens = 0;
|
||||
let previousTokens = input_tokens; // Start with original input
|
||||
for (let i = 0; i < collectedUsage.length; i++) {
|
||||
const usage = collectedUsage[i];
|
||||
// Sum output_tokens directly from all entries - works for both sequential and parallel execution
|
||||
// This avoids the incremental calculation that produced negative values for parallel agents
|
||||
let total_output_tokens = 0;
|
||||
|
||||
for (const usage of collectedUsage) {
|
||||
if (!usage) {
|
||||
continue;
|
||||
}
|
||||
|
|
@ -811,6 +813,9 @@ class AgentClient extends BaseClient {
|
|||
const cache_read =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
|
||||
// Accumulate output tokens for the usage summary
|
||||
total_output_tokens += Number(usage.output_tokens) || 0;
|
||||
|
||||
const txMetadata = {
|
||||
context,
|
||||
balance,
|
||||
|
|
@ -821,18 +826,6 @@ class AgentClient extends BaseClient {
|
|||
model: usage.model ?? model ?? this.model ?? this.options.agent.model_parameters.model,
|
||||
};
|
||||
|
||||
if (i > 0) {
|
||||
// Count new tokens generated (input_tokens minus previous accumulated tokens)
|
||||
output_tokens +=
|
||||
(Number(usage.input_tokens) || 0) + cache_creation + cache_read - previousTokens;
|
||||
}
|
||||
|
||||
// Add this message's output tokens
|
||||
output_tokens += Number(usage.output_tokens) || 0;
|
||||
|
||||
// Update previousTokens to include this message's output
|
||||
previousTokens += Number(usage.output_tokens) || 0;
|
||||
|
||||
if (cache_creation > 0 || cache_read > 0) {
|
||||
spendStructuredTokens(txMetadata, {
|
||||
promptTokens: {
|
||||
|
|
@ -862,7 +855,7 @@ class AgentClient extends BaseClient {
|
|||
|
||||
this.usage = {
|
||||
input_tokens,
|
||||
output_tokens,
|
||||
output_tokens: total_output_tokens,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue