mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-01-22 10:16:13 +01:00
💰 fix: Multi-Agent Token Spending & Prevent Double-Spend (#11433)
* fix: Token Spending Logic for Multi-Agents on Abort Scenarios * Implemented logic to skip token spending if a conversation is aborted, preventing double-spending. * Introduced `spendCollectedUsage` function to handle token spending for multiple models during aborts, ensuring accurate accounting for parallel agents. * Updated `GenerationJobManager` to store and retrieve collected usage data for improved abort handling. * Added comprehensive tests for the new functionality, covering various scenarios including cache token handling and parallel agent usage. * fix: Memory Context Handling for Multi-Agents * Refactored `buildMessages` method to pass memory context to parallel agents, ensuring they share the same user context. * Improved handling of memory context when no existing instructions are present for parallel agents. * Added comprehensive tests to verify memory context propagation and behavior under various scenarios, including cases with no memory available and empty agent configurations. * Enhanced logging for better traceability of memory context additions to agents. * chore: Memory Context Documentation for Parallel Agents * Updated documentation in the `AgentClient` class to clarify the in-place mutation of agentConfig objects when passing memory context to parallel agents. * Added notes on the implications of mutating objects directly to ensure all parallel agents receive the correct memory context before execution. * chore: UsageMetadata Interface docs for Token Spending * Expanded the UsageMetadata interface to support both OpenAI and Anthropic cache token formats. * Added detailed documentation for cache token properties, including mutually exclusive fields for different model types. * Improved clarity on how to access cache token details for accurate token spending tracking. * fix: Enhance Token Spending Logic in Abort Middleware * Refactored `spendCollectedUsage` function to utilize Promise.all for concurrent token spending, improving performance and ensuring all operations complete before clearing the collectedUsage array. * Added documentation to clarify the importance of clearing the collectedUsage array to prevent double-spending in abort scenarios. * Updated tests to verify the correct behavior of the spending logic and the clearing of the array after spending operations.
This commit is contained in:
parent
32e6f3b8e5
commit
36c5a88c4e
11 changed files with 1440 additions and 28 deletions
|
|
@ -522,14 +522,36 @@ class AgentClient extends BaseClient {
|
|||
}
|
||||
|
||||
const withoutKeys = await this.useMemory();
|
||||
if (withoutKeys) {
|
||||
systemContent += `${memoryInstructions}\n\n# Existing memory about the user:\n${withoutKeys}`;
|
||||
const memoryContext = withoutKeys
|
||||
? `${memoryInstructions}\n\n# Existing memory about the user:\n${withoutKeys}`
|
||||
: '';
|
||||
if (memoryContext) {
|
||||
systemContent += memoryContext;
|
||||
}
|
||||
|
||||
if (systemContent) {
|
||||
this.options.agent.instructions = systemContent;
|
||||
}
|
||||
|
||||
/**
|
||||
* Pass memory context to parallel agents (addedConvo) so they have the same user context.
|
||||
*
|
||||
* NOTE: This intentionally mutates the agentConfig objects in place. The agentConfigs Map
|
||||
* holds references to config objects that will be passed to the graph runtime. Mutating
|
||||
* them here ensures all parallel agents receive the memory context before execution starts.
|
||||
* Creating new objects would not work because the Map references would still point to the old objects.
|
||||
*/
|
||||
if (memoryContext && this.agentConfigs?.size > 0) {
|
||||
for (const [agentId, agentConfig] of this.agentConfigs.entries()) {
|
||||
if (agentConfig.instructions) {
|
||||
agentConfig.instructions = agentConfig.instructions + '\n\n' + memoryContext;
|
||||
} else {
|
||||
agentConfig.instructions = memoryContext;
|
||||
}
|
||||
logger.debug(`[AgentClient] Added memory context to parallel agent: ${agentId}`);
|
||||
}
|
||||
}
|
||||
|
||||
return result;
|
||||
}
|
||||
|
||||
|
|
@ -1084,11 +1106,20 @@ class AgentClient extends BaseClient {
|
|||
this.artifactPromises.push(...attachments);
|
||||
}
|
||||
|
||||
await this.recordCollectedUsage({
|
||||
context: 'message',
|
||||
balance: balanceConfig,
|
||||
transactions: transactionsConfig,
|
||||
});
|
||||
/** Skip token spending if aborted - the abort handler (abortMiddleware.js) handles it
|
||||
This prevents double-spending when user aborts via `/api/agents/chat/abort` */
|
||||
const wasAborted = abortController?.signal?.aborted;
|
||||
if (!wasAborted) {
|
||||
await this.recordCollectedUsage({
|
||||
context: 'message',
|
||||
balance: balanceConfig,
|
||||
transactions: transactionsConfig,
|
||||
});
|
||||
} else {
|
||||
logger.debug(
|
||||
'[api/server/controllers/agents/client.js #chatCompletion] Skipping token spending - handled by abort middleware',
|
||||
);
|
||||
}
|
||||
} catch (err) {
|
||||
logger.error(
|
||||
'[api/server/controllers/agents/client.js #chatCompletion] Error in cleanup phase',
|
||||
|
|
|
|||
|
|
@ -1849,4 +1849,224 @@ describe('AgentClient - titleConvo', () => {
|
|||
});
|
||||
});
|
||||
});
|
||||
|
||||
describe('buildMessages - memory context for parallel agents', () => {
|
||||
let client;
|
||||
let mockReq;
|
||||
let mockRes;
|
||||
let mockAgent;
|
||||
let mockOptions;
|
||||
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
|
||||
mockAgent = {
|
||||
id: 'primary-agent',
|
||||
name: 'Primary Agent',
|
||||
endpoint: EModelEndpoint.openAI,
|
||||
provider: EModelEndpoint.openAI,
|
||||
instructions: 'Primary agent instructions',
|
||||
model_parameters: {
|
||||
model: 'gpt-4',
|
||||
},
|
||||
tools: [],
|
||||
};
|
||||
|
||||
mockReq = {
|
||||
user: {
|
||||
id: 'user-123',
|
||||
personalization: {
|
||||
memories: true,
|
||||
},
|
||||
},
|
||||
body: {
|
||||
endpoint: EModelEndpoint.openAI,
|
||||
},
|
||||
config: {
|
||||
memory: {
|
||||
disabled: false,
|
||||
},
|
||||
},
|
||||
};
|
||||
|
||||
mockRes = {};
|
||||
|
||||
mockOptions = {
|
||||
req: mockReq,
|
||||
res: mockRes,
|
||||
agent: mockAgent,
|
||||
endpoint: EModelEndpoint.agents,
|
||||
};
|
||||
|
||||
client = new AgentClient(mockOptions);
|
||||
client.conversationId = 'convo-123';
|
||||
client.responseMessageId = 'response-123';
|
||||
client.shouldSummarize = false;
|
||||
client.maxContextTokens = 4096;
|
||||
});
|
||||
|
||||
it('should pass memory context to parallel agents (addedConvo)', async () => {
|
||||
const memoryContent = 'User prefers dark mode. User is a software developer.';
|
||||
client.useMemory = jest.fn().mockResolvedValue(memoryContent);
|
||||
|
||||
const parallelAgent1 = {
|
||||
id: 'parallel-agent-1',
|
||||
name: 'Parallel Agent 1',
|
||||
instructions: 'Parallel agent 1 instructions',
|
||||
provider: EModelEndpoint.openAI,
|
||||
};
|
||||
|
||||
const parallelAgent2 = {
|
||||
id: 'parallel-agent-2',
|
||||
name: 'Parallel Agent 2',
|
||||
instructions: 'Parallel agent 2 instructions',
|
||||
provider: EModelEndpoint.anthropic,
|
||||
};
|
||||
|
||||
client.agentConfigs = new Map([
|
||||
['parallel-agent-1', parallelAgent1],
|
||||
['parallel-agent-2', parallelAgent2],
|
||||
]);
|
||||
|
||||
const messages = [
|
||||
{
|
||||
messageId: 'msg-1',
|
||||
parentMessageId: null,
|
||||
sender: 'User',
|
||||
text: 'Hello',
|
||||
isCreatedByUser: true,
|
||||
},
|
||||
];
|
||||
|
||||
await client.buildMessages(messages, null, {
|
||||
instructions: 'Base instructions',
|
||||
additional_instructions: null,
|
||||
});
|
||||
|
||||
expect(client.useMemory).toHaveBeenCalled();
|
||||
|
||||
expect(client.options.agent.instructions).toContain('Base instructions');
|
||||
expect(client.options.agent.instructions).toContain(memoryContent);
|
||||
|
||||
expect(parallelAgent1.instructions).toContain('Parallel agent 1 instructions');
|
||||
expect(parallelAgent1.instructions).toContain(memoryContent);
|
||||
|
||||
expect(parallelAgent2.instructions).toContain('Parallel agent 2 instructions');
|
||||
expect(parallelAgent2.instructions).toContain(memoryContent);
|
||||
});
|
||||
|
||||
it('should not modify parallel agents when no memory context is available', async () => {
|
||||
client.useMemory = jest.fn().mockResolvedValue(undefined);
|
||||
|
||||
const parallelAgent = {
|
||||
id: 'parallel-agent-1',
|
||||
name: 'Parallel Agent 1',
|
||||
instructions: 'Original parallel instructions',
|
||||
provider: EModelEndpoint.openAI,
|
||||
};
|
||||
|
||||
client.agentConfigs = new Map([['parallel-agent-1', parallelAgent]]);
|
||||
|
||||
const messages = [
|
||||
{
|
||||
messageId: 'msg-1',
|
||||
parentMessageId: null,
|
||||
sender: 'User',
|
||||
text: 'Hello',
|
||||
isCreatedByUser: true,
|
||||
},
|
||||
];
|
||||
|
||||
await client.buildMessages(messages, null, {
|
||||
instructions: 'Base instructions',
|
||||
additional_instructions: null,
|
||||
});
|
||||
|
||||
expect(parallelAgent.instructions).toBe('Original parallel instructions');
|
||||
});
|
||||
|
||||
it('should handle parallel agents without existing instructions', async () => {
|
||||
const memoryContent = 'User is a data scientist.';
|
||||
client.useMemory = jest.fn().mockResolvedValue(memoryContent);
|
||||
|
||||
const parallelAgentNoInstructions = {
|
||||
id: 'parallel-agent-no-instructions',
|
||||
name: 'Parallel Agent No Instructions',
|
||||
provider: EModelEndpoint.openAI,
|
||||
};
|
||||
|
||||
client.agentConfigs = new Map([
|
||||
['parallel-agent-no-instructions', parallelAgentNoInstructions],
|
||||
]);
|
||||
|
||||
const messages = [
|
||||
{
|
||||
messageId: 'msg-1',
|
||||
parentMessageId: null,
|
||||
sender: 'User',
|
||||
text: 'Hello',
|
||||
isCreatedByUser: true,
|
||||
},
|
||||
];
|
||||
|
||||
await client.buildMessages(messages, null, {
|
||||
instructions: null,
|
||||
additional_instructions: null,
|
||||
});
|
||||
|
||||
expect(parallelAgentNoInstructions.instructions).toContain(memoryContent);
|
||||
});
|
||||
|
||||
it('should not modify agentConfigs when none exist', async () => {
|
||||
const memoryContent = 'User prefers concise responses.';
|
||||
client.useMemory = jest.fn().mockResolvedValue(memoryContent);
|
||||
|
||||
client.agentConfigs = null;
|
||||
|
||||
const messages = [
|
||||
{
|
||||
messageId: 'msg-1',
|
||||
parentMessageId: null,
|
||||
sender: 'User',
|
||||
text: 'Hello',
|
||||
isCreatedByUser: true,
|
||||
},
|
||||
];
|
||||
|
||||
await expect(
|
||||
client.buildMessages(messages, null, {
|
||||
instructions: 'Base instructions',
|
||||
additional_instructions: null,
|
||||
}),
|
||||
).resolves.not.toThrow();
|
||||
|
||||
expect(client.options.agent.instructions).toContain(memoryContent);
|
||||
});
|
||||
|
||||
it('should handle empty agentConfigs map', async () => {
|
||||
const memoryContent = 'User likes detailed explanations.';
|
||||
client.useMemory = jest.fn().mockResolvedValue(memoryContent);
|
||||
|
||||
client.agentConfigs = new Map();
|
||||
|
||||
const messages = [
|
||||
{
|
||||
messageId: 'msg-1',
|
||||
parentMessageId: null,
|
||||
sender: 'User',
|
||||
text: 'Hello',
|
||||
isCreatedByUser: true,
|
||||
},
|
||||
];
|
||||
|
||||
await expect(
|
||||
client.buildMessages(messages, null, {
|
||||
instructions: 'Base instructions',
|
||||
additional_instructions: null,
|
||||
}),
|
||||
).resolves.not.toThrow();
|
||||
|
||||
expect(client.options.agent.instructions).toContain(memoryContent);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
|
|
@ -7,13 +7,89 @@ const {
|
|||
sanitizeMessageForTransmit,
|
||||
} = require('@librechat/api');
|
||||
const { isAssistantsEndpoint, ErrorTypes } = require('librechat-data-provider');
|
||||
const { spendTokens, spendStructuredTokens } = require('~/models/spendTokens');
|
||||
const { truncateText, smartTruncateText } = require('~/app/clients/prompts');
|
||||
const clearPendingReq = require('~/cache/clearPendingReq');
|
||||
const { sendError } = require('~/server/middleware/error');
|
||||
const { spendTokens } = require('~/models/spendTokens');
|
||||
const { saveMessage, getConvo } = require('~/models');
|
||||
const { abortRun } = require('./abortRun');
|
||||
|
||||
/**
|
||||
* Spend tokens for all models from collected usage.
|
||||
* This handles both sequential and parallel agent execution.
|
||||
*
|
||||
* IMPORTANT: After spending, this function clears the collectedUsage array
|
||||
* to prevent double-spending. The array is shared with AgentClient.collectedUsage,
|
||||
* so clearing it here prevents the finally block from also spending tokens.
|
||||
*
|
||||
* @param {Object} params
|
||||
* @param {string} params.userId - User ID
|
||||
* @param {string} params.conversationId - Conversation ID
|
||||
* @param {Array<Object>} params.collectedUsage - Usage metadata from all models
|
||||
* @param {string} [params.fallbackModel] - Fallback model name if not in usage
|
||||
*/
|
||||
async function spendCollectedUsage({ userId, conversationId, collectedUsage, fallbackModel }) {
|
||||
if (!collectedUsage || collectedUsage.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const spendPromises = [];
|
||||
|
||||
for (const usage of collectedUsage) {
|
||||
if (!usage) {
|
||||
continue;
|
||||
}
|
||||
|
||||
// Support both OpenAI format (input_token_details) and Anthropic format (cache_*_input_tokens)
|
||||
const cache_creation =
|
||||
Number(usage.input_token_details?.cache_creation) ||
|
||||
Number(usage.cache_creation_input_tokens) ||
|
||||
0;
|
||||
const cache_read =
|
||||
Number(usage.input_token_details?.cache_read) || Number(usage.cache_read_input_tokens) || 0;
|
||||
|
||||
const txMetadata = {
|
||||
context: 'abort',
|
||||
conversationId,
|
||||
user: userId,
|
||||
model: usage.model ?? fallbackModel,
|
||||
};
|
||||
|
||||
if (cache_creation > 0 || cache_read > 0) {
|
||||
spendPromises.push(
|
||||
spendStructuredTokens(txMetadata, {
|
||||
promptTokens: {
|
||||
input: usage.input_tokens,
|
||||
write: cache_creation,
|
||||
read: cache_read,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
}).catch((err) => {
|
||||
logger.error('[abortMiddleware] Error spending structured tokens for abort', err);
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
spendPromises.push(
|
||||
spendTokens(txMetadata, {
|
||||
promptTokens: usage.input_tokens,
|
||||
completionTokens: usage.output_tokens,
|
||||
}).catch((err) => {
|
||||
logger.error('[abortMiddleware] Error spending tokens for abort', err);
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for all token spending to complete
|
||||
await Promise.all(spendPromises);
|
||||
|
||||
// Clear the array to prevent double-spending from the AgentClient finally block.
|
||||
// The collectedUsage array is shared by reference with AgentClient.collectedUsage,
|
||||
// so clearing it here ensures recordCollectedUsage() sees an empty array and returns early.
|
||||
collectedUsage.length = 0;
|
||||
}
|
||||
|
||||
/**
|
||||
* Abort an active message generation.
|
||||
* Uses GenerationJobManager for all agent requests.
|
||||
|
|
@ -39,9 +115,8 @@ async function abortMessage(req, res) {
|
|||
return;
|
||||
}
|
||||
|
||||
const { jobData, content, text } = abortResult;
|
||||
const { jobData, content, text, collectedUsage } = abortResult;
|
||||
|
||||
// Count tokens and spend them
|
||||
const completionTokens = await countTokens(text);
|
||||
const promptTokens = jobData?.promptTokens ?? 0;
|
||||
|
||||
|
|
@ -62,10 +137,21 @@ async function abortMessage(req, res) {
|
|||
tokenCount: completionTokens,
|
||||
};
|
||||
|
||||
await spendTokens(
|
||||
{ ...responseMessage, context: 'incomplete', user: userId },
|
||||
{ promptTokens, completionTokens },
|
||||
);
|
||||
// Spend tokens for ALL models from collectedUsage (handles parallel agents/addedConvo)
|
||||
if (collectedUsage && collectedUsage.length > 0) {
|
||||
await spendCollectedUsage({
|
||||
userId,
|
||||
conversationId: jobData?.conversationId,
|
||||
collectedUsage,
|
||||
fallbackModel: jobData?.model,
|
||||
});
|
||||
} else {
|
||||
// Fallback: no collected usage, use text-based token counting for primary model only
|
||||
await spendTokens(
|
||||
{ ...responseMessage, context: 'incomplete', user: userId },
|
||||
{ promptTokens, completionTokens },
|
||||
);
|
||||
}
|
||||
|
||||
await saveMessage(
|
||||
req,
|
||||
|
|
|
|||
428
api/server/middleware/abortMiddleware.spec.js
Normal file
428
api/server/middleware/abortMiddleware.spec.js
Normal file
|
|
@ -0,0 +1,428 @@
|
|||
/**
|
||||
* Tests for abortMiddleware - spendCollectedUsage function
|
||||
*
|
||||
* This tests the token spending logic for abort scenarios,
|
||||
* particularly for parallel agents (addedConvo) where multiple
|
||||
* models need their tokens spent.
|
||||
*/
|
||||
|
||||
const mockSpendTokens = jest.fn().mockResolvedValue();
|
||||
const mockSpendStructuredTokens = jest.fn().mockResolvedValue();
|
||||
|
||||
jest.mock('~/models/spendTokens', () => ({
|
||||
spendTokens: (...args) => mockSpendTokens(...args),
|
||||
spendStructuredTokens: (...args) => mockSpendStructuredTokens(...args),
|
||||
}));
|
||||
|
||||
jest.mock('@librechat/data-schemas', () => ({
|
||||
logger: {
|
||||
debug: jest.fn(),
|
||||
error: jest.fn(),
|
||||
warn: jest.fn(),
|
||||
info: jest.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
jest.mock('@librechat/api', () => ({
|
||||
countTokens: jest.fn().mockResolvedValue(100),
|
||||
isEnabled: jest.fn().mockReturnValue(false),
|
||||
sendEvent: jest.fn(),
|
||||
GenerationJobManager: {
|
||||
abortJob: jest.fn(),
|
||||
},
|
||||
sanitizeMessageForTransmit: jest.fn((msg) => msg),
|
||||
}));
|
||||
|
||||
jest.mock('librechat-data-provider', () => ({
|
||||
isAssistantsEndpoint: jest.fn().mockReturnValue(false),
|
||||
ErrorTypes: { INVALID_REQUEST: 'INVALID_REQUEST', NO_SYSTEM_MESSAGES: 'NO_SYSTEM_MESSAGES' },
|
||||
}));
|
||||
|
||||
jest.mock('~/app/clients/prompts', () => ({
|
||||
truncateText: jest.fn((text) => text),
|
||||
smartTruncateText: jest.fn((text) => text),
|
||||
}));
|
||||
|
||||
jest.mock('~/cache/clearPendingReq', () => jest.fn().mockResolvedValue());
|
||||
|
||||
jest.mock('~/server/middleware/error', () => ({
|
||||
sendError: jest.fn(),
|
||||
}));
|
||||
|
||||
jest.mock('~/models', () => ({
|
||||
saveMessage: jest.fn().mockResolvedValue(),
|
||||
getConvo: jest.fn().mockResolvedValue({ title: 'Test Chat' }),
|
||||
}));
|
||||
|
||||
jest.mock('./abortRun', () => ({
|
||||
abortRun: jest.fn(),
|
||||
}));
|
||||
|
||||
// Import the module after mocks are set up
|
||||
// We need to extract the spendCollectedUsage function for testing
|
||||
// Since it's not exported, we'll test it through the handleAbort flow
|
||||
|
||||
describe('abortMiddleware - spendCollectedUsage', () => {
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
describe('spendCollectedUsage logic', () => {
|
||||
// Since spendCollectedUsage is not exported, we test the logic directly
|
||||
// by replicating the function here for unit testing
|
||||
|
||||
const spendCollectedUsage = async ({
|
||||
userId,
|
||||
conversationId,
|
||||
collectedUsage,
|
||||
fallbackModel,
|
||||
}) => {
|
||||
if (!collectedUsage || collectedUsage.length === 0) {
|
||||
return;
|
||||
}
|
||||
|
||||
const spendPromises = [];
|
||||
|
||||
for (const usage of collectedUsage) {
|
||||
if (!usage) {
|
||||
continue;
|
||||
}
|
||||
|
||||
const cache_creation =
|
||||
Number(usage.input_token_details?.cache_creation) ||
|
||||
Number(usage.cache_creation_input_tokens) ||
|
||||
0;
|
||||
const cache_read =
|
||||
Number(usage.input_token_details?.cache_read) ||
|
||||
Number(usage.cache_read_input_tokens) ||
|
||||
0;
|
||||
|
||||
const txMetadata = {
|
||||
context: 'abort',
|
||||
conversationId,
|
||||
user: userId,
|
||||
model: usage.model ?? fallbackModel,
|
||||
};
|
||||
|
||||
if (cache_creation > 0 || cache_read > 0) {
|
||||
spendPromises.push(
|
||||
mockSpendStructuredTokens(txMetadata, {
|
||||
promptTokens: {
|
||||
input: usage.input_tokens,
|
||||
write: cache_creation,
|
||||
read: cache_read,
|
||||
},
|
||||
completionTokens: usage.output_tokens,
|
||||
}).catch(() => {
|
||||
// Log error but don't throw
|
||||
}),
|
||||
);
|
||||
continue;
|
||||
}
|
||||
|
||||
spendPromises.push(
|
||||
mockSpendTokens(txMetadata, {
|
||||
promptTokens: usage.input_tokens,
|
||||
completionTokens: usage.output_tokens,
|
||||
}).catch(() => {
|
||||
// Log error but don't throw
|
||||
}),
|
||||
);
|
||||
}
|
||||
|
||||
// Wait for all token spending to complete
|
||||
await Promise.all(spendPromises);
|
||||
|
||||
// Clear the array to prevent double-spending
|
||||
collectedUsage.length = 0;
|
||||
};
|
||||
|
||||
it('should return early if collectedUsage is empty', async () => {
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage: [],
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).not.toHaveBeenCalled();
|
||||
expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should return early if collectedUsage is null', async () => {
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage: null,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).not.toHaveBeenCalled();
|
||||
expect(mockSpendStructuredTokens).not.toHaveBeenCalled();
|
||||
});
|
||||
|
||||
it('should skip null entries in collectedUsage', async () => {
|
||||
const collectedUsage = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
null,
|
||||
{ input_tokens: 200, output_tokens: 60, model: 'gpt-4' },
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledTimes(2);
|
||||
});
|
||||
|
||||
it('should spend tokens for single model', async () => {
|
||||
const collectedUsage = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledTimes(1);
|
||||
expect(mockSpendTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({
|
||||
context: 'abort',
|
||||
conversationId: 'convo-123',
|
||||
user: 'user-123',
|
||||
model: 'gpt-4',
|
||||
}),
|
||||
{ promptTokens: 100, completionTokens: 50 },
|
||||
);
|
||||
});
|
||||
|
||||
it('should spend tokens for multiple models (parallel agents)', async () => {
|
||||
const collectedUsage = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
{ input_tokens: 120, output_tokens: 60, model: 'gemini-pro' },
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledTimes(3);
|
||||
|
||||
// Verify each model was called
|
||||
expect(mockSpendTokens).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
expect.objectContaining({ model: 'gpt-4' }),
|
||||
{ promptTokens: 100, completionTokens: 50 },
|
||||
);
|
||||
expect(mockSpendTokens).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
expect.objectContaining({ model: 'claude-3' }),
|
||||
{ promptTokens: 80, completionTokens: 40 },
|
||||
);
|
||||
expect(mockSpendTokens).toHaveBeenNthCalledWith(
|
||||
3,
|
||||
expect.objectContaining({ model: 'gemini-pro' }),
|
||||
{ promptTokens: 120, completionTokens: 60 },
|
||||
);
|
||||
});
|
||||
|
||||
it('should use fallbackModel when usage.model is missing', async () => {
|
||||
const collectedUsage = [{ input_tokens: 100, output_tokens: 50 }];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'fallback-model',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'fallback-model' }),
|
||||
expect.any(Object),
|
||||
);
|
||||
});
|
||||
|
||||
it('should use spendStructuredTokens for OpenAI format cache tokens', async () => {
|
||||
const collectedUsage = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'gpt-4',
|
||||
input_token_details: {
|
||||
cache_creation: 20,
|
||||
cache_read: 10,
|
||||
},
|
||||
},
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
|
||||
expect(mockSpendTokens).not.toHaveBeenCalled();
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'gpt-4', context: 'abort' }),
|
||||
{
|
||||
promptTokens: {
|
||||
input: 100,
|
||||
write: 20,
|
||||
read: 10,
|
||||
},
|
||||
completionTokens: 50,
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it('should use spendStructuredTokens for Anthropic format cache tokens', async () => {
|
||||
const collectedUsage = [
|
||||
{
|
||||
input_tokens: 100,
|
||||
output_tokens: 50,
|
||||
model: 'claude-3',
|
||||
cache_creation_input_tokens: 25,
|
||||
cache_read_input_tokens: 15,
|
||||
},
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'claude-3',
|
||||
});
|
||||
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
|
||||
expect(mockSpendTokens).not.toHaveBeenCalled();
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledWith(
|
||||
expect.objectContaining({ model: 'claude-3' }),
|
||||
{
|
||||
promptTokens: {
|
||||
input: 100,
|
||||
write: 25,
|
||||
read: 15,
|
||||
},
|
||||
completionTokens: 50,
|
||||
},
|
||||
);
|
||||
});
|
||||
|
||||
it('should handle mixed cache and non-cache entries', async () => {
|
||||
const collectedUsage = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{
|
||||
input_tokens: 150,
|
||||
output_tokens: 30,
|
||||
model: 'claude-3',
|
||||
cache_creation_input_tokens: 20,
|
||||
cache_read_input_tokens: 10,
|
||||
},
|
||||
{ input_tokens: 200, output_tokens: 20, model: 'gemini-pro' },
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledTimes(2);
|
||||
expect(mockSpendStructuredTokens).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
|
||||
it('should handle real-world parallel agent abort scenario', async () => {
|
||||
// Simulates: Primary agent (gemini) + addedConvo agent (gpt-5) aborted mid-stream
|
||||
const collectedUsage = [
|
||||
{ input_tokens: 31596, output_tokens: 151, model: 'gemini-3-flash-preview' },
|
||||
{ input_tokens: 28000, output_tokens: 120, model: 'gpt-5.2' },
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gemini-3-flash-preview',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledTimes(2);
|
||||
|
||||
// Primary model
|
||||
expect(mockSpendTokens).toHaveBeenNthCalledWith(
|
||||
1,
|
||||
expect.objectContaining({ model: 'gemini-3-flash-preview' }),
|
||||
{ promptTokens: 31596, completionTokens: 151 },
|
||||
);
|
||||
|
||||
// Parallel model (addedConvo)
|
||||
expect(mockSpendTokens).toHaveBeenNthCalledWith(
|
||||
2,
|
||||
expect.objectContaining({ model: 'gpt-5.2' }),
|
||||
{ promptTokens: 28000, completionTokens: 120 },
|
||||
);
|
||||
});
|
||||
|
||||
it('should clear collectedUsage array after spending to prevent double-spending', async () => {
|
||||
// This tests the race condition fix: after abort middleware spends tokens,
|
||||
// the collectedUsage array is cleared so AgentClient.recordCollectedUsage()
|
||||
// (which shares the same array reference) sees an empty array and returns early.
|
||||
const collectedUsage = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
];
|
||||
|
||||
expect(collectedUsage.length).toBe(2);
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
expect(mockSpendTokens).toHaveBeenCalledTimes(2);
|
||||
|
||||
// The array should be cleared after spending
|
||||
expect(collectedUsage.length).toBe(0);
|
||||
});
|
||||
|
||||
it('should await all token spending operations before clearing array', async () => {
|
||||
// Ensure we don't clear the array before spending completes
|
||||
let spendCallCount = 0;
|
||||
mockSpendTokens.mockImplementation(async () => {
|
||||
spendCallCount++;
|
||||
// Simulate async delay
|
||||
await new Promise((resolve) => setTimeout(resolve, 10));
|
||||
});
|
||||
|
||||
const collectedUsage = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
];
|
||||
|
||||
await spendCollectedUsage({
|
||||
userId: 'user-123',
|
||||
conversationId: 'convo-123',
|
||||
collectedUsage,
|
||||
fallbackModel: 'gpt-4',
|
||||
});
|
||||
|
||||
// Both spend calls should have completed
|
||||
expect(spendCallCount).toBe(2);
|
||||
|
||||
// Array should be cleared after awaiting
|
||||
expect(collectedUsage.length).toBe(0);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -3,10 +3,11 @@ const { createContentAggregator } = require('@librechat/agents');
|
|||
const {
|
||||
initializeAgent,
|
||||
validateAgentModel,
|
||||
getCustomEndpointConfig,
|
||||
createSequentialChainEdges,
|
||||
createEdgeCollector,
|
||||
filterOrphanedEdges,
|
||||
GenerationJobManager,
|
||||
getCustomEndpointConfig,
|
||||
createSequentialChainEdges,
|
||||
} = require('@librechat/api');
|
||||
const {
|
||||
EModelEndpoint,
|
||||
|
|
@ -314,6 +315,10 @@ const initializeClient = async ({ req, res, signal, endpointOption }) => {
|
|||
endpoint: isEphemeralAgentId(primaryConfig.id) ? primaryConfig.endpoint : EModelEndpoint.agents,
|
||||
});
|
||||
|
||||
if (streamId) {
|
||||
GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
|
||||
}
|
||||
|
||||
return { client, userMCPAuthMap };
|
||||
};
|
||||
|
||||
|
|
|
|||
|
|
@ -1,9 +1,11 @@
|
|||
import { logger } from '@librechat/data-schemas';
|
||||
import type { StandardGraph } from '@librechat/agents';
|
||||
import type { Agents } from 'librechat-data-provider';
|
||||
import { parseTextParts } from 'librechat-data-provider';
|
||||
import type { Agents, TMessageContentParts } from 'librechat-data-provider';
|
||||
import type {
|
||||
SerializableJobData,
|
||||
IEventTransport,
|
||||
UsageMetadata,
|
||||
AbortResult,
|
||||
IJobStore,
|
||||
} from './interfaces/IJobStore';
|
||||
|
|
@ -585,7 +587,14 @@ class GenerationJobManagerClass {
|
|||
|
||||
if (!jobData) {
|
||||
logger.warn(`[GenerationJobManager] Cannot abort - job not found: ${streamId}`);
|
||||
return { success: false, jobData: null, content: [], finalEvent: null };
|
||||
return {
|
||||
text: '',
|
||||
content: [],
|
||||
jobData: null,
|
||||
success: false,
|
||||
finalEvent: null,
|
||||
collectedUsage: [],
|
||||
};
|
||||
}
|
||||
|
||||
// Emit abort signal for cross-replica support (Redis mode)
|
||||
|
|
@ -599,15 +608,21 @@ class GenerationJobManagerClass {
|
|||
runtime.abortController.abort();
|
||||
}
|
||||
|
||||
// Get content before clearing state
|
||||
/** Content before clearing state */
|
||||
const result = await this.jobStore.getContentParts(streamId);
|
||||
const content = result?.content ?? [];
|
||||
|
||||
// Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
|
||||
// In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation
|
||||
/** Collected usage for all models */
|
||||
const collectedUsage = this.jobStore.getCollectedUsage(streamId);
|
||||
|
||||
/** Text from content parts for fallback token counting */
|
||||
const text = parseTextParts(content as TMessageContentParts[]);
|
||||
|
||||
/** Detect "early abort" - aborted before any generation happened (e.g., during tool loading)
|
||||
In this case, no messages were saved to DB, so frontend shouldn't navigate to conversation */
|
||||
const isEarlyAbort = content.length === 0 && !jobData.responseMessageId;
|
||||
|
||||
// Create final event for abort
|
||||
/** Final event for abort */
|
||||
const userMessageId = jobData.userMessage?.messageId;
|
||||
|
||||
const abortFinalEvent: t.ServerSentEvent = {
|
||||
|
|
@ -669,6 +684,8 @@ class GenerationJobManagerClass {
|
|||
jobData,
|
||||
content,
|
||||
finalEvent: abortFinalEvent,
|
||||
text,
|
||||
collectedUsage,
|
||||
};
|
||||
}
|
||||
|
||||
|
|
@ -933,6 +950,18 @@ class GenerationJobManagerClass {
|
|||
this.jobStore.setContentParts(streamId, contentParts);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set reference to the collectedUsage array.
|
||||
* This array accumulates token usage from all models during generation.
|
||||
*/
|
||||
setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
|
||||
// Use runtime state check for performance (sync check)
|
||||
if (!this.runtimeState.has(streamId)) {
|
||||
return;
|
||||
}
|
||||
this.jobStore.setCollectedUsage(streamId, collectedUsage);
|
||||
}
|
||||
|
||||
/**
|
||||
* Set reference to the graph instance.
|
||||
*/
|
||||
|
|
|
|||
482
packages/api/src/stream/__tests__/collectedUsage.spec.ts
Normal file
482
packages/api/src/stream/__tests__/collectedUsage.spec.ts
Normal file
|
|
@ -0,0 +1,482 @@
|
|||
/**
|
||||
* Tests for collected usage functionality in GenerationJobManager.
|
||||
*
|
||||
* This tests the storage and retrieval of collectedUsage for abort handling,
|
||||
* ensuring all models (including parallel agents from addedConvo) have their
|
||||
* tokens spent when a conversation is aborted.
|
||||
*/
|
||||
|
||||
import type { UsageMetadata } from '../interfaces/IJobStore';
|
||||
|
||||
describe('CollectedUsage - InMemoryJobStore', () => {
|
||||
beforeEach(() => {
|
||||
jest.resetModules();
|
||||
});
|
||||
|
||||
it('should store and retrieve collectedUsage', async () => {
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const store = new InMemoryJobStore();
|
||||
await store.initialize();
|
||||
|
||||
const streamId = 'test-stream-1';
|
||||
await store.createJob(streamId, 'user-1');
|
||||
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
];
|
||||
|
||||
store.setCollectedUsage(streamId, collectedUsage);
|
||||
const retrieved = store.getCollectedUsage(streamId);
|
||||
|
||||
expect(retrieved).toEqual(collectedUsage);
|
||||
expect(retrieved).toHaveLength(2);
|
||||
|
||||
await store.destroy();
|
||||
});
|
||||
|
||||
it('should return empty array when no collectedUsage set', async () => {
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const store = new InMemoryJobStore();
|
||||
await store.initialize();
|
||||
|
||||
const streamId = 'test-stream-2';
|
||||
await store.createJob(streamId, 'user-1');
|
||||
|
||||
const retrieved = store.getCollectedUsage(streamId);
|
||||
|
||||
expect(retrieved).toEqual([]);
|
||||
|
||||
await store.destroy();
|
||||
});
|
||||
|
||||
it('should return empty array for non-existent stream', async () => {
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const store = new InMemoryJobStore();
|
||||
await store.initialize();
|
||||
|
||||
const retrieved = store.getCollectedUsage('non-existent-stream');
|
||||
|
||||
expect(retrieved).toEqual([]);
|
||||
|
||||
await store.destroy();
|
||||
});
|
||||
|
||||
it('should update collectedUsage when set multiple times', async () => {
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const store = new InMemoryJobStore();
|
||||
await store.initialize();
|
||||
|
||||
const streamId = 'test-stream-3';
|
||||
await store.createJob(streamId, 'user-1');
|
||||
|
||||
const usage1: UsageMetadata[] = [{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }];
|
||||
store.setCollectedUsage(streamId, usage1);
|
||||
|
||||
// Simulate more usage being added
|
||||
const usage2: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
];
|
||||
store.setCollectedUsage(streamId, usage2);
|
||||
|
||||
const retrieved = store.getCollectedUsage(streamId);
|
||||
expect(retrieved).toHaveLength(2);
|
||||
|
||||
await store.destroy();
|
||||
});
|
||||
|
||||
it('should clear collectedUsage when clearContentState is called', async () => {
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const store = new InMemoryJobStore();
|
||||
await store.initialize();
|
||||
|
||||
const streamId = 'test-stream-4';
|
||||
await store.createJob(streamId, 'user-1');
|
||||
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
];
|
||||
store.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
expect(store.getCollectedUsage(streamId)).toHaveLength(1);
|
||||
|
||||
store.clearContentState(streamId);
|
||||
|
||||
expect(store.getCollectedUsage(streamId)).toEqual([]);
|
||||
|
||||
await store.destroy();
|
||||
});
|
||||
|
||||
it('should clear collectedUsage when job is deleted', async () => {
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const store = new InMemoryJobStore();
|
||||
await store.initialize();
|
||||
|
||||
const streamId = 'test-stream-5';
|
||||
await store.createJob(streamId, 'user-1');
|
||||
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
];
|
||||
store.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
await store.deleteJob(streamId);
|
||||
|
||||
expect(store.getCollectedUsage(streamId)).toEqual([]);
|
||||
|
||||
await store.destroy();
|
||||
});
|
||||
});
|
||||
|
||||
describe('CollectedUsage - GenerationJobManager', () => {
|
||||
beforeEach(() => {
|
||||
jest.resetModules();
|
||||
});
|
||||
|
||||
it('should set and retrieve collectedUsage through manager', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `manager-test-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
];
|
||||
|
||||
GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
// Retrieve through abort
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.collectedUsage).toEqual(collectedUsage);
|
||||
expect(abortResult.collectedUsage).toHaveLength(2);
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should return empty collectedUsage when none set', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `no-usage-test-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.collectedUsage).toEqual([]);
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should not set collectedUsage if job does not exist', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
];
|
||||
|
||||
// This should not throw, just silently do nothing
|
||||
GenerationJobManager.setCollectedUsage('non-existent-stream', collectedUsage);
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob('non-existent-stream');
|
||||
expect(abortResult.success).toBe(false);
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
});
|
||||
|
||||
describe('AbortJob - Text and CollectedUsage', () => {
|
||||
beforeEach(() => {
|
||||
jest.resetModules();
|
||||
});
|
||||
|
||||
it('should extract text from content parts on abort', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `text-extract-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
// Set content parts with text
|
||||
const contentParts = [
|
||||
{ type: 'text', text: 'Hello ' },
|
||||
{ type: 'text', text: 'world!' },
|
||||
];
|
||||
GenerationJobManager.setContentParts(streamId, contentParts as never);
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.text).toBe('Hello world!');
|
||||
expect(abortResult.success).toBe(true);
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should return empty text when no content parts', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `empty-text-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.text).toBe('');
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should return both text and collectedUsage on abort', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `full-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
// Set content parts
|
||||
const contentParts = [{ type: 'text', text: 'Partial response...' }];
|
||||
GenerationJobManager.setContentParts(streamId, contentParts as never);
|
||||
|
||||
// Set collected usage
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' },
|
||||
{ input_tokens: 80, output_tokens: 40, model: 'claude-3' },
|
||||
];
|
||||
GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.success).toBe(true);
|
||||
expect(abortResult.text).toBe('Partial response...');
|
||||
expect(abortResult.collectedUsage).toEqual(collectedUsage);
|
||||
expect(abortResult.content).toHaveLength(1);
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should return empty values for non-existent job', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob('non-existent-job');
|
||||
|
||||
expect(abortResult.success).toBe(false);
|
||||
expect(abortResult.text).toBe('');
|
||||
expect(abortResult.collectedUsage).toEqual([]);
|
||||
expect(abortResult.content).toEqual([]);
|
||||
expect(abortResult.jobData).toBeNull();
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
});
|
||||
|
||||
describe('Real-world Scenarios', () => {
|
||||
beforeEach(() => {
|
||||
jest.resetModules();
|
||||
});
|
||||
|
||||
it('should handle parallel agent abort with collected usage', async () => {
|
||||
/**
|
||||
* Scenario: User aborts a conversation with addedConvo (parallel agents)
|
||||
* - Primary agent: gemini-3-flash-preview
|
||||
* - Parallel agent: gpt-5.2
|
||||
* Both should have their tokens spent on abort
|
||||
*/
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `parallel-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
// Simulate content from primary agent
|
||||
const contentParts = [
|
||||
{ type: 'text', text: 'Primary agent output...' },
|
||||
{ type: 'text', text: 'More content...' },
|
||||
];
|
||||
GenerationJobManager.setContentParts(streamId, contentParts as never);
|
||||
|
||||
// Simulate collected usage from both agents (as would happen during generation)
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 31596,
|
||||
output_tokens: 151,
|
||||
model: 'gemini-3-flash-preview',
|
||||
},
|
||||
{
|
||||
input_tokens: 28000,
|
||||
output_tokens: 120,
|
||||
model: 'gpt-5.2',
|
||||
},
|
||||
];
|
||||
GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
// Abort the job
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
// Verify both models' usage is returned
|
||||
expect(abortResult.success).toBe(true);
|
||||
expect(abortResult.collectedUsage).toHaveLength(2);
|
||||
expect(abortResult.collectedUsage[0].model).toBe('gemini-3-flash-preview');
|
||||
expect(abortResult.collectedUsage[1].model).toBe('gpt-5.2');
|
||||
|
||||
// Verify text is extracted
|
||||
expect(abortResult.text).toContain('Primary agent output');
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should handle abort with cache tokens from Anthropic', async () => {
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `cache-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
// Anthropic-style cache tokens
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{
|
||||
input_tokens: 788,
|
||||
output_tokens: 163,
|
||||
cache_creation_input_tokens: 30808,
|
||||
cache_read_input_tokens: 0,
|
||||
model: 'claude-opus-4-5-20251101',
|
||||
},
|
||||
];
|
||||
GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.collectedUsage[0].cache_creation_input_tokens).toBe(30808);
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
|
||||
it('should handle abort with sequential tool calls usage', async () => {
|
||||
/**
|
||||
* Scenario: Single agent with multiple tool calls, aborted mid-execution
|
||||
* Usage accumulates for each LLM call
|
||||
*/
|
||||
const { GenerationJobManager } = await import('../GenerationJobManager');
|
||||
const { InMemoryJobStore } = await import('../implementations/InMemoryJobStore');
|
||||
const { InMemoryEventTransport } = await import('../implementations/InMemoryEventTransport');
|
||||
|
||||
GenerationJobManager.configure({
|
||||
jobStore: new InMemoryJobStore(),
|
||||
eventTransport: new InMemoryEventTransport(),
|
||||
isRedis: false,
|
||||
cleanupOnComplete: false,
|
||||
});
|
||||
|
||||
await GenerationJobManager.initialize();
|
||||
|
||||
const streamId = `sequential-abort-${Date.now()}`;
|
||||
await GenerationJobManager.createJob(streamId, 'user-1');
|
||||
|
||||
// Usage from multiple sequential LLM calls (tool use pattern)
|
||||
const collectedUsage: UsageMetadata[] = [
|
||||
{ input_tokens: 100, output_tokens: 50, model: 'gpt-4' }, // Initial call
|
||||
{ input_tokens: 150, output_tokens: 30, model: 'gpt-4' }, // After tool result 1
|
||||
{ input_tokens: 180, output_tokens: 20, model: 'gpt-4' }, // After tool result 2 (aborted here)
|
||||
];
|
||||
GenerationJobManager.setCollectedUsage(streamId, collectedUsage);
|
||||
|
||||
const abortResult = await GenerationJobManager.abortJob(streamId);
|
||||
|
||||
expect(abortResult.collectedUsage).toHaveLength(3);
|
||||
// All three entries should be present for proper token accounting
|
||||
|
||||
await GenerationJobManager.destroy();
|
||||
});
|
||||
});
|
||||
|
|
@ -1,7 +1,12 @@
|
|||
import { logger } from '@librechat/data-schemas';
|
||||
import type { StandardGraph } from '@librechat/agents';
|
||||
import type { Agents } from 'librechat-data-provider';
|
||||
import type { IJobStore, SerializableJobData, JobStatus } from '~/stream/interfaces/IJobStore';
|
||||
import type {
|
||||
SerializableJobData,
|
||||
UsageMetadata,
|
||||
IJobStore,
|
||||
JobStatus,
|
||||
} from '~/stream/interfaces/IJobStore';
|
||||
|
||||
/**
|
||||
* Content state for a job - volatile, in-memory only.
|
||||
|
|
@ -10,6 +15,7 @@ import type { IJobStore, SerializableJobData, JobStatus } from '~/stream/interfa
|
|||
interface ContentState {
|
||||
contentParts: Agents.MessageContentComplex[];
|
||||
graphRef: WeakRef<StandardGraph> | null;
|
||||
collectedUsage: UsageMetadata[];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -240,6 +246,7 @@ export class InMemoryJobStore implements IJobStore {
|
|||
this.contentState.set(streamId, {
|
||||
contentParts: [],
|
||||
graphRef: new WeakRef(graph),
|
||||
collectedUsage: [],
|
||||
});
|
||||
}
|
||||
}
|
||||
|
|
@ -252,10 +259,30 @@ export class InMemoryJobStore implements IJobStore {
|
|||
if (existing) {
|
||||
existing.contentParts = contentParts;
|
||||
} else {
|
||||
this.contentState.set(streamId, { contentParts, graphRef: null });
|
||||
this.contentState.set(streamId, { contentParts, graphRef: null, collectedUsage: [] });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Set collected usage reference for a job.
|
||||
*/
|
||||
setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
|
||||
const existing = this.contentState.get(streamId);
|
||||
if (existing) {
|
||||
existing.collectedUsage = collectedUsage;
|
||||
} else {
|
||||
this.contentState.set(streamId, { contentParts: [], graphRef: null, collectedUsage });
|
||||
}
|
||||
}
|
||||
|
||||
/**
|
||||
* Get collected usage for a job.
|
||||
*/
|
||||
getCollectedUsage(streamId: string): UsageMetadata[] {
|
||||
const state = this.contentState.get(streamId);
|
||||
return state?.collectedUsage ?? [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get content parts for a job.
|
||||
* Returns live content from stored reference.
|
||||
|
|
|
|||
|
|
@ -1,9 +1,14 @@
|
|||
import { logger } from '@librechat/data-schemas';
|
||||
import { createContentAggregator } from '@librechat/agents';
|
||||
import type { IJobStore, SerializableJobData, JobStatus } from '~/stream/interfaces/IJobStore';
|
||||
import type { StandardGraph } from '@librechat/agents';
|
||||
import type { Agents } from 'librechat-data-provider';
|
||||
import type { Redis, Cluster } from 'ioredis';
|
||||
import type {
|
||||
SerializableJobData,
|
||||
UsageMetadata,
|
||||
IJobStore,
|
||||
JobStatus,
|
||||
} from '~/stream/interfaces/IJobStore';
|
||||
|
||||
/**
|
||||
* Key prefixes for Redis storage.
|
||||
|
|
@ -90,6 +95,13 @@ export class RedisJobStore implements IJobStore {
|
|||
*/
|
||||
private localGraphCache = new Map<string, WeakRef<StandardGraph>>();
|
||||
|
||||
/**
|
||||
* Local cache for collectedUsage arrays.
|
||||
* Generation happens on a single instance, so collectedUsage is only available locally.
|
||||
* For cross-replica abort, the abort handler falls back to text-based token counting.
|
||||
*/
|
||||
private localCollectedUsageCache = new Map<string, UsageMetadata[]>();
|
||||
|
||||
/** Cleanup interval in ms (1 minute) */
|
||||
private cleanupIntervalMs = 60000;
|
||||
|
||||
|
|
@ -227,6 +239,7 @@ export class RedisJobStore implements IJobStore {
|
|||
async deleteJob(streamId: string): Promise<void> {
|
||||
// Clear local caches
|
||||
this.localGraphCache.delete(streamId);
|
||||
this.localCollectedUsageCache.delete(streamId);
|
||||
|
||||
// Note: userJobs cleanup is handled lazily via self-healing in getActiveJobIdsByUser
|
||||
// In cluster mode, separate runningJobs (global) from stream-specific keys (same slot)
|
||||
|
|
@ -290,6 +303,7 @@ export class RedisJobStore implements IJobStore {
|
|||
if (!job) {
|
||||
await this.redis.srem(KEYS.runningJobs, streamId);
|
||||
this.localGraphCache.delete(streamId);
|
||||
this.localCollectedUsageCache.delete(streamId);
|
||||
cleaned++;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -298,6 +312,7 @@ export class RedisJobStore implements IJobStore {
|
|||
if (job.status !== 'running') {
|
||||
await this.redis.srem(KEYS.runningJobs, streamId);
|
||||
this.localGraphCache.delete(streamId);
|
||||
this.localCollectedUsageCache.delete(streamId);
|
||||
cleaned++;
|
||||
continue;
|
||||
}
|
||||
|
|
@ -382,6 +397,7 @@ export class RedisJobStore implements IJobStore {
|
|||
}
|
||||
// Clear local caches
|
||||
this.localGraphCache.clear();
|
||||
this.localCollectedUsageCache.clear();
|
||||
// Don't close the Redis connection - it's shared
|
||||
logger.info('[RedisJobStore] Destroyed');
|
||||
}
|
||||
|
|
@ -406,11 +422,28 @@ export class RedisJobStore implements IJobStore {
|
|||
* No-op for Redis - content parts are reconstructed from chunks.
|
||||
* Metadata (agentId, groupId) is embedded directly on content parts by the agent runtime.
|
||||
*/
|
||||
setContentParts(_streamId: string, _contentParts: Agents.MessageContentComplex[]): void {
|
||||
setContentParts(): void {
|
||||
// Content parts are reconstructed from chunks during getContentParts
|
||||
// No separate storage needed
|
||||
}
|
||||
|
||||
/**
|
||||
* Store collectedUsage reference in local cache.
|
||||
* This is used for abort handling to spend tokens for all models.
|
||||
* Note: Only available on the generating instance; cross-replica abort uses fallback.
|
||||
*/
|
||||
setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void {
|
||||
this.localCollectedUsageCache.set(streamId, collectedUsage);
|
||||
}
|
||||
|
||||
/**
|
||||
* Get collected usage for a job.
|
||||
* Only available if this is the generating instance.
|
||||
*/
|
||||
getCollectedUsage(streamId: string): UsageMetadata[] {
|
||||
return this.localCollectedUsageCache.get(streamId) ?? [];
|
||||
}
|
||||
|
||||
/**
|
||||
* Get aggregated content - tries local cache first, falls back to Redis reconstruction.
|
||||
*
|
||||
|
|
@ -528,6 +561,7 @@ export class RedisJobStore implements IJobStore {
|
|||
clearContentState(streamId: string): void {
|
||||
// Clear local caches immediately
|
||||
this.localGraphCache.delete(streamId);
|
||||
this.localCollectedUsageCache.delete(streamId);
|
||||
|
||||
// Fire and forget - async cleanup for Redis
|
||||
this.clearContentStateAsync(streamId).catch((err) => {
|
||||
|
|
|
|||
|
|
@ -5,11 +5,12 @@ export {
|
|||
} from './GenerationJobManager';
|
||||
|
||||
export type {
|
||||
AbortResult,
|
||||
SerializableJobData,
|
||||
IEventTransport,
|
||||
UsageMetadata,
|
||||
AbortResult,
|
||||
JobStatus,
|
||||
IJobStore,
|
||||
IEventTransport,
|
||||
} from './interfaces/IJobStore';
|
||||
|
||||
export { createStreamServices } from './createStreamServices';
|
||||
|
|
|
|||
|
|
@ -45,6 +45,54 @@ export interface SerializableJobData {
|
|||
promptTokens?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Usage metadata for token spending across different LLM providers.
|
||||
*
|
||||
* This interface supports two mutually exclusive cache token formats:
|
||||
*
|
||||
* **OpenAI format** (GPT-4, o1, etc.):
|
||||
* - Uses `input_token_details.cache_creation` and `input_token_details.cache_read`
|
||||
* - Cache tokens are nested under the `input_token_details` object
|
||||
*
|
||||
* **Anthropic format** (Claude models):
|
||||
* - Uses `cache_creation_input_tokens` and `cache_read_input_tokens`
|
||||
* - Cache tokens are top-level properties
|
||||
*
|
||||
* When processing usage data, check both formats:
|
||||
* ```typescript
|
||||
* const cacheCreation = usage.input_token_details?.cache_creation
|
||||
* || usage.cache_creation_input_tokens || 0;
|
||||
* ```
|
||||
*/
|
||||
export interface UsageMetadata {
|
||||
/** Total input tokens (prompt tokens) */
|
||||
input_tokens?: number;
|
||||
/** Total output tokens (completion tokens) */
|
||||
output_tokens?: number;
|
||||
/** Model identifier that generated this usage */
|
||||
model?: string;
|
||||
/**
|
||||
* OpenAI-style cache token details.
|
||||
* Present for OpenAI models (GPT-4, o1, etc.)
|
||||
*/
|
||||
input_token_details?: {
|
||||
/** Tokens written to cache */
|
||||
cache_creation?: number;
|
||||
/** Tokens read from cache */
|
||||
cache_read?: number;
|
||||
};
|
||||
/**
|
||||
* Anthropic-style cache creation tokens.
|
||||
* Present for Claude models. Mutually exclusive with input_token_details.
|
||||
*/
|
||||
cache_creation_input_tokens?: number;
|
||||
/**
|
||||
* Anthropic-style cache read tokens.
|
||||
* Present for Claude models. Mutually exclusive with input_token_details.
|
||||
*/
|
||||
cache_read_input_tokens?: number;
|
||||
}
|
||||
|
||||
/**
|
||||
* Result returned from aborting a job - contains all data needed
|
||||
* for token spending and message saving without storing callbacks
|
||||
|
|
@ -58,6 +106,10 @@ export interface AbortResult {
|
|||
content: Agents.MessageContentComplex[];
|
||||
/** Final event to send to client */
|
||||
finalEvent: unknown;
|
||||
/** Concatenated text from all content parts for token counting fallback */
|
||||
text: string;
|
||||
/** Collected usage metadata from all models for token spending */
|
||||
collectedUsage: UsageMetadata[];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
@ -210,6 +262,23 @@ export interface IJobStore {
|
|||
* @param runSteps - Run steps to save
|
||||
*/
|
||||
saveRunSteps?(streamId: string, runSteps: Agents.RunStep[]): Promise<void>;
|
||||
|
||||
/**
|
||||
* Set collected usage reference for a job.
|
||||
* This array accumulates token usage from all models during generation.
|
||||
*
|
||||
* @param streamId - The stream identifier
|
||||
* @param collectedUsage - Array of usage metadata from all models
|
||||
*/
|
||||
setCollectedUsage(streamId: string, collectedUsage: UsageMetadata[]): void;
|
||||
|
||||
/**
|
||||
* Get collected usage for a job.
|
||||
*
|
||||
* @param streamId - The stream identifier
|
||||
* @returns Array of usage metadata or empty array
|
||||
*/
|
||||
getCollectedUsage(streamId: string): UsageMetadata[];
|
||||
}
|
||||
|
||||
/**
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue