mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-19 01:40:15 +01:00
⚡ refactor: Optimize & Standardize Tokenizer Usage (#10777)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
* refactor: Token Limit Processing with Enhanced Efficiency - Added a new test suite for `processTextWithTokenLimit`, ensuring comprehensive coverage of various scenarios including under, at, and exceeding token limits. - Refactored the `processTextWithTokenLimit` function to utilize a ratio-based estimation method, significantly reducing the number of token counting function calls compared to the previous binary search approach. - Improved handling of edge cases and variable token density, ensuring accurate truncation and performance across diverse text inputs. - Included direct comparisons with the old implementation to validate correctness and efficiency improvements. * refactor: Remove Tokenizer Route and Related References - Deleted the tokenizer route from the server and removed its references from the routes index and server files, streamlining the API structure. - This change simplifies the routing configuration by eliminating unused endpoints. * refactor: Migrate countTokens Utility to API Module - Removed the local countTokens utility and integrated it into the @librechat/api module for centralized access. - Updated various files to reference the new countTokens import from the API module, ensuring consistent usage across the application. - Cleaned up unused references and imports related to the previous countTokens implementation. * refactor: Centralize escapeRegExp Utility in API Module - Moved the escapeRegExp function from local utility files to the @librechat/api module for consistent usage across the application. - Updated imports in various files to reference the new centralized escapeRegExp function, ensuring cleaner code and reducing redundancy. - Removed duplicate implementations of escapeRegExp from multiple files, streamlining the codebase. * refactor: Enhance Token Counting Flexibility in Text Processing - Updated the `processTextWithTokenLimit` function to accept both synchronous and asynchronous token counting functions, improving its versatility. - Introduced a new `TokenCountFn` type to define the token counting function signature. - Added comprehensive tests to validate the behavior of `processTextWithTokenLimit` with both sync and async token counting functions, ensuring consistent results. - Implemented a wrapper to track call counts for the `countTokens` function, optimizing performance and reducing unnecessary calls. - Enhanced existing tests to compare the performance of the new implementation against the old one, demonstrating significant improvements in efficiency. * chore: documentation for Truncation Safety Buffer in Token Processing - Added a safety buffer multiplier to the character position estimates during text truncation to prevent overshooting token limits. - Updated the `processTextWithTokenLimit` function to utilize the new `TRUNCATION_SAFETY_BUFFER` constant, enhancing the accuracy of token limit processing. - Improved documentation to clarify the rationale behind the buffer and its impact on performance and efficiency in token counting.
This commit is contained in:
parent
b2387cc6fa
commit
8bdc808074
19 changed files with 925 additions and 107 deletions
|
|
@ -2,6 +2,7 @@ import { SystemCategories } from 'librechat-data-provider';
|
|||
import type { IPromptGroupDocument as IPromptGroup } from '@librechat/data-schemas';
|
||||
import type { Types } from 'mongoose';
|
||||
import type { PromptGroupsListResponse } from '~/types';
|
||||
import { escapeRegExp } from '~/utils/common';
|
||||
|
||||
/**
|
||||
* Formats prompt groups for the paginated /groups endpoint response
|
||||
|
|
@ -101,7 +102,6 @@ export function buildPromptGroupFilter({
|
|||
|
||||
// Handle name filter - convert to regex for case-insensitive search
|
||||
if (name) {
|
||||
const escapeRegExp = (str: string) => str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
filter.name = new RegExp(escapeRegExp(name), 'i');
|
||||
}
|
||||
|
||||
|
|
|
|||
|
|
@ -48,3 +48,12 @@ export function optionalChainWithEmptyCheck(
|
|||
}
|
||||
return values[values.length - 1];
|
||||
}
|
||||
|
||||
/**
|
||||
* Escapes special characters in a string for use in a regular expression.
|
||||
* @param str - The string to escape.
|
||||
* @returns The escaped string safe for use in RegExp.
|
||||
*/
|
||||
export function escapeRegExp(str: string): string {
|
||||
return str.replace(/[.*+?^${}()|[\]\\]/g, '\\$&');
|
||||
}
|
||||
|
|
|
|||
|
|
@ -17,7 +17,7 @@ export * from './promise';
|
|||
export * from './sanitizeTitle';
|
||||
export * from './tempChatRetention';
|
||||
export * from './text';
|
||||
export { default as Tokenizer } from './tokenizer';
|
||||
export { default as Tokenizer, countTokens } from './tokenizer';
|
||||
export * from './yaml';
|
||||
export * from './http';
|
||||
export * from './tokens';
|
||||
|
|
|
|||
851
packages/api/src/utils/text.spec.ts
Normal file
851
packages/api/src/utils/text.spec.ts
Normal file
|
|
@ -0,0 +1,851 @@
|
|||
import { processTextWithTokenLimit, TokenCountFn } from './text';
|
||||
import Tokenizer, { countTokens } from './tokenizer';
|
||||
|
||||
jest.mock('@librechat/data-schemas', () => ({
|
||||
logger: {
|
||||
debug: jest.fn(),
|
||||
warn: jest.fn(),
|
||||
error: jest.fn(),
|
||||
},
|
||||
}));
|
||||
|
||||
/**
|
||||
* OLD IMPLEMENTATION (Binary Search) - kept for comparison testing
|
||||
* This is the original algorithm that caused CPU spikes
|
||||
*/
|
||||
async function processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
}: {
|
||||
text: string;
|
||||
tokenLimit: number;
|
||||
tokenCountFn: TokenCountFn;
|
||||
}): Promise<{ text: string; tokenCount: number; wasTruncated: boolean }> {
|
||||
const originalTokenCount = await tokenCountFn(text);
|
||||
|
||||
if (originalTokenCount <= tokenLimit) {
|
||||
return {
|
||||
text,
|
||||
tokenCount: originalTokenCount,
|
||||
wasTruncated: false,
|
||||
};
|
||||
}
|
||||
|
||||
let low = 0;
|
||||
let high = text.length;
|
||||
let bestText = '';
|
||||
|
||||
while (low <= high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
const truncatedText = text.substring(0, mid);
|
||||
const tokenCount = await tokenCountFn(truncatedText);
|
||||
|
||||
if (tokenCount <= tokenLimit) {
|
||||
bestText = truncatedText;
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid - 1;
|
||||
}
|
||||
}
|
||||
|
||||
const finalTokenCount = await tokenCountFn(bestText);
|
||||
|
||||
return {
|
||||
text: bestText,
|
||||
tokenCount: finalTokenCount,
|
||||
wasTruncated: true,
|
||||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Creates a wrapper around Tokenizer.getTokenCount that tracks call count
|
||||
*/
|
||||
const createRealTokenCounter = () => {
|
||||
let callCount = 0;
|
||||
const tokenCountFn = (text: string): number => {
|
||||
callCount++;
|
||||
return Tokenizer.getTokenCount(text, 'cl100k_base');
|
||||
};
|
||||
return {
|
||||
tokenCountFn,
|
||||
getCallCount: () => callCount,
|
||||
resetCallCount: () => {
|
||||
callCount = 0;
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
/**
|
||||
* Creates a wrapper around the async countTokens function that tracks call count
|
||||
*/
|
||||
const createCountTokensCounter = () => {
|
||||
let callCount = 0;
|
||||
const tokenCountFn = async (text: string): Promise<number> => {
|
||||
callCount++;
|
||||
return countTokens(text);
|
||||
};
|
||||
return {
|
||||
tokenCountFn,
|
||||
getCallCount: () => callCount,
|
||||
resetCallCount: () => {
|
||||
callCount = 0;
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
describe('processTextWithTokenLimit', () => {
|
||||
/**
|
||||
* Creates a mock token count function that simulates realistic token counting.
|
||||
* Roughly 4 characters per token (common for English text).
|
||||
* Tracks call count to verify efficiency.
|
||||
*/
|
||||
const createMockTokenCounter = () => {
|
||||
let callCount = 0;
|
||||
const tokenCountFn = (text: string): number => {
|
||||
callCount++;
|
||||
return Math.ceil(text.length / 4);
|
||||
};
|
||||
return {
|
||||
tokenCountFn,
|
||||
getCallCount: () => callCount,
|
||||
resetCallCount: () => {
|
||||
callCount = 0;
|
||||
},
|
||||
};
|
||||
};
|
||||
|
||||
/** Creates a string of specified character length */
|
||||
const createTextOfLength = (charLength: number): string => {
|
||||
return 'a'.repeat(charLength);
|
||||
};
|
||||
|
||||
/** Creates realistic text content with varied token density */
|
||||
const createRealisticText = (approximateTokens: number): string => {
|
||||
const words = [
|
||||
'the',
|
||||
'quick',
|
||||
'brown',
|
||||
'fox',
|
||||
'jumps',
|
||||
'over',
|
||||
'lazy',
|
||||
'dog',
|
||||
'lorem',
|
||||
'ipsum',
|
||||
'dolor',
|
||||
'sit',
|
||||
'amet',
|
||||
'consectetur',
|
||||
'adipiscing',
|
||||
'elit',
|
||||
'sed',
|
||||
'do',
|
||||
'eiusmod',
|
||||
'tempor',
|
||||
'incididunt',
|
||||
'ut',
|
||||
'labore',
|
||||
'et',
|
||||
'dolore',
|
||||
'magna',
|
||||
'aliqua',
|
||||
'enim',
|
||||
'ad',
|
||||
'minim',
|
||||
'veniam',
|
||||
'authentication',
|
||||
'implementation',
|
||||
'configuration',
|
||||
'documentation',
|
||||
];
|
||||
const result: string[] = [];
|
||||
for (let i = 0; i < approximateTokens; i++) {
|
||||
result.push(words[i % words.length]);
|
||||
}
|
||||
return result.join(' ');
|
||||
};
|
||||
|
||||
describe('tokenCountFn flexibility (sync and async)', () => {
|
||||
it('should work with synchronous tokenCountFn', async () => {
|
||||
const syncTokenCountFn = (text: string): number => Math.ceil(text.length / 4);
|
||||
const text = 'Hello, world! This is a test message.';
|
||||
const tokenLimit = 5;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: syncTokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
});
|
||||
|
||||
it('should work with asynchronous tokenCountFn', async () => {
|
||||
const asyncTokenCountFn = async (text: string): Promise<number> => {
|
||||
await new Promise((resolve) => setTimeout(resolve, 1));
|
||||
return Math.ceil(text.length / 4);
|
||||
};
|
||||
const text = 'Hello, world! This is a test message.';
|
||||
const tokenLimit = 5;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: asyncTokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
});
|
||||
|
||||
it('should produce equivalent results with sync and async tokenCountFn', async () => {
|
||||
const syncTokenCountFn = (text: string): number => Math.ceil(text.length / 4);
|
||||
const asyncTokenCountFn = async (text: string): Promise<number> => Math.ceil(text.length / 4);
|
||||
const text = 'a'.repeat(8000);
|
||||
const tokenLimit = 1000;
|
||||
|
||||
const syncResult = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: syncTokenCountFn,
|
||||
});
|
||||
|
||||
const asyncResult = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: asyncTokenCountFn,
|
||||
});
|
||||
|
||||
expect(syncResult.tokenCount).toBe(asyncResult.tokenCount);
|
||||
expect(syncResult.wasTruncated).toBe(asyncResult.wasTruncated);
|
||||
expect(syncResult.text.length).toBe(asyncResult.text.length);
|
||||
});
|
||||
});
|
||||
|
||||
describe('when text is under the token limit', () => {
|
||||
it('should return original text unchanged', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = 'Hello, world!';
|
||||
const tokenLimit = 100;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.text).toBe(text);
|
||||
expect(result.wasTruncated).toBe(false);
|
||||
});
|
||||
|
||||
it('should return correct token count', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = 'Hello, world!';
|
||||
const tokenLimit = 100;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.tokenCount).toBe(Math.ceil(text.length / 4));
|
||||
});
|
||||
|
||||
it('should only call tokenCountFn once when under limit', async () => {
|
||||
const { tokenCountFn, getCallCount } = createMockTokenCounter();
|
||||
const text = 'Hello, world!';
|
||||
const tokenLimit = 100;
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(getCallCount()).toBe(1);
|
||||
});
|
||||
});
|
||||
|
||||
describe('when text is exactly at the token limit', () => {
|
||||
it('should return original text unchanged', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = createTextOfLength(400);
|
||||
const tokenLimit = 100;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.text).toBe(text);
|
||||
expect(result.wasTruncated).toBe(false);
|
||||
expect(result.tokenCount).toBe(tokenLimit);
|
||||
});
|
||||
});
|
||||
|
||||
describe('when text exceeds the token limit', () => {
|
||||
it('should truncate text to fit within limit', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = createTextOfLength(8000);
|
||||
const tokenLimit = 1000;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(result.text.length).toBeLessThan(text.length);
|
||||
});
|
||||
|
||||
it('should truncate text to be close to but not exceed the limit', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = createTextOfLength(8000);
|
||||
const tokenLimit = 1000;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(result.tokenCount).toBeGreaterThan(tokenLimit * 0.9);
|
||||
});
|
||||
});
|
||||
|
||||
describe('efficiency - tokenCountFn call count', () => {
|
||||
it('should call tokenCountFn at most 7 times for large text (vs ~17 for binary search)', async () => {
|
||||
const { tokenCountFn, getCallCount } = createMockTokenCounter();
|
||||
const text = createTextOfLength(400000);
|
||||
const tokenLimit = 50000;
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(getCallCount()).toBeLessThanOrEqual(7);
|
||||
});
|
||||
|
||||
it('should typically call tokenCountFn only 2-3 times for standard truncation', async () => {
|
||||
const { tokenCountFn, getCallCount } = createMockTokenCounter();
|
||||
const text = createTextOfLength(40000);
|
||||
const tokenLimit = 5000;
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(getCallCount()).toBeLessThanOrEqual(3);
|
||||
});
|
||||
|
||||
it('should call tokenCountFn only once when text is under limit', async () => {
|
||||
const { tokenCountFn, getCallCount } = createMockTokenCounter();
|
||||
const text = createTextOfLength(1000);
|
||||
const tokenLimit = 10000;
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(getCallCount()).toBe(1);
|
||||
});
|
||||
|
||||
it('should handle very large text (100k+ tokens) efficiently', async () => {
|
||||
const { tokenCountFn, getCallCount } = createMockTokenCounter();
|
||||
const text = createTextOfLength(500000);
|
||||
const tokenLimit = 100000;
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(getCallCount()).toBeLessThanOrEqual(7);
|
||||
});
|
||||
});
|
||||
|
||||
describe('edge cases', () => {
|
||||
it('should handle empty text', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = '';
|
||||
const tokenLimit = 100;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.text).toBe('');
|
||||
expect(result.tokenCount).toBe(0);
|
||||
expect(result.wasTruncated).toBe(false);
|
||||
});
|
||||
|
||||
it('should handle token limit of 1', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = createTextOfLength(1000);
|
||||
const tokenLimit = 1;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
});
|
||||
|
||||
it('should handle text that is just slightly over the limit', async () => {
|
||||
const { tokenCountFn } = createMockTokenCounter();
|
||||
const text = createTextOfLength(404);
|
||||
const tokenLimit = 100;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
});
|
||||
});
|
||||
|
||||
describe('correctness with variable token density', () => {
|
||||
it('should handle text with varying token density', async () => {
|
||||
const variableDensityTokenCounter = (text: string): number => {
|
||||
const shortWords = (text.match(/\s+/g) || []).length;
|
||||
return Math.ceil(text.length / 4) + shortWords;
|
||||
};
|
||||
|
||||
const text = 'This is a test with many short words and some longer concatenated words too';
|
||||
const tokenLimit = 10;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: variableDensityTokenCounter,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
});
|
||||
});
|
||||
|
||||
describe('direct comparison with OLD binary search implementation', () => {
|
||||
it('should produce equivalent results to the old implementation', async () => {
|
||||
const oldCounter = createMockTokenCounter();
|
||||
const newCounter = createMockTokenCounter();
|
||||
const text = createTextOfLength(8000);
|
||||
const tokenLimit = 1000;
|
||||
|
||||
const oldResult = await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const newResult = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
expect(newResult.wasTruncated).toBe(oldResult.wasTruncated);
|
||||
expect(newResult.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(oldResult.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
});
|
||||
|
||||
it('should use significantly fewer tokenCountFn calls than old implementation (400k chars)', async () => {
|
||||
const oldCounter = createMockTokenCounter();
|
||||
const newCounter = createMockTokenCounter();
|
||||
const text = createTextOfLength(400000);
|
||||
const tokenLimit = 50000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(
|
||||
`[400k chars] OLD implementation: ${oldCalls} calls, NEW implementation: ${newCalls} calls`,
|
||||
);
|
||||
console.log(`[400k chars] Reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
expect(newCalls).toBeLessThanOrEqual(7);
|
||||
});
|
||||
|
||||
it('should use significantly fewer tokenCountFn calls than old implementation (500k chars, 100k token limit)', async () => {
|
||||
const oldCounter = createMockTokenCounter();
|
||||
const newCounter = createMockTokenCounter();
|
||||
const text = createTextOfLength(500000);
|
||||
const tokenLimit = 100000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(
|
||||
`[500k chars] OLD implementation: ${oldCalls} calls, NEW implementation: ${newCalls} calls`,
|
||||
);
|
||||
console.log(`[500k chars] Reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
});
|
||||
|
||||
it('should achieve at least 70% reduction in tokenCountFn calls', async () => {
|
||||
const oldCounter = createMockTokenCounter();
|
||||
const newCounter = createMockTokenCounter();
|
||||
const text = createTextOfLength(500000);
|
||||
const tokenLimit = 100000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
const reduction = 1 - newCalls / oldCalls;
|
||||
|
||||
console.log(
|
||||
`Efficiency improvement: ${(reduction * 100).toFixed(1)}% fewer tokenCountFn calls`,
|
||||
);
|
||||
|
||||
expect(reduction).toBeGreaterThanOrEqual(0.7);
|
||||
});
|
||||
|
||||
it('should simulate the reported scenario (122k tokens, 100k limit)', async () => {
|
||||
const oldCounter = createMockTokenCounter();
|
||||
const newCounter = createMockTokenCounter();
|
||||
const text = createTextOfLength(489564);
|
||||
const tokenLimit = 100000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(`[User reported scenario: ~122k tokens]`);
|
||||
console.log(`OLD implementation: ${oldCalls} tokenCountFn calls`);
|
||||
console.log(`NEW implementation: ${newCalls} tokenCountFn calls`);
|
||||
console.log(`Reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
expect(newCalls).toBeLessThanOrEqual(7);
|
||||
});
|
||||
});
|
||||
|
||||
describe('direct comparison with REAL tiktoken tokenizer', () => {
|
||||
beforeEach(() => {
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
});
|
||||
|
||||
it('should produce valid truncation with real tokenizer', async () => {
|
||||
const counter = createRealTokenCounter();
|
||||
const text = createRealisticText(5000);
|
||||
const tokenLimit = 1000;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: counter.tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(result.text.length).toBeLessThan(text.length);
|
||||
});
|
||||
|
||||
it('should use fewer tiktoken calls than old implementation (realistic text)', async () => {
|
||||
const oldCounter = createRealTokenCounter();
|
||||
const newCounter = createRealTokenCounter();
|
||||
const text = createRealisticText(15000);
|
||||
const tokenLimit = 5000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(`[Real tiktoken ~15k tokens] OLD: ${oldCalls} calls, NEW: ${newCalls} calls`);
|
||||
console.log(`[Real tiktoken] Reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
});
|
||||
|
||||
it('should handle the reported user scenario with real tokenizer (~120k tokens)', async () => {
|
||||
const oldCounter = createRealTokenCounter();
|
||||
const newCounter = createRealTokenCounter();
|
||||
const text = createRealisticText(120000);
|
||||
const tokenLimit = 100000;
|
||||
|
||||
const startOld = performance.now();
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
const timeOld = performance.now() - startOld;
|
||||
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
const startNew = performance.now();
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
const timeNew = performance.now() - startNew;
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(`\n[REAL TIKTOKEN - User reported scenario: ~120k tokens]`);
|
||||
console.log(`OLD implementation: ${oldCalls} tiktoken calls, ${timeOld.toFixed(0)}ms`);
|
||||
console.log(`NEW implementation: ${newCalls} tiktoken calls, ${timeNew.toFixed(0)}ms`);
|
||||
console.log(`Call reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
console.log(`Time reduction: ${((1 - timeNew / timeOld) * 100).toFixed(1)}%`);
|
||||
console.log(
|
||||
`Result: truncated=${result.wasTruncated}, tokens=${result.tokenCount}/${tokenLimit}\n`,
|
||||
);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(newCalls).toBeLessThanOrEqual(7);
|
||||
});
|
||||
|
||||
it('should achieve at least 70% reduction with real tokenizer', async () => {
|
||||
const oldCounter = createRealTokenCounter();
|
||||
const newCounter = createRealTokenCounter();
|
||||
const text = createRealisticText(50000);
|
||||
const tokenLimit = 10000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
const reduction = 1 - newCalls / oldCalls;
|
||||
|
||||
console.log(
|
||||
`[Real tiktoken 50k tokens] OLD: ${oldCalls}, NEW: ${newCalls}, Reduction: ${(reduction * 100).toFixed(1)}%`,
|
||||
);
|
||||
|
||||
expect(reduction).toBeGreaterThanOrEqual(0.7);
|
||||
});
|
||||
});
|
||||
|
||||
describe('using countTokens async function from @librechat/api', () => {
|
||||
beforeEach(() => {
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
});
|
||||
|
||||
it('countTokens should return correct token count', async () => {
|
||||
const text = 'Hello, world!';
|
||||
const count = await countTokens(text);
|
||||
|
||||
expect(count).toBeGreaterThan(0);
|
||||
expect(typeof count).toBe('number');
|
||||
});
|
||||
|
||||
it('countTokens should handle empty string', async () => {
|
||||
const count = await countTokens('');
|
||||
expect(count).toBe(0);
|
||||
});
|
||||
|
||||
it('should work with processTextWithTokenLimit using countTokens', async () => {
|
||||
const counter = createCountTokensCounter();
|
||||
const text = createRealisticText(5000);
|
||||
const tokenLimit = 1000;
|
||||
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: counter.tokenCountFn,
|
||||
});
|
||||
|
||||
expect(result.wasTruncated).toBe(true);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(result.text.length).toBeLessThan(text.length);
|
||||
});
|
||||
|
||||
it('should use fewer countTokens calls than old implementation', async () => {
|
||||
const oldCounter = createCountTokensCounter();
|
||||
const newCounter = createCountTokensCounter();
|
||||
const text = createRealisticText(15000);
|
||||
const tokenLimit = 5000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(`[countTokens ~15k tokens] OLD: ${oldCalls} calls, NEW: ${newCalls} calls`);
|
||||
console.log(`[countTokens] Reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
});
|
||||
|
||||
it('should handle user reported scenario with countTokens (~120k tokens)', async () => {
|
||||
const oldCounter = createCountTokensCounter();
|
||||
const newCounter = createCountTokensCounter();
|
||||
const text = createRealisticText(120000);
|
||||
const tokenLimit = 100000;
|
||||
|
||||
const startOld = performance.now();
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
const timeOld = performance.now() - startOld;
|
||||
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
const startNew = performance.now();
|
||||
const result = await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
const timeNew = performance.now() - startNew;
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
|
||||
console.log(`\n[countTokens - User reported scenario: ~120k tokens]`);
|
||||
console.log(`OLD implementation: ${oldCalls} countTokens calls, ${timeOld.toFixed(0)}ms`);
|
||||
console.log(`NEW implementation: ${newCalls} countTokens calls, ${timeNew.toFixed(0)}ms`);
|
||||
console.log(`Call reduction: ${((1 - newCalls / oldCalls) * 100).toFixed(1)}%`);
|
||||
console.log(`Time reduction: ${((1 - timeNew / timeOld) * 100).toFixed(1)}%`);
|
||||
console.log(
|
||||
`Result: truncated=${result.wasTruncated}, tokens=${result.tokenCount}/${tokenLimit}\n`,
|
||||
);
|
||||
|
||||
expect(newCalls).toBeLessThan(oldCalls);
|
||||
expect(result.tokenCount).toBeLessThanOrEqual(tokenLimit);
|
||||
expect(newCalls).toBeLessThanOrEqual(7);
|
||||
});
|
||||
|
||||
it('should achieve at least 70% reduction with countTokens', async () => {
|
||||
const oldCounter = createCountTokensCounter();
|
||||
const newCounter = createCountTokensCounter();
|
||||
const text = createRealisticText(50000);
|
||||
const tokenLimit = 10000;
|
||||
|
||||
await processTextWithTokenLimitOLD({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: oldCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
await processTextWithTokenLimit({
|
||||
text,
|
||||
tokenLimit,
|
||||
tokenCountFn: newCounter.tokenCountFn,
|
||||
});
|
||||
|
||||
const oldCalls = oldCounter.getCallCount();
|
||||
const newCalls = newCounter.getCallCount();
|
||||
const reduction = 1 - newCalls / oldCalls;
|
||||
|
||||
console.log(
|
||||
`[countTokens 50k tokens] OLD: ${oldCalls}, NEW: ${newCalls}, Reduction: ${(reduction * 100).toFixed(1)}%`,
|
||||
);
|
||||
|
||||
expect(reduction).toBeGreaterThanOrEqual(0.7);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
@ -1,11 +1,39 @@
|
|||
import { logger } from '@librechat/data-schemas';
|
||||
|
||||
/** Token count function that can be sync or async */
|
||||
export type TokenCountFn = (text: string) => number | Promise<number>;
|
||||
|
||||
/**
|
||||
* Safety buffer multiplier applied to character position estimates during truncation.
|
||||
*
|
||||
* We use 98% (0.98) rather than 100% to intentionally undershoot the target on the first attempt.
|
||||
* This is necessary because:
|
||||
* - Token density varies across text (some regions may have more tokens per character than the average)
|
||||
* - The ratio-based estimate assumes uniform token distribution, which is rarely true
|
||||
* - Undershooting is safer than overshooting: exceeding the limit requires another iteration,
|
||||
* while being slightly under is acceptable
|
||||
* - In practice, this buffer reduces refinement iterations from 2-3 down to 0-1 in most cases
|
||||
*
|
||||
* @example
|
||||
* // If text has 1000 chars and 250 tokens (4 chars/token average), targeting 100 tokens:
|
||||
* // Without buffer: estimate = 1000 * (100/250) = 400 chars → might yield 105 tokens (over!)
|
||||
* // With 0.98 buffer: estimate = 400 * 0.98 = 392 chars → likely yields 97-99 tokens (safe)
|
||||
*/
|
||||
const TRUNCATION_SAFETY_BUFFER = 0.98;
|
||||
|
||||
/**
|
||||
* Processes text content by counting tokens and truncating if it exceeds the specified limit.
|
||||
* Uses ratio-based estimation to minimize expensive tokenCountFn calls.
|
||||
*
|
||||
* @param text - The text content to process
|
||||
* @param tokenLimit - The maximum number of tokens allowed
|
||||
* @param tokenCountFn - Function to count tokens
|
||||
* @param tokenCountFn - Function to count tokens (can be sync or async)
|
||||
* @returns Promise resolving to object with processed text, token count, and truncation status
|
||||
*
|
||||
* @remarks
|
||||
* This function uses a ratio-based estimation algorithm instead of binary search.
|
||||
* Binary search would require O(log n) tokenCountFn calls (~17 for 100k chars),
|
||||
* while this approach typically requires only 2-3 calls for a 90%+ reduction in CPU usage.
|
||||
*/
|
||||
export async function processTextWithTokenLimit({
|
||||
text,
|
||||
|
|
@ -14,7 +42,7 @@ export async function processTextWithTokenLimit({
|
|||
}: {
|
||||
text: string;
|
||||
tokenLimit: number;
|
||||
tokenCountFn: (text: string) => number;
|
||||
tokenCountFn: TokenCountFn;
|
||||
}): Promise<{ text: string; tokenCount: number; wasTruncated: boolean }> {
|
||||
const originalTokenCount = await tokenCountFn(text);
|
||||
|
||||
|
|
@ -26,40 +54,34 @@ export async function processTextWithTokenLimit({
|
|||
};
|
||||
}
|
||||
|
||||
/**
|
||||
* Doing binary search here to find the truncation point efficiently
|
||||
* (May be a better way to go about this)
|
||||
*/
|
||||
let low = 0;
|
||||
let high = text.length;
|
||||
let bestText = '';
|
||||
|
||||
logger.debug(
|
||||
`[textTokenLimiter] Text content exceeds token limit: ${originalTokenCount} > ${tokenLimit}, truncating...`,
|
||||
);
|
||||
|
||||
while (low <= high) {
|
||||
const mid = Math.floor((low + high) / 2);
|
||||
const truncatedText = text.substring(0, mid);
|
||||
const tokenCount = await tokenCountFn(truncatedText);
|
||||
const ratio = tokenLimit / originalTokenCount;
|
||||
let charPosition = Math.floor(text.length * ratio * TRUNCATION_SAFETY_BUFFER);
|
||||
|
||||
if (tokenCount <= tokenLimit) {
|
||||
bestText = truncatedText;
|
||||
low = mid + 1;
|
||||
} else {
|
||||
high = mid - 1;
|
||||
}
|
||||
let truncatedText = text.substring(0, charPosition);
|
||||
let tokenCount = await tokenCountFn(truncatedText);
|
||||
|
||||
const maxIterations = 5;
|
||||
let iterations = 0;
|
||||
|
||||
while (tokenCount > tokenLimit && iterations < maxIterations && charPosition > 0) {
|
||||
const overageRatio = tokenLimit / tokenCount;
|
||||
charPosition = Math.floor(charPosition * overageRatio * TRUNCATION_SAFETY_BUFFER);
|
||||
truncatedText = text.substring(0, charPosition);
|
||||
tokenCount = await tokenCountFn(truncatedText);
|
||||
iterations++;
|
||||
}
|
||||
|
||||
const finalTokenCount = await tokenCountFn(bestText);
|
||||
|
||||
logger.warn(
|
||||
`[textTokenLimiter] Text truncated from ${originalTokenCount} to ${finalTokenCount} tokens (limit: ${tokenLimit})`,
|
||||
`[textTokenLimiter] Text truncated from ${originalTokenCount} to ${tokenCount} tokens (limit: ${tokenLimit})`,
|
||||
);
|
||||
|
||||
return {
|
||||
text: bestText,
|
||||
tokenCount: finalTokenCount,
|
||||
text: truncatedText,
|
||||
tokenCount,
|
||||
wasTruncated: true,
|
||||
};
|
||||
}
|
||||
|
|
|
|||
|
|
@ -75,4 +75,14 @@ class Tokenizer {
|
|||
|
||||
const TokenizerSingleton = new Tokenizer();
|
||||
|
||||
/**
|
||||
* Counts the number of tokens in a given text using tiktoken.
|
||||
* This is an async wrapper around Tokenizer.getTokenCount for compatibility.
|
||||
* @param text - The text to be tokenized. Defaults to an empty string if not provided.
|
||||
* @returns The number of tokens in the provided text.
|
||||
*/
|
||||
export async function countTokens(text = ''): Promise<number> {
|
||||
return TokenizerSingleton.getTokenCount(text, 'cl100k_base');
|
||||
}
|
||||
|
||||
export default TokenizerSingleton;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue