mirror of
https://github.com/danny-avila/LibreChat.git
synced 2026-03-12 02:52:36 +01:00
⚡ refactor: Replace tiktoken with ai-tokenizer (#12175)
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
Some checks are pending
Docker Dev Branch Images Build / build (Dockerfile, lc-dev, node) (push) Waiting to run
Docker Dev Branch Images Build / build (Dockerfile.multi, lc-dev-api, api-build) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile, librechat-dev, node) (push) Waiting to run
Docker Dev Images Build / build (Dockerfile.multi, librechat-dev-api, api-build) (push) Waiting to run
Sync Locize Translations & Create Translation PR / Sync Translation Keys with Locize (push) Waiting to run
Sync Locize Translations & Create Translation PR / Create Translation PR on Version Published (push) Blocked by required conditions
* chore: Update dependencies by adding ai-tokenizer and removing tiktoken - Added ai-tokenizer version 1.0.6 to package.json and package-lock.json across multiple packages. - Removed tiktoken version 1.0.15 from package.json and package-lock.json in the same locations, streamlining dependency management. * refactor: replace js-tiktoken with ai-tokenizer - Added support for 'claude' encoding in the AgentClient class to improve model compatibility. - Updated Tokenizer class to utilize 'ai-tokenizer' for both 'o200k_base' and 'claude' encodings, replacing the previous 'tiktoken' dependency. - Refactored tests to reflect changes in tokenizer behavior and ensure accurate token counting for both encoding types. - Removed deprecated references to 'tiktoken' and adjusted related tests for improved clarity and functionality. * chore: remove tiktoken mocks from DALLE3 tests - Eliminated mock implementations of 'tiktoken' from DALLE3-related test files to streamline test setup and align with recent dependency updates. - Adjusted related test structures to ensure compatibility with the new tokenizer implementation. * chore: Add distinct encoding support for Anthropic Claude models - Introduced a new method `getEncoding` in the AgentClient class to handle the specific BPE tokenizer for Claude models, ensuring compatibility with the distinct encoding requirements. - Updated documentation to clarify the encoding logic for Claude and other models. * docs: Update return type documentation for getEncoding method in AgentClient - Clarified the return type of the getEncoding method to specify that it can return an EncodingName or undefined, enhancing code readability and type safety. * refactor: Tokenizer class and error handling - Exported the EncodingName type for broader usage. - Renamed encodingMap to encodingData for clarity. - Improved error handling in getTokenCount method to ensure recovery attempts are logged and return 0 on failure. - Updated countTokens function documentation to specify the use of 'o200k_base' encoding. * refactor: Simplify encoding documentation and export type - Updated the getEncoding method documentation to clarify the default behavior for non-Anthropic Claude models. - Exported the EncodingName type separately from the Tokenizer module for improved clarity and usage. * test: Update text processing tests for token limits - Adjusted test cases to handle smaller text sizes, changing scenarios from ~120k tokens to ~20k tokens for both the real tokenizer and countTokens functions. - Updated token limits in tests to reflect new constraints, ensuring tests accurately assess performance and call reduction. - Enhanced console log messages for clarity regarding token counts and reductions in the updated scenarios. * refactor: Update Tokenizer imports and exports - Moved Tokenizer and countTokens exports to the tokenizer module for better organization. - Adjusted imports in memory.ts to reflect the new structure, ensuring consistent usage across the codebase. - Updated memory.test.ts to mock the Tokenizer from the correct module path, enhancing test accuracy. * refactor: Tokenizer initialization and error handling - Introduced an async `initEncoding` method to preload tokenizers, improving performance and accuracy in token counting. - Updated `getTokenCount` to handle uninitialized tokenizers more gracefully, ensuring proper recovery and logging on errors. - Removed deprecated synchronous tokenizer retrieval, streamlining the overall tokenizer management process. * test: Enhance tokenizer tests with initialization and encoding checks - Added `beforeAll` hooks to initialize tokenizers for 'o200k_base' and 'claude' encodings before running tests, ensuring proper setup. - Updated tests to validate the loading of encodings and the correctness of token counts for both 'o200k_base' and 'claude'. - Improved test structure to deduplicate concurrent initialization calls, enhancing performance and reliability.
This commit is contained in:
parent
fcb344da47
commit
9a5d7eaa4e
15 changed files with 112 additions and 277 deletions
|
|
@ -1,12 +1,3 @@
|
|||
/**
|
||||
* @file Tokenizer.spec.cjs
|
||||
*
|
||||
* Tests the real TokenizerSingleton (no mocking of `tiktoken`).
|
||||
* Make sure to install `tiktoken` and have it configured properly.
|
||||
*/
|
||||
|
||||
import { logger } from '@librechat/data-schemas';
|
||||
import type { Tiktoken } from 'tiktoken';
|
||||
import Tokenizer from './tokenizer';
|
||||
|
||||
jest.mock('@librechat/data-schemas', () => ({
|
||||
|
|
@ -17,127 +8,49 @@ jest.mock('@librechat/data-schemas', () => ({
|
|||
|
||||
describe('Tokenizer', () => {
|
||||
it('should be a singleton (same instance)', async () => {
|
||||
const AnotherTokenizer = await import('./tokenizer'); // same path
|
||||
const AnotherTokenizer = await import('./tokenizer');
|
||||
expect(Tokenizer).toBe(AnotherTokenizer.default);
|
||||
});
|
||||
|
||||
describe('getTokenizer', () => {
|
||||
it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => {
|
||||
// The real `encoding_for_model` will be called internally
|
||||
// as soon as we pass isModelName = true.
|
||||
const tokenizer = Tokenizer.getTokenizer('gpt-4', true);
|
||||
|
||||
// Basic sanity checks
|
||||
expect(tokenizer).toBeDefined();
|
||||
// You can optionally check certain properties from `tiktoken` if they exist
|
||||
// e.g., expect(typeof tokenizer.encode).toBe('function');
|
||||
describe('initEncoding', () => {
|
||||
it('should load o200k_base encoding', async () => {
|
||||
await Tokenizer.initEncoding('o200k_base');
|
||||
const count = Tokenizer.getTokenCount('Hello, world!', 'o200k_base');
|
||||
expect(count).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => {
|
||||
// The real `get_encoding` will be called internally
|
||||
// as soon as we pass isModelName = false.
|
||||
const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
|
||||
|
||||
expect(tokenizer).toBeDefined();
|
||||
// e.g., expect(typeof tokenizer.encode).toBe('function');
|
||||
it('should load claude encoding', async () => {
|
||||
await Tokenizer.initEncoding('claude');
|
||||
const count = Tokenizer.getTokenCount('Hello, world!', 'claude');
|
||||
expect(count).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('should return cached tokenizer if previously fetched', () => {
|
||||
const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false);
|
||||
const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false);
|
||||
// Should be the exact same instance from the cache
|
||||
expect(tokenizer1).toBe(tokenizer2);
|
||||
});
|
||||
});
|
||||
|
||||
describe('freeAndResetAllEncoders', () => {
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
});
|
||||
|
||||
it('should free all encoders and reset tokenizerCallsCount to 1', () => {
|
||||
// By creating two different encodings, we populate the cache
|
||||
Tokenizer.getTokenizer('cl100k_base', false);
|
||||
Tokenizer.getTokenizer('r50k_base', false);
|
||||
|
||||
// Now free them
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
// The internal cache is cleared
|
||||
expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined();
|
||||
expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined();
|
||||
|
||||
// tokenizerCallsCount is reset to 1
|
||||
expect(Tokenizer.tokenizerCallsCount).toBe(1);
|
||||
});
|
||||
|
||||
it('should catch and log errors if freeing fails', () => {
|
||||
// Mock logger.error before the test
|
||||
const mockLoggerError = jest.spyOn(logger, 'error');
|
||||
|
||||
// Set up a problematic tokenizer in the cache
|
||||
Tokenizer.tokenizersCache['cl100k_base'] = {
|
||||
free() {
|
||||
throw new Error('Intentional free error');
|
||||
},
|
||||
} as unknown as Tiktoken;
|
||||
|
||||
// Should not throw uncaught errors
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
|
||||
// Verify logger.error was called with correct arguments
|
||||
expect(mockLoggerError).toHaveBeenCalledWith(
|
||||
'[Tokenizer] Free and reset encoders error',
|
||||
expect.any(Error),
|
||||
);
|
||||
|
||||
// Clean up
|
||||
mockLoggerError.mockRestore();
|
||||
Tokenizer.tokenizersCache = {};
|
||||
it('should deduplicate concurrent init calls', async () => {
|
||||
const [, , count] = await Promise.all([
|
||||
Tokenizer.initEncoding('o200k_base'),
|
||||
Tokenizer.initEncoding('o200k_base'),
|
||||
Tokenizer.initEncoding('o200k_base').then(() =>
|
||||
Tokenizer.getTokenCount('test', 'o200k_base'),
|
||||
),
|
||||
]);
|
||||
expect(count).toBeGreaterThan(0);
|
||||
});
|
||||
});
|
||||
|
||||
describe('getTokenCount', () => {
|
||||
beforeEach(() => {
|
||||
jest.clearAllMocks();
|
||||
Tokenizer.freeAndResetAllEncoders();
|
||||
beforeAll(async () => {
|
||||
await Tokenizer.initEncoding('o200k_base');
|
||||
await Tokenizer.initEncoding('claude');
|
||||
});
|
||||
|
||||
it('should return the number of tokens in the given text', () => {
|
||||
const text = 'Hello, world!';
|
||||
const count = Tokenizer.getTokenCount(text, 'cl100k_base');
|
||||
const count = Tokenizer.getTokenCount('Hello, world!', 'o200k_base');
|
||||
expect(count).toBeGreaterThan(0);
|
||||
});
|
||||
|
||||
it('should reset encoders if an error is thrown', () => {
|
||||
// We can simulate an error by temporarily overriding the selected tokenizer's `encode` method.
|
||||
const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
|
||||
const originalEncode = tokenizer.encode;
|
||||
tokenizer.encode = () => {
|
||||
throw new Error('Forced error');
|
||||
};
|
||||
|
||||
// Despite the forced error, the code should catch and reset, then re-encode
|
||||
const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base');
|
||||
it('should count tokens using claude encoding', () => {
|
||||
const count = Tokenizer.getTokenCount('Hello, world!', 'claude');
|
||||
expect(count).toBeGreaterThan(0);
|
||||
|
||||
// Restore the original encode
|
||||
tokenizer.encode = originalEncode;
|
||||
});
|
||||
|
||||
it('should reset tokenizers after 25 calls', () => {
|
||||
// Spy on freeAndResetAllEncoders
|
||||
const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders');
|
||||
|
||||
// Make 24 calls; should NOT reset yet
|
||||
for (let i = 0; i < 24; i++) {
|
||||
Tokenizer.getTokenCount('test text', 'cl100k_base');
|
||||
}
|
||||
expect(resetSpy).not.toHaveBeenCalled();
|
||||
|
||||
// 25th call triggers the reset
|
||||
Tokenizer.getTokenCount('the 25th call!', 'cl100k_base');
|
||||
expect(resetSpy).toHaveBeenCalledTimes(1);
|
||||
});
|
||||
});
|
||||
});
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue