mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-09-22 06:00:56 +02:00

* refactor: consolidate tokenizer to singleton * fix: remove legacy tokenizer code, add Tokenizer singleton tests * ci: fix jest open handles
136 lines
4.6 KiB
JavaScript
136 lines
4.6 KiB
JavaScript
/**
|
||
* @file Tokenizer.spec.cjs
|
||
*
|
||
* Tests the real TokenizerSingleton (no mocking of `tiktoken`).
|
||
* Make sure to install `tiktoken` and have it configured properly.
|
||
*/
|
||
|
||
const Tokenizer = require('./Tokenizer'); // <-- Adjust path to your singleton file
|
||
const { logger } = require('~/config');
|
||
|
||
describe('Tokenizer', () => {
|
||
it('should be a singleton (same instance)', () => {
|
||
const AnotherTokenizer = require('./Tokenizer'); // same path
|
||
expect(Tokenizer).toBe(AnotherTokenizer);
|
||
});
|
||
|
||
describe('getTokenizer', () => {
|
||
it('should create an encoder for an explicit model name (e.g., "gpt-4")', () => {
|
||
// The real `encoding_for_model` will be called internally
|
||
// as soon as we pass isModelName = true.
|
||
const tokenizer = Tokenizer.getTokenizer('gpt-4', true);
|
||
|
||
// Basic sanity checks
|
||
expect(tokenizer).toBeDefined();
|
||
// You can optionally check certain properties from `tiktoken` if they exist
|
||
// e.g., expect(typeof tokenizer.encode).toBe('function');
|
||
});
|
||
|
||
it('should create an encoder for a known encoding (e.g., "cl100k_base")', () => {
|
||
// The real `get_encoding` will be called internally
|
||
// as soon as we pass isModelName = false.
|
||
const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
|
||
|
||
expect(tokenizer).toBeDefined();
|
||
// e.g., expect(typeof tokenizer.encode).toBe('function');
|
||
});
|
||
|
||
it('should return cached tokenizer if previously fetched', () => {
|
||
const tokenizer1 = Tokenizer.getTokenizer('cl100k_base', false);
|
||
const tokenizer2 = Tokenizer.getTokenizer('cl100k_base', false);
|
||
// Should be the exact same instance from the cache
|
||
expect(tokenizer1).toBe(tokenizer2);
|
||
});
|
||
});
|
||
|
||
describe('freeAndResetAllEncoders', () => {
|
||
beforeEach(() => {
|
||
jest.clearAllMocks();
|
||
});
|
||
|
||
it('should free all encoders and reset tokenizerCallsCount to 1', () => {
|
||
// By creating two different encodings, we populate the cache
|
||
Tokenizer.getTokenizer('cl100k_base', false);
|
||
Tokenizer.getTokenizer('r50k_base', false);
|
||
|
||
// Now free them
|
||
Tokenizer.freeAndResetAllEncoders();
|
||
|
||
// The internal cache is cleared
|
||
expect(Tokenizer.tokenizersCache['cl100k_base']).toBeUndefined();
|
||
expect(Tokenizer.tokenizersCache['r50k_base']).toBeUndefined();
|
||
|
||
// tokenizerCallsCount is reset to 1
|
||
expect(Tokenizer.tokenizerCallsCount).toBe(1);
|
||
});
|
||
|
||
it('should catch and log errors if freeing fails', () => {
|
||
// Mock logger.error before the test
|
||
const mockLoggerError = jest.spyOn(logger, 'error');
|
||
|
||
// Set up a problematic tokenizer in the cache
|
||
Tokenizer.tokenizersCache['cl100k_base'] = {
|
||
free() {
|
||
throw new Error('Intentional free error');
|
||
},
|
||
};
|
||
|
||
// Should not throw uncaught errors
|
||
Tokenizer.freeAndResetAllEncoders();
|
||
|
||
// Verify logger.error was called with correct arguments
|
||
expect(mockLoggerError).toHaveBeenCalledWith(
|
||
'[Tokenizer] Free and reset encoders error',
|
||
expect.any(Error),
|
||
);
|
||
|
||
// Clean up
|
||
mockLoggerError.mockRestore();
|
||
Tokenizer.tokenizersCache = {};
|
||
});
|
||
});
|
||
|
||
describe('getTokenCount', () => {
|
||
beforeEach(() => {
|
||
jest.clearAllMocks();
|
||
Tokenizer.freeAndResetAllEncoders();
|
||
});
|
||
|
||
it('should return the number of tokens in the given text', () => {
|
||
const text = 'Hello, world!';
|
||
const count = Tokenizer.getTokenCount(text, 'cl100k_base');
|
||
expect(count).toBeGreaterThan(0);
|
||
});
|
||
|
||
it('should reset encoders if an error is thrown', () => {
|
||
// We can simulate an error by temporarily overriding the selected tokenizer’s `encode` method.
|
||
const tokenizer = Tokenizer.getTokenizer('cl100k_base', false);
|
||
const originalEncode = tokenizer.encode;
|
||
tokenizer.encode = () => {
|
||
throw new Error('Forced error');
|
||
};
|
||
|
||
// Despite the forced error, the code should catch and reset, then re-encode
|
||
const count = Tokenizer.getTokenCount('Hello again', 'cl100k_base');
|
||
expect(count).toBeGreaterThan(0);
|
||
|
||
// Restore the original encode
|
||
tokenizer.encode = originalEncode;
|
||
});
|
||
|
||
it('should reset tokenizers after 25 calls', () => {
|
||
// Spy on freeAndResetAllEncoders
|
||
const resetSpy = jest.spyOn(Tokenizer, 'freeAndResetAllEncoders');
|
||
|
||
// Make 24 calls; should NOT reset yet
|
||
for (let i = 0; i < 24; i++) {
|
||
Tokenizer.getTokenCount('test text', 'cl100k_base');
|
||
}
|
||
expect(resetSpy).not.toHaveBeenCalled();
|
||
|
||
// 25th call triggers the reset
|
||
Tokenizer.getTokenCount('the 25th call!', 'cl100k_base');
|
||
expect(resetSpy).toHaveBeenCalledTimes(1);
|
||
});
|
||
});
|
||
});
|