mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-09-22 06:00:56 +02:00

* refactor: consolidate tokenizer to singleton * fix: remove legacy tokenizer code, add Tokenizer singleton tests * ci: fix jest open handles
64 lines
1.8 KiB
JavaScript
64 lines
1.8 KiB
JavaScript
const { encoding_for_model: encodingForModel, get_encoding: getEncoding } = require('tiktoken');
|
|
const { logger } = require('~/config');
|
|
|
|
class Tokenizer {
|
|
constructor() {
|
|
this.tokenizersCache = {};
|
|
this.tokenizerCallsCount = 0;
|
|
}
|
|
|
|
getTokenizer(encoding, isModelName = false, extendSpecialTokens = {}) {
|
|
let tokenizer;
|
|
if (this.tokenizersCache[encoding]) {
|
|
tokenizer = this.tokenizersCache[encoding];
|
|
} else {
|
|
if (isModelName) {
|
|
tokenizer = encodingForModel(encoding, extendSpecialTokens);
|
|
} else {
|
|
tokenizer = getEncoding(encoding, extendSpecialTokens);
|
|
}
|
|
this.tokenizersCache[encoding] = tokenizer;
|
|
}
|
|
return tokenizer;
|
|
}
|
|
|
|
freeAndResetAllEncoders() {
|
|
try {
|
|
Object.keys(this.tokenizersCache).forEach((key) => {
|
|
if (this.tokenizersCache[key]) {
|
|
this.tokenizersCache[key].free();
|
|
delete this.tokenizersCache[key];
|
|
}
|
|
});
|
|
this.tokenizerCallsCount = 1;
|
|
} catch (error) {
|
|
logger.error('[Tokenizer] Free and reset encoders error', error);
|
|
}
|
|
}
|
|
|
|
resetTokenizersIfNecessary() {
|
|
if (this.tokenizerCallsCount >= 25) {
|
|
if (this.options?.debug) {
|
|
logger.debug('[Tokenizer] freeAndResetAllEncoders: reached 25 encodings, resetting...');
|
|
}
|
|
this.freeAndResetAllEncoders();
|
|
}
|
|
this.tokenizerCallsCount++;
|
|
}
|
|
|
|
getTokenCount(text, encoding = 'cl100k_base') {
|
|
this.resetTokenizersIfNecessary();
|
|
try {
|
|
const tokenizer = this.getTokenizer(encoding);
|
|
return tokenizer.encode(text, 'all').length;
|
|
} catch (error) {
|
|
this.freeAndResetAllEncoders();
|
|
const tokenizer = this.getTokenizer(encoding);
|
|
return tokenizer.encode(text, 'all').length;
|
|
}
|
|
}
|
|
}
|
|
|
|
const TokenizerSingleton = new Tokenizer();
|
|
|
|
module.exports = TokenizerSingleton;
|