🔄 refactor: Consolidate Tokenizer; Fix Jest Open Handles (#5175)

* refactor: consolidate tokenizer to singleton

* fix: remove legacy tokenizer code, add Tokenizer singleton tests

* ci: fix jest open handles
This commit is contained in:
Danny Avila 2025-01-03 18:11:14 -05:00 committed by GitHub
parent bf0a84e45a
commit c26b54c74d
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
11 changed files with 202 additions and 221 deletions

View file

@ -6,7 +6,6 @@ const { ChatGoogleVertexAI } = require('@langchain/google-vertexai');
const { ChatGoogleGenerativeAI } = require('@langchain/google-genai');
const { GoogleGenerativeAI: GenAI } = require('@google/generative-ai');
const { AIMessage, HumanMessage, SystemMessage } = require('@langchain/core/messages');
const { encoding_for_model: encodingForModel, get_encoding: getEncoding } = require('tiktoken');
const {
validateVisionModel,
getResponseSender,
@ -17,6 +16,7 @@ const {
AuthKeys,
} = require('librechat-data-provider');
const { encodeAndFormat } = require('~/server/services/Files/images');
const Tokenizer = require('~/server/services/Tokenizer');
const { getModelMaxTokens } = require('~/utils');
const { sleep } = require('~/server/utils');
const { logger } = require('~/config');
@ -31,7 +31,6 @@ const BaseClient = require('./BaseClient');
const loc = process.env.GOOGLE_LOC || 'us-central1';
const publisher = 'google';
const endpointPrefix = `${loc}-aiplatform.googleapis.com`;
const tokenizersCache = {};
const settings = endpointSettings[EModelEndpoint.google];
const EXCLUDED_GENAI_MODELS = /gemini-(?:1\.0|1-0|pro)/;
@ -177,25 +176,15 @@ class GoogleClient extends BaseClient {
// without tripping the stop sequences, so I'm using "||>" instead.
this.startToken = '||>';
this.endToken = '';
this.gptEncoder = this.constructor.getTokenizer('cl100k_base');
} else if (isTextModel) {
this.startToken = '||>';
this.endToken = '';
this.gptEncoder = this.constructor.getTokenizer('text-davinci-003', true, {
'<|im_start|>': 100264,
'<|im_end|>': 100265,
});
} else {
// Previously I was trying to use "<|endoftext|>" but there seems to be some bug with OpenAI's token counting
// system that causes only the first "<|endoftext|>" to be counted as 1 token, and the rest are not treated
// as a single token. So we're using this instead.
this.startToken = '||>';
this.endToken = '';
try {
this.gptEncoder = this.constructor.getTokenizer(this.modelOptions.model, true);
} catch {
this.gptEncoder = this.constructor.getTokenizer('text-davinci-003', true);
}
}
if (!this.modelOptions.stop) {
@ -926,23 +915,18 @@ class GoogleClient extends BaseClient {
];
}
/* TO-DO: Handle tokens with Google tokenization NOTE: these are required */
static getTokenizer(encoding, isModelName = false, extendSpecialTokens = {}) {
if (tokenizersCache[encoding]) {
return tokenizersCache[encoding];
}
let tokenizer;
if (isModelName) {
tokenizer = encodingForModel(encoding, extendSpecialTokens);
} else {
tokenizer = getEncoding(encoding, extendSpecialTokens);
}
tokenizersCache[encoding] = tokenizer;
return tokenizer;
getEncoding() {
return 'cl100k_base';
}
/**
* Returns the token count of a given text. It also checks and resets the tokenizers if necessary.
* @param {string} text - The text to get the token count for.
* @returns {number} The token count of the given text.
*/
getTokenCount(text) {
return this.gptEncoder.encode(text, 'all').length;
const encoding = this.getEncoding();
return Tokenizer.getTokenCount(text, encoding);
}
}