import { logger } from '@librechat/data-schemas'; import { encoding_for_model as encodingForModel, get_encoding as getEncoding } from 'tiktoken'; import type { Tiktoken, TiktokenModel, TiktokenEncoding } from 'tiktoken'; interface TokenizerOptions { debug?: boolean; } class Tokenizer { tokenizersCache: Record; tokenizerCallsCount: number; private options?: TokenizerOptions; constructor() { this.tokenizersCache = {}; this.tokenizerCallsCount = 0; } getTokenizer( encoding: TiktokenModel | TiktokenEncoding, isModelName = false, extendSpecialTokens: Record = {}, ): Tiktoken { let tokenizer: Tiktoken; if (this.tokenizersCache[encoding]) { tokenizer = this.tokenizersCache[encoding]; } else { if (isModelName) { tokenizer = encodingForModel(encoding as TiktokenModel, extendSpecialTokens); } else { tokenizer = getEncoding(encoding as TiktokenEncoding, extendSpecialTokens); } this.tokenizersCache[encoding] = tokenizer; } return tokenizer; } freeAndResetAllEncoders(): void { try { Object.keys(this.tokenizersCache).forEach((key) => { if (this.tokenizersCache[key]) { this.tokenizersCache[key].free(); delete this.tokenizersCache[key]; } }); this.tokenizerCallsCount = 1; } catch (error) { logger.error('[Tokenizer] Free and reset encoders error', error); } } resetTokenizersIfNecessary(): void { if (this.tokenizerCallsCount >= 25) { if (this.options?.debug) { logger.debug('[Tokenizer] freeAndResetAllEncoders: reached 25 encodings, resetting...'); } this.freeAndResetAllEncoders(); } this.tokenizerCallsCount++; } getTokenCount(text: string, encoding: TiktokenModel | TiktokenEncoding = 'cl100k_base'): number { this.resetTokenizersIfNecessary(); try { const tokenizer = this.getTokenizer(encoding); return tokenizer.encode(text, 'all').length; } catch (error) { logger.error('[Tokenizer] Error getting token count:', error); this.freeAndResetAllEncoders(); const tokenizer = this.getTokenizer(encoding); return tokenizer.encode(text, 'all').length; } } } const TokenizerSingleton = new Tokenizer(); /** * Counts the number of tokens in a given text using tiktoken. * This is an async wrapper around Tokenizer.getTokenCount for compatibility. * @param text - The text to be tokenized. Defaults to an empty string if not provided. * @returns The number of tokens in the provided text. */ export async function countTokens(text = ''): Promise { return TokenizerSingleton.getTokenCount(text, 'cl100k_base'); } export default TokenizerSingleton;