🔄 refactor: Consolidate Tokenizer; Fix Jest Open Handles (#5175)

* refactor: consolidate tokenizer to singleton * fix: remove legacy tokenizer code, add Tokenizer singleton tests * ci: fix jest open handles
2026-02-16 23:48:09 +01:00 · 2025-01-03 18:11:14 -05:00 · 2025-01-03 18:11:14 -05:00 · c26b54c74d
commit c26b54c74d
parent bf0a84e45a
11 changed files with 202 additions and 221 deletions
--- a/api/app/clients/GoogleClient.js
+++ b/api/app/clients/GoogleClient.js
@ -6,7 +6,6 @@ const { ChatGoogleVertexAI } = require('@langchain/google-vertexai');
 const { ChatGoogleGenerativeAI } = require('@langchain/google-genai');
 const { GoogleGenerativeAI: GenAI } = require('@google/generative-ai');
 const { AIMessage, HumanMessage, SystemMessage } = require('@langchain/core/messages');
-const { encoding_for_model: encodingForModel, get_encoding: getEncoding } = require('tiktoken');
 const {
  validateVisionModel,
  getResponseSender,
@ -17,6 +16,7 @@ const {
  AuthKeys,
 } = require('librechat-data-provider');
 const { encodeAndFormat } = require('~/server/services/Files/images');
+const Tokenizer = require('~/server/services/Tokenizer');
 const { getModelMaxTokens } = require('~/utils');
 const { sleep } = require('~/server/utils');
 const { logger } = require('~/config');
@ -31,7 +31,6 @@ const BaseClient = require('./BaseClient');
 const loc = process.env.GOOGLE_LOC || 'us-central1';
 const publisher = 'google';
 const endpointPrefix = `${loc}-aiplatform.googleapis.com`;
-const tokenizersCache = {};

 const settings = endpointSettings[EModelEndpoint.google];
 const EXCLUDED_GENAI_MODELS = /gemini-(?:1\.0|1-0|pro)/;
@ -177,25 +176,15 @@ class GoogleClient extends BaseClient {
      // without tripping the stop sequences, so I'm using "||>" instead.
      this.startToken = '||>';
      this.endToken = '';
-      this.gptEncoder = this.constructor.getTokenizer('cl100k_base');
    } else if (isTextModel) {
      this.startToken = '||>';
      this.endToken = '';
-      this.gptEncoder = this.constructor.getTokenizer('text-davinci-003', true, {
-        '<|im_start|>': 100264,
-        '<|im_end|>': 100265,
-      });
    } else {
      // Previously I was trying to use "<|endoftext|>" but there seems to be some bug with OpenAI's token counting
      // system that causes only the first "<|endoftext|>" to be counted as 1 token, and the rest are not treated
      // as a single token. So we're using this instead.
      this.startToken = '||>';
      this.endToken = '';
-      try {
-        this.gptEncoder = this.constructor.getTokenizer(this.modelOptions.model, true);
-      } catch {
-        this.gptEncoder = this.constructor.getTokenizer('text-davinci-003', true);
-      }
    }

    if (!this.modelOptions.stop) {
@ -926,23 +915,18 @@ class GoogleClient extends BaseClient {
    ];
  }

-  /* TO-DO: Handle tokens with Google tokenization NOTE: these are required */
-  static getTokenizer(encoding, isModelName = false, extendSpecialTokens = {}) {
-    if (tokenizersCache[encoding]) {
-      return tokenizersCache[encoding];
-    }
-    let tokenizer;
-    if (isModelName) {
-      tokenizer = encodingForModel(encoding, extendSpecialTokens);
-    } else {
-      tokenizer = getEncoding(encoding, extendSpecialTokens);
-    }
-    tokenizersCache[encoding] = tokenizer;
-    return tokenizer;
+  getEncoding() {
+    return 'cl100k_base';
  }

+  /**
+   * Returns the token count of a given text. It also checks and resets the tokenizers if necessary.
+   * @param {string} text - The text to get the token count for.
+   * @returns {number} The token count of the given text.
+   */
  getTokenCount(text) {
-    return this.gptEncoder.encode(text, 'all').length;
+    const encoding = this.getEncoding();
+    return Tokenizer.getTokenCount(text, encoding);
  }
 }