LibreChat/api/app/clients/document/tokenSplit.js

const { TokenTextSplitter } = require('@langchain/textsplitters');

/**
 * Splits a given text by token chunks, based on the provided parameters for the TokenTextSplitter.
 * Note: limit or memoize use of this function as its calculation is expensive.
 *
 * @param {Object} obj - Configuration object for the text splitting operation.
 * @param {string} obj.text - The text to be split.
 * @param {string} [obj.encodingName='cl100k_base'] - Encoding name. Defaults to 'cl100k_base'.
 * @param {number} [obj.chunkSize=1] - The token size of each chunk. Defaults to 1.
 * @param {number} [obj.chunkOverlap=0] - The number of chunk elements to be overlapped between adjacent chunks. Defaults to 0.
 * @param {number} [obj.returnSize] - If specified and not 0, slices the return array from the end by this amount.
 *
 * @returns {Promise<Array>} Returns a promise that resolves to an array of text chunks.
 * If no text is provided, an empty array is returned.
 * If returnSize is specified and not 0, slices the return array from the end by returnSize.
 *
 * @async
 * @function tokenSplit
 */
async function tokenSplit({
  text,
  encodingName = 'cl100k_base',
  chunkSize = 1,
  chunkOverlap = 0,
  returnSize,
}) {
  if (!text) {
    return [];
  }

  const splitter = new TokenTextSplitter({
    encodingName,
    chunkSize,
    chunkOverlap,
  });

  if (!returnSize) {
    return await splitter.splitText(text);
  }

  const splitText = await splitter.splitText(text);

  if (returnSize && returnSize > 0 && splitText.length > 0) {
    return splitText.slice(-Math.abs(returnSize));
  }

  return splitText;
}

module.exports = tokenSplit;
📦 fix: npm warnings; chore: bump deprecated packages (#4707) * chore: bump langchain deps to address vulnerability warnings * chore: bump community package and install textsplitters package * fix: update expected result in tokenSplit tests for accuracy * chore: remove CodeSherpa tools * chore: remove E2B tools and loadToolSuite * chore: remove CodeBrew tool and update related references * chore: remove HumanTool and ChatTool, update tool references * chore: remove Zapier tool from manifest.json and update SerpAPI * chore: remove basic tools * chore: update import path for RecursiveCharacterTextSplitter * chore: update import path for DynamicStructuredTool * chore: remove extractionChain.js and update tool filtering logic * chore: npm audit fix * chore: bump google packages * chore: update DALL-E tool to DALL-E-3 and adjust authentication logic * ci: update message classes * chore: elliptic npm audit fix * chore: update CallbackManager import and remove deprecated tool handling logic * chore: imports order * chore: remove unused code --------- Co-authored-by: Max Sanna <max@maxsanna.com> 2024-11-12 18:51:32 -05:00			`const { TokenTextSplitter } = require('@langchain/textsplitters');`
feat: ConversationSummaryBufferMemory (#973) * refactor: pass model in message edit payload, use encoder in standalone util function * feat: add summaryBuffer helper * refactor(api/messages): use new countTokens helper and add auth middleware at top * wip: ConversationSummaryBufferMemory * refactor: move pre-generation helpers to prompts dir * chore: remove console log * chore: remove test as payload will no longer carry tokenCount * chore: update getMessagesWithinTokenLimit JSDoc * refactor: optimize getMessagesForConversation and also break on summary, feat(ci): getMessagesForConversation tests * refactor(getMessagesForConvo): count '00000000-0000-0000-0000-000000000000' as root message * chore: add newer model to token map * fix: condition was point to prop of array instead of message prop * refactor(BaseClient): use object for refineMessages param, rename 'summary' to 'summaryMessage', add previous_summary refactor(getMessagesWithinTokenLimit): replace text and tokenCount if should summarize, summary, and summaryTokenCount are present fix/refactor(handleContextStrategy): use the right comparison length for context diff, and replace payload first message when a summary is present * chore: log previous_summary if debugging * refactor(formatMessage): assume if role is defined that it's a valid value * refactor(getMessagesWithinTokenLimit): remove summary logic refactor(handleContextStrategy): add usePrevSummary logic in case only summary was pruned refactor(loadHistory): initial message query will return all ordered messages but keep track of the latest summary refactor(getMessagesForConversation): use object for single param, edit jsdoc, edit all files using the method refactor(ChatGPTClient): order messages before buildPrompt is called, TODO: add convoSumBuffMemory logic * fix: undefined handling and summarizing only when shouldRefineContext is true * chore(BaseClient): fix test results omitting system role for summaries and test edge case * chore: export summaryBuffer from index file * refactor(OpenAIClient/BaseClient): move refineMessages to subclass, implement LLM initialization for summaryBuffer * feat: add OPENAI_SUMMARIZE to enable summarizing, refactor: rename client prop 'shouldRefineContext' to 'shouldSummarize', change contextStrategy value to 'summarize' from 'refine' * refactor: rename refineMessages method to summarizeMessages for clarity * chore: clarify summary future intent in .env.example * refactor(initializeLLM): handle case for either 'model' or 'modelName' being passed * feat(gptPlugins): enable summarization for plugins * refactor(gptPlugins): utilize new initializeLLM method and formatting methods for messages, use payload array for currentMessages and assign pastMessages sooner * refactor(agents): use ConversationSummaryBufferMemory for both agent types * refactor(formatMessage): optimize original method for langchain, add helper function for langchain messages, add JSDocs and tests * refactor(summaryBuffer): add helper to createSummaryBufferMemory, and use new formatting helpers * fix: forgot to spread formatMessages also took opportunity to pluralize filename * refactor: pass memory to tools, namely openapi specs. not used and may never be used by new method but added for testing * ci(formatMessages): add more exhaustive checks for langchain messages * feat: add debug env var for OpenAI * chore: delete unnecessary comments * chore: add extra note about summary feature * fix: remove tokenCount from payload instructions * fix: test fail * fix: only pass instructions to payload when defined or not empty object * refactor: fromPromptMessages is deprecated, use renamed method fromMessages * refactor: use 'includes' instead of 'startsWith' for extended OpenRouter compatibility * fix(PluginsClient.buildPromptBody): handle undefined message strings * chore: log langchain titling error * feat: getModelMaxTokens helper * feat: tokenSplit helper * feat: summary prompts updated * fix: optimize _CUT_OFF_SUMMARIZER prompt * refactor(summaryBuffer): use custom summary prompt, allow prompt to be passed, pass humanPrefix and aiPrefix to memory, along with any future variables, rename messagesToRefine to context * fix(summaryBuffer): handle edge case where messagesToRefine exceeds summary context, refactor(BaseClient): allow custom maxContextTokens to be passed to getMessagesWithinTokenLimit, add defined check before unshifting summaryMessage, update shouldSummarize based on this refactor(OpenAIClient): use getModelMaxTokens, use cut-off message method for summary if no messages were left after pruning * fix(handleContextStrategy): handle case where incoming prompt is bigger than model context * chore: rename refinedContent to splitText * chore: remove unnecessary debug log 2023-09-26 21:02:28 -04:00
			`/**`
			`* Splits a given text by token chunks, based on the provided parameters for the TokenTextSplitter.`
			`* Note: limit or memoize use of this function as its calculation is expensive.`
			`*`
			`* @param {Object} obj - Configuration object for the text splitting operation.`
			`* @param {string} obj.text - The text to be split.`
			`* @param {string} [obj.encodingName='cl100k_base'] - Encoding name. Defaults to 'cl100k_base'.`
			`* @param {number} [obj.chunkSize=1] - The token size of each chunk. Defaults to 1.`
			`* @param {number} [obj.chunkOverlap=0] - The number of chunk elements to be overlapped between adjacent chunks. Defaults to 0.`
			`* @param {number} [obj.returnSize] - If specified and not 0, slices the return array from the end by this amount.`
			`*`
			`* @returns {Promise<Array>} Returns a promise that resolves to an array of text chunks.`
			`* If no text is provided, an empty array is returned.`
			`* If returnSize is specified and not 0, slices the return array from the end by returnSize.`
			`*`
			`* @async`
			`* @function tokenSplit`
			`*/`
			`async function tokenSplit({`
			`text,`
			`encodingName = 'cl100k_base',`
			`chunkSize = 1,`
			`chunkOverlap = 0,`
			`returnSize,`
			`}) {`
			`if (!text) {`
			`return [];`
			`}`

			`const splitter = new TokenTextSplitter({`
			`encodingName,`
			`chunkSize,`
			`chunkOverlap,`
			`});`

			`if (!returnSize) {`
			`return await splitter.splitText(text);`
			`}`

			`const splitText = await splitter.splitText(text);`

			`if (returnSize && returnSize > 0 && splitText.length > 0) {`
			`return splitText.slice(-Math.abs(returnSize));`
			`}`

			`return splitText;`
			`}`

			`module.exports = tokenSplit;`