LibreChat/api/app/clients/document/tokenSplit.js
Danny Avila 95201908e9
📦 fix: npm warnings; chore: bump deprecated packages (#4707)
* chore: bump langchain deps to address vulnerability warnings

* chore: bump community package and install textsplitters package

* fix: update expected result in tokenSplit tests for accuracy

* chore: remove CodeSherpa tools

* chore: remove E2B tools and loadToolSuite

* chore: remove CodeBrew tool and update related references

* chore: remove HumanTool and ChatTool, update tool references

* chore: remove Zapier tool from manifest.json and update SerpAPI

* chore: remove basic tools

* chore: update import path for RecursiveCharacterTextSplitter

* chore: update import path for DynamicStructuredTool

* chore: remove extractionChain.js and update tool filtering logic

* chore: npm audit fix

* chore: bump google packages

* chore: update DALL-E tool to DALL-E-3 and adjust authentication logic

* ci: update message classes

* chore: elliptic npm audit fix

* chore: update CallbackManager import and remove deprecated tool handling logic

* chore: imports order

* chore: remove unused code

---------

Co-authored-by: Max Sanna <max@maxsanna.com>
2024-11-12 18:51:32 -05:00

51 lines
1.6 KiB
JavaScript

const { TokenTextSplitter } = require('@langchain/textsplitters');
/**
* Splits a given text by token chunks, based on the provided parameters for the TokenTextSplitter.
* Note: limit or memoize use of this function as its calculation is expensive.
*
* @param {Object} obj - Configuration object for the text splitting operation.
* @param {string} obj.text - The text to be split.
* @param {string} [obj.encodingName='cl100k_base'] - Encoding name. Defaults to 'cl100k_base'.
* @param {number} [obj.chunkSize=1] - The token size of each chunk. Defaults to 1.
* @param {number} [obj.chunkOverlap=0] - The number of chunk elements to be overlapped between adjacent chunks. Defaults to 0.
* @param {number} [obj.returnSize] - If specified and not 0, slices the return array from the end by this amount.
*
* @returns {Promise<Array>} Returns a promise that resolves to an array of text chunks.
* If no text is provided, an empty array is returned.
* If returnSize is specified and not 0, slices the return array from the end by returnSize.
*
* @async
* @function tokenSplit
*/
async function tokenSplit({
text,
encodingName = 'cl100k_base',
chunkSize = 1,
chunkOverlap = 0,
returnSize,
}) {
if (!text) {
return [];
}
const splitter = new TokenTextSplitter({
encodingName,
chunkSize,
chunkOverlap,
});
if (!returnSize) {
return await splitter.splitText(text);
}
const splitText = await splitter.splitText(text);
if (returnSize && returnSize > 0 && splitText.length > 0) {
return splitText.slice(-Math.abs(returnSize));
}
return splitText;
}
module.exports = tokenSplit;