mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-17 08:50:15 +01:00
52 lines
1.6 KiB
JavaScript
52 lines
1.6 KiB
JavaScript
|
|
const { TokenTextSplitter } = require('langchain/text_splitter');
|
||
|
|
|
||
|
|
/**
|
||
|
|
* Splits a given text by token chunks, based on the provided parameters for the TokenTextSplitter.
|
||
|
|
* Note: limit or memoize use of this function as its calculation is expensive.
|
||
|
|
*
|
||
|
|
* @param {Object} obj - Configuration object for the text splitting operation.
|
||
|
|
* @param {string} obj.text - The text to be split.
|
||
|
|
* @param {string} [obj.encodingName='cl100k_base'] - Encoding name. Defaults to 'cl100k_base'.
|
||
|
|
* @param {number} [obj.chunkSize=1] - The token size of each chunk. Defaults to 1.
|
||
|
|
* @param {number} [obj.chunkOverlap=0] - The number of chunk elements to be overlapped between adjacent chunks. Defaults to 0.
|
||
|
|
* @param {number} [obj.returnSize] - If specified and not 0, slices the return array from the end by this amount.
|
||
|
|
*
|
||
|
|
* @returns {Promise<Array>} Returns a promise that resolves to an array of text chunks.
|
||
|
|
* If no text is provided, an empty array is returned.
|
||
|
|
* If returnSize is specified and not 0, slices the return array from the end by returnSize.
|
||
|
|
*
|
||
|
|
* @async
|
||
|
|
* @function tokenSplit
|
||
|
|
*/
|
||
|
|
async function tokenSplit({
|
||
|
|
text,
|
||
|
|
encodingName = 'cl100k_base',
|
||
|
|
chunkSize = 1,
|
||
|
|
chunkOverlap = 0,
|
||
|
|
returnSize,
|
||
|
|
}) {
|
||
|
|
if (!text) {
|
||
|
|
return [];
|
||
|
|
}
|
||
|
|
|
||
|
|
const splitter = new TokenTextSplitter({
|
||
|
|
encodingName,
|
||
|
|
chunkSize,
|
||
|
|
chunkOverlap,
|
||
|
|
});
|
||
|
|
|
||
|
|
if (!returnSize) {
|
||
|
|
return await splitter.splitText(text);
|
||
|
|
}
|
||
|
|
|
||
|
|
const splitText = await splitter.splitText(text);
|
||
|
|
|
||
|
|
if (returnSize && returnSize > 0 && splitText.length > 0) {
|
||
|
|
return splitText.slice(-Math.abs(returnSize));
|
||
|
|
}
|
||
|
|
|
||
|
|
return splitText;
|
||
|
|
}
|
||
|
|
|
||
|
|
module.exports = tokenSplit;
|