mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-17 08:50:15 +01:00
* fix: voice setting for autoplayback TTS * fix(useTextToSpeechExternal): resolve stateful playback issues and consolidate state logic * refactor: initialize tts voice and provider schema once per request * fix(tts): edge case, longer text inputs. TODO: use continuous stream for longer text inputs * fix(tts): pause global audio on conversation change * refactor: keyvMongo ban cache to allow db updates for unbanning, to prevent server restart * chore: eslint fix * refactor: make ban cache exclusively keyvMongo
412 lines
13 KiB
JavaScript
412 lines
13 KiB
JavaScript
const axios = require('axios');
|
|
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
|
|
const { getRandomVoiceId, createChunkProcessor, splitTextIntoChunks } = require('./streamAudio');
|
|
const { extractEnvVariable } = require('librechat-data-provider');
|
|
const { logger } = require('~/config');
|
|
|
|
/**
|
|
* getProvider function
|
|
* This function takes the ttsSchema object and returns the name of the provider
|
|
* If more than one provider is set or no provider is set, it throws an error
|
|
*
|
|
* @param {Object} ttsSchema - The TTS schema containing the provider configuration
|
|
* @returns {string} The name of the provider
|
|
* @throws {Error} Throws an error if multiple providers are set or no provider is set
|
|
*/
|
|
function getProvider(ttsSchema) {
|
|
if (!ttsSchema) {
|
|
throw new Error(`No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?
|
|
# Example TTS configuration`);
|
|
}
|
|
const providers = Object.entries(ttsSchema).filter(([, value]) => Object.keys(value).length > 0);
|
|
|
|
if (providers.length > 1) {
|
|
throw new Error('Multiple providers are set. Please set only one provider.');
|
|
} else if (providers.length === 0) {
|
|
throw new Error('No provider is set. Please set a provider.');
|
|
} else {
|
|
return providers[0][0];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* removeUndefined function
|
|
* This function takes an object and removes all keys with undefined values
|
|
* It also removes keys with empty objects as values
|
|
*
|
|
* @param {Object} obj - The object to be cleaned
|
|
* @returns {void} This function does not return a value. It modifies the input object directly
|
|
*/
|
|
function removeUndefined(obj) {
|
|
Object.keys(obj).forEach((key) => {
|
|
if (obj[key] && typeof obj[key] === 'object') {
|
|
removeUndefined(obj[key]);
|
|
if (Object.keys(obj[key]).length === 0) {
|
|
delete obj[key];
|
|
}
|
|
} else if (obj[key] === undefined) {
|
|
delete obj[key];
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* This function prepares the necessary data and headers for making a request to the OpenAI TTS
|
|
* It uses the provided TTS schema, input text, and voice to create the request
|
|
*
|
|
* @param {TCustomConfig['tts']['openai']} ttsSchema - The TTS schema containing the OpenAI configuration
|
|
* @param {string} input - The text to be converted to speech
|
|
* @param {string} voice - The voice to be used for the speech
|
|
*
|
|
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
|
|
* If an error occurs, it throws an error with a message indicating that the selected voice is not available
|
|
*/
|
|
function openAIProvider(ttsSchema, input, voice) {
|
|
const url = ttsSchema?.url || 'https://api.openai.com/v1/audio/speech';
|
|
|
|
if (
|
|
ttsSchema?.voices &&
|
|
ttsSchema.voices.length > 0 &&
|
|
!ttsSchema.voices.includes(voice) &&
|
|
!ttsSchema.voices.includes('ALL')
|
|
) {
|
|
throw new Error(`Voice ${voice} is not available.`);
|
|
}
|
|
|
|
let data = {
|
|
input,
|
|
model: ttsSchema?.model,
|
|
voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
|
|
backend: ttsSchema?.backend,
|
|
};
|
|
|
|
let headers = {
|
|
'Content-Type': 'application/json',
|
|
Authorization: 'Bearer ' + extractEnvVariable(ttsSchema?.apiKey),
|
|
};
|
|
|
|
[data, headers].forEach(removeUndefined);
|
|
|
|
return [url, data, headers];
|
|
}
|
|
|
|
/**
|
|
* elevenLabsProvider function
|
|
* This function prepares the necessary data and headers for making a request to the Eleven Labs TTS
|
|
* It uses the provided TTS schema, input text, and voice to create the request
|
|
*
|
|
* @param {TCustomConfig['tts']['elevenLabs']} ttsSchema - The TTS schema containing the Eleven Labs configuration
|
|
* @param {string} input - The text to be converted to speech
|
|
* @param {string} voice - The voice to be used for the speech
|
|
* @param {boolean} stream - Whether to stream the audio or not
|
|
*
|
|
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
|
|
* @throws {Error} Throws an error if the selected voice is not available
|
|
*/
|
|
function elevenLabsProvider(ttsSchema, input, voice, stream) {
|
|
let url =
|
|
ttsSchema?.url ||
|
|
`https://api.elevenlabs.io/v1/text-to-speech/{voice_id}${stream ? '/stream' : ''}`;
|
|
|
|
if (!ttsSchema?.voices.includes(voice) && !ttsSchema?.voices.includes('ALL')) {
|
|
throw new Error(`Voice ${voice} is not available.`);
|
|
}
|
|
|
|
url = url.replace('{voice_id}', voice);
|
|
|
|
let data = {
|
|
model_id: ttsSchema?.model,
|
|
text: input,
|
|
// voice_id: voice,
|
|
voice_settings: {
|
|
similarity_boost: ttsSchema?.voice_settings?.similarity_boost,
|
|
stability: ttsSchema?.voice_settings?.stability,
|
|
style: ttsSchema?.voice_settings?.style,
|
|
use_speaker_boost: ttsSchema?.voice_settings?.use_speaker_boost || undefined,
|
|
},
|
|
pronunciation_dictionary_locators: ttsSchema?.pronunciation_dictionary_locators,
|
|
};
|
|
|
|
let headers = {
|
|
'Content-Type': 'application/json',
|
|
'xi-api-key': extractEnvVariable(ttsSchema?.apiKey),
|
|
Accept: 'audio/mpeg',
|
|
};
|
|
|
|
[data, headers].forEach(removeUndefined);
|
|
|
|
return [url, data, headers];
|
|
}
|
|
|
|
/**
|
|
* localAIProvider function
|
|
* This function prepares the necessary data and headers for making a request to the LocalAI TTS
|
|
* It uses the provided TTS schema, input text, and voice to create the request
|
|
*
|
|
* @param {TCustomConfig['tts']['localai']} ttsSchema - The TTS schema containing the LocalAI configuration
|
|
* @param {string} input - The text to be converted to speech
|
|
* @param {string} voice - The voice to be used for the speech
|
|
*
|
|
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
|
|
* @throws {Error} Throws an error if the selected voice is not available
|
|
*/
|
|
function localAIProvider(ttsSchema, input, voice) {
|
|
let url = ttsSchema?.url;
|
|
|
|
if (
|
|
ttsSchema?.voices &&
|
|
ttsSchema.voices.length > 0 &&
|
|
!ttsSchema.voices.includes(voice) &&
|
|
!ttsSchema.voices.includes('ALL')
|
|
) {
|
|
throw new Error(`Voice ${voice} is not available.`);
|
|
}
|
|
|
|
let data = {
|
|
input,
|
|
model: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
|
|
backend: ttsSchema?.backend,
|
|
};
|
|
|
|
let headers = {
|
|
'Content-Type': 'application/json',
|
|
Authorization: 'Bearer ' + extractEnvVariable(ttsSchema?.apiKey),
|
|
};
|
|
|
|
[data, headers].forEach(removeUndefined);
|
|
|
|
if (extractEnvVariable(ttsSchema.apiKey) === '') {
|
|
delete headers.Authorization;
|
|
}
|
|
|
|
return [url, data, headers];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Returns provider and its schema for use with TTS requests
|
|
* @param {TCustomConfig} customConfig
|
|
* @param {string} _voice
|
|
* @returns {Promise<[string, TProviderSchema]>}
|
|
*/
|
|
async function getProviderSchema(customConfig) {
|
|
const provider = getProvider(customConfig.tts);
|
|
return [provider, customConfig.tts[provider]];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Returns a tuple of the TTS schema as well as the voice for the TTS request
|
|
* @param {TProviderSchema} providerSchema
|
|
* @param {string} requestVoice
|
|
* @returns {Promise<string>}
|
|
*/
|
|
async function getVoice(providerSchema, requestVoice) {
|
|
const voices = providerSchema.voices.filter((voice) => voice && voice.toUpperCase() !== 'ALL');
|
|
let voice = requestVoice;
|
|
if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) {
|
|
voice = getRandomVoiceId(voices);
|
|
}
|
|
|
|
return voice;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param {string} provider
|
|
* @param {TProviderSchema} ttsSchema
|
|
* @param {object} params
|
|
* @param {string} params.voice
|
|
* @param {string} params.input
|
|
* @param {boolean} [params.stream]
|
|
* @returns {Promise<ArrayBuffer>}
|
|
*/
|
|
async function ttsRequest(provider, ttsSchema, { input, voice, stream = true } = { stream: true }) {
|
|
let [url, data, headers] = [];
|
|
switch (provider) {
|
|
case 'openai':
|
|
[url, data, headers] = openAIProvider(ttsSchema, input, voice);
|
|
break;
|
|
case 'elevenlabs':
|
|
[url, data, headers] = elevenLabsProvider(ttsSchema, input, voice, stream);
|
|
break;
|
|
case 'localai':
|
|
[url, data, headers] = localAIProvider(ttsSchema, input, voice);
|
|
break;
|
|
default:
|
|
throw new Error('Invalid provider');
|
|
}
|
|
|
|
if (stream) {
|
|
return await axios.post(url, data, { headers, responseType: 'stream' });
|
|
}
|
|
|
|
return await axios.post(url, data, { headers, responseType: 'arraybuffer' });
|
|
}
|
|
|
|
/**
|
|
* Handles a text-to-speech request. Extracts input and voice from the request, retrieves the TTS configuration,
|
|
* and sends a request to the appropriate provider. The resulting audio data is sent in the response
|
|
*
|
|
* @param {Object} req - The request object, which should contain the input text and voice in its body
|
|
* @param {Object} res - The response object, used to send the audio data or an error message
|
|
*
|
|
* @returns {Promise<void>} This function does not return a value. It sends the audio data or an error message in the response
|
|
*
|
|
* @throws {Error} Throws an error if the provider is invalid
|
|
*/
|
|
async function textToSpeech(req, res) {
|
|
const { input } = req.body;
|
|
|
|
if (!input) {
|
|
return res.status(400).send('Missing text in request body');
|
|
}
|
|
|
|
const customConfig = await getCustomConfig();
|
|
if (!customConfig) {
|
|
res.status(500).send('Custom config not found');
|
|
}
|
|
|
|
try {
|
|
res.setHeader('Content-Type', 'audio/mpeg');
|
|
const [provider, ttsSchema] = await getProviderSchema(customConfig);
|
|
const voice = await getVoice(ttsSchema, req.body.voice);
|
|
if (input.length < 4096) {
|
|
const response = await ttsRequest(provider, ttsSchema, { input, voice });
|
|
response.data.pipe(res);
|
|
return;
|
|
}
|
|
|
|
const textChunks = splitTextIntoChunks(input, 1000);
|
|
|
|
for (const chunk of textChunks) {
|
|
try {
|
|
const response = await ttsRequest(provider, ttsSchema, {
|
|
voice,
|
|
input: chunk.text,
|
|
stream: true,
|
|
});
|
|
|
|
logger.debug(`[textToSpeech] user: ${req?.user?.id} | writing audio stream`);
|
|
await new Promise((resolve) => {
|
|
response.data.pipe(res, { end: chunk.isFinished });
|
|
response.data.on('end', () => {
|
|
resolve();
|
|
});
|
|
});
|
|
|
|
if (chunk.isFinished) {
|
|
break;
|
|
}
|
|
} catch (innerError) {
|
|
logger.error('Error processing update:', chunk, innerError);
|
|
if (!res.headersSent) {
|
|
res.status(500).end();
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!res.headersSent) {
|
|
res.end();
|
|
}
|
|
} catch (error) {
|
|
logger.error('An error occurred while creating the audio stream:', error);
|
|
res.status(500).send('An error occurred');
|
|
}
|
|
}
|
|
|
|
async function streamAudio(req, res) {
|
|
res.setHeader('Content-Type', 'audio/mpeg');
|
|
const customConfig = await getCustomConfig();
|
|
if (!customConfig) {
|
|
return res.status(500).send('Custom config not found');
|
|
}
|
|
|
|
const [provider, ttsSchema] = await getProviderSchema(customConfig);
|
|
const voice = await getVoice(ttsSchema, req.body.voice);
|
|
|
|
try {
|
|
let shouldContinue = true;
|
|
|
|
req.on('close', () => {
|
|
logger.warn('[streamAudio] Audio Stream Request closed by client');
|
|
shouldContinue = false;
|
|
});
|
|
|
|
const processChunks = createChunkProcessor(req.body.messageId);
|
|
|
|
while (shouldContinue) {
|
|
// example updates
|
|
// const updates = [
|
|
// { text: 'This is a test.', isFinished: false },
|
|
// { text: 'This is only a test.', isFinished: false },
|
|
// { text: 'Your voice is like a combination of Fergie and Jesus!', isFinished: true },
|
|
// ];
|
|
|
|
const updates = await processChunks();
|
|
if (typeof updates === 'string') {
|
|
logger.error(`Error processing audio stream updates: ${JSON.stringify(updates)}`);
|
|
res.status(500).end();
|
|
return;
|
|
}
|
|
|
|
if (updates.length === 0) {
|
|
await new Promise((resolve) => setTimeout(resolve, 1250));
|
|
continue;
|
|
}
|
|
|
|
for (const update of updates) {
|
|
try {
|
|
const response = await ttsRequest(provider, ttsSchema, {
|
|
voice,
|
|
input: update.text,
|
|
stream: true,
|
|
});
|
|
|
|
if (!shouldContinue) {
|
|
break;
|
|
}
|
|
|
|
logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`);
|
|
await new Promise((resolve) => {
|
|
response.data.pipe(res, { end: update.isFinished });
|
|
response.data.on('end', () => {
|
|
resolve();
|
|
});
|
|
});
|
|
|
|
if (update.isFinished) {
|
|
shouldContinue = false;
|
|
break;
|
|
}
|
|
} catch (innerError) {
|
|
logger.error('Error processing update:', update, innerError);
|
|
if (!res.headersSent) {
|
|
res.status(500).end();
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!shouldContinue) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!res.headersSent) {
|
|
res.end();
|
|
}
|
|
} catch (error) {
|
|
logger.error('Failed to fetch audio:', error);
|
|
if (!res.headersSent) {
|
|
res.status(500).end();
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
textToSpeech,
|
|
getProvider,
|
|
streamAudio,
|
|
};
|