const axios = require('axios'); const getCustomConfig = require('~/server/services/Config/getCustomConfig'); const { getRandomVoiceId, createChunkProcessor, splitTextIntoChunks } = require('./streamAudio'); const { extractEnvVariable } = require('librechat-data-provider'); const { logger } = require('~/config'); /** * getProvider function * This function takes the ttsSchema object and returns the name of the provider * If more than one provider is set or no provider is set, it throws an error * * @param {Object} ttsSchema - The TTS schema containing the provider configuration * @returns {string} The name of the provider * @throws {Error} Throws an error if multiple providers are set or no provider is set */ function getProvider(ttsSchema) { if (!ttsSchema) { throw new Error(`No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)? https://www.librechat.ai/docs/configuration/stt_tts#tts`); } const providers = Object.entries(ttsSchema).filter(([, value]) => Object.keys(value).length > 0); if (providers.length > 1) { throw new Error('Multiple providers are set. Please set only one provider.'); } else if (providers.length === 0) { throw new Error('No provider is set. Please set a provider.'); } else { return providers[0][0]; } } /** * removeUndefined function * This function takes an object and removes all keys with undefined values * It also removes keys with empty objects as values * * @param {Object} obj - The object to be cleaned * @returns {void} This function does not return a value. It modifies the input object directly */ function removeUndefined(obj) { Object.keys(obj).forEach((key) => { if (obj[key] && typeof obj[key] === 'object') { removeUndefined(obj[key]); if (Object.keys(obj[key]).length === 0) { delete obj[key]; } } else if (obj[key] === undefined) { delete obj[key]; } }); } /** * This function prepares the necessary data and headers for making a request to the OpenAI TTS * It uses the provided TTS schema, input text, and voice to create the request * * @param {TCustomConfig['tts']['openai']} ttsSchema - The TTS schema containing the OpenAI configuration * @param {string} input - The text to be converted to speech * @param {string} voice - The voice to be used for the speech * * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request * If an error occurs, it throws an error with a message indicating that the selected voice is not available */ function openAIProvider(ttsSchema, input, voice) { const url = ttsSchema?.url || 'https://api.openai.com/v1/audio/speech'; if ( ttsSchema?.voices && ttsSchema.voices.length > 0 && !ttsSchema.voices.includes(voice) && !ttsSchema.voices.includes('ALL') ) { throw new Error(`Voice ${voice} is not available.`); } let data = { input, model: ttsSchema?.model, voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, backend: ttsSchema?.backend, }; let headers = { 'Content-Type': 'application/json', Authorization: 'Bearer ' + extractEnvVariable(ttsSchema?.apiKey), }; [data, headers].forEach(removeUndefined); return [url, data, headers]; } /** * elevenLabsProvider function * This function prepares the necessary data and headers for making a request to the Eleven Labs TTS * It uses the provided TTS schema, input text, and voice to create the request * * @param {TCustomConfig['tts']['elevenLabs']} ttsSchema - The TTS schema containing the Eleven Labs configuration * @param {string} input - The text to be converted to speech * @param {string} voice - The voice to be used for the speech * @param {boolean} stream - Whether to stream the audio or not * * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request * @throws {Error} Throws an error if the selected voice is not available */ function elevenLabsProvider(ttsSchema, input, voice, stream) { let url = ttsSchema?.url || `https://api.elevenlabs.io/v1/text-to-speech/{voice_id}${stream ? '/stream' : ''}`; if (!ttsSchema?.voices.includes(voice) && !ttsSchema?.voices.includes('ALL')) { throw new Error(`Voice ${voice} is not available.`); } url = url.replace('{voice_id}', voice); let data = { model_id: ttsSchema?.model, text: input, // voice_id: voice, voice_settings: { similarity_boost: ttsSchema?.voice_settings?.similarity_boost, stability: ttsSchema?.voice_settings?.stability, style: ttsSchema?.voice_settings?.style, use_speaker_boost: ttsSchema?.voice_settings?.use_speaker_boost || undefined, }, pronunciation_dictionary_locators: ttsSchema?.pronunciation_dictionary_locators, }; let headers = { 'Content-Type': 'application/json', 'xi-api-key': extractEnvVariable(ttsSchema?.apiKey), Accept: 'audio/mpeg', }; [data, headers].forEach(removeUndefined); return [url, data, headers]; } /** * localAIProvider function * This function prepares the necessary data and headers for making a request to the LocalAI TTS * It uses the provided TTS schema, input text, and voice to create the request * * @param {TCustomConfig['tts']['localai']} ttsSchema - The TTS schema containing the LocalAI configuration * @param {string} input - The text to be converted to speech * @param {string} voice - The voice to be used for the speech * * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request * @throws {Error} Throws an error if the selected voice is not available */ function localAIProvider(ttsSchema, input, voice) { let url = ttsSchema?.url; if ( ttsSchema?.voices && ttsSchema.voices.length > 0 && !ttsSchema.voices.includes(voice) && !ttsSchema.voices.includes('ALL') ) { throw new Error(`Voice ${voice} is not available.`); } let data = { input, model: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined, backend: ttsSchema?.backend, }; let headers = { 'Content-Type': 'application/json', Authorization: 'Bearer ' + extractEnvVariable(ttsSchema?.apiKey), }; [data, headers].forEach(removeUndefined); if (extractEnvVariable(ttsSchema.apiKey) === '') { delete headers.Authorization; } return [url, data, headers]; } /** * * Returns provider and its schema for use with TTS requests * @param {TCustomConfig} customConfig * @param {string} _voice * @returns {Promise<[string, TProviderSchema]>} */ async function getProviderSchema(customConfig) { const provider = getProvider(customConfig.tts); return [provider, customConfig.tts[provider]]; } /** * * Returns a tuple of the TTS schema as well as the voice for the TTS request * @param {TProviderSchema} providerSchema * @param {string} requestVoice * @returns {Promise} */ async function getVoice(providerSchema, requestVoice) { const voices = providerSchema.voices.filter((voice) => voice && voice.toUpperCase() !== 'ALL'); let voice = requestVoice; if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) { voice = getRandomVoiceId(voices); } return voice; } /** * * @param {string} provider * @param {TProviderSchema} ttsSchema * @param {object} params * @param {string} params.voice * @param {string} params.input * @param {boolean} [params.stream] * @returns {Promise} */ async function ttsRequest(provider, ttsSchema, { input, voice, stream = true } = { stream: true }) { let [url, data, headers] = []; switch (provider) { case 'openai': [url, data, headers] = openAIProvider(ttsSchema, input, voice); break; case 'elevenlabs': [url, data, headers] = elevenLabsProvider(ttsSchema, input, voice, stream); break; case 'localai': [url, data, headers] = localAIProvider(ttsSchema, input, voice); break; default: throw new Error('Invalid provider'); } if (stream) { return await axios.post(url, data, { headers, responseType: 'stream' }); } return await axios.post(url, data, { headers, responseType: 'arraybuffer' }); } /** * Handles a text-to-speech request. Extracts input and voice from the request, retrieves the TTS configuration, * and sends a request to the appropriate provider. The resulting audio data is sent in the response * * @param {Object} req - The request object, which should contain the input text and voice in its body * @param {Object} res - The response object, used to send the audio data or an error message * * @returns {Promise} This function does not return a value. It sends the audio data or an error message in the response * * @throws {Error} Throws an error if the provider is invalid */ async function textToSpeech(req, res) { const { input } = req.body; if (!input) { return res.status(400).send('Missing text in request body'); } const customConfig = await getCustomConfig(); if (!customConfig) { res.status(500).send('Custom config not found'); } try { res.setHeader('Content-Type', 'audio/mpeg'); const [provider, ttsSchema] = await getProviderSchema(customConfig); const voice = await getVoice(ttsSchema, req.body.voice); if (input.length < 4096) { const response = await ttsRequest(provider, ttsSchema, { input, voice }); response.data.pipe(res); return; } const textChunks = splitTextIntoChunks(input, 1000); for (const chunk of textChunks) { try { const response = await ttsRequest(provider, ttsSchema, { voice, input: chunk.text, stream: true, }); logger.debug(`[textToSpeech] user: ${req?.user?.id} | writing audio stream`); await new Promise((resolve) => { response.data.pipe(res, { end: chunk.isFinished }); response.data.on('end', () => { resolve(); }); }); if (chunk.isFinished) { break; } } catch (innerError) { logger.error('Error processing manual update:', chunk, innerError); if (!res.headersSent) { res.status(500).end(); } return; } } if (!res.headersSent) { res.end(); } } catch (error) { logger.error( 'Error creating the audio stream. Suggestion: check your provider quota. Error:', error, ); res.status(500).send('An error occurred'); } } async function streamAudio(req, res) { res.setHeader('Content-Type', 'audio/mpeg'); const customConfig = await getCustomConfig(); if (!customConfig) { return res.status(500).send('Custom config not found'); } const [provider, ttsSchema] = await getProviderSchema(customConfig); const voice = await getVoice(ttsSchema, req.body.voice); try { let shouldContinue = true; req.on('close', () => { logger.warn('[streamAudio] Audio Stream Request closed by client'); shouldContinue = false; }); const processChunks = createChunkProcessor(req.body.messageId); while (shouldContinue) { // example updates // const updates = [ // { text: 'This is a test.', isFinished: false }, // { text: 'This is only a test.', isFinished: false }, // { text: 'Your voice is like a combination of Fergie and Jesus!', isFinished: true }, // ]; const updates = await processChunks(); if (typeof updates === 'string') { logger.error(`Error processing audio stream updates: ${JSON.stringify(updates)}`); res.status(500).end(); return; } if (updates.length === 0) { await new Promise((resolve) => setTimeout(resolve, 1250)); continue; } for (const update of updates) { try { const response = await ttsRequest(provider, ttsSchema, { voice, input: update.text, stream: true, }); if (!shouldContinue) { break; } logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`); await new Promise((resolve) => { response.data.pipe(res, { end: update.isFinished }); response.data.on('end', () => { resolve(); }); }); if (update.isFinished) { shouldContinue = false; break; } } catch (innerError) { logger.error('Error processing update:', update, innerError); if (!res.headersSent) { res.status(500).end(); } return; } } if (!shouldContinue) { break; } } if (!res.headersSent) { res.end(); } } catch (error) { logger.error('Failed to fetch audio:', error); if (!res.headersSent) { res.status(500).end(); } } } module.exports = { textToSpeech, getProvider, streamAudio, };