mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-17 08:50:15 +01:00
* feat: update useTextToSpeech and useSpeechToText hooks to support external audio endpoints This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: add userSelect style to ConversationModeSwitch label * fix: remove unused updateTokenWebsocket function and import The updateTokenWebsocket function and its import are no longer used in the OpenAIClient module. This commit removes the function and import to clean up the codebase * feat: support external audio endpoints in useTextToSpeech and useSpeechToText hooks This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: update AutomaticPlayback component to AutomaticPlaybackSwitch; tests: added AutomaticPlaybackSwitch.spec > > This commit renames the AutomaticPlayback component to AutomaticPlaybackSwitch in the Speech directory. The new name better reflects the purpose of the component and aligns with the naming convention used in the codebase. * feat: update useSpeechToText hook to include interimTranscript This commit updates the useSpeechToText hook in the client/src/components/Chat/Input/AudioRecorder.tsx file to include the interimTranscript state. This allows for real-time display of the speech-to-text transcription while the user is still speaking. The interimTranscript is now used to update the text area value during recording. * feat: Add customConfigSpeech API endpoint for retrieving custom speech configuration This commit adds a new API endpoint in the file under the directory. This endpoint is responsible for retrieving the custom speech configuration using the function from the module * feat: update store var and ; fix: getCustomConfigSpeech * fix: client tests, removed unused import * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations This commit modifies the useCustomConfigSpeechQuery function in the client/src/data-provider/queries.ts file to return an array of custom speech configurations instead of a single object. This change allows for better handling and manipulation of the data in the application * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations * refactor: Update variable name in speechTab schema * refactor: removed unused and nested code * fix: using recoilState * refactor: Update Speech component to use useCallback for setting settings * fix: test * fix: tests * feature: ensure that the settings don't change after modifying then through the UI * remove comment * fix: Handle error gracefully in getCustomConfigSpeech and getVoices endpoints * fix: Handle error * fix: backend tests * fix: invalid custom config logging * chore: add back custom config info logging * chore: revert loadCustomConfig spec --------- Co-authored-by: Danny Avila <danny@librechat.ai>
416 lines
13 KiB
JavaScript
416 lines
13 KiB
JavaScript
const axios = require('axios');
|
|
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
|
|
const { getRandomVoiceId, createChunkProcessor, splitTextIntoChunks } = require('./streamAudio');
|
|
const { extractEnvVariable } = require('librechat-data-provider');
|
|
const { logger } = require('~/config');
|
|
|
|
/**
|
|
* getProvider function
|
|
* This function takes the ttsSchema object and returns the name of the provider
|
|
* If more than one provider is set or no provider is set, it throws an error
|
|
*
|
|
* @param {Object} ttsSchema - The TTS schema containing the provider configuration
|
|
* @returns {string} The name of the provider
|
|
* @throws {Error} Throws an error if multiple providers are set or no provider is set
|
|
*/
|
|
function getProvider(ttsSchema) {
|
|
if (!ttsSchema) {
|
|
throw new Error(`No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?
|
|
|
|
https://www.librechat.ai/docs/configuration/stt_tts#tts`);
|
|
}
|
|
const providers = Object.entries(ttsSchema).filter(([, value]) => Object.keys(value).length > 0);
|
|
|
|
if (providers.length > 1) {
|
|
throw new Error('Multiple providers are set. Please set only one provider.');
|
|
} else if (providers.length === 0) {
|
|
throw new Error('No provider is set. Please set a provider.');
|
|
} else {
|
|
return providers[0][0];
|
|
}
|
|
}
|
|
|
|
/**
|
|
* removeUndefined function
|
|
* This function takes an object and removes all keys with undefined values
|
|
* It also removes keys with empty objects as values
|
|
*
|
|
* @param {Object} obj - The object to be cleaned
|
|
* @returns {void} This function does not return a value. It modifies the input object directly
|
|
*/
|
|
function removeUndefined(obj) {
|
|
Object.keys(obj).forEach((key) => {
|
|
if (obj[key] && typeof obj[key] === 'object') {
|
|
removeUndefined(obj[key]);
|
|
if (Object.keys(obj[key]).length === 0) {
|
|
delete obj[key];
|
|
}
|
|
} else if (obj[key] === undefined) {
|
|
delete obj[key];
|
|
}
|
|
});
|
|
}
|
|
|
|
/**
|
|
* This function prepares the necessary data and headers for making a request to the OpenAI TTS
|
|
* It uses the provided TTS schema, input text, and voice to create the request
|
|
*
|
|
* @param {TCustomConfig['tts']['openai']} ttsSchema - The TTS schema containing the OpenAI configuration
|
|
* @param {string} input - The text to be converted to speech
|
|
* @param {string} voice - The voice to be used for the speech
|
|
*
|
|
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
|
|
* If an error occurs, it throws an error with a message indicating that the selected voice is not available
|
|
*/
|
|
function openAIProvider(ttsSchema, input, voice) {
|
|
const url = ttsSchema?.url || 'https://api.openai.com/v1/audio/speech';
|
|
|
|
if (
|
|
ttsSchema?.voices &&
|
|
ttsSchema.voices.length > 0 &&
|
|
!ttsSchema.voices.includes(voice) &&
|
|
!ttsSchema.voices.includes('ALL')
|
|
) {
|
|
throw new Error(`Voice ${voice} is not available.`);
|
|
}
|
|
|
|
let data = {
|
|
input,
|
|
model: ttsSchema?.model,
|
|
voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
|
|
backend: ttsSchema?.backend,
|
|
};
|
|
|
|
let headers = {
|
|
'Content-Type': 'application/json',
|
|
Authorization: 'Bearer ' + extractEnvVariable(ttsSchema?.apiKey),
|
|
};
|
|
|
|
[data, headers].forEach(removeUndefined);
|
|
|
|
return [url, data, headers];
|
|
}
|
|
|
|
/**
|
|
* elevenLabsProvider function
|
|
* This function prepares the necessary data and headers for making a request to the Eleven Labs TTS
|
|
* It uses the provided TTS schema, input text, and voice to create the request
|
|
*
|
|
* @param {TCustomConfig['tts']['elevenLabs']} ttsSchema - The TTS schema containing the Eleven Labs configuration
|
|
* @param {string} input - The text to be converted to speech
|
|
* @param {string} voice - The voice to be used for the speech
|
|
* @param {boolean} stream - Whether to stream the audio or not
|
|
*
|
|
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
|
|
* @throws {Error} Throws an error if the selected voice is not available
|
|
*/
|
|
function elevenLabsProvider(ttsSchema, input, voice, stream) {
|
|
let url =
|
|
ttsSchema?.url ||
|
|
`https://api.elevenlabs.io/v1/text-to-speech/{voice_id}${stream ? '/stream' : ''}`;
|
|
|
|
if (!ttsSchema?.voices.includes(voice) && !ttsSchema?.voices.includes('ALL')) {
|
|
throw new Error(`Voice ${voice} is not available.`);
|
|
}
|
|
|
|
url = url.replace('{voice_id}', voice);
|
|
|
|
let data = {
|
|
model_id: ttsSchema?.model,
|
|
text: input,
|
|
// voice_id: voice,
|
|
voice_settings: {
|
|
similarity_boost: ttsSchema?.voice_settings?.similarity_boost,
|
|
stability: ttsSchema?.voice_settings?.stability,
|
|
style: ttsSchema?.voice_settings?.style,
|
|
use_speaker_boost: ttsSchema?.voice_settings?.use_speaker_boost || undefined,
|
|
},
|
|
pronunciation_dictionary_locators: ttsSchema?.pronunciation_dictionary_locators,
|
|
};
|
|
|
|
let headers = {
|
|
'Content-Type': 'application/json',
|
|
'xi-api-key': extractEnvVariable(ttsSchema?.apiKey),
|
|
Accept: 'audio/mpeg',
|
|
};
|
|
|
|
[data, headers].forEach(removeUndefined);
|
|
|
|
return [url, data, headers];
|
|
}
|
|
|
|
/**
|
|
* localAIProvider function
|
|
* This function prepares the necessary data and headers for making a request to the LocalAI TTS
|
|
* It uses the provided TTS schema, input text, and voice to create the request
|
|
*
|
|
* @param {TCustomConfig['tts']['localai']} ttsSchema - The TTS schema containing the LocalAI configuration
|
|
* @param {string} input - The text to be converted to speech
|
|
* @param {string} voice - The voice to be used for the speech
|
|
*
|
|
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
|
|
* @throws {Error} Throws an error if the selected voice is not available
|
|
*/
|
|
function localAIProvider(ttsSchema, input, voice) {
|
|
let url = ttsSchema?.url;
|
|
|
|
if (
|
|
ttsSchema?.voices &&
|
|
ttsSchema.voices.length > 0 &&
|
|
!ttsSchema.voices.includes(voice) &&
|
|
!ttsSchema.voices.includes('ALL')
|
|
) {
|
|
throw new Error(`Voice ${voice} is not available.`);
|
|
}
|
|
|
|
let data = {
|
|
input,
|
|
model: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
|
|
backend: ttsSchema?.backend,
|
|
};
|
|
|
|
let headers = {
|
|
'Content-Type': 'application/json',
|
|
Authorization: 'Bearer ' + extractEnvVariable(ttsSchema?.apiKey),
|
|
};
|
|
|
|
[data, headers].forEach(removeUndefined);
|
|
|
|
if (extractEnvVariable(ttsSchema.apiKey) === '') {
|
|
delete headers.Authorization;
|
|
}
|
|
|
|
return [url, data, headers];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Returns provider and its schema for use with TTS requests
|
|
* @param {TCustomConfig} customConfig
|
|
* @param {string} _voice
|
|
* @returns {Promise<[string, TProviderSchema]>}
|
|
*/
|
|
async function getProviderSchema(customConfig) {
|
|
const provider = getProvider(customConfig.speech.tts);
|
|
return [provider, customConfig.speech.tts[provider]];
|
|
}
|
|
|
|
/**
|
|
*
|
|
* Returns a tuple of the TTS schema as well as the voice for the TTS request
|
|
* @param {TProviderSchema} providerSchema
|
|
* @param {string} requestVoice
|
|
* @returns {Promise<string>}
|
|
*/
|
|
async function getVoice(providerSchema, requestVoice) {
|
|
const voices = providerSchema.voices.filter((voice) => voice && voice.toUpperCase() !== 'ALL');
|
|
let voice = requestVoice;
|
|
if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) {
|
|
voice = getRandomVoiceId(voices);
|
|
}
|
|
|
|
return voice;
|
|
}
|
|
|
|
/**
|
|
*
|
|
* @param {string} provider
|
|
* @param {TProviderSchema} ttsSchema
|
|
* @param {object} params
|
|
* @param {string} params.voice
|
|
* @param {string} params.input
|
|
* @param {boolean} [params.stream]
|
|
* @returns {Promise<ArrayBuffer>}
|
|
*/
|
|
async function ttsRequest(provider, ttsSchema, { input, voice, stream = true } = { stream: true }) {
|
|
let [url, data, headers] = [];
|
|
switch (provider) {
|
|
case 'openai':
|
|
[url, data, headers] = openAIProvider(ttsSchema, input, voice);
|
|
break;
|
|
case 'elevenlabs':
|
|
[url, data, headers] = elevenLabsProvider(ttsSchema, input, voice, stream);
|
|
break;
|
|
case 'localai':
|
|
[url, data, headers] = localAIProvider(ttsSchema, input, voice);
|
|
break;
|
|
default:
|
|
throw new Error('Invalid provider');
|
|
}
|
|
|
|
if (stream) {
|
|
return await axios.post(url, data, { headers, responseType: 'stream' });
|
|
}
|
|
|
|
return await axios.post(url, data, { headers, responseType: 'arraybuffer' });
|
|
}
|
|
|
|
/**
|
|
* Handles a text-to-speech request. Extracts input and voice from the request, retrieves the TTS configuration,
|
|
* and sends a request to the appropriate provider. The resulting audio data is sent in the response
|
|
*
|
|
* @param {Object} req - The request object, which should contain the input text and voice in its body
|
|
* @param {Object} res - The response object, used to send the audio data or an error message
|
|
*
|
|
* @returns {Promise<void>} This function does not return a value. It sends the audio data or an error message in the response
|
|
*
|
|
* @throws {Error} Throws an error if the provider is invalid
|
|
*/
|
|
async function textToSpeech(req, res) {
|
|
const { input } = req.body;
|
|
|
|
if (!input) {
|
|
return res.status(400).send('Missing text in request body');
|
|
}
|
|
|
|
const customConfig = await getCustomConfig();
|
|
if (!customConfig) {
|
|
res.status(500).send('Custom config not found');
|
|
}
|
|
|
|
try {
|
|
res.setHeader('Content-Type', 'audio/mpeg');
|
|
const [provider, ttsSchema] = await getProviderSchema(customConfig);
|
|
const voice = await getVoice(ttsSchema, req.body.voice);
|
|
if (input.length < 4096) {
|
|
const response = await ttsRequest(provider, ttsSchema, { input, voice });
|
|
response.data.pipe(res);
|
|
return;
|
|
}
|
|
|
|
const textChunks = splitTextIntoChunks(input, 1000);
|
|
|
|
for (const chunk of textChunks) {
|
|
try {
|
|
const response = await ttsRequest(provider, ttsSchema, {
|
|
voice,
|
|
input: chunk.text,
|
|
stream: true,
|
|
});
|
|
|
|
logger.debug(`[textToSpeech] user: ${req?.user?.id} | writing audio stream`);
|
|
await new Promise((resolve) => {
|
|
response.data.pipe(res, { end: chunk.isFinished });
|
|
response.data.on('end', () => {
|
|
resolve();
|
|
});
|
|
});
|
|
|
|
if (chunk.isFinished) {
|
|
break;
|
|
}
|
|
} catch (innerError) {
|
|
logger.error('Error processing manual update:', chunk, innerError);
|
|
if (!res.headersSent) {
|
|
res.status(500).end();
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!res.headersSent) {
|
|
res.end();
|
|
}
|
|
} catch (error) {
|
|
logger.error(
|
|
'Error creating the audio stream. Suggestion: check your provider quota. Error:',
|
|
error,
|
|
);
|
|
res.status(500).send('An error occurred');
|
|
}
|
|
}
|
|
|
|
async function streamAudio(req, res) {
|
|
res.setHeader('Content-Type', 'audio/mpeg');
|
|
const customConfig = await getCustomConfig();
|
|
if (!customConfig) {
|
|
return res.status(500).send('Custom config not found');
|
|
}
|
|
|
|
const [provider, ttsSchema] = await getProviderSchema(customConfig);
|
|
const voice = await getVoice(ttsSchema, req.body.voice);
|
|
|
|
try {
|
|
let shouldContinue = true;
|
|
|
|
req.on('close', () => {
|
|
logger.warn('[streamAudio] Audio Stream Request closed by client');
|
|
shouldContinue = false;
|
|
});
|
|
|
|
const processChunks = createChunkProcessor(req.body.messageId);
|
|
|
|
while (shouldContinue) {
|
|
// example updates
|
|
// const updates = [
|
|
// { text: 'This is a test.', isFinished: false },
|
|
// { text: 'This is only a test.', isFinished: false },
|
|
// { text: 'Your voice is like a combination of Fergie and Jesus!', isFinished: true },
|
|
// ];
|
|
|
|
const updates = await processChunks();
|
|
if (typeof updates === 'string') {
|
|
logger.error(`Error processing audio stream updates: ${JSON.stringify(updates)}`);
|
|
res.status(500).end();
|
|
return;
|
|
}
|
|
|
|
if (updates.length === 0) {
|
|
await new Promise((resolve) => setTimeout(resolve, 1250));
|
|
continue;
|
|
}
|
|
|
|
for (const update of updates) {
|
|
try {
|
|
const response = await ttsRequest(provider, ttsSchema, {
|
|
voice,
|
|
input: update.text,
|
|
stream: true,
|
|
});
|
|
|
|
if (!shouldContinue) {
|
|
break;
|
|
}
|
|
|
|
logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`);
|
|
await new Promise((resolve) => {
|
|
response.data.pipe(res, { end: update.isFinished });
|
|
response.data.on('end', () => {
|
|
resolve();
|
|
});
|
|
});
|
|
|
|
if (update.isFinished) {
|
|
shouldContinue = false;
|
|
break;
|
|
}
|
|
} catch (innerError) {
|
|
logger.error('Error processing update:', update, innerError);
|
|
if (!res.headersSent) {
|
|
res.status(500).end();
|
|
}
|
|
return;
|
|
}
|
|
}
|
|
|
|
if (!shouldContinue) {
|
|
break;
|
|
}
|
|
}
|
|
|
|
if (!res.headersSent) {
|
|
res.end();
|
|
}
|
|
} catch (error) {
|
|
logger.error('Failed to fetch audio:', error);
|
|
if (!res.headersSent) {
|
|
res.status(500).end();
|
|
}
|
|
}
|
|
}
|
|
|
|
module.exports = {
|
|
textToSpeech,
|
|
getProvider,
|
|
streamAudio,
|
|
};
|