LibreChat/api/server/services/Files/Audio/speechToText.js
Marco Beretta 1aad315de6
🎤 feat: add custom speech config, browser TTS/STT features, and dynamic speech tab settings (#2921)
* feat: update useTextToSpeech and useSpeechToText hooks to support external audio endpoints

This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints

* feat: add userSelect style to ConversationModeSwitch label

* fix: remove unused updateTokenWebsocket function and import

The updateTokenWebsocket function and its import are no longer used in the OpenAIClient module. This commit removes the function and import to clean up the codebase

* feat: support external audio endpoints in useTextToSpeech and useSpeechToText hooks

This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints

* feat: update AutomaticPlayback component to AutomaticPlaybackSwitch; tests: added AutomaticPlaybackSwitch.spec
>
> This commit renames the AutomaticPlayback component to AutomaticPlaybackSwitch in the Speech directory. The new name better reflects the purpose of the component and aligns with the naming convention used in the codebase.

* feat: update useSpeechToText hook to include interimTranscript

This commit updates the useSpeechToText hook in the client/src/components/Chat/Input/AudioRecorder.tsx file to include the interimTranscript state. This allows for real-time display of the speech-to-text transcription while the user is still speaking. The interimTranscript is now used to update the text area value during recording.

* feat: Add customConfigSpeech API endpoint for retrieving custom speech configuration

This commit adds a new API endpoint  in the  file under the  directory. This endpoint is responsible for retrieving the custom speech configuration using the  function from the  module

* feat: update store var  and ; fix: getCustomConfigSpeech

* fix: client tests, removed unused import

* feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations

This commit modifies the useCustomConfigSpeechQuery function in the client/src/data-provider/queries.ts file to return an array of custom speech configurations instead of a single object. This change allows for better handling and manipulation of the data in the application

* feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations

* refactor: Update variable name in speechTab schema

* refactor: removed unused and nested code

* fix: using recoilState

* refactor: Update Speech component to use useCallback for setting settings

* fix: test

* fix: tests

* feature: ensure that the settings don't change after modifying then through the UI

* remove comment

* fix: Handle error gracefully in getCustomConfigSpeech and getVoices endpoints

* fix: Handle error

* fix: backend tests

* fix: invalid custom config logging

* chore: add back custom config info logging

* chore: revert loadCustomConfig spec

---------

Co-authored-by: Danny Avila <danny@librechat.ai>
2024-07-05 10:13:34 -04:00

211 lines
6.2 KiB
JavaScript

const axios = require('axios');
const { Readable } = require('stream');
const { logger } = require('~/config');
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
const { extractEnvVariable } = require('librechat-data-provider');
/**
* Handle the response from the STT API
* @param {Object} response - The response from the STT API
*
* @returns {string} The text from the response data
*
* @throws Will throw an error if the response status is not 200 or the response data is missing
*/
async function handleResponse(response) {
if (response.status !== 200) {
throw new Error('Invalid response from the STT API');
}
if (!response.data || !response.data.text) {
throw new Error('Missing data in response from the STT API');
}
return response.data.text.trim();
}
function getProvider(sttSchema) {
if (sttSchema?.openai) {
return 'openai';
}
throw new Error('Invalid provider');
}
function removeUndefined(obj) {
Object.keys(obj).forEach((key) => {
if (obj[key] && typeof obj[key] === 'object') {
removeUndefined(obj[key]);
if (Object.keys(obj[key]).length === 0) {
delete obj[key];
}
} else if (obj[key] === undefined) {
delete obj[key];
}
});
}
/**
* This function prepares the necessary data and headers for making a request to the OpenAI API
* It uses the provided speech-to-text schema and audio stream to create the request
*
* @param {Object} sttSchema - The speech-to-text schema containing the OpenAI configuration
* @param {Stream} audioReadStream - The audio data to be transcribed
*
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
* If an error occurs, it returns an array with three null values and logs the error with logger
*/
function openAIProvider(sttSchema, audioReadStream) {
try {
const url = sttSchema.openai?.url || 'https://api.openai.com/v1/audio/transcriptions';
const apiKey = sttSchema.openai.apiKey ? extractEnvVariable(sttSchema.openai.apiKey) : '';
let data = {
file: audioReadStream,
model: sttSchema.openai.model,
};
let headers = {
'Content-Type': 'multipart/form-data',
};
[headers].forEach(removeUndefined);
if (apiKey) {
headers.Authorization = 'Bearer ' + apiKey;
}
return [url, data, headers];
} catch (error) {
logger.error('An error occurred while preparing the OpenAI API STT request: ', error);
return [null, null, null];
}
}
/**
* This function prepares the necessary data and headers for making a request to the Azure API
* It uses the provided request and audio stream to create the request
*
* @param {Object} req - The request object, which should contain the endpoint in its body
* @param {Stream} audioReadStream - The audio data to be transcribed
*
* @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
* If an error occurs, it returns an array with three null values and logs the error with logger
*/
function azureProvider(req, audioReadStream) {
try {
const { endpoint } = req.body;
const azureConfig = req.app.locals[endpoint];
if (!azureConfig) {
throw new Error(`No configuration found for endpoint: ${endpoint}`);
}
const { apiKey, instanceName, whisperModel, apiVersion } = Object.entries(
azureConfig.groupMap,
).reduce((acc, [, value]) => {
if (acc) {
return acc;
}
const whisperKey = Object.keys(value.models).find((modelKey) =>
modelKey.startsWith('whisper'),
);
if (whisperKey) {
return {
apiVersion: value.version,
apiKey: value.apiKey,
instanceName: value.instanceName,
whisperModel: value.models[whisperKey]['deploymentName'],
};
}
return null;
}, null);
if (!apiKey || !instanceName || !whisperModel || !apiVersion) {
throw new Error('Required Azure configuration values are missing');
}
const baseURL = `https://${instanceName}.openai.azure.com`;
const url = `${baseURL}/openai/deployments/${whisperModel}/audio/transcriptions?api-version=${apiVersion}`;
let data = {
file: audioReadStream,
filename: 'audio.wav',
contentType: 'audio/wav',
knownLength: audioReadStream.length,
};
const headers = {
...data.getHeaders(),
'Content-Type': 'multipart/form-data',
'api-key': apiKey,
};
return [url, data, headers];
} catch (error) {
logger.error('An error occurred while preparing the Azure API STT request: ', error);
return [null, null, null];
}
}
/**
* Convert speech to text
* @param {Object} req - The request object
* @param {Object} res - The response object
*
* @returns {Object} The response object with the text from the STT API
*
* @throws Will throw an error if an error occurs while processing the audio
*/
async function speechToText(req, res) {
const customConfig = await getCustomConfig();
if (!customConfig) {
return res.status(500).send('Custom config not found');
}
if (!req.file || !req.file.buffer) {
return res.status(400).json({ message: 'No audio file provided in the FormData' });
}
const audioBuffer = req.file.buffer;
const audioReadStream = Readable.from(audioBuffer);
audioReadStream.path = 'audio.wav';
const provider = getProvider(customConfig.speech.stt);
let [url, data, headers] = [];
switch (provider) {
case 'openai':
[url, data, headers] = openAIProvider(customConfig.stt, audioReadStream);
break;
case 'azure':
[url, data, headers] = azureProvider(req, audioReadStream);
break;
default:
throw new Error('Invalid provider');
}
if (!Readable.from) {
const audioBlob = new Blob([audioBuffer], { type: req.file.mimetype });
delete data['file'];
data['file'] = audioBlob;
}
try {
const response = await axios.post(url, data, { headers: headers });
const text = await handleResponse(response);
res.json({ text });
} catch (error) {
logger.error('An error occurred while processing the audio:', error);
res.sendStatus(500);
}
}
module.exports = speechToText;