🗣️ refactor: speech services; fix: OpenAI STT (#3431)

* fix: OpenAI STT

* refactor: STT and TTS service, slightly imporve of performance

* fix(DecibelSelector): update default value
This commit is contained in:
Marco Beretta 2024-07-30 09:18:52 -04:00 committed by GitHub
parent 4ffdefc2a8
commit 51cd847606
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
8 changed files with 737 additions and 714 deletions

View file

@ -0,0 +1,248 @@
const { Readable } = require('stream');
const axios = require('axios');
const { extractEnvVariable, STTProviders } = require('librechat-data-provider');
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
const { genAzureEndpoint } = require('~/utils');
const { logger } = require('~/config');
/**
* Service class for handling Speech-to-Text (STT) operations.
* @class
*/
class STTService {
/**
* Creates an instance of STTService.
* @param {Object} customConfig - The custom configuration object.
*/
constructor(customConfig) {
this.customConfig = customConfig;
this.providerStrategies = {
[STTProviders.OPENAI]: this.openAIProvider,
[STTProviders.AZURE_OPENAI]: this.azureOpenAIProvider,
};
}
/**
* Creates a singleton instance of STTService.
* @static
* @async
* @returns {Promise<STTService>} The STTService instance.
* @throws {Error} If the custom config is not found.
*/
static async getInstance() {
const customConfig = await getCustomConfig();
if (!customConfig) {
throw new Error('Custom config not found');
}
return new STTService(customConfig);
}
/**
* Retrieves the configured STT provider and its schema.
* @returns {Promise<[string, Object]>} A promise that resolves to an array containing the provider name and its schema.
* @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set.
*/
async getProviderSchema() {
const sttSchema = this.customConfig.speech.stt;
if (!sttSchema) {
throw new Error(
'No STT schema is set. Did you configure STT in the custom config (librechat.yaml)?',
);
}
const providers = Object.entries(sttSchema).filter(
([, value]) => Object.keys(value).length > 0,
);
if (providers.length !== 1) {
throw new Error(
providers.length > 1
? 'Multiple providers are set. Please set only one provider.'
: 'No provider is set. Please set a provider.',
);
}
const [provider, schema] = providers[0];
return [provider, schema];
}
/**
* Recursively removes undefined properties from an object.
* @param {Object} obj - The object to clean.
* @returns {void}
*/
removeUndefined(obj) {
Object.keys(obj).forEach((key) => {
if (obj[key] && typeof obj[key] === 'object') {
this.removeUndefined(obj[key]);
if (Object.keys(obj[key]).length === 0) {
delete obj[key];
}
} else if (obj[key] === undefined) {
delete obj[key];
}
});
}
/**
* Prepares the request for the OpenAI STT provider.
* @param {Object} sttSchema - The STT schema for OpenAI.
* @param {Stream} audioReadStream - The audio data to be transcribed.
* @returns {Array} An array containing the URL, data, and headers for the request.
*/
openAIProvider(sttSchema, audioReadStream) {
const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions';
const apiKey = extractEnvVariable(sttSchema.apiKey) || '';
const data = {
file: audioReadStream,
model: sttSchema.model,
};
const headers = {
'Content-Type': 'multipart/form-data',
...(apiKey && { Authorization: `Bearer ${apiKey}` }),
};
[headers].forEach(this.removeUndefined);
return [url, data, headers];
}
/**
* Prepares the request for the Azure OpenAI STT provider.
* @param {Object} sttSchema - The STT schema for Azure OpenAI.
* @param {Buffer} audioBuffer - The audio data to be transcribed.
* @param {Object} audioFile - The audio file object containing originalname, mimetype, and size.
* @returns {Array} An array containing the URL, data, and headers for the request.
* @throws {Error} If the audio file size exceeds 25MB or the audio file format is not accepted.
*/
azureOpenAIProvider(sttSchema, audioBuffer, audioFile) {
const url = `${genAzureEndpoint({
azureOpenAIApiInstanceName: sttSchema?.instanceName,
azureOpenAIApiDeploymentName: sttSchema?.deploymentName,
})}/audio/transcriptions?api-version=${sttSchema?.apiVersion}`;
const apiKey = sttSchema.apiKey ? extractEnvVariable(sttSchema.apiKey) : '';
if (audioBuffer.byteLength > 25 * 1024 * 1024) {
throw new Error('The audio file size exceeds the limit of 25MB');
}
const acceptedFormats = ['flac', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'ogg', 'wav', 'webm'];
const fileFormat = audioFile.mimetype.split('/')[1];
if (!acceptedFormats.includes(fileFormat)) {
throw new Error(`The audio file format ${fileFormat} is not accepted`);
}
const formData = new FormData();
const audioBlob = new Blob([audioBuffer], { type: audioFile.mimetype });
formData.append('file', audioBlob, audioFile.originalname);
const headers = {
'Content-Type': 'multipart/form-data',
...(apiKey && { 'api-key': apiKey }),
};
[headers].forEach(this.removeUndefined);
return [url, formData, headers];
}
/**
* Sends an STT request to the specified provider.
* @async
* @param {string} provider - The STT provider to use.
* @param {Object} sttSchema - The STT schema for the provider.
* @param {Object} requestData - The data required for the STT request.
* @param {Buffer} requestData.audioBuffer - The audio data to be transcribed.
* @param {Object} requestData.audioFile - The audio file object containing originalname, mimetype, and size.
* @returns {Promise<string>} A promise that resolves to the transcribed text.
* @throws {Error} If the provider is invalid, the response status is not 200, or the response data is missing.
*/
async sttRequest(provider, sttSchema, { audioBuffer, audioFile }) {
const strategy = this.providerStrategies[provider];
if (!strategy) {
throw new Error('Invalid provider');
}
const audioReadStream = Readable.from(audioBuffer);
audioReadStream.path = 'audio.wav';
const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile);
if (!Readable.from && data instanceof FormData) {
const audioBlob = new Blob([audioBuffer], { type: audioFile.mimetype });
data.set('file', audioBlob, audioFile.originalname);
}
try {
const response = await axios.post(url, data, { headers });
if (response.status !== 200) {
throw new Error('Invalid response from the STT API');
}
if (!response.data || !response.data.text) {
throw new Error('Missing data in response from the STT API');
}
return response.data.text.trim();
} catch (error) {
logger.error(`STT request failed for provider ${provider}:`, error);
throw error;
}
}
/**
* Processes a speech-to-text request.
* @async
* @param {Object} req - The request object.
* @param {Object} res - The response object.
* @returns {Promise<void>}
*/
async processTextToSpeech(req, res) {
if (!req.file || !req.file.buffer) {
return res.status(400).json({ message: 'No audio file provided in the FormData' });
}
const audioBuffer = req.file.buffer;
const audioFile = {
originalname: req.file.originalname,
mimetype: req.file.mimetype,
size: req.file.size,
};
try {
const [provider, sttSchema] = await this.getProviderSchema();
const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
res.json({ text });
} catch (error) {
logger.error('An error occurred while processing the audio:', error);
res.sendStatus(500);
}
}
}
/**
* Factory function to create an STTService instance.
* @async
* @returns {Promise<STTService>} A promise that resolves to an STTService instance.
*/
async function createSTTService() {
return STTService.getInstance();
}
/**
* Wrapper function for speech-to-text processing.
* @async
* @param {Object} req - The request object.
* @param {Object} res - The response object.
* @returns {Promise<void>}
*/
async function speechToText(req, res) {
const sttService = await createSTTService();
await sttService.processTextToSpeech(req, res);
}
module.exports = speechToText;