mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-09-22 06:00:56 +02:00
🗣️ feat: Language Support for OpenAI Speech-to-Text (#9470)
This commit is contained in:
parent
e95e0052da
commit
65c83317aa
2 changed files with 32 additions and 5 deletions
|
@ -159,9 +159,11 @@ class STTService {
|
||||||
* Prepares the request for the OpenAI STT provider.
|
* Prepares the request for the OpenAI STT provider.
|
||||||
* @param {Object} sttSchema - The STT schema for OpenAI.
|
* @param {Object} sttSchema - The STT schema for OpenAI.
|
||||||
* @param {Stream} audioReadStream - The audio data to be transcribed.
|
* @param {Stream} audioReadStream - The audio data to be transcribed.
|
||||||
|
* @param {Object} audioFile - The audio file object (unused in OpenAI provider).
|
||||||
|
* @param {string} language - The language code for the transcription.
|
||||||
* @returns {Array} An array containing the URL, data, and headers for the request.
|
* @returns {Array} An array containing the URL, data, and headers for the request.
|
||||||
*/
|
*/
|
||||||
openAIProvider(sttSchema, audioReadStream) {
|
openAIProvider(sttSchema, audioReadStream, audioFile, language) {
|
||||||
const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions';
|
const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions';
|
||||||
const apiKey = extractEnvVariable(sttSchema.apiKey) || '';
|
const apiKey = extractEnvVariable(sttSchema.apiKey) || '';
|
||||||
|
|
||||||
|
@ -170,6 +172,12 @@ class STTService {
|
||||||
model: sttSchema.model,
|
model: sttSchema.model,
|
||||||
};
|
};
|
||||||
|
|
||||||
|
if (language) {
|
||||||
|
/** Converted locale code (e.g., "en-US") to ISO-639-1 format (e.g., "en") */
|
||||||
|
const isoLanguage = language.split('-')[0];
|
||||||
|
data.language = isoLanguage;
|
||||||
|
}
|
||||||
|
|
||||||
const headers = {
|
const headers = {
|
||||||
'Content-Type': 'multipart/form-data',
|
'Content-Type': 'multipart/form-data',
|
||||||
...(apiKey && { Authorization: `Bearer ${apiKey}` }),
|
...(apiKey && { Authorization: `Bearer ${apiKey}` }),
|
||||||
|
@ -184,10 +192,11 @@ class STTService {
|
||||||
* @param {Object} sttSchema - The STT schema for Azure OpenAI.
|
* @param {Object} sttSchema - The STT schema for Azure OpenAI.
|
||||||
* @param {Buffer} audioBuffer - The audio data to be transcribed.
|
* @param {Buffer} audioBuffer - The audio data to be transcribed.
|
||||||
* @param {Object} audioFile - The audio file object containing originalname, mimetype, and size.
|
* @param {Object} audioFile - The audio file object containing originalname, mimetype, and size.
|
||||||
|
* @param {string} language - The language code for the transcription.
|
||||||
* @returns {Array} An array containing the URL, data, and headers for the request.
|
* @returns {Array} An array containing the URL, data, and headers for the request.
|
||||||
* @throws {Error} If the audio file size exceeds 25MB or the audio file format is not accepted.
|
* @throws {Error} If the audio file size exceeds 25MB or the audio file format is not accepted.
|
||||||
*/
|
*/
|
||||||
azureOpenAIProvider(sttSchema, audioBuffer, audioFile) {
|
azureOpenAIProvider(sttSchema, audioBuffer, audioFile, language) {
|
||||||
const url = `${genAzureEndpoint({
|
const url = `${genAzureEndpoint({
|
||||||
azureOpenAIApiInstanceName: extractEnvVariable(sttSchema?.instanceName),
|
azureOpenAIApiInstanceName: extractEnvVariable(sttSchema?.instanceName),
|
||||||
azureOpenAIApiDeploymentName: extractEnvVariable(sttSchema?.deploymentName),
|
azureOpenAIApiDeploymentName: extractEnvVariable(sttSchema?.deploymentName),
|
||||||
|
@ -211,6 +220,12 @@ class STTService {
|
||||||
contentType: audioFile.mimetype,
|
contentType: audioFile.mimetype,
|
||||||
});
|
});
|
||||||
|
|
||||||
|
if (language) {
|
||||||
|
/** Converted locale code (e.g., "en-US") to ISO-639-1 format (e.g., "en") */
|
||||||
|
const isoLanguage = language.split('-')[0];
|
||||||
|
formData.append('language', isoLanguage);
|
||||||
|
}
|
||||||
|
|
||||||
const headers = {
|
const headers = {
|
||||||
'Content-Type': 'multipart/form-data',
|
'Content-Type': 'multipart/form-data',
|
||||||
...(apiKey && { 'api-key': apiKey }),
|
...(apiKey && { 'api-key': apiKey }),
|
||||||
|
@ -229,10 +244,11 @@ class STTService {
|
||||||
* @param {Object} requestData - The data required for the STT request.
|
* @param {Object} requestData - The data required for the STT request.
|
||||||
* @param {Buffer} requestData.audioBuffer - The audio data to be transcribed.
|
* @param {Buffer} requestData.audioBuffer - The audio data to be transcribed.
|
||||||
* @param {Object} requestData.audioFile - The audio file object containing originalname, mimetype, and size.
|
* @param {Object} requestData.audioFile - The audio file object containing originalname, mimetype, and size.
|
||||||
|
* @param {string} requestData.language - The language code for the transcription.
|
||||||
* @returns {Promise<string>} A promise that resolves to the transcribed text.
|
* @returns {Promise<string>} A promise that resolves to the transcribed text.
|
||||||
* @throws {Error} If the provider is invalid, the response status is not 200, or the response data is missing.
|
* @throws {Error} If the provider is invalid, the response status is not 200, or the response data is missing.
|
||||||
*/
|
*/
|
||||||
async sttRequest(provider, sttSchema, { audioBuffer, audioFile }) {
|
async sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }) {
|
||||||
const strategy = this.providerStrategies[provider];
|
const strategy = this.providerStrategies[provider];
|
||||||
if (!strategy) {
|
if (!strategy) {
|
||||||
throw new Error('Invalid provider');
|
throw new Error('Invalid provider');
|
||||||
|
@ -243,7 +259,13 @@ class STTService {
|
||||||
const audioReadStream = Readable.from(audioBuffer);
|
const audioReadStream = Readable.from(audioBuffer);
|
||||||
audioReadStream.path = `audio.${fileExtension}`;
|
audioReadStream.path = `audio.${fileExtension}`;
|
||||||
|
|
||||||
const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile);
|
const [url, data, headers] = strategy.call(
|
||||||
|
this,
|
||||||
|
sttSchema,
|
||||||
|
audioReadStream,
|
||||||
|
audioFile,
|
||||||
|
language,
|
||||||
|
);
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const response = await axios.post(url, data, { headers });
|
const response = await axios.post(url, data, { headers });
|
||||||
|
@ -284,7 +306,8 @@ class STTService {
|
||||||
|
|
||||||
try {
|
try {
|
||||||
const [provider, sttSchema] = await this.getProviderSchema(req);
|
const [provider, sttSchema] = await this.getProviderSchema(req);
|
||||||
const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
|
const language = req.body?.language || '';
|
||||||
|
const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile, language });
|
||||||
res.json({ text });
|
res.json({ text });
|
||||||
} catch (error) {
|
} catch (error) {
|
||||||
logger.error('An error occurred while processing the audio:', error);
|
logger.error('An error occurred while processing the audio:', error);
|
||||||
|
|
|
@ -25,6 +25,7 @@ const useSpeechToTextExternal = (
|
||||||
|
|
||||||
const [minDecibels] = useRecoilState(store.decibelValue);
|
const [minDecibels] = useRecoilState(store.decibelValue);
|
||||||
const [autoSendText] = useRecoilState(store.autoSendText);
|
const [autoSendText] = useRecoilState(store.autoSendText);
|
||||||
|
const [languageSTT] = useRecoilState<string>(store.languageSTT);
|
||||||
const [speechToText] = useRecoilState<boolean>(store.speechToText);
|
const [speechToText] = useRecoilState<boolean>(store.speechToText);
|
||||||
const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
|
const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
|
||||||
|
|
||||||
|
@ -121,6 +122,9 @@ const useSpeechToTextExternal = (
|
||||||
|
|
||||||
const formData = new FormData();
|
const formData = new FormData();
|
||||||
formData.append('audio', audioBlob, `audio.${fileExtension}`);
|
formData.append('audio', audioBlob, `audio.${fileExtension}`);
|
||||||
|
if (languageSTT) {
|
||||||
|
formData.append('language', languageSTT);
|
||||||
|
}
|
||||||
setIsRequestBeingMade(true);
|
setIsRequestBeingMade(true);
|
||||||
cleanup();
|
cleanup();
|
||||||
processAudio(formData);
|
processAudio(formData);
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue