From 65c83317aa5c692e4177b30429521c467ca12ab1 Mon Sep 17 00:00:00 2001 From: Danny Avila Date: Fri, 5 Sep 2025 12:01:00 -0400 Subject: [PATCH] =?UTF-8?q?=F0=9F=97=A3=EF=B8=8F=20feat:=20Language=20Supp?= =?UTF-8?q?ort=20for=20OpenAI=20Speech-to-Text=20(#9470)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- api/server/services/Files/Audio/STTService.js | 33 ++++++++++++++++--- .../hooks/Input/useSpeechToTextExternal.ts | 4 +++ 2 files changed, 32 insertions(+), 5 deletions(-) diff --git a/api/server/services/Files/Audio/STTService.js b/api/server/services/Files/Audio/STTService.js index 95eea62ab..60d6a48a1 100644 --- a/api/server/services/Files/Audio/STTService.js +++ b/api/server/services/Files/Audio/STTService.js @@ -159,9 +159,11 @@ class STTService { * Prepares the request for the OpenAI STT provider. * @param {Object} sttSchema - The STT schema for OpenAI. * @param {Stream} audioReadStream - The audio data to be transcribed. + * @param {Object} audioFile - The audio file object (unused in OpenAI provider). + * @param {string} language - The language code for the transcription. * @returns {Array} An array containing the URL, data, and headers for the request. */ - openAIProvider(sttSchema, audioReadStream) { + openAIProvider(sttSchema, audioReadStream, audioFile, language) { const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions'; const apiKey = extractEnvVariable(sttSchema.apiKey) || ''; @@ -170,6 +172,12 @@ class STTService { model: sttSchema.model, }; + if (language) { + /** Converted locale code (e.g., "en-US") to ISO-639-1 format (e.g., "en") */ + const isoLanguage = language.split('-')[0]; + data.language = isoLanguage; + } + const headers = { 'Content-Type': 'multipart/form-data', ...(apiKey && { Authorization: `Bearer ${apiKey}` }), @@ -184,10 +192,11 @@ class STTService { * @param {Object} sttSchema - The STT schema for Azure OpenAI. * @param {Buffer} audioBuffer - The audio data to be transcribed. * @param {Object} audioFile - The audio file object containing originalname, mimetype, and size. + * @param {string} language - The language code for the transcription. * @returns {Array} An array containing the URL, data, and headers for the request. * @throws {Error} If the audio file size exceeds 25MB or the audio file format is not accepted. */ - azureOpenAIProvider(sttSchema, audioBuffer, audioFile) { + azureOpenAIProvider(sttSchema, audioBuffer, audioFile, language) { const url = `${genAzureEndpoint({ azureOpenAIApiInstanceName: extractEnvVariable(sttSchema?.instanceName), azureOpenAIApiDeploymentName: extractEnvVariable(sttSchema?.deploymentName), @@ -211,6 +220,12 @@ class STTService { contentType: audioFile.mimetype, }); + if (language) { + /** Converted locale code (e.g., "en-US") to ISO-639-1 format (e.g., "en") */ + const isoLanguage = language.split('-')[0]; + formData.append('language', isoLanguage); + } + const headers = { 'Content-Type': 'multipart/form-data', ...(apiKey && { 'api-key': apiKey }), @@ -229,10 +244,11 @@ class STTService { * @param {Object} requestData - The data required for the STT request. * @param {Buffer} requestData.audioBuffer - The audio data to be transcribed. * @param {Object} requestData.audioFile - The audio file object containing originalname, mimetype, and size. + * @param {string} requestData.language - The language code for the transcription. * @returns {Promise} A promise that resolves to the transcribed text. * @throws {Error} If the provider is invalid, the response status is not 200, or the response data is missing. */ - async sttRequest(provider, sttSchema, { audioBuffer, audioFile }) { + async sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }) { const strategy = this.providerStrategies[provider]; if (!strategy) { throw new Error('Invalid provider'); @@ -243,7 +259,13 @@ class STTService { const audioReadStream = Readable.from(audioBuffer); audioReadStream.path = `audio.${fileExtension}`; - const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile); + const [url, data, headers] = strategy.call( + this, + sttSchema, + audioReadStream, + audioFile, + language, + ); try { const response = await axios.post(url, data, { headers }); @@ -284,7 +306,8 @@ class STTService { try { const [provider, sttSchema] = await this.getProviderSchema(req); - const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile }); + const language = req.body?.language || ''; + const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }); res.json({ text }); } catch (error) { logger.error('An error occurred while processing the audio:', error); diff --git a/client/src/hooks/Input/useSpeechToTextExternal.ts b/client/src/hooks/Input/useSpeechToTextExternal.ts index 6571a0911..0376279ce 100644 --- a/client/src/hooks/Input/useSpeechToTextExternal.ts +++ b/client/src/hooks/Input/useSpeechToTextExternal.ts @@ -25,6 +25,7 @@ const useSpeechToTextExternal = ( const [minDecibels] = useRecoilState(store.decibelValue); const [autoSendText] = useRecoilState(store.autoSendText); + const [languageSTT] = useRecoilState(store.languageSTT); const [speechToText] = useRecoilState(store.speechToText); const [autoTranscribeAudio] = useRecoilState(store.autoTranscribeAudio); @@ -121,6 +122,9 @@ const useSpeechToTextExternal = ( const formData = new FormData(); formData.append('audio', audioBlob, `audio.${fileExtension}`); + if (languageSTT) { + formData.append('language', languageSTT); + } setIsRequestBeingMade(true); cleanup(); processAudio(formData);