🎤 feat: add custom speech config, browser TTS/STT features, and dynamic speech tab settings (#2921)

* feat: update useTextToSpeech and useSpeechToText hooks to support external audio endpoints This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: add userSelect style to ConversationModeSwitch label * fix: remove unused updateTokenWebsocket function and import The updateTokenWebsocket function and its import are no longer used in the OpenAIClient module. This commit removes the function and import to clean up the codebase * feat: support external audio endpoints in useTextToSpeech and useSpeechToText hooks This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: update AutomaticPlayback component to AutomaticPlaybackSwitch; tests: added AutomaticPlaybackSwitch.spec > > This commit renames the AutomaticPlayback component to AutomaticPlaybackSwitch in the Speech directory. The new name better reflects the purpose of the component and aligns with the naming convention used in the codebase. * feat: update useSpeechToText hook to include interimTranscript This commit updates the useSpeechToText hook in the client/src/components/Chat/Input/AudioRecorder.tsx file to include the interimTranscript state. This allows for real-time display of the speech-to-text transcription while the user is still speaking. The interimTranscript is now used to update the text area value during recording. * feat: Add customConfigSpeech API endpoint for retrieving custom speech configuration This commit adds a new API endpoint in the file under the directory. This endpoint is responsible for retrieving the custom speech configuration using the function from the module * feat: update store var and ; fix: getCustomConfigSpeech * fix: client tests, removed unused import * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations This commit modifies the useCustomConfigSpeechQuery function in the client/src/data-provider/queries.ts file to return an array of custom speech configurations instead of a single object. This change allows for better handling and manipulation of the data in the application * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations * refactor: Update variable name in speechTab schema * refactor: removed unused and nested code * fix: using recoilState * refactor: Update Speech component to use useCallback for setting settings * fix: test * fix: tests * feature: ensure that the settings don't change after modifying then through the UI * remove comment * fix: Handle error gracefully in getCustomConfigSpeech and getVoices endpoints * fix: Handle error * fix: backend tests * fix: invalid custom config logging * chore: add back custom config info logging * chore: revert loadCustomConfig spec --------- Co-authored-by: Danny Avila <danny@librechat.ai>
2026-02-18 08:28:10 +01:00 · 2024-07-05 17:13:34 +03:00 · 2024-07-05 17:13:34 +03:00 · 1aad315de6
commit 1aad315de6
parent 5d985746cb
50 changed files with 598 additions and 179 deletions
--- a/api/server/routes/files/index.js
+++ b/api/server/routes/files/index.js
@ -1,19 +1,11 @@
 const express = require('express');
-const {
-  uaParser,
-  checkBan,
-  requireJwtAuth,
-  createFileLimiters,
-  createTTSLimiters,
-  createSTTLimiters,
-} = require('~/server/middleware');
+const { uaParser, checkBan, requireJwtAuth, createFileLimiters } = require('~/server/middleware');
 const { createMulterInstance } = require('./multer');

 const files = require('./files');
 const images = require('./images');
 const avatar = require('./avatar');
-const stt = require('./stt');
-const tts = require('./tts');
+const speech = require('./speech');

 const initialize = async () => {
  const router = express.Router();
@ -21,11 +13,8 @@ const initialize = async () => {
  router.use(checkBan);
  router.use(uaParser);

-  /* Important: stt/tts routes must be added before the upload limiters */
-  const { sttIpLimiter, sttUserLimiter } = createSTTLimiters();
-  const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters();
-  router.use('/stt', sttIpLimiter, sttUserLimiter, stt);
-  router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts);
+  /* Important: speech route must be added before the upload limiters */
+  router.use('/speech', speech);

  const upload = await createMulterInstance();
  const { fileUploadIpLimiter, fileUploadUserLimiter } = createFileLimiters();
--- a/api/server/routes/files/speech/customConfigSpeech.js
+++ b/api/server/routes/files/speech/customConfigSpeech.js
@ -0,0 +1,10 @@
+const express = require('express');
+const router = express.Router();
+
+const { getCustomConfigSpeech } = require('~/server/services/Files/Audio');
+
+router.get('/get', async (req, res) => {
+  await getCustomConfigSpeech(req, res);
+});
+
+module.exports = router;
--- a/api/server/routes/files/speech/index.js
+++ b/api/server/routes/files/speech/index.js
@ -0,0 +1,17 @@
+const express = require('express');
+const { createTTSLimiters, createSTTLimiters } = require('~/server/middleware');
+
+const stt = require('./stt');
+const tts = require('./tts');
+const customConfigSpeech = require('./customConfigSpeech');
+
+const router = express.Router();
+
+const { sttIpLimiter, sttUserLimiter } = createSTTLimiters();
+const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters();
+router.use('/stt', sttIpLimiter, sttUserLimiter, stt);
+router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts);
+
+router.use('/config', customConfigSpeech);
+
+module.exports = router;
--- a/api/server/routes/files/speech/stt.js
+++ b/api/server/routes/files/speech/stt.js
--- a/api/server/routes/files/speech/tts.js
+++ b/api/server/routes/files/speech/tts.js
--- a/api/server/services/Config/loadCustomConfig.js
+++ b/api/server/services/Config/loadCustomConfig.js
@ -76,8 +76,28 @@ Please specify a correct \`imageOutputType\` value (case-sensitive).
    );
  }
  if (!result.success) {
-    i === 0 && logger.error(`Invalid custom config file at ${configPath}`, result.error);
-    i === 0 && i++;
+    let errorMessage = `Invalid custom config file at ${configPath}:
+${JSON.stringify(result.error, null, 2)}`;
+
+    if (i === 0) {
+      logger.error(errorMessage);
+      const speechError = result.error.errors.find(
+        (err) =>
+          err.code === 'unrecognized_keys' &&
+          (err.message?.includes('stt') || err.message?.includes('tts')),
+      );
+
+      if (speechError) {
+        logger.warn(`
+The Speech-to-text and Text-to-speech configuration format has recently changed.
+If you're getting this error, please refer to the latest documentation:
+
+https://www.librechat.ai/docs/configuration/stt_tts`);
+      }
+
+      i++;
+    }
+
    return null;
  } else {
    logger.info('Custom config file loaded:');
--- a/api/server/services/Files/Audio/getCustomConfigSpeech.js
+++ b/api/server/services/Files/Audio/getCustomConfigSpeech.js
@ -0,0 +1,50 @@
+const getCustomConfig = require('~/server/services/Config/getCustomConfig');
+
+/**
+ * This function retrieves the speechTab settings from the custom configuration
+ * It first fetches the custom configuration
+ * Then, it checks if the custom configuration and the speechTab schema exist
+ * If they do, it sends the speechTab settings as a JSON response
+ * If they don't, it throws an error
+ *
+ * @param {Object} req - The request object
+ * @param {Object} res - The response object
+ * @returns {Promise<void>}
+ * @throws {Error} - If the custom configuration or the speechTab schema is missing, an error is thrown
+ */
+async function getCustomConfigSpeech(req, res) {
+  try {
+    const customConfig = await getCustomConfig();
+
+    if (!customConfig || !customConfig.speech?.speechTab) {
+      throw new Error('Configuration or speechTab schema is missing');
+    }
+
+    const ttsSchema = customConfig.speech?.speechTab;
+    let settings = {};
+
+    if (ttsSchema.advancedMode !== undefined) {
+      settings.advancedMode = ttsSchema.advancedMode;
+    }
+    if (ttsSchema.speechToText) {
+      for (const key in ttsSchema.speechToText) {
+        if (ttsSchema.speechToText[key] !== undefined) {
+          settings[key] = ttsSchema.speechToText[key];
+        }
+      }
+    }
+    if (ttsSchema.textToSpeech) {
+      for (const key in ttsSchema.textToSpeech) {
+        if (ttsSchema.textToSpeech[key] !== undefined) {
+          settings[key] = ttsSchema.textToSpeech[key];
+        }
+      }
+    }
+
+    res.json(settings);
+  } catch (error) {
+    res.status(200).send();
+  }
+}
+
+module.exports = getCustomConfigSpeech;
--- a/api/server/services/Files/Audio/getVoices.js
+++ b/api/server/services/Files/Audio/getVoices.js
@ -1,4 +1,3 @@
-const { logger } = require('~/config');
 const getCustomConfig = require('~/server/services/Config/getCustomConfig');
 const { getProvider } = require('./textToSpeech');

@ -16,11 +15,11 @@ async function getVoices(req, res) {
  try {
    const customConfig = await getCustomConfig();

-    if (!customConfig || !customConfig?.tts) {
+    if (!customConfig || !customConfig?.speech?.tts) {
      throw new Error('Configuration or TTS schema is missing');
    }

-    const ttsSchema = customConfig?.tts;
+    const ttsSchema = customConfig?.speech?.tts;
    const provider = getProvider(ttsSchema);
    let voices;

@ -40,8 +39,7 @@ async function getVoices(req, res) {

    res.json(voices);
  } catch (error) {
-    logger.error(`Failed to get voices: ${error.message}`);
-    res.status(500).json({ error: 'Failed to get voices' });
+    res.status(500).json({ error: `Failed to get voices: ${error.message}` });
  }
 }

--- a/api/server/services/Files/Audio/index.js
+++ b/api/server/services/Files/Audio/index.js
@ -1,11 +1,11 @@
 const getVoices = require('./getVoices');
+const getCustomConfigSpeech = require('./getCustomConfigSpeech');
 const textToSpeech = require('./textToSpeech');
 const speechToText = require('./speechToText');
-const { updateTokenWebsocket } = require('./webSocket');

 module.exports = {
  getVoices,
+  getCustomConfigSpeech,
  speechToText,
  ...textToSpeech,
-  updateTokenWebsocket,
 };
--- a/api/server/services/Files/Audio/speechToText.js
+++ b/api/server/services/Files/Audio/speechToText.js
@ -25,7 +25,7 @@ async function handleResponse(response) {
 }

 function getProvider(sttSchema) {
-  if (sttSchema.openai) {
+  if (sttSchema?.openai) {
    return 'openai';
  }

@ -176,7 +176,7 @@ async function speechToText(req, res) {
  const audioReadStream = Readable.from(audioBuffer);
  audioReadStream.path = 'audio.wav';

-  const provider = getProvider(customConfig.stt);
+  const provider = getProvider(customConfig.speech.stt);

  let [url, data, headers] = [];

--- a/api/server/services/Files/Audio/textToSpeech.js
+++ b/api/server/services/Files/Audio/textToSpeech.js
@ -191,8 +191,8 @@ function localAIProvider(ttsSchema, input, voice) {
 * @returns {Promise<[string, TProviderSchema]>}
 */
 async function getProviderSchema(customConfig) {
-  const provider = getProvider(customConfig.tts);
-  return [provider, customConfig.tts[provider]];
+  const provider = getProvider(customConfig.speech.tts);
+  return [provider, customConfig.speech.tts[provider]];
 }

 /**
--- a/api/server/services/Files/Audio/webSocket.js
+++ b/api/server/services/Files/Audio/webSocket.js
@ -1,31 +0,0 @@
-let token = '';
-
-function updateTokenWebsocket(newToken) {
-  console.log('Token:', newToken);
-  token = newToken;
-}
-
-function sendTextToWebsocket(ws, onDataReceived) {
-  if (token === '[DONE]') {
-    ws.send(' ');
-    return;
-  }
-
-  if (ws.readyState === WebSocket.OPEN) {
-    ws.send(token);
-
-    ws.onmessage = function (event) {
-      console.log('Received:', event.data);
-      if (onDataReceived) {
-        onDataReceived(event.data); // Pass the received data to the callback function
-      }
-    };
-  } else {
-    console.error('WebSocket is not open. Ready state is: ' + ws.readyState);
-  }
-}
-
-module.exports = {
-  updateTokenWebsocket,
-  sendTextToWebsocket,
-};