From 1aad315de65f6316d5aba705a1fe6769410035db Mon Sep 17 00:00:00 2001
From: Marco Beretta <81851188+berry-13@users.noreply.github.com>
Date: Fri, 5 Jul 2024 17:13:34 +0300
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A4=20feat:=20add=20custom=20speech=20?=
 =?UTF-8?q?config,=20browser=20TTS/STT=20features,=20and=20dynamic=20speec?=
 =?UTF-8?q?h=20tab=20settings=20(#2921)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* feat: update useTextToSpeech and useSpeechToText hooks to support external audio endpoints

This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints

* feat: add userSelect style to ConversationModeSwitch label

* fix: remove unused updateTokenWebsocket function and import

The updateTokenWebsocket function and its import are no longer used in the OpenAIClient module. This commit removes the function and import to clean up the codebase

* feat: support external audio endpoints in useTextToSpeech and useSpeechToText hooks

This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints

* feat: update AutomaticPlayback component to AutomaticPlaybackSwitch; tests: added AutomaticPlaybackSwitch.spec
>
> This commit renames the AutomaticPlayback component to AutomaticPlaybackSwitch in the Speech directory. The new name better reflects the purpose of the component and aligns with the naming convention used in the codebase.

* feat: update useSpeechToText hook to include interimTranscript

This commit updates the useSpeechToText hook in the client/src/components/Chat/Input/AudioRecorder.tsx file to include the interimTranscript state. This allows for real-time display of the speech-to-text transcription while the user is still speaking. The interimTranscript is now used to update the text area value during recording.

* feat: Add customConfigSpeech API endpoint for retrieving custom speech configuration

This commit adds a new API endpoint  in the  file under the  directory. This endpoint is responsible for retrieving the custom speech configuration using the  function from the  module

* feat: update store var  and ; fix: getCustomConfigSpeech

* fix: client tests, removed unused import

* feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations

This commit modifies the useCustomConfigSpeechQuery function in the client/src/data-provider/queries.ts file to return an array of custom speech configurations instead of a single object. This change allows for better handling and manipulation of the data in the application

* feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations

* refactor: Update variable name in speechTab schema

* refactor: removed unused and nested code

* fix: using recoilState

* refactor: Update Speech component to use useCallback for setting settings

* fix: test

* fix: tests

* feature: ensure that the settings don't change after modifying then through the UI

* remove comment

* fix: Handle error gracefully in getCustomConfigSpeech and getVoices endpoints

* fix: Handle error

* fix: backend tests

* fix: invalid custom config logging

* chore: add back custom config info logging

* chore: revert loadCustomConfig spec

---------

Co-authored-by: Danny Avila <danny@librechat.ai>
---
 api/app/clients/OpenAIClient.js               |   2 -
 api/server/routes/files/index.js              |  19 +---
 .../routes/files/speech/customConfigSpeech.js |  10 ++
 api/server/routes/files/speech/index.js       |  17 +++
 api/server/routes/files/{ => speech}/stt.js   |   0
 api/server/routes/files/{ => speech}/tts.js   |   0
 .../services/Config/loadCustomConfig.js       |  24 +++-
 .../Files/Audio/getCustomConfigSpeech.js      |  50 ++++++++
 api/server/services/Files/Audio/getVoices.js  |   8 +-
 api/server/services/Files/Audio/index.js      |   4 +-
 .../services/Files/Audio/speechToText.js      |   4 +-
 .../services/Files/Audio/textToSpeech.js      |   4 +-
 api/server/services/Files/Audio/webSocket.js  |  31 -----
 .../components/Chat/Input/AudioRecorder.tsx   |  19 +++-
 client/src/components/Chat/Input/ChatForm.tsx |   4 +-
 .../Chat/Input/Files/Table/DataTable.tsx      |   2 +-
 .../components/Chat/Messages/HoverButtons.tsx |   2 +-
 .../Files/FileList/DataTableFile.tsx          |   2 +-
 .../Files/FileList/FileTableColumns.tsx       |  25 ++--
 .../components/Messages/ScrollToBottom.tsx    |   2 +-
 .../Speech/ConversationModeSwitch.tsx         |  14 +--
 .../Speech/STT/AutoSendTextSwitch.tsx         |   2 +-
 .../Speech/STT/AutoTranscribeAudioSwitch.tsx  |   2 +-
 .../Speech/STT/DecibelSelector.tsx            |   2 +-
 .../Speech/STT/EngineSTTDropdown.tsx          |   6 +-
 .../Speech/STT/LanguageSTTDropdown.tsx        | 107 ++++++++++++++++++
 .../Speech/STT/SpeechToTextSwitch.tsx         |   2 +-
 .../Nav/SettingsTabs/Speech/STT/index.ts      |   1 +
 .../Nav/SettingsTabs/Speech/Speech.tsx        | 105 +++++++++++++++--
 ...ayback.tsx => AutomaticPlaybackSwitch.tsx} |   2 +-
 .../Speech/TTS/CacheTTSSwitch.tsx             |   2 +-
 .../Speech/TTS/EngineTTSDropdown.tsx          |   6 +-
 .../SettingsTabs/Speech/TTS/PlaybackRate.tsx  |   2 +-
 .../Speech/TTS/TextToSpeechSwitch.tsx         |   2 +-
 .../AutomaticPlaybackSwitch.spec.tsx          |  38 +++++++
 .../Nav/SettingsTabs/Speech/TTS/index.ts      |   2 +-
 client/src/data-provider/queries.ts           |   7 ++
 client/src/hooks/Input/index.ts               |   1 +
 .../src/hooks/Input/useGetAudioSettings.tsx   |  19 ++++
 client/src/hooks/Input/useSpeechToText.ts     |  28 ++---
 .../src/hooks/Input/useSpeechToTextBrowser.ts |  63 ++++++++---
 .../hooks/Input/useSpeechToTextExternal.ts    |   7 +-
 client/src/hooks/Input/useTextToSpeech.ts     |  12 +-
 client/src/localization/languages/Eng.ts      |   2 +-
 client/src/store/settings.ts                  |  49 +++++---
 packages/data-provider/src/api-endpoints.ts   |   8 +-
 packages/data-provider/src/config.ts          |  44 ++++++-
 packages/data-provider/src/data-service.ts    |   4 +
 packages/data-provider/src/keys.ts            |   1 +
 packages/data-provider/src/types/files.ts     |   8 ++
 50 files changed, 598 insertions(+), 179 deletions(-)
 create mode 100644 api/server/routes/files/speech/customConfigSpeech.js
 create mode 100644 api/server/routes/files/speech/index.js
 rename api/server/routes/files/{ => speech}/stt.js (100%)
 rename api/server/routes/files/{ => speech}/tts.js (100%)
 create mode 100644 api/server/services/Files/Audio/getCustomConfigSpeech.js
 delete mode 100644 api/server/services/Files/Audio/webSocket.js
 create mode 100644 client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx
 rename client/src/components/Nav/SettingsTabs/Speech/TTS/{AutomaticPlayback.tsx => AutomaticPlaybackSwitch.tsx} (94%)
 create mode 100644 client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/AutomaticPlaybackSwitch.spec.tsx
 create mode 100644 client/src/hooks/Input/useGetAudioSettings.tsx

diff --git a/api/app/clients/OpenAIClient.js b/api/app/clients/OpenAIClient.js
index ced2387bd5..7520cbb897 100644
--- a/api/app/clients/OpenAIClient.js
+++ b/api/app/clients/OpenAIClient.js
@@ -27,7 +27,6 @@ const {
   createContextHandlers,
 } = require('./prompts');
 const { encodeAndFormat } = require('~/server/services/Files/images/encode');
-const { updateTokenWebsocket } = require('~/server/services/Files/Audio');
 const { isEnabled, sleep } = require('~/server/utils');
 const { handleOpenAIErrors } = require('./tools/util');
 const spendTokens = require('~/models/spendTokens');
@@ -595,7 +594,6 @@ class OpenAIClient extends BaseClient {
         payload,
         (progressMessage) => {
           if (progressMessage === '[DONE]') {
-            updateTokenWebsocket('[DONE]');
             return;
           }
 
diff --git a/api/server/routes/files/index.js b/api/server/routes/files/index.js
index 2911ecb0b3..6317f4495f 100644
--- a/api/server/routes/files/index.js
+++ b/api/server/routes/files/index.js
@@ -1,19 +1,11 @@
 const express = require('express');
-const {
-  uaParser,
-  checkBan,
-  requireJwtAuth,
-  createFileLimiters,
-  createTTSLimiters,
-  createSTTLimiters,
-} = require('~/server/middleware');
+const { uaParser, checkBan, requireJwtAuth, createFileLimiters } = require('~/server/middleware');
 const { createMulterInstance } = require('./multer');
 
 const files = require('./files');
 const images = require('./images');
 const avatar = require('./avatar');
-const stt = require('./stt');
-const tts = require('./tts');
+const speech = require('./speech');
 
 const initialize = async () => {
   const router = express.Router();
@@ -21,11 +13,8 @@ const initialize = async () => {
   router.use(checkBan);
   router.use(uaParser);
 
-  /* Important: stt/tts routes must be added before the upload limiters */
-  const { sttIpLimiter, sttUserLimiter } = createSTTLimiters();
-  const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters();
-  router.use('/stt', sttIpLimiter, sttUserLimiter, stt);
-  router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts);
+  /* Important: speech route must be added before the upload limiters */
+  router.use('/speech', speech);
 
   const upload = await createMulterInstance();
   const { fileUploadIpLimiter, fileUploadUserLimiter } = createFileLimiters();
diff --git a/api/server/routes/files/speech/customConfigSpeech.js b/api/server/routes/files/speech/customConfigSpeech.js
new file mode 100644
index 0000000000..c3b1e2eb47
--- /dev/null
+++ b/api/server/routes/files/speech/customConfigSpeech.js
@@ -0,0 +1,10 @@
+const express = require('express');
+const router = express.Router();
+
+const { getCustomConfigSpeech } = require('~/server/services/Files/Audio');
+
+router.get('/get', async (req, res) => {
+  await getCustomConfigSpeech(req, res);
+});
+
+module.exports = router;
diff --git a/api/server/routes/files/speech/index.js b/api/server/routes/files/speech/index.js
new file mode 100644
index 0000000000..074ed553c9
--- /dev/null
+++ b/api/server/routes/files/speech/index.js
@@ -0,0 +1,17 @@
+const express = require('express');
+const { createTTSLimiters, createSTTLimiters } = require('~/server/middleware');
+
+const stt = require('./stt');
+const tts = require('./tts');
+const customConfigSpeech = require('./customConfigSpeech');
+
+const router = express.Router();
+
+const { sttIpLimiter, sttUserLimiter } = createSTTLimiters();
+const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters();
+router.use('/stt', sttIpLimiter, sttUserLimiter, stt);
+router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts);
+
+router.use('/config', customConfigSpeech);
+
+module.exports = router;
diff --git a/api/server/routes/files/stt.js b/api/server/routes/files/speech/stt.js
similarity index 100%
rename from api/server/routes/files/stt.js
rename to api/server/routes/files/speech/stt.js
diff --git a/api/server/routes/files/tts.js b/api/server/routes/files/speech/tts.js
similarity index 100%
rename from api/server/routes/files/tts.js
rename to api/server/routes/files/speech/tts.js
diff --git a/api/server/services/Config/loadCustomConfig.js b/api/server/services/Config/loadCustomConfig.js
index 1b5b287066..2127ec239e 100644
--- a/api/server/services/Config/loadCustomConfig.js
+++ b/api/server/services/Config/loadCustomConfig.js
@@ -76,8 +76,28 @@ Please specify a correct \`imageOutputType\` value (case-sensitive).
     );
   }
   if (!result.success) {
-    i === 0 && logger.error(`Invalid custom config file at ${configPath}`, result.error);
-    i === 0 && i++;
+    let errorMessage = `Invalid custom config file at ${configPath}:
+${JSON.stringify(result.error, null, 2)}`;
+
+    if (i === 0) {
+      logger.error(errorMessage);
+      const speechError = result.error.errors.find(
+        (err) =>
+          err.code === 'unrecognized_keys' &&
+          (err.message?.includes('stt') || err.message?.includes('tts')),
+      );
+
+      if (speechError) {
+        logger.warn(`
+The Speech-to-text and Text-to-speech configuration format has recently changed.
+If you're getting this error, please refer to the latest documentation:
+
+https://www.librechat.ai/docs/configuration/stt_tts`);
+      }
+
+      i++;
+    }
+
     return null;
   } else {
     logger.info('Custom config file loaded:');
diff --git a/api/server/services/Files/Audio/getCustomConfigSpeech.js b/api/server/services/Files/Audio/getCustomConfigSpeech.js
new file mode 100644
index 0000000000..e9d185af2e
--- /dev/null
+++ b/api/server/services/Files/Audio/getCustomConfigSpeech.js
@@ -0,0 +1,50 @@
+const getCustomConfig = require('~/server/services/Config/getCustomConfig');
+
+/**
+ * This function retrieves the speechTab settings from the custom configuration
+ * It first fetches the custom configuration
+ * Then, it checks if the custom configuration and the speechTab schema exist
+ * If they do, it sends the speechTab settings as a JSON response
+ * If they don't, it throws an error
+ *
+ * @param {Object} req - The request object
+ * @param {Object} res - The response object
+ * @returns {Promise<void>}
+ * @throws {Error} - If the custom configuration or the speechTab schema is missing, an error is thrown
+ */
+async function getCustomConfigSpeech(req, res) {
+  try {
+    const customConfig = await getCustomConfig();
+
+    if (!customConfig || !customConfig.speech?.speechTab) {
+      throw new Error('Configuration or speechTab schema is missing');
+    }
+
+    const ttsSchema = customConfig.speech?.speechTab;
+    let settings = {};
+
+    if (ttsSchema.advancedMode !== undefined) {
+      settings.advancedMode = ttsSchema.advancedMode;
+    }
+    if (ttsSchema.speechToText) {
+      for (const key in ttsSchema.speechToText) {
+        if (ttsSchema.speechToText[key] !== undefined) {
+          settings[key] = ttsSchema.speechToText[key];
+        }
+      }
+    }
+    if (ttsSchema.textToSpeech) {
+      for (const key in ttsSchema.textToSpeech) {
+        if (ttsSchema.textToSpeech[key] !== undefined) {
+          settings[key] = ttsSchema.textToSpeech[key];
+        }
+      }
+    }
+
+    res.json(settings);
+  } catch (error) {
+    res.status(200).send();
+  }
+}
+
+module.exports = getCustomConfigSpeech;
diff --git a/api/server/services/Files/Audio/getVoices.js b/api/server/services/Files/Audio/getVoices.js
index b87cd363b2..56341cd2b0 100644
--- a/api/server/services/Files/Audio/getVoices.js
+++ b/api/server/services/Files/Audio/getVoices.js
@@ -1,4 +1,3 @@
-const { logger } = require('~/config');
 const getCustomConfig = require('~/server/services/Config/getCustomConfig');
 const { getProvider } = require('./textToSpeech');
 
@@ -16,11 +15,11 @@ async function getVoices(req, res) {
   try {
     const customConfig = await getCustomConfig();
 
-    if (!customConfig || !customConfig?.tts) {
+    if (!customConfig || !customConfig?.speech?.tts) {
       throw new Error('Configuration or TTS schema is missing');
     }
 
-    const ttsSchema = customConfig?.tts;
+    const ttsSchema = customConfig?.speech?.tts;
     const provider = getProvider(ttsSchema);
     let voices;
 
@@ -40,8 +39,7 @@ async function getVoices(req, res) {
 
     res.json(voices);
   } catch (error) {
-    logger.error(`Failed to get voices: ${error.message}`);
-    res.status(500).json({ error: 'Failed to get voices' });
+    res.status(500).json({ error: `Failed to get voices: ${error.message}` });
   }
 }
 
diff --git a/api/server/services/Files/Audio/index.js b/api/server/services/Files/Audio/index.js
index a201ea556c..75882f2397 100644
--- a/api/server/services/Files/Audio/index.js
+++ b/api/server/services/Files/Audio/index.js
@@ -1,11 +1,11 @@
 const getVoices = require('./getVoices');
+const getCustomConfigSpeech = require('./getCustomConfigSpeech');
 const textToSpeech = require('./textToSpeech');
 const speechToText = require('./speechToText');
-const { updateTokenWebsocket } = require('./webSocket');
 
 module.exports = {
   getVoices,
+  getCustomConfigSpeech,
   speechToText,
   ...textToSpeech,
-  updateTokenWebsocket,
 };
diff --git a/api/server/services/Files/Audio/speechToText.js b/api/server/services/Files/Audio/speechToText.js
index 96e70b76fe..7e0d2a2145 100644
--- a/api/server/services/Files/Audio/speechToText.js
+++ b/api/server/services/Files/Audio/speechToText.js
@@ -25,7 +25,7 @@ async function handleResponse(response) {
 }
 
 function getProvider(sttSchema) {
-  if (sttSchema.openai) {
+  if (sttSchema?.openai) {
     return 'openai';
   }
 
@@ -176,7 +176,7 @@ async function speechToText(req, res) {
   const audioReadStream = Readable.from(audioBuffer);
   audioReadStream.path = 'audio.wav';
 
-  const provider = getProvider(customConfig.stt);
+  const provider = getProvider(customConfig.speech.stt);
 
   let [url, data, headers] = [];
 
diff --git a/api/server/services/Files/Audio/textToSpeech.js b/api/server/services/Files/Audio/textToSpeech.js
index 7778faabeb..49a0d4e2e7 100644
--- a/api/server/services/Files/Audio/textToSpeech.js
+++ b/api/server/services/Files/Audio/textToSpeech.js
@@ -191,8 +191,8 @@ function localAIProvider(ttsSchema, input, voice) {
  * @returns {Promise<[string, TProviderSchema]>}
  */
 async function getProviderSchema(customConfig) {
-  const provider = getProvider(customConfig.tts);
-  return [provider, customConfig.tts[provider]];
+  const provider = getProvider(customConfig.speech.tts);
+  return [provider, customConfig.speech.tts[provider]];
 }
 
 /**
diff --git a/api/server/services/Files/Audio/webSocket.js b/api/server/services/Files/Audio/webSocket.js
deleted file mode 100644
index f2d96c7941..0000000000
--- a/api/server/services/Files/Audio/webSocket.js
+++ /dev/null
@@ -1,31 +0,0 @@
-let token = '';
-
-function updateTokenWebsocket(newToken) {
-  console.log('Token:', newToken);
-  token = newToken;
-}
-
-function sendTextToWebsocket(ws, onDataReceived) {
-  if (token === '[DONE]') {
-    ws.send(' ');
-    return;
-  }
-
-  if (ws.readyState === WebSocket.OPEN) {
-    ws.send(token);
-
-    ws.onmessage = function (event) {
-      console.log('Received:', event.data);
-      if (onDataReceived) {
-        onDataReceived(event.data); // Pass the received data to the callback function
-      }
-    };
-  } else {
-    console.error('WebSocket is not open. Ready state is: ' + ws.readyState);
-  }
-}
-
-module.exports = {
-  updateTokenWebsocket,
-  sendTextToWebsocket,
-};
diff --git a/client/src/components/Chat/Input/AudioRecorder.tsx b/client/src/components/Chat/Input/AudioRecorder.tsx
index 48d89c2c3f..dd088ea3c8 100644
--- a/client/src/components/Chat/Input/AudioRecorder.tsx
+++ b/client/src/components/Chat/Input/AudioRecorder.tsx
@@ -31,15 +31,26 @@ export default function AudioRecorder({
     }
   };
 
-  const { isListening, isLoading, startRecording, stopRecording, speechText, clearText } =
-    useSpeechToText(handleTranscriptionComplete);
+  const {
+    isListening,
+    isLoading,
+    startRecording,
+    stopRecording,
+    interimTranscript,
+    speechText,
+    clearText,
+  } = useSpeechToText(handleTranscriptionComplete);
 
   useEffect(() => {
-    if (textAreaRef.current) {
+    if (isListening && textAreaRef.current) {
+      methods.setValue('text', interimTranscript, {
+        shouldValidate: true,
+      });
+    } else if (textAreaRef.current) {
       textAreaRef.current.value = speechText;
       methods.setValue('text', speechText, { shouldValidate: true });
     }
-  }, [speechText, methods, textAreaRef]);
+  }, [interimTranscript, speechText, methods, textAreaRef]);
 
   const handleStartRecording = async () => {
     await startRecording();
diff --git a/client/src/components/Chat/Input/ChatForm.tsx b/client/src/components/Chat/Input/ChatForm.tsx
index 63ac15b01b..2ad4580eea 100644
--- a/client/src/components/Chat/Input/ChatForm.tsx
+++ b/client/src/components/Chat/Input/ChatForm.tsx
@@ -38,8 +38,8 @@ const ChatForm = ({ index = 0 }) => {
   const submitButtonRef = useRef<HTMLButtonElement>(null);
   const textAreaRef = useRef<HTMLTextAreaElement | null>(null);
 
-  const SpeechToText = useRecoilValue(store.SpeechToText);
-  const TextToSpeech = useRecoilValue(store.TextToSpeech);
+  const SpeechToText = useRecoilValue(store.speechToText);
+  const TextToSpeech = useRecoilValue(store.textToSpeech);
   const automaticPlayback = useRecoilValue(store.automaticPlayback);
 
   const [showStopButton, setShowStopButton] = useRecoilState(store.showStopButtonByIndex(index));
diff --git a/client/src/components/Chat/Input/Files/Table/DataTable.tsx b/client/src/components/Chat/Input/Files/Table/DataTable.tsx
index a61a41ed0e..c80cb33191 100644
--- a/client/src/components/Chat/Input/Files/Table/DataTable.tsx
+++ b/client/src/components/Chat/Input/Files/Table/DataTable.tsx
@@ -96,7 +96,7 @@ export default function DataTable<TData, TValue>({ columns, data }: DataTablePro
             deleteFiles({ files: filesToDelete as TFile[] });
             setRowSelection({});
           }}
-          className="dark:hover:bg-gray-850/25 ml-1 gap-2 sm:ml-0"
+          className="ml-1 gap-2 dark:hover:bg-gray-850/25 sm:ml-0"
           disabled={!table.getFilteredSelectedRowModel().rows.length || isDeleting}
         >
           {isDeleting ? (
diff --git a/client/src/components/Chat/Messages/HoverButtons.tsx b/client/src/components/Chat/Messages/HoverButtons.tsx
index 3c3ad97890..163d5e9765 100644
--- a/client/src/components/Chat/Messages/HoverButtons.tsx
+++ b/client/src/components/Chat/Messages/HoverButtons.tsx
@@ -39,7 +39,7 @@ export default function HoverButtons({
   const { endpoint: _endpoint, endpointType } = conversation ?? {};
   const endpoint = endpointType ?? _endpoint;
   const [isCopied, setIsCopied] = useState(false);
-  const [TextToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [TextToSpeech] = useRecoilState<boolean>(store.textToSpeech);
 
   const {
     hideEditButton,
diff --git a/client/src/components/Files/FileList/DataTableFile.tsx b/client/src/components/Files/FileList/DataTableFile.tsx
index 50cb855ae9..92e454016d 100644
--- a/client/src/components/Files/FileList/DataTableFile.tsx
+++ b/client/src/components/Files/FileList/DataTableFile.tsx
@@ -106,7 +106,7 @@ export default function DataTableFile<TData, TValue>({
                 deleteFiles({ files: filesToDelete as TFile[] });
                 setRowSelection({});
               }}
-              className="dark:hover:bg-gray-850/25 ml-1 gap-2 sm:ml-0"
+              className="ml-1 gap-2 dark:hover:bg-gray-850/25 sm:ml-0"
               disabled={!table.getFilteredSelectedRowModel().rows.length || isDeleting}
             >
               {isDeleting ? (
diff --git a/client/src/components/Files/FileList/FileTableColumns.tsx b/client/src/components/Files/FileList/FileTableColumns.tsx
index 6421cd8ec6..8e670aa805 100644
--- a/client/src/components/Files/FileList/FileTableColumns.tsx
+++ b/client/src/components/Files/FileList/FileTableColumns.tsx
@@ -75,18 +75,21 @@ export const fileTableColumns: ColumnDef<TFile>[] = [
       return (
         <>
           {attachedVectorStores.map((vectorStore, index) => {
-            if (index === 4)
-            {return (
-              <span
-                key={index}
-                className="ml-2 mt-2 flex w-fit flex-row items-center rounded-full bg-[#f5f5f5] px-2 text-gray-500"
-              >
-                <PlusIcon className="h-3 w-3" />
+            if (index === 4) {
+              return (
+                <span
+                  key={index}
+                  className="ml-2 mt-2 flex w-fit flex-row items-center rounded-full bg-[#f5f5f5] px-2 text-gray-500"
+                >
+                  <PlusIcon className="h-3 w-3" />
                   &nbsp;
-                {attachedVectorStores.length - index} more
-              </span>
-            );}
-            if (index > 4) {return null;}
+                  {attachedVectorStores.length - index} more
+                </span>
+              );
+            }
+            if (index > 4) {
+              return null;
+            }
             return (
               <span key={index} className="ml-2 mt-2 rounded-full bg-[#f2f8ec] px-2 text-[#91c561]">
                 {vectorStore.name}
diff --git a/client/src/components/Messages/ScrollToBottom.tsx b/client/src/components/Messages/ScrollToBottom.tsx
index 01434f5ecb..b5eef83735 100644
--- a/client/src/components/Messages/ScrollToBottom.tsx
+++ b/client/src/components/Messages/ScrollToBottom.tsx
@@ -8,7 +8,7 @@ export default function ScrollToBottom({ scrollHandler }: Props) {
   return (
     <button
       onClick={scrollHandler}
-      className="dark:bg-gray-850/90 absolute bottom-5 right-1/2 cursor-pointer rounded-full border border-gray-200 bg-white bg-clip-padding text-gray-600 dark:border-white/10 dark:text-gray-200"
+      className="absolute bottom-5 right-1/2 cursor-pointer rounded-full border border-gray-200 bg-white bg-clip-padding text-gray-600 dark:border-white/10 dark:bg-gray-850/90 dark:text-gray-200"
     >
       <svg
         width="24"
diff --git a/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx
index 2a69ce5944..4f3bba67d9 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx
@@ -10,18 +10,16 @@ export default function ConversationModeSwitch({
 }) {
   const localize = useLocalize();
   const [conversationMode, setConversationMode] = useRecoilState<boolean>(store.conversationMode);
-  const [advancedMode] = useRecoilState<boolean>(store.advancedMode);
-  const [textToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
+  const [textToSpeech] = useRecoilState<boolean>(store.textToSpeech);
   const [, setAutoSendText] = useRecoilState<boolean>(store.autoSendText);
   const [, setDecibelValue] = useRecoilState(store.decibelValue);
   const [, setAutoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
 
   const handleCheckedChange = (value: boolean) => {
-    if (!advancedMode) {
-      setAutoTranscribeAudio(value);
-      setAutoSendText(value);
-      setDecibelValue(-45);
-    }
+    setAutoTranscribeAudio(value);
+    setAutoSendText(value);
+    setDecibelValue(-45);
     setConversationMode(value);
     if (onCheckedChange) {
       onCheckedChange(value);
@@ -40,7 +38,7 @@ export default function ConversationModeSwitch({
           onCheckedChange={handleCheckedChange}
           className="ml-4"
           data-testid="ConversationMode"
-          disabled={!textToSpeech}
+          disabled={!textToSpeech || !speechToText}
         />
       </div>
     </div>
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx
index aab971252e..2d47095b23 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx
@@ -10,7 +10,7 @@ export default function AutoSendTextSwitch({
 }) {
   const localize = useLocalize();
   const [autoSendText, setAutoSendText] = useRecoilState<boolean>(store.autoSendText);
-  const [SpeechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const [SpeechToText] = useRecoilState<boolean>(store.speechToText);
 
   const handleCheckedChange = (value: boolean) => {
     setAutoSendText(value);
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx
index cf0876d945..99bd794173 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx
@@ -12,7 +12,7 @@ export default function AutoTranscribeAudioSwitch({
   const [autoTranscribeAudio, setAutoTranscribeAudio] = useRecoilState<boolean>(
     store.autoTranscribeAudio,
   );
-  const [speechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
 
   const handleCheckedChange = (value: boolean) => {
     setAutoTranscribeAudio(value);
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx
index 3157a7cdcd..1796f11eb8 100755
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx
@@ -7,7 +7,7 @@ import { cn, defaultTextProps, optionText } from '~/utils/';
 
 export default function DecibelSelector() {
   const localize = useLocalize();
-  const speechToText = useRecoilValue(store.SpeechToText);
+  const speechToText = useRecoilValue(store.speechToText);
   const [decibelValue, setDecibelValue] = useRecoilState(store.decibelValue);
 
   return (
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx
index 8562292be6..ab27c20e32 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx
@@ -5,21 +5,21 @@ import store from '~/store';
 
 export default function EngineSTTDropdown() {
   const localize = useLocalize();
-  const [endpointSTT, setEndpointSTT] = useRecoilState<string>(store.endpointSTT);
+  const [engineSTT, setEngineSTT] = useRecoilState<string>(store.engineSTT);
   const endpointOptions = [
     { value: 'browser', display: localize('com_nav_browser') },
     { value: 'external', display: localize('com_nav_external') },
   ];
 
   const handleSelect = (value: string) => {
-    setEndpointSTT(value);
+    setEngineSTT(value);
   };
 
   return (
     <div className="flex items-center justify-between">
       <div>{localize('com_nav_engine')}</div>
       <Dropdown
-        value={endpointSTT}
+        value={engineSTT}
         onChange={handleSelect}
         options={endpointOptions}
         width={180}
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx
new file mode 100644
index 0000000000..8c10fa49d5
--- /dev/null
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx
@@ -0,0 +1,107 @@
+import { useRecoilState } from 'recoil';
+import { Dropdown } from '~/components/ui';
+import { useLocalize } from '~/hooks';
+import store from '~/store';
+
+export default function LanguageSTTDropdown() {
+  const localize = useLocalize();
+  const [languageSTT, setLanguageSTT] = useRecoilState<string>(store.languageSTT);
+
+  const languageOptions = [
+    { value: 'af', display: 'Afrikaans' },
+    { value: 'eu', display: 'Basque' },
+    { value: 'bg', display: 'Bulgarian' },
+    { value: 'ca', display: 'Catalan' },
+    { value: 'ar-EG', display: 'Arabic (Egypt)' },
+    { value: 'ar-JO', display: 'Arabic (Jordan)' },
+    { value: 'ar-KW', display: 'Arabic (Kuwait)' },
+    { value: 'ar-LB', display: 'Arabic (Lebanon)' },
+    { value: 'ar-QA', display: 'Arabic (Qatar)' },
+    { value: 'ar-AE', display: 'Arabic (UAE)' },
+    { value: 'ar-MA', display: 'Arabic (Morocco)' },
+    { value: 'ar-IQ', display: 'Arabic (Iraq)' },
+    { value: 'ar-DZ', display: 'Arabic (Algeria)' },
+    { value: 'ar-BH', display: 'Arabic (Bahrain)' },
+    { value: 'ar-LY', display: 'Arabic (Libya)' },
+    { value: 'ar-OM', display: 'Arabic (Oman)' },
+    { value: 'ar-SA', display: 'Arabic (Saudi Arabia)' },
+    { value: 'ar-TN', display: 'Arabic (Tunisia)' },
+    { value: 'ar-YE', display: 'Arabic (Yemen)' },
+    { value: 'cs', display: 'Czech' },
+    { value: 'nl-NL', display: 'Dutch' },
+    { value: 'en-AU', display: 'English (Australia)' },
+    { value: 'en-CA', display: 'English (Canada)' },
+    { value: 'en-IN', display: 'English (India)' },
+    { value: 'en-NZ', display: 'English (New Zealand)' },
+    { value: 'en-ZA', display: 'English (South Africa)' },
+    { value: 'en-GB', display: 'English (UK)' },
+    { value: 'en-US', display: 'English (US)' },
+    { value: 'fi', display: 'Finnish' },
+    { value: 'fr-FR', display: 'French' },
+    { value: 'gl', display: 'Galician' },
+    { value: 'de-DE', display: 'German' },
+    { value: 'el-GR', display: 'Greek' },
+    { value: 'he', display: 'Hebrew' },
+    { value: 'hu', display: 'Hungarian' },
+    { value: 'is', display: 'Icelandic' },
+    { value: 'it-IT', display: 'Italian' },
+    { value: 'id', display: 'Indonesian' },
+    { value: 'ja', display: 'Japanese' },
+    { value: 'ko', display: 'Korean' },
+    { value: 'la', display: 'Latin' },
+    { value: 'zh-CN', display: 'Mandarin Chinese' },
+    { value: 'zh-TW', display: 'Taiwanese' },
+    { value: 'zh-HK', display: 'Cantonese' },
+    { value: 'ms-MY', display: 'Malaysian' },
+    { value: 'no-NO', display: 'Norwegian' },
+    { value: 'pl', display: 'Polish' },
+    { value: 'xx-piglatin', display: 'Pig Latin' },
+    { value: 'pt-PT', display: 'Portuguese' },
+    { value: 'pt-br', display: 'Portuguese (Brasil)' },
+    { value: 'ro-RO', display: 'Romanian' },
+    { value: 'ru', display: 'Russian' },
+    { value: 'sr-SP', display: 'Serbian' },
+    { value: 'sk', display: 'Slovak' },
+    { value: 'es-AR', display: 'Spanish (Argentina)' },
+    { value: 'es-BO', display: 'Spanish (Bolivia)' },
+    { value: 'es-CL', display: 'Spanish (Chile)' },
+    { value: 'es-CO', display: 'Spanish (Colombia)' },
+    { value: 'es-CR', display: 'Spanish (Costa Rica)' },
+    { value: 'es-DO', display: 'Spanish (Dominican Republic)' },
+    { value: 'es-EC', display: 'Spanish (Ecuador)' },
+    { value: 'es-SV', display: 'Spanish (El Salvador)' },
+    { value: 'es-GT', display: 'Spanish (Guatemala)' },
+    { value: 'es-HN', display: 'Spanish (Honduras)' },
+    { value: 'es-MX', display: 'Spanish (Mexico)' },
+    { value: 'es-NI', display: 'Spanish (Nicaragua)' },
+    { value: 'es-PA', display: 'Spanish (Panama)' },
+    { value: 'es-PY', display: 'Spanish (Paraguay)' },
+    { value: 'es-PE', display: 'Spanish (Peru)' },
+    { value: 'es-PR', display: 'Spanish (Puerto Rico)' },
+    { value: 'es-ES', display: 'Spanish (Spain)' },
+    { value: 'es-US', display: 'Spanish (US)' },
+    { value: 'es-UY', display: 'Spanish (Uruguay)' },
+    { value: 'es-VE', display: 'Spanish (Venezuela)' },
+    { value: 'sv-SE', display: 'Swedish' },
+    { value: 'tr', display: 'Turkish' },
+    { value: 'zu', display: 'Zulu' },
+  ];
+
+  const handleSelect = (value: string) => {
+    setLanguageSTT(value);
+  };
+
+  return (
+    <div className="flex items-center justify-between">
+      <div>{localize('com_nav_language')}</div>
+      <Dropdown
+        value={languageSTT}
+        onChange={handleSelect}
+        options={languageOptions}
+        width={220}
+        position={'left'}
+        testId="LanguageSTTDropdown"
+      />
+    </div>
+  );
+}
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx
index 1aef9b1d9f..6ea5710351 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx
@@ -9,7 +9,7 @@ export default function SpeechToTextSwitch({
   onCheckedChange?: (value: boolean) => void;
 }) {
   const localize = useLocalize();
-  const [speechToText, setSpeechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const [speechToText, setSpeechToText] = useRecoilState<boolean>(store.speechToText);
 
   const handleCheckedChange = (value: boolean) => {
     setSpeechToText(value);
diff --git a/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts b/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts
index 9ac483af70..904f7bbdf2 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts
@@ -3,3 +3,4 @@ export { default as SpeechToTextSwitch } from './SpeechToTextSwitch';
 export { default as EngineSTTDropdown } from './EngineSTTDropdown';
 export { default as DecibelSelector } from './DecibelSelector';
 export { default as AutoTranscribeAudioSwitch } from './AutoTranscribeAudioSwitch';
+export { default as LanguageSTTDropdown } from './LanguageSTTDropdown';
diff --git a/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx b/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx
index 1bcc3ba97e..48eafe86a9 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx
@@ -1,6 +1,6 @@
 import * as Tabs from '@radix-ui/react-tabs';
 import { SettingsTabValues } from 'librechat-data-provider';
-import React, { useState, useRef } from 'react';
+import React, { useState, useRef, useEffect, useCallback } from 'react';
 import { useRecoilState } from 'recoil';
 import { Lightbulb, Cog } from 'lucide-react';
 import { useOnClickOutside, useMediaQuery } from '~/hooks';
@@ -10,7 +10,7 @@ import ConversationModeSwitch from './ConversationModeSwitch';
 import {
   TextToSpeechSwitch,
   EngineTTSDropdown,
-  AutomaticPlayback,
+  AutomaticPlaybackSwitch,
   CacheTTSSwitch,
   VoiceDropdown,
   PlaybackRate,
@@ -18,16 +18,100 @@ import {
 import {
   DecibelSelector,
   EngineSTTDropdown,
+  LanguageSTTDropdown,
   SpeechToTextSwitch,
   AutoSendTextSwitch,
   AutoTranscribeAudioSwitch,
 } from './STT';
+import { useCustomConfigSpeechQuery } from '~/data-provider';
 
 function Speech() {
-  const isSmallScreen = useMediaQuery('(max-width: 767px)');
-  const [advancedMode, setAdvancedMode] = useRecoilState<boolean>(store.advancedMode);
-  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
   const [confirmClear, setConfirmClear] = useState(false);
+  const { data } = useCustomConfigSpeechQuery();
+  const isSmallScreen = useMediaQuery('(max-width: 767px)');
+
+  const [advancedMode, setAdvancedMode] = useRecoilState(store.advancedMode);
+  const [autoTranscribeAudio, setAutoTranscribeAudio] = useRecoilState(store.autoTranscribeAudio);
+  const [conversationMode, setConversationMode] = useRecoilState(store.conversationMode);
+  const [speechToText, setSpeechToText] = useRecoilState(store.speechToText);
+  const [textToSpeech, setTextToSpeech] = useRecoilState(store.textToSpeech);
+  const [cacheTTS, setCacheTTS] = useRecoilState(store.cacheTTS);
+  const [engineSTT, setEngineSTT] = useRecoilState<string>(store.engineSTT);
+  const [languageSTT, setLanguageSTT] = useRecoilState<string>(store.languageSTT);
+  const [decibelValue, setDecibelValue] = useRecoilState(store.decibelValue);
+  const [autoSendText, setAutoSendText] = useRecoilState(store.autoSendText);
+  const [engineTTS, setEngineTTS] = useRecoilState<string>(store.engineTTS);
+  const [voice, setVoice] = useRecoilState<string>(store.voice);
+  const [languageTTS, setLanguageTTS] = useRecoilState<string>(store.languageTTS);
+  const [automaticPlayback, setAutomaticPlayback] = useRecoilState(store.automaticPlayback);
+  const [playbackRate, setPlaybackRate] = useRecoilState(store.playbackRate);
+
+  const updateSetting = useCallback(
+    (key, newValue) => {
+      const settings = {
+        conversationMode: { value: conversationMode, setFunc: setConversationMode },
+        advancedMode: { value: advancedMode, setFunc: setAdvancedMode },
+        speechToText: { value: speechToText, setFunc: setSpeechToText },
+        textToSpeech: { value: textToSpeech, setFunc: setTextToSpeech },
+        cacheTTS: { value: cacheTTS, setFunc: setCacheTTS },
+        engineSTT: { value: engineSTT, setFunc: setEngineSTT },
+        languageSTT: { value: languageSTT, setFunc: setLanguageSTT },
+        autoTranscribeAudio: { value: autoTranscribeAudio, setFunc: setAutoTranscribeAudio },
+        decibelValue: { value: decibelValue, setFunc: setDecibelValue },
+        autoSendText: { value: autoSendText, setFunc: setAutoSendText },
+        engineTTS: { value: engineTTS, setFunc: setEngineTTS },
+        voice: { value: voice, setFunc: setVoice },
+        languageTTS: { value: languageTTS, setFunc: setLanguageTTS },
+        automaticPlayback: { value: automaticPlayback, setFunc: setAutomaticPlayback },
+        playbackRate: { value: playbackRate, setFunc: setPlaybackRate },
+      };
+
+      if (settings[key]) {
+        const setting = settings[key];
+        setting.setFunc(newValue);
+      }
+    },
+    [
+      conversationMode,
+      advancedMode,
+      speechToText,
+      textToSpeech,
+      cacheTTS,
+      engineSTT,
+      languageSTT,
+      autoTranscribeAudio,
+      decibelValue,
+      autoSendText,
+      engineTTS,
+      voice,
+      languageTTS,
+      automaticPlayback,
+      playbackRate,
+      setConversationMode,
+      setAdvancedMode,
+      setSpeechToText,
+      setTextToSpeech,
+      setCacheTTS,
+      setEngineSTT,
+      setLanguageSTT,
+      setAutoTranscribeAudio,
+      setDecibelValue,
+      setAutoSendText,
+      setEngineTTS,
+      setVoice,
+      setLanguageTTS,
+      setAutomaticPlayback,
+      setPlaybackRate,
+    ],
+  );
+
+  useEffect(() => {
+    if (data) {
+      Object.entries(data).forEach(([key, value]) => {
+        updateSetting(key, value);
+      });
+    }
+  }, []);
 
   const contentRef = useRef(null);
   useOnClickOutside(contentRef, () => confirmClear && setConfirmClear(false), []);
@@ -91,13 +175,13 @@ function Speech() {
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
               <EngineSTTDropdown />
             </div>
+            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
+              <LanguageSTTDropdown />
+            </div>
             <div className="h-px bg-black/20 bg-white/20" role="none" />
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
               <TextToSpeechSwitch />
             </div>
-            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
-              <AutomaticPlayback />
-            </div>
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
               <EngineTTSDropdown />
             </div>
@@ -119,6 +203,9 @@ function Speech() {
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
               <EngineSTTDropdown />
             </div>
+            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
+              <LanguageSTTDropdown />
+            </div>
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
               <AutoTranscribeAudioSwitch />
             </div>
@@ -135,7 +222,7 @@ function Speech() {
               <TextToSpeechSwitch />
             </div>
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
-              <AutomaticPlayback />
+              <AutomaticPlaybackSwitch />
             </div>
             <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
               <EngineTTSDropdown />
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlayback.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlaybackSwitch.tsx
similarity index 94%
rename from client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlayback.tsx
rename to client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlaybackSwitch.tsx
index 400d43a230..5de5e053c1 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlayback.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlaybackSwitch.tsx
@@ -3,7 +3,7 @@ import { Switch } from '~/components/ui';
 import { useLocalize } from '~/hooks';
 import store from '~/store';
 
-export default function AutomaticPlayback({
+export default function AutomaticPlaybackSwitch({
   onCheckedChange,
 }: {
   onCheckedChange?: (value: boolean) => void;
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx
index ac76598107..b804581c28 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx
@@ -10,7 +10,7 @@ export default function CacheTTSSwitch({
 }) {
   const localize = useLocalize();
   const [cacheTTS, setCacheTTS] = useRecoilState<boolean>(store.cacheTTS);
-  const [textToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [textToSpeech] = useRecoilState<boolean>(store.textToSpeech);
 
   const handleCheckedChange = (value: boolean) => {
     setCacheTTS(value);
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx
index 4b7a536068..77bb85dc78 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx
@@ -5,21 +5,21 @@ import store from '~/store';
 
 export default function EngineTTSDropdown() {
   const localize = useLocalize();
-  const [endpointTTS, setEndpointTTS] = useRecoilState<string>(store.endpointTTS);
+  const [engineTTS, setEngineTTS] = useRecoilState<string>(store.engineTTS);
   const endpointOptions = [
     { value: 'browser', display: localize('com_nav_browser') },
     { value: 'external', display: localize('com_nav_external') },
   ];
 
   const handleSelect = (value: string) => {
-    setEndpointTTS(value);
+    setEngineTTS(value);
   };
 
   return (
     <div className="flex items-center justify-between">
       <div>{localize('com_nav_engine')}</div>
       <Dropdown
-        value={endpointTTS}
+        value={engineTTS}
         onChange={handleSelect}
         options={endpointOptions}
         width={180}
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx
index 489b8dc864..446c67da57 100755
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx
@@ -7,7 +7,7 @@ import { cn, defaultTextProps, optionText } from '~/utils/';
 
 export default function DecibelSelector() {
   const localize = useLocalize();
-  const textToSpeech = useRecoilValue(store.TextToSpeech);
+  const textToSpeech = useRecoilValue(store.textToSpeech);
   const [playbackRate, setPlaybackRate] = useRecoilState(store.playbackRate);
 
   return (
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx
index f0b5b8d6dc..241d1aa768 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx
@@ -9,7 +9,7 @@ export default function TextToSpeechSwitch({
   onCheckedChange?: (value: boolean) => void;
 }) {
   const localize = useLocalize();
-  const [TextToSpeech, setTextToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [TextToSpeech, setTextToSpeech] = useRecoilState<boolean>(store.textToSpeech);
 
   const handleCheckedChange = (value: boolean) => {
     setTextToSpeech(value);
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/AutomaticPlaybackSwitch.spec.tsx b/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/AutomaticPlaybackSwitch.spec.tsx
new file mode 100644
index 0000000000..5871fdc42b
--- /dev/null
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/AutomaticPlaybackSwitch.spec.tsx
@@ -0,0 +1,38 @@
+import React from 'react';
+import '@testing-library/jest-dom/extend-expect';
+import { render, fireEvent } from 'test/layout-test-utils';
+import AutomaticPlaybackSwitch from '../AutomaticPlaybackSwitch';
+import { RecoilRoot } from 'recoil';
+
+describe('AutomaticPlaybackSwitch', () => {
+  /**
+   * Mock function to set the text-to-speech state.
+   */
+  let mockSetAutomaticPlayback: jest.Mock<void, [boolean]> | ((value: boolean) => void) | undefined;
+
+  beforeEach(() => {
+    mockSetAutomaticPlayback = jest.fn();
+  });
+
+  it('renders correctly', () => {
+    const { getByTestId } = render(
+      <RecoilRoot>
+        <AutomaticPlaybackSwitch />
+      </RecoilRoot>,
+    );
+
+    expect(getByTestId('AutomaticPlayback')).toBeInTheDocument();
+  });
+
+  it('calls onCheckedChange when the switch is toggled', () => {
+    const { getByTestId } = render(
+      <RecoilRoot>
+        <AutomaticPlaybackSwitch onCheckedChange={mockSetAutomaticPlayback} />
+      </RecoilRoot>,
+    );
+    const switchElement = getByTestId('AutomaticPlayback');
+    fireEvent.click(switchElement);
+
+    expect(mockSetAutomaticPlayback).toHaveBeenCalledWith(true);
+  });
+});
diff --git a/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts b/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts
index 2fdae1fcf8..b7b12cf15b 100644
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts
@@ -1,4 +1,4 @@
-export { default as AutomaticPlayback } from './AutomaticPlayback';
+export { default as AutomaticPlaybackSwitch } from './AutomaticPlaybackSwitch';
 export { default as CacheTTSSwitch } from './CacheTTSSwitch';
 export { default as EngineTTSDropdown } from './EngineTTSDropdown';
 export { default as PlaybackRate } from './PlaybackRate';
diff --git a/client/src/data-provider/queries.ts b/client/src/data-provider/queries.ts
index 666a19e6cc..78c9676798 100644
--- a/client/src/data-provider/queries.ts
+++ b/client/src/data-provider/queries.ts
@@ -423,6 +423,13 @@ export const useVoicesQuery = (): UseQueryResult<t.VoiceResponse> => {
   return useQuery([QueryKeys.voices], () => dataService.getVoices());
 };
 
+/* Custom config speech */
+export const useCustomConfigSpeechQuery = (): UseQueryResult<t.getCustomConfigSpeechResponse> => {
+  return useQuery([QueryKeys.customConfigSpeech], () => dataService.getCustomConfigSpeech());
+};
+
+/** Prompt */
+
 export const usePromptGroupsInfiniteQuery = (
   params?: t.TPromptGroupsWithFilterRequest,
   config?: UseInfiniteQueryOptions<t.PromptGroupListResponse, unknown>,
diff --git a/client/src/hooks/Input/index.ts b/client/src/hooks/Input/index.ts
index 672c98e1be..beebe0f91c 100644
--- a/client/src/hooks/Input/index.ts
+++ b/client/src/hooks/Input/index.ts
@@ -8,3 +8,4 @@ export { default as useRequiresKey } from './useRequiresKey';
 export { default as useMultipleKeys } from './useMultipleKeys';
 export { default as useSpeechToText } from './useSpeechToText';
 export { default as useTextToSpeech } from './useTextToSpeech';
+export { default as useGetAudioSettings } from './useGetAudioSettings';
diff --git a/client/src/hooks/Input/useGetAudioSettings.tsx b/client/src/hooks/Input/useGetAudioSettings.tsx
new file mode 100644
index 0000000000..d7f529a846
--- /dev/null
+++ b/client/src/hooks/Input/useGetAudioSettings.tsx
@@ -0,0 +1,19 @@
+import { useRecoilState } from 'recoil';
+import store from '~/store';
+
+export enum AudioEndpoints {
+  browser = 'browser',
+  external = 'external',
+}
+
+const useGetAudioSettings = () => {
+  const [engineSTT] = useRecoilState<string>(store.engineSTT);
+  const [engineTTS] = useRecoilState<string>(store.engineTTS);
+
+  const externalSpeechToText = engineSTT === AudioEndpoints.external;
+  const externalTextToSpeech = engineTTS === AudioEndpoints.external;
+
+  return { externalSpeechToText, externalTextToSpeech };
+};
+
+export default useGetAudioSettings;
diff --git a/client/src/hooks/Input/useSpeechToText.ts b/client/src/hooks/Input/useSpeechToText.ts
index fd927ce35d..acf0f05ed3 100644
--- a/client/src/hooks/Input/useSpeechToText.ts
+++ b/client/src/hooks/Input/useSpeechToText.ts
@@ -1,17 +1,16 @@
 import { useState, useEffect } from 'react';
 import useSpeechToTextBrowser from './useSpeechToTextBrowser';
 import useSpeechToTextExternal from './useSpeechToTextExternal';
-import { useRecoilState } from 'recoil';
-import store from '~/store';
+import useGetAudioSettings from './useGetAudioSettings';
 
 const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => {
-  const [endpointSTT] = useRecoilState<string>(store.endpointSTT);
-  const useExternalSpeechToText = endpointSTT === 'external';
+  const { externalSpeechToText } = useGetAudioSettings();
   const [animatedText, setAnimatedText] = useState('');
 
   const {
     isListening: speechIsListeningBrowser,
     isLoading: speechIsLoadingBrowser,
+    interimTranscript: interimTranscriptBrowser,
     text: speechTextBrowser,
     startRecording: startSpeechRecordingBrowser,
     stopRecording: stopSpeechRecordingBrowser,
@@ -26,21 +25,21 @@ const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) =>
     clearText,
   } = useSpeechToTextExternal(handleTranscriptionComplete);
 
-  const isListening = useExternalSpeechToText
-    ? speechIsListeningExternal
-    : speechIsListeningBrowser;
-  const isLoading = useExternalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser;
-  const speechTextForm = useExternalSpeechToText ? speechTextExternal : speechTextBrowser;
-  const startRecording = useExternalSpeechToText
+  const isListening = externalSpeechToText ? speechIsListeningExternal : speechIsListeningBrowser;
+  const isLoading = externalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser;
+  const speechTextForm = externalSpeechToText ? speechTextExternal : speechTextBrowser;
+  const startRecording = externalSpeechToText
     ? startSpeechRecordingExternal
     : startSpeechRecordingBrowser;
-  const stopRecording = useExternalSpeechToText
+  const stopRecording = externalSpeechToText
     ? stopSpeechRecordingExternal
     : stopSpeechRecordingBrowser;
   const speechText =
     isListening || (speechTextExternal && speechTextExternal.length > 0)
       ? speechTextExternal
       : speechTextForm || '';
+  // for a future real-time STT external
+  const interimTranscript = externalSpeechToText ? '' : interimTranscriptBrowser;
 
   const animateTextTyping = (text: string) => {
     const totalDuration = 2000;
@@ -65,17 +64,18 @@ const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) =>
   };
 
   useEffect(() => {
-    if (speechText) {
+    if (speechText && externalSpeechToText) {
       animateTextTyping(speechText);
     }
-  }, [speechText]);
+  }, [speechText, externalSpeechToText]);
 
   return {
     isListening,
     isLoading,
     startRecording,
     stopRecording,
-    speechText: animatedText,
+    interimTranscript,
+    speechText: externalSpeechToText ? animatedText : speechText,
     clearText,
   };
 };
diff --git a/client/src/hooks/Input/useSpeechToTextBrowser.ts b/client/src/hooks/Input/useSpeechToTextBrowser.ts
index 9f41456899..ce17ee54be 100644
--- a/client/src/hooks/Input/useSpeechToTextBrowser.ts
+++ b/client/src/hooks/Input/useSpeechToTextBrowser.ts
@@ -1,34 +1,57 @@
-import { useEffect } from 'react';
+import { useEffect, useState } from 'react';
 import { useRecoilState } from 'recoil';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
 import SpeechRecognition, { useSpeechRecognition } from 'react-speech-recognition';
+import useGetAudioSettings from './useGetAudioSettings';
 
 const useSpeechToTextBrowser = () => {
   const { showToast } = useToastContext();
-  const [endpointSTT] = useRecoilState<string>(store.endpointSTT);
+  const [languageSTT] = useRecoilState<string>(store.languageSTT);
+  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
+  const { externalSpeechToText } = useGetAudioSettings();
+  const [isListening, setIsListening] = useState(false);
 
-  const { transcript, listening, resetTranscript, browserSupportsSpeechRecognition } =
-    useSpeechRecognition();
+  const {
+    interimTranscript,
+    finalTranscript,
+    listening,
+    browserSupportsSpeechRecognition,
+    isMicrophoneAvailable,
+  } = useSpeechRecognition();
 
   const toggleListening = () => {
-    if (browserSupportsSpeechRecognition) {
-      if (listening) {
-        SpeechRecognition.stopListening();
-      } else {
-        SpeechRecognition.startListening();
-      }
-    } else {
+    if (!browserSupportsSpeechRecognition) {
       showToast({
         message: 'Browser does not support SpeechRecognition',
         status: 'error',
       });
+      return;
+    }
+
+    if (!isMicrophoneAvailable) {
+      showToast({
+        message: 'Microphone is not available',
+        status: 'error',
+      });
+      return;
+    }
+
+    if (listening) {
+      setIsListening(false);
+      SpeechRecognition.stopListening();
+    } else {
+      setIsListening(true);
+      SpeechRecognition.startListening({
+        language: languageSTT,
+        continuous: autoTranscribeAudio,
+      });
     }
   };
 
   useEffect(() => {
     const handleKeyDown = (e: KeyboardEvent) => {
-      if (e.shiftKey && e.altKey && e.code === 'KeyL' && endpointSTT === 'browser') {
+      if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
         toggleListening();
       }
     };
@@ -37,15 +60,19 @@ const useSpeechToTextBrowser = () => {
     return () => window.removeEventListener('keydown', handleKeyDown);
   }, []);
 
+  useEffect(() => {
+    if (!listening) {
+      setIsListening(false);
+    }
+  }, [listening]);
+
   return {
-    isListening: listening,
+    isListening,
     isLoading: false,
-    text: transcript,
+    interimTranscript,
+    text: finalTranscript,
     startRecording: toggleListening,
-    stopRecording: () => {
-      SpeechRecognition.stopListening();
-      resetTranscript();
-    },
+    stopRecording: toggleListening,
   };
 };
 
diff --git a/client/src/hooks/Input/useSpeechToTextExternal.ts b/client/src/hooks/Input/useSpeechToTextExternal.ts
index 3d8eca360c..3eb74bcd1b 100644
--- a/client/src/hooks/Input/useSpeechToTextExternal.ts
+++ b/client/src/hooks/Input/useSpeechToTextExternal.ts
@@ -3,11 +3,12 @@ import { useRecoilState } from 'recoil';
 import { useSpeechToTextMutation } from '~/data-provider';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
+import useGetAudioSettings from './useGetAudioSettings';
 
 const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => {
   const { showToast } = useToastContext();
-  const [endpointSTT] = useRecoilState<string>(store.endpointSTT);
-  const [speechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const { externalSpeechToText } = useGetAudioSettings();
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
   const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
   const [autoSendText] = useRecoilState<boolean>(store.autoSendText);
   const [text, setText] = useState<string>('');
@@ -196,7 +197,7 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
   };
 
   const handleKeyDown = async (e: KeyboardEvent) => {
-    if (e.shiftKey && e.altKey && e.code === 'KeyL' && endpointSTT !== 'browser') {
+    if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
       if (!window.MediaRecorder) {
         showToast({ message: 'MediaRecorder is not supported in this browser', status: 'error' });
         return;
diff --git a/client/src/hooks/Input/useTextToSpeech.ts b/client/src/hooks/Input/useTextToSpeech.ts
index 444248310a..9722f9941b 100644
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@@ -4,12 +4,10 @@ import type { TMessage } from 'librechat-data-provider';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import { usePauseGlobalAudio } from '../Audio';
-import { useRecoilState } from 'recoil';
-import store from '~/store';
+import useGetAudioSettings from './useGetAudioSettings';
 
 const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
-  const [endpointTTS] = useRecoilState<string>(store.endpointTTS);
-  const useExternalTextToSpeech = endpointTTS === 'external';
+  const { externalTextToSpeech } = useGetAudioSettings();
 
   const {
     generateSpeechLocal: generateSpeechLocal,
@@ -26,9 +24,9 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
   } = useTextToSpeechExternal(message.messageId, isLast, index);
   const { pauseGlobalAudio } = usePauseGlobalAudio(index);
 
-  const generateSpeech = useExternalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
-  const cancelSpeech = useExternalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
-  const isSpeaking = useExternalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;
+  const generateSpeech = externalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
+  const cancelSpeech = externalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
+  const isSpeaking = externalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;
 
   const isMouseDownRef = useRef(false);
   const timerRef = useRef<number | undefined>(undefined);
diff --git a/client/src/localization/languages/Eng.ts b/client/src/localization/languages/Eng.ts
index 852df14ab5..587660395e 100644
--- a/client/src/localization/languages/Eng.ts
+++ b/client/src/localization/languages/Eng.ts
@@ -539,7 +539,7 @@ export default {
   com_nav_modular_chat: 'Enable switching Endpoints mid-conversation',
   com_nav_latex_parsing: 'Parsing LaTeX in messages (may affect performance)',
   com_nav_text_to_speech: 'Text to Speech',
-  com_nav_automatic_playback: 'Autoplay Latest Message (external only)',
+  com_nav_automatic_playback: 'Autoplay Latest Message',
   com_nav_speech_to_text: 'Speech to Text',
   com_nav_profile_picture: 'Profile Picture',
   com_nav_change_picture: 'Change picture',
diff --git a/client/src/store/settings.ts b/client/src/store/settings.ts
index 362e2dadd0..f1bb6eb4cc 100644
--- a/client/src/store/settings.ts
+++ b/client/src/store/settings.ts
@@ -18,32 +18,45 @@ const staticAtoms = {
   showPopover: atom<boolean>({ key: 'showPopover', default: false }),
 };
 
-// Atoms with localStorage
 const localStorageAtoms = {
+  // General settings
   autoScroll: atomWithLocalStorage('autoScroll', false),
-  showCode: atomWithLocalStorage('showCode', false),
   hideSidePanel: atomWithLocalStorage('hideSidePanel', false),
-  modularChat: atomWithLocalStorage('modularChat', true),
-  LaTeXParsing: atomWithLocalStorage('LaTeXParsing', true),
-  UsernameDisplay: atomWithLocalStorage('UsernameDisplay', true),
-  TextToSpeech: atomWithLocalStorage('textToSpeech', true),
-  automaticPlayback: atomWithLocalStorage('automaticPlayback', false),
+
+  // Messages settings
   enterToSend: atomWithLocalStorage('enterToSend', true),
-  SpeechToText: atomWithLocalStorage('speechToText', true),
-  conversationMode: atomWithLocalStorage('conversationMode', false),
-  advancedMode: atomWithLocalStorage('advancedMode', false),
-  autoSendText: atomWithLocalStorage('autoSendText', false),
-  autoTranscribeAudio: atomWithLocalStorage('autoTranscribeAudio', false),
-  decibelValue: atomWithLocalStorage('decibelValue', -45),
-  endpointSTT: atomWithLocalStorage('endpointSTT', 'browser'),
-  endpointTTS: atomWithLocalStorage('endpointTTS', 'browser'),
-  cacheTTS: atomWithLocalStorage('cacheTTS', true),
-  voice: atomWithLocalStorage('voice', ''),
+  showCode: atomWithLocalStorage('showCode', false),
+  saveDrafts: atomWithLocalStorage('saveDrafts', false),
   forkSetting: atomWithLocalStorage('forkSetting', ''),
   splitAtTarget: atomWithLocalStorage('splitAtTarget', false),
+
   rememberForkOption: atomWithLocalStorage('rememberForkOption', true),
+
+  // Beta features settings
+  modularChat: atomWithLocalStorage('modularChat', true),
+  LaTeXParsing: atomWithLocalStorage('LaTeXParsing', true),
+
+  // Speech settings
+  conversationMode: atomWithLocalStorage('conversationMode', false),
+  advancedMode: atomWithLocalStorage('advancedMode', false),
+
+  speechToText: atomWithLocalStorage('speechToText', true),
+  engineSTT: atomWithLocalStorage('engineSTT', 'browser'),
+  languageSTT: atomWithLocalStorage('languageSTT', ''),
+  autoTranscribeAudio: atomWithLocalStorage('autoTranscribeAudio', false),
+  decibelValue: atomWithLocalStorage('decibelValue', -45),
+  autoSendText: atomWithLocalStorage('autoSendText', false),
+
+  textToSpeech: atomWithLocalStorage('textToSpeech', true),
+  engineTTS: atomWithLocalStorage('engineTTS', 'browser'),
+  voice: atomWithLocalStorage('voice', ''),
+  languageTTS: atomWithLocalStorage('languageTTS', ''),
+  automaticPlayback: atomWithLocalStorage('automaticPlayback', false),
   playbackRate: atomWithLocalStorage<number | null>('playbackRate', null),
-  saveDrafts: atomWithLocalStorage('saveDrafts', false),
+  cacheTTS: atomWithLocalStorage('cacheTTS', true),
+
+  // Account settings
+  UsernameDisplay: atomWithLocalStorage('UsernameDisplay', true),
 };
 
 export default { ...staticAtoms, ...localStorageAtoms };
diff --git a/packages/data-provider/src/api-endpoints.ts b/packages/data-provider/src/api-endpoints.ts
index 0ec00b6384..7f032edb07 100644
--- a/packages/data-provider/src/api-endpoints.ts
+++ b/packages/data-provider/src/api-endpoints.ts
@@ -128,14 +128,18 @@ export const images = () => `${files()}/images`;
 
 export const avatar = () => `${images()}/avatar`;
 
-export const speechToText = () => `${files()}/stt`;
+export const speech = () => `${files()}/speech`;
 
-export const textToSpeech = () => `${files()}/tts`;
+export const speechToText = () => `${speech()}/stt`;
+
+export const textToSpeech = () => `${speech()}/tts`;
 
 export const textToSpeechManual = () => `${textToSpeech()}/manual`;
 
 export const textToSpeechVoices = () => `${textToSpeech()}/voices`;
 
+export const getCustomConfigSpeech = () => `${speech()}/config/get`;
+
 export const getPromptGroup = (_id: string) => `${prompts()}/groups/${_id}`;
 
 export const getPromptGroupsWithFilters = (filter: object) => {
diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts
index 39f34ea368..b0b2ff5b4b 100644
--- a/packages/data-provider/src/config.ts
+++ b/packages/data-provider/src/config.ts
@@ -6,6 +6,7 @@ import { fileConfigSchema } from './file-config';
 import { specsConfigSchema } from './models';
 import { FileSources } from './types/files';
 import { TModelsConfig } from './types';
+import { speech } from './api-endpoints';
 
 export const defaultSocialLogins = ['google', 'facebook', 'openid', 'github', 'discord'];
 
@@ -273,6 +274,40 @@ const sttSchema = z.object({
     .optional(),
 });
 
+const speechTab = z
+  .object({
+    conversationMode: z.boolean().optional(),
+    advancedMode: z.boolean().optional(),
+    speechToText: z
+      .boolean()
+      .optional()
+      .or(
+        z.object({
+          engineSTT: z.string().optional(),
+          languageSTT: z.string().optional(),
+          autoTranscribeAudio: z.boolean().optional(),
+          decibelValue: z.number().optional(),
+          autoSendText: z.boolean().optional(),
+        }),
+      )
+      .optional(),
+    textToSpeech: z
+      .boolean()
+      .optional()
+      .or(
+        z.object({
+          engineTTS: z.string().optional(),
+          voice: z.string().optional(),
+          languageTTS: z.string().optional(),
+          automaticPlayback: z.boolean().optional(),
+          playbackRate: z.number().optional(),
+          cacheTTS: z.boolean().optional(),
+        }),
+      )
+      .optional(),
+  })
+  .optional();
+
 export enum RateLimitPrefix {
   FILE_UPLOAD = 'FILE_UPLOAD',
   IMPORT = 'IMPORT',
@@ -362,8 +397,13 @@ export const configSchema = z.object({
       allowedDomains: z.array(z.string()).optional(),
     })
     .default({ socialLogins: defaultSocialLogins }),
-  tts: ttsSchema.optional(),
-  stt: sttSchema.optional(),
+  speech: z
+    .object({
+      tts: ttsSchema.optional(),
+      stt: sttSchema.optional(),
+      speechTab: speechTab.optional(),
+    })
+    .optional(),
   rateLimits: rateLimitSchema.optional(),
   fileConfig: fileConfigSchema.optional(),
   modelSpecs: specsConfigSchema.optional(),
diff --git a/packages/data-provider/src/data-service.ts b/packages/data-provider/src/data-service.ts
index 8250dcee17..fadd438b49 100644
--- a/packages/data-provider/src/data-service.ts
+++ b/packages/data-provider/src/data-service.ts
@@ -355,6 +355,10 @@ export const getVoices = (): Promise<f.VoiceResponse> => {
   return request.get(endpoints.textToSpeechVoices());
 };
 
+export const getCustomConfigSpeech = (): Promise<f.getCustomConfigSpeechResponse[]> => {
+  return request.get(endpoints.getCustomConfigSpeech());
+};
+
 /* actions */
 
 export const updateAction = (data: m.UpdateActionVariables): Promise<m.UpdateActionResponse> => {
diff --git a/packages/data-provider/src/keys.ts b/packages/data-provider/src/keys.ts
index c8692c1e06..bea8a58f50 100644
--- a/packages/data-provider/src/keys.ts
+++ b/packages/data-provider/src/keys.ts
@@ -27,6 +27,7 @@ export enum QueryKeys {
   assistantDocs = 'assistantDocs',
   fileDownload = 'fileDownload',
   voices = 'voices',
+  customConfigSpeech = 'customConfigSpeech',
   prompts = 'prompts',
   prompt = 'prompt',
   promptGroups = 'promptGroups',
diff --git a/packages/data-provider/src/types/files.ts b/packages/data-provider/src/types/files.ts
index 8358549282..8650b39c28 100644
--- a/packages/data-provider/src/types/files.ts
+++ b/packages/data-provider/src/types/files.ts
@@ -83,6 +83,8 @@ export type SpeechToTextResponse = {
 
 export type VoiceResponse = string[];
 
+export type getCustomConfigSpeechResponse = { [key: string]: string };
+
 export type UploadMutationOptions = {
   onSuccess?: (data: TFileUpload, variables: FormData, context?: unknown) => void;
   onMutate?: (variables: FormData) => void | Promise<unknown>;
@@ -113,6 +115,12 @@ export type VoiceOptions = {
   onError?: (error: unknown, variables: unknown, context?: unknown) => void;
 };
 
+export type getCustomConfigSpeechOptions = {
+  onSuccess?: (data: getCustomConfigSpeechResponse, variables: unknown, context?: unknown) => void;
+  onMutate?: () => void | Promise<unknown>;
+  onError?: (error: unknown, variables: unknown, context?: unknown) => void;
+};
+
 export type DeleteFilesResponse = {
   message: string;
   result: Record<string, unknown>;