🎤 feat: add custom speech config, browser TTS/STT features, and dynamic speech tab settings (#2921)

* feat: update useTextToSpeech and useSpeechToText hooks to support external audio endpoints This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: add userSelect style to ConversationModeSwitch label * fix: remove unused updateTokenWebsocket function and import The updateTokenWebsocket function and its import are no longer used in the OpenAIClient module. This commit removes the function and import to clean up the codebase * feat: support external audio endpoints in useTextToSpeech and useSpeechToText hooks This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: update AutomaticPlayback component to AutomaticPlaybackSwitch; tests: added AutomaticPlaybackSwitch.spec > > This commit renames the AutomaticPlayback component to AutomaticPlaybackSwitch in the Speech directory. The new name better reflects the purpose of the component and aligns with the naming convention used in the codebase. * feat: update useSpeechToText hook to include interimTranscript This commit updates the useSpeechToText hook in the client/src/components/Chat/Input/AudioRecorder.tsx file to include the interimTranscript state. This allows for real-time display of the speech-to-text transcription while the user is still speaking. The interimTranscript is now used to update the text area value during recording. * feat: Add customConfigSpeech API endpoint for retrieving custom speech configuration This commit adds a new API endpoint in the file under the directory. This endpoint is responsible for retrieving the custom speech configuration using the function from the module * feat: update store var and ; fix: getCustomConfigSpeech * fix: client tests, removed unused import * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations This commit modifies the useCustomConfigSpeechQuery function in the client/src/data-provider/queries.ts file to return an array of custom speech configurations instead of a single object. This change allows for better handling and manipulation of the data in the application * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations * refactor: Update variable name in speechTab schema * refactor: removed unused and nested code * fix: using recoilState * refactor: Update Speech component to use useCallback for setting settings * fix: test * fix: tests * feature: ensure that the settings don't change after modifying then through the UI * remove comment * fix: Handle error gracefully in getCustomConfigSpeech and getVoices endpoints * fix: Handle error * fix: backend tests * fix: invalid custom config logging * chore: add back custom config info logging * chore: revert loadCustomConfig spec --------- Co-authored-by: Danny Avila <danny@librechat.ai>
2025-12-17 00:40:14 +01:00 · 2024-07-05 17:13:34 +03:00 · 2024-07-05 17:13:34 +03:00 · 1aad315de6
commit 1aad315de6
parent 5d985746cb
50 changed files with 598 additions and 179 deletions
--- a/api/app/clients/OpenAIClient.js
+++ b/api/app/clients/OpenAIClient.js
@ -27,7 +27,6 @@ const {
  createContextHandlers,
 } = require('./prompts');
 const { encodeAndFormat } = require('~/server/services/Files/images/encode');
-const { updateTokenWebsocket } = require('~/server/services/Files/Audio');
 const { isEnabled, sleep } = require('~/server/utils');
 const { handleOpenAIErrors } = require('./tools/util');
 const spendTokens = require('~/models/spendTokens');
@ -595,7 +594,6 @@ class OpenAIClient extends BaseClient {
        payload,
        (progressMessage) => {
          if (progressMessage === '[DONE]') {
-            updateTokenWebsocket('[DONE]');
            return;
          }

--- a/api/server/routes/files/index.js
+++ b/api/server/routes/files/index.js
@ -1,19 +1,11 @@
 const express = require('express');
-const {
-  uaParser,
-  checkBan,
-  requireJwtAuth,
-  createFileLimiters,
-  createTTSLimiters,
-  createSTTLimiters,
-} = require('~/server/middleware');
+const { uaParser, checkBan, requireJwtAuth, createFileLimiters } = require('~/server/middleware');
 const { createMulterInstance } = require('./multer');

 const files = require('./files');
 const images = require('./images');
 const avatar = require('./avatar');
-const stt = require('./stt');
-const tts = require('./tts');
+const speech = require('./speech');

 const initialize = async () => {
  const router = express.Router();
@ -21,11 +13,8 @@ const initialize = async () => {
  router.use(checkBan);
  router.use(uaParser);

-  /* Important: stt/tts routes must be added before the upload limiters */
-  const { sttIpLimiter, sttUserLimiter } = createSTTLimiters();
-  const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters();
-  router.use('/stt', sttIpLimiter, sttUserLimiter, stt);
-  router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts);
+  /* Important: speech route must be added before the upload limiters */
+  router.use('/speech', speech);

  const upload = await createMulterInstance();
  const { fileUploadIpLimiter, fileUploadUserLimiter } = createFileLimiters();
--- a/api/server/routes/files/speech/customConfigSpeech.js
+++ b/api/server/routes/files/speech/customConfigSpeech.js
@ -0,0 +1,10 @@
+const express = require('express');
+const router = express.Router();
+
+const { getCustomConfigSpeech } = require('~/server/services/Files/Audio');
+
+router.get('/get', async (req, res) => {
+  await getCustomConfigSpeech(req, res);
+});
+
+module.exports = router;
--- a/api/server/routes/files/speech/index.js
+++ b/api/server/routes/files/speech/index.js
@ -0,0 +1,17 @@
+const express = require('express');
+const { createTTSLimiters, createSTTLimiters } = require('~/server/middleware');
+
+const stt = require('./stt');
+const tts = require('./tts');
+const customConfigSpeech = require('./customConfigSpeech');
+
+const router = express.Router();
+
+const { sttIpLimiter, sttUserLimiter } = createSTTLimiters();
+const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters();
+router.use('/stt', sttIpLimiter, sttUserLimiter, stt);
+router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts);
+
+router.use('/config', customConfigSpeech);
+
+module.exports = router;
--- a/api/server/routes/files/speech/stt.js
+++ b/api/server/routes/files/speech/stt.js
--- a/api/server/routes/files/speech/tts.js
+++ b/api/server/routes/files/speech/tts.js
--- a/api/server/services/Config/loadCustomConfig.js
+++ b/api/server/services/Config/loadCustomConfig.js
@ -76,8 +76,28 @@ Please specify a correct \`imageOutputType\` value (case-sensitive).
    );
  }
  if (!result.success) {
-    i === 0 && logger.error(`Invalid custom config file at ${configPath}`, result.error);
-    i === 0 && i++;
+    let errorMessage = `Invalid custom config file at ${configPath}:
+${JSON.stringify(result.error, null, 2)}`;
+
+    if (i === 0) {
+      logger.error(errorMessage);
+      const speechError = result.error.errors.find(
+        (err) =>
+          err.code === 'unrecognized_keys' &&
+          (err.message?.includes('stt') || err.message?.includes('tts')),
+      );
+
+      if (speechError) {
+        logger.warn(`
+The Speech-to-text and Text-to-speech configuration format has recently changed.
+If you're getting this error, please refer to the latest documentation:
+
+https://www.librechat.ai/docs/configuration/stt_tts`);
+      }
+
+      i++;
+    }
+
    return null;
  } else {
    logger.info('Custom config file loaded:');
--- a/api/server/services/Files/Audio/getCustomConfigSpeech.js
+++ b/api/server/services/Files/Audio/getCustomConfigSpeech.js
@ -0,0 +1,50 @@
+const getCustomConfig = require('~/server/services/Config/getCustomConfig');
+
+/**
+ * This function retrieves the speechTab settings from the custom configuration
+ * It first fetches the custom configuration
+ * Then, it checks if the custom configuration and the speechTab schema exist
+ * If they do, it sends the speechTab settings as a JSON response
+ * If they don't, it throws an error
+ *
+ * @param {Object} req - The request object
+ * @param {Object} res - The response object
+ * @returns {Promise<void>}
+ * @throws {Error} - If the custom configuration or the speechTab schema is missing, an error is thrown
+ */
+async function getCustomConfigSpeech(req, res) {
+  try {
+    const customConfig = await getCustomConfig();
+
+    if (!customConfig || !customConfig.speech?.speechTab) {
+      throw new Error('Configuration or speechTab schema is missing');
+    }
+
+    const ttsSchema = customConfig.speech?.speechTab;
+    let settings = {};
+
+    if (ttsSchema.advancedMode !== undefined) {
+      settings.advancedMode = ttsSchema.advancedMode;
+    }
+    if (ttsSchema.speechToText) {
+      for (const key in ttsSchema.speechToText) {
+        if (ttsSchema.speechToText[key] !== undefined) {
+          settings[key] = ttsSchema.speechToText[key];
+        }
+      }
+    }
+    if (ttsSchema.textToSpeech) {
+      for (const key in ttsSchema.textToSpeech) {
+        if (ttsSchema.textToSpeech[key] !== undefined) {
+          settings[key] = ttsSchema.textToSpeech[key];
+        }
+      }
+    }
+
+    res.json(settings);
+  } catch (error) {
+    res.status(200).send();
+  }
+}
+
+module.exports = getCustomConfigSpeech;
--- a/api/server/services/Files/Audio/getVoices.js
+++ b/api/server/services/Files/Audio/getVoices.js
@ -1,4 +1,3 @@
-const { logger } = require('~/config');
 const getCustomConfig = require('~/server/services/Config/getCustomConfig');
 const { getProvider } = require('./textToSpeech');

@ -16,11 +15,11 @@ async function getVoices(req, res) {
  try {
    const customConfig = await getCustomConfig();

-    if (!customConfig || !customConfig?.tts) {
+    if (!customConfig || !customConfig?.speech?.tts) {
      throw new Error('Configuration or TTS schema is missing');
    }

-    const ttsSchema = customConfig?.tts;
+    const ttsSchema = customConfig?.speech?.tts;
    const provider = getProvider(ttsSchema);
    let voices;

@ -40,8 +39,7 @@ async function getVoices(req, res) {

    res.json(voices);
  } catch (error) {
-    logger.error(`Failed to get voices: ${error.message}`);
-    res.status(500).json({ error: 'Failed to get voices' });
+    res.status(500).json({ error: `Failed to get voices: ${error.message}` });
  }
 }

--- a/api/server/services/Files/Audio/index.js
+++ b/api/server/services/Files/Audio/index.js
@ -1,11 +1,11 @@
 const getVoices = require('./getVoices');
+const getCustomConfigSpeech = require('./getCustomConfigSpeech');
 const textToSpeech = require('./textToSpeech');
 const speechToText = require('./speechToText');
-const { updateTokenWebsocket } = require('./webSocket');

 module.exports = {
  getVoices,
+  getCustomConfigSpeech,
  speechToText,
  ...textToSpeech,
-  updateTokenWebsocket,
 };
--- a/api/server/services/Files/Audio/speechToText.js
+++ b/api/server/services/Files/Audio/speechToText.js
@ -25,7 +25,7 @@ async function handleResponse(response) {
 }

 function getProvider(sttSchema) {
-  if (sttSchema.openai) {
+  if (sttSchema?.openai) {
    return 'openai';
  }

@ -176,7 +176,7 @@ async function speechToText(req, res) {
  const audioReadStream = Readable.from(audioBuffer);
  audioReadStream.path = 'audio.wav';

-  const provider = getProvider(customConfig.stt);
+  const provider = getProvider(customConfig.speech.stt);

  let [url, data, headers] = [];

--- a/api/server/services/Files/Audio/textToSpeech.js
+++ b/api/server/services/Files/Audio/textToSpeech.js
@ -191,8 +191,8 @@ function localAIProvider(ttsSchema, input, voice) {
 * @returns {Promise<[string, TProviderSchema]>}
 */
 async function getProviderSchema(customConfig) {
-  const provider = getProvider(customConfig.tts);
-  return [provider, customConfig.tts[provider]];
+  const provider = getProvider(customConfig.speech.tts);
+  return [provider, customConfig.speech.tts[provider]];
 }

 /**
--- a/api/server/services/Files/Audio/webSocket.js
+++ b/api/server/services/Files/Audio/webSocket.js
@ -1,31 +0,0 @@
-let token = '';
-
-function updateTokenWebsocket(newToken) {
-  console.log('Token:', newToken);
-  token = newToken;
-}
-
-function sendTextToWebsocket(ws, onDataReceived) {
-  if (token === '[DONE]') {
-    ws.send(' ');
-    return;
-  }
-
-  if (ws.readyState === WebSocket.OPEN) {
-    ws.send(token);
-
-    ws.onmessage = function (event) {
-      console.log('Received:', event.data);
-      if (onDataReceived) {
-        onDataReceived(event.data); // Pass the received data to the callback function
-      }
-    };
-  } else {
-    console.error('WebSocket is not open. Ready state is: ' + ws.readyState);
-  }
-}
-
-module.exports = {
-  updateTokenWebsocket,
-  sendTextToWebsocket,
-};
--- a/client/src/components/Chat/Input/AudioRecorder.tsx
+++ b/client/src/components/Chat/Input/AudioRecorder.tsx
@ -31,15 +31,26 @@ export default function AudioRecorder({
    }
  };

-  const { isListening, isLoading, startRecording, stopRecording, speechText, clearText } =
-    useSpeechToText(handleTranscriptionComplete);
+  const {
+    isListening,
+    isLoading,
+    startRecording,
+    stopRecording,
+    interimTranscript,
+    speechText,
+    clearText,
+  } = useSpeechToText(handleTranscriptionComplete);

  useEffect(() => {
-    if (textAreaRef.current) {
+    if (isListening && textAreaRef.current) {
+      methods.setValue('text', interimTranscript, {
+        shouldValidate: true,
+      });
+    } else if (textAreaRef.current) {
      textAreaRef.current.value = speechText;
      methods.setValue('text', speechText, { shouldValidate: true });
    }
-  }, [speechText, methods, textAreaRef]);
+  }, [interimTranscript, speechText, methods, textAreaRef]);

  const handleStartRecording = async () => {
    await startRecording();
--- a/client/src/components/Chat/Input/ChatForm.tsx
+++ b/client/src/components/Chat/Input/ChatForm.tsx
@ -38,8 +38,8 @@ const ChatForm = ({ index = 0 }) => {
  const submitButtonRef = useRef<HTMLButtonElement>(null);
  const textAreaRef = useRef<HTMLTextAreaElement | null>(null);

-  const SpeechToText = useRecoilValue(store.SpeechToText);
-  const TextToSpeech = useRecoilValue(store.TextToSpeech);
+  const SpeechToText = useRecoilValue(store.speechToText);
+  const TextToSpeech = useRecoilValue(store.textToSpeech);
  const automaticPlayback = useRecoilValue(store.automaticPlayback);

  const [showStopButton, setShowStopButton] = useRecoilState(store.showStopButtonByIndex(index));
--- a/client/src/components/Chat/Input/Files/Table/DataTable.tsx
+++ b/client/src/components/Chat/Input/Files/Table/DataTable.tsx
@ -96,7 +96,7 @@ export default function DataTable<TData, TValue>({ columns, data }: DataTablePro
            deleteFiles({ files: filesToDelete as TFile[] });
            setRowSelection({});
          }}
-          className="dark:hover:bg-gray-850/25 ml-1 gap-2 sm:ml-0"
+          className="ml-1 gap-2 dark:hover:bg-gray-850/25 sm:ml-0"
          disabled={!table.getFilteredSelectedRowModel().rows.length || isDeleting}
        >
          {isDeleting ? (
--- a/client/src/components/Chat/Messages/HoverButtons.tsx
+++ b/client/src/components/Chat/Messages/HoverButtons.tsx
@ -39,7 +39,7 @@ export default function HoverButtons({
  const { endpoint: _endpoint, endpointType } = conversation ?? {};
  const endpoint = endpointType ?? _endpoint;
  const [isCopied, setIsCopied] = useState(false);
-  const [TextToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [TextToSpeech] = useRecoilState<boolean>(store.textToSpeech);

  const {
    hideEditButton,
--- a/client/src/components/Files/FileList/DataTableFile.tsx
+++ b/client/src/components/Files/FileList/DataTableFile.tsx
@ -106,7 +106,7 @@ export default function DataTableFile<TData, TValue>({
                deleteFiles({ files: filesToDelete as TFile[] });
                setRowSelection({});
              }}
-              className="dark:hover:bg-gray-850/25 ml-1 gap-2 sm:ml-0"
+              className="ml-1 gap-2 dark:hover:bg-gray-850/25 sm:ml-0"
              disabled={!table.getFilteredSelectedRowModel().rows.length || isDeleting}
            >
              {isDeleting ? (
--- a/client/src/components/Files/FileList/FileTableColumns.tsx
+++ b/client/src/components/Files/FileList/FileTableColumns.tsx
@ -75,8 +75,8 @@ export const fileTableColumns: ColumnDef<TFile>[] = [
      return (
        <>
          {attachedVectorStores.map((vectorStore, index) => {
-            if (index === 4)
-            {return (
+            if (index === 4) {
+              return (
                <span
                  key={index}
                  className="ml-2 mt-2 flex w-fit flex-row items-center rounded-full bg-[#f5f5f5] px-2 text-gray-500"
@ -85,8 +85,11 @@ export const fileTableColumns: ColumnDef<TFile>[] = [
                  &nbsp;
                  {attachedVectorStores.length - index} more
                </span>
-            );}
-            if (index > 4) {return null;}
+              );
+            }
+            if (index > 4) {
+              return null;
+            }
            return (
              <span key={index} className="ml-2 mt-2 rounded-full bg-[#f2f8ec] px-2 text-[#91c561]">
                {vectorStore.name}
--- a/client/src/components/Messages/ScrollToBottom.tsx
+++ b/client/src/components/Messages/ScrollToBottom.tsx
@ -8,7 +8,7 @@ export default function ScrollToBottom({ scrollHandler }: Props) {
  return (
    <button
      onClick={scrollHandler}
-      className="dark:bg-gray-850/90 absolute bottom-5 right-1/2 cursor-pointer rounded-full border border-gray-200 bg-white bg-clip-padding text-gray-600 dark:border-white/10 dark:text-gray-200"
+      className="absolute bottom-5 right-1/2 cursor-pointer rounded-full border border-gray-200 bg-white bg-clip-padding text-gray-600 dark:border-white/10 dark:bg-gray-850/90 dark:text-gray-200"
    >
      <svg
        width="24"
--- a/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/ConversationModeSwitch.tsx
@ -10,18 +10,16 @@ export default function ConversationModeSwitch({
 }) {
  const localize = useLocalize();
  const [conversationMode, setConversationMode] = useRecoilState<boolean>(store.conversationMode);
-  const [advancedMode] = useRecoilState<boolean>(store.advancedMode);
-  const [textToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
+  const [textToSpeech] = useRecoilState<boolean>(store.textToSpeech);
  const [, setAutoSendText] = useRecoilState<boolean>(store.autoSendText);
  const [, setDecibelValue] = useRecoilState(store.decibelValue);
  const [, setAutoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);

  const handleCheckedChange = (value: boolean) => {
-    if (!advancedMode) {
    setAutoTranscribeAudio(value);
    setAutoSendText(value);
    setDecibelValue(-45);
-    }
    setConversationMode(value);
    if (onCheckedChange) {
      onCheckedChange(value);
@ -40,7 +38,7 @@ export default function ConversationModeSwitch({
          onCheckedChange={handleCheckedChange}
          className="ml-4"
          data-testid="ConversationMode"
-          disabled={!textToSpeech}
+          disabled={!textToSpeech || !speechToText}
        />
      </div>
    </div>
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoSendTextSwitch.tsx
@ -10,7 +10,7 @@ export default function AutoSendTextSwitch({
 }) {
  const localize = useLocalize();
  const [autoSendText, setAutoSendText] = useRecoilState<boolean>(store.autoSendText);
-  const [SpeechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const [SpeechToText] = useRecoilState<boolean>(store.speechToText);

  const handleCheckedChange = (value: boolean) => {
    setAutoSendText(value);
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/AutoTranscribeAudioSwitch.tsx
@ -12,7 +12,7 @@ export default function AutoTranscribeAudioSwitch({
  const [autoTranscribeAudio, setAutoTranscribeAudio] = useRecoilState<boolean>(
    store.autoTranscribeAudio,
  );
-  const [speechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);

  const handleCheckedChange = (value: boolean) => {
    setAutoTranscribeAudio(value);
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/DecibelSelector.tsx
@ -7,7 +7,7 @@ import { cn, defaultTextProps, optionText } from '~/utils/';

 export default function DecibelSelector() {
  const localize = useLocalize();
-  const speechToText = useRecoilValue(store.SpeechToText);
+  const speechToText = useRecoilValue(store.speechToText);
  const [decibelValue, setDecibelValue] = useRecoilState(store.decibelValue);

  return (
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/EngineSTTDropdown.tsx
@ -5,21 +5,21 @@ import store from '~/store';

 export default function EngineSTTDropdown() {
  const localize = useLocalize();
-  const [endpointSTT, setEndpointSTT] = useRecoilState<string>(store.endpointSTT);
+  const [engineSTT, setEngineSTT] = useRecoilState<string>(store.engineSTT);
  const endpointOptions = [
    { value: 'browser', display: localize('com_nav_browser') },
    { value: 'external', display: localize('com_nav_external') },
  ];

  const handleSelect = (value: string) => {
-    setEndpointSTT(value);
+    setEngineSTT(value);
  };

  return (
    <div className="flex items-center justify-between">
      <div>{localize('com_nav_engine')}</div>
      <Dropdown
-        value={endpointSTT}
+        value={engineSTT}
        onChange={handleSelect}
        options={endpointOptions}
        width={180}
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx
@ -0,0 +1,107 @@
+import { useRecoilState } from 'recoil';
+import { Dropdown } from '~/components/ui';
+import { useLocalize } from '~/hooks';
+import store from '~/store';
+
+export default function LanguageSTTDropdown() {
+  const localize = useLocalize();
+  const [languageSTT, setLanguageSTT] = useRecoilState<string>(store.languageSTT);
+
+  const languageOptions = [
+    { value: 'af', display: 'Afrikaans' },
+    { value: 'eu', display: 'Basque' },
+    { value: 'bg', display: 'Bulgarian' },
+    { value: 'ca', display: 'Catalan' },
+    { value: 'ar-EG', display: 'Arabic (Egypt)' },
+    { value: 'ar-JO', display: 'Arabic (Jordan)' },
+    { value: 'ar-KW', display: 'Arabic (Kuwait)' },
+    { value: 'ar-LB', display: 'Arabic (Lebanon)' },
+    { value: 'ar-QA', display: 'Arabic (Qatar)' },
+    { value: 'ar-AE', display: 'Arabic (UAE)' },
+    { value: 'ar-MA', display: 'Arabic (Morocco)' },
+    { value: 'ar-IQ', display: 'Arabic (Iraq)' },
+    { value: 'ar-DZ', display: 'Arabic (Algeria)' },
+    { value: 'ar-BH', display: 'Arabic (Bahrain)' },
+    { value: 'ar-LY', display: 'Arabic (Libya)' },
+    { value: 'ar-OM', display: 'Arabic (Oman)' },
+    { value: 'ar-SA', display: 'Arabic (Saudi Arabia)' },
+    { value: 'ar-TN', display: 'Arabic (Tunisia)' },
+    { value: 'ar-YE', display: 'Arabic (Yemen)' },
+    { value: 'cs', display: 'Czech' },
+    { value: 'nl-NL', display: 'Dutch' },
+    { value: 'en-AU', display: 'English (Australia)' },
+    { value: 'en-CA', display: 'English (Canada)' },
+    { value: 'en-IN', display: 'English (India)' },
+    { value: 'en-NZ', display: 'English (New Zealand)' },
+    { value: 'en-ZA', display: 'English (South Africa)' },
+    { value: 'en-GB', display: 'English (UK)' },
+    { value: 'en-US', display: 'English (US)' },
+    { value: 'fi', display: 'Finnish' },
+    { value: 'fr-FR', display: 'French' },
+    { value: 'gl', display: 'Galician' },
+    { value: 'de-DE', display: 'German' },
+    { value: 'el-GR', display: 'Greek' },
+    { value: 'he', display: 'Hebrew' },
+    { value: 'hu', display: 'Hungarian' },
+    { value: 'is', display: 'Icelandic' },
+    { value: 'it-IT', display: 'Italian' },
+    { value: 'id', display: 'Indonesian' },
+    { value: 'ja', display: 'Japanese' },
+    { value: 'ko', display: 'Korean' },
+    { value: 'la', display: 'Latin' },
+    { value: 'zh-CN', display: 'Mandarin Chinese' },
+    { value: 'zh-TW', display: 'Taiwanese' },
+    { value: 'zh-HK', display: 'Cantonese' },
+    { value: 'ms-MY', display: 'Malaysian' },
+    { value: 'no-NO', display: 'Norwegian' },
+    { value: 'pl', display: 'Polish' },
+    { value: 'xx-piglatin', display: 'Pig Latin' },
+    { value: 'pt-PT', display: 'Portuguese' },
+    { value: 'pt-br', display: 'Portuguese (Brasil)' },
+    { value: 'ro-RO', display: 'Romanian' },
+    { value: 'ru', display: 'Russian' },
+    { value: 'sr-SP', display: 'Serbian' },
+    { value: 'sk', display: 'Slovak' },
+    { value: 'es-AR', display: 'Spanish (Argentina)' },
+    { value: 'es-BO', display: 'Spanish (Bolivia)' },
+    { value: 'es-CL', display: 'Spanish (Chile)' },
+    { value: 'es-CO', display: 'Spanish (Colombia)' },
+    { value: 'es-CR', display: 'Spanish (Costa Rica)' },
+    { value: 'es-DO', display: 'Spanish (Dominican Republic)' },
+    { value: 'es-EC', display: 'Spanish (Ecuador)' },
+    { value: 'es-SV', display: 'Spanish (El Salvador)' },
+    { value: 'es-GT', display: 'Spanish (Guatemala)' },
+    { value: 'es-HN', display: 'Spanish (Honduras)' },
+    { value: 'es-MX', display: 'Spanish (Mexico)' },
+    { value: 'es-NI', display: 'Spanish (Nicaragua)' },
+    { value: 'es-PA', display: 'Spanish (Panama)' },
+    { value: 'es-PY', display: 'Spanish (Paraguay)' },
+    { value: 'es-PE', display: 'Spanish (Peru)' },
+    { value: 'es-PR', display: 'Spanish (Puerto Rico)' },
+    { value: 'es-ES', display: 'Spanish (Spain)' },
+    { value: 'es-US', display: 'Spanish (US)' },
+    { value: 'es-UY', display: 'Spanish (Uruguay)' },
+    { value: 'es-VE', display: 'Spanish (Venezuela)' },
+    { value: 'sv-SE', display: 'Swedish' },
+    { value: 'tr', display: 'Turkish' },
+    { value: 'zu', display: 'Zulu' },
+  ];
+
+  const handleSelect = (value: string) => {
+    setLanguageSTT(value);
+  };
+
+  return (
+    <div className="flex items-center justify-between">
+      <div>{localize('com_nav_language')}</div>
+      <Dropdown
+        value={languageSTT}
+        onChange={handleSelect}
+        options={languageOptions}
+        width={220}
+        position={'left'}
+        testId="LanguageSTTDropdown"
+      />
+    </div>
+  );
+}
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/SpeechToTextSwitch.tsx
@ -9,7 +9,7 @@ export default function SpeechToTextSwitch({
  onCheckedChange?: (value: boolean) => void;
 }) {
  const localize = useLocalize();
-  const [speechToText, setSpeechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const [speechToText, setSpeechToText] = useRecoilState<boolean>(store.speechToText);

  const handleCheckedChange = (value: boolean) => {
    setSpeechToText(value);
--- a/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts
+++ b/client/src/components/Nav/SettingsTabs/Speech/STT/index.ts
@ -3,3 +3,4 @@ export { default as SpeechToTextSwitch } from './SpeechToTextSwitch';
 export { default as EngineSTTDropdown } from './EngineSTTDropdown';
 export { default as DecibelSelector } from './DecibelSelector';
 export { default as AutoTranscribeAudioSwitch } from './AutoTranscribeAudioSwitch';
+export { default as LanguageSTTDropdown } from './LanguageSTTDropdown';
--- a/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/Speech.tsx
@ -1,6 +1,6 @@
 import * as Tabs from '@radix-ui/react-tabs';
 import { SettingsTabValues } from 'librechat-data-provider';
-import React, { useState, useRef } from 'react';
+import React, { useState, useRef, useEffect, useCallback } from 'react';
 import { useRecoilState } from 'recoil';
 import { Lightbulb, Cog } from 'lucide-react';
 import { useOnClickOutside, useMediaQuery } from '~/hooks';
@ -10,7 +10,7 @@ import ConversationModeSwitch from './ConversationModeSwitch';
 import {
  TextToSpeechSwitch,
  EngineTTSDropdown,
-  AutomaticPlayback,
+  AutomaticPlaybackSwitch,
  CacheTTSSwitch,
  VoiceDropdown,
  PlaybackRate,
@ -18,16 +18,100 @@ import {
 import {
  DecibelSelector,
  EngineSTTDropdown,
+  LanguageSTTDropdown,
  SpeechToTextSwitch,
  AutoSendTextSwitch,
  AutoTranscribeAudioSwitch,
 } from './STT';
+import { useCustomConfigSpeechQuery } from '~/data-provider';

 function Speech() {
-  const isSmallScreen = useMediaQuery('(max-width: 767px)');
-  const [advancedMode, setAdvancedMode] = useRecoilState<boolean>(store.advancedMode);
-  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
  const [confirmClear, setConfirmClear] = useState(false);
+  const { data } = useCustomConfigSpeechQuery();
+  const isSmallScreen = useMediaQuery('(max-width: 767px)');
+
+  const [advancedMode, setAdvancedMode] = useRecoilState(store.advancedMode);
+  const [autoTranscribeAudio, setAutoTranscribeAudio] = useRecoilState(store.autoTranscribeAudio);
+  const [conversationMode, setConversationMode] = useRecoilState(store.conversationMode);
+  const [speechToText, setSpeechToText] = useRecoilState(store.speechToText);
+  const [textToSpeech, setTextToSpeech] = useRecoilState(store.textToSpeech);
+  const [cacheTTS, setCacheTTS] = useRecoilState(store.cacheTTS);
+  const [engineSTT, setEngineSTT] = useRecoilState<string>(store.engineSTT);
+  const [languageSTT, setLanguageSTT] = useRecoilState<string>(store.languageSTT);
+  const [decibelValue, setDecibelValue] = useRecoilState(store.decibelValue);
+  const [autoSendText, setAutoSendText] = useRecoilState(store.autoSendText);
+  const [engineTTS, setEngineTTS] = useRecoilState<string>(store.engineTTS);
+  const [voice, setVoice] = useRecoilState<string>(store.voice);
+  const [languageTTS, setLanguageTTS] = useRecoilState<string>(store.languageTTS);
+  const [automaticPlayback, setAutomaticPlayback] = useRecoilState(store.automaticPlayback);
+  const [playbackRate, setPlaybackRate] = useRecoilState(store.playbackRate);
+
+  const updateSetting = useCallback(
+    (key, newValue) => {
+      const settings = {
+        conversationMode: { value: conversationMode, setFunc: setConversationMode },
+        advancedMode: { value: advancedMode, setFunc: setAdvancedMode },
+        speechToText: { value: speechToText, setFunc: setSpeechToText },
+        textToSpeech: { value: textToSpeech, setFunc: setTextToSpeech },
+        cacheTTS: { value: cacheTTS, setFunc: setCacheTTS },
+        engineSTT: { value: engineSTT, setFunc: setEngineSTT },
+        languageSTT: { value: languageSTT, setFunc: setLanguageSTT },
+        autoTranscribeAudio: { value: autoTranscribeAudio, setFunc: setAutoTranscribeAudio },
+        decibelValue: { value: decibelValue, setFunc: setDecibelValue },
+        autoSendText: { value: autoSendText, setFunc: setAutoSendText },
+        engineTTS: { value: engineTTS, setFunc: setEngineTTS },
+        voice: { value: voice, setFunc: setVoice },
+        languageTTS: { value: languageTTS, setFunc: setLanguageTTS },
+        automaticPlayback: { value: automaticPlayback, setFunc: setAutomaticPlayback },
+        playbackRate: { value: playbackRate, setFunc: setPlaybackRate },
+      };
+
+      if (settings[key]) {
+        const setting = settings[key];
+        setting.setFunc(newValue);
+      }
+    },
+    [
+      conversationMode,
+      advancedMode,
+      speechToText,
+      textToSpeech,
+      cacheTTS,
+      engineSTT,
+      languageSTT,
+      autoTranscribeAudio,
+      decibelValue,
+      autoSendText,
+      engineTTS,
+      voice,
+      languageTTS,
+      automaticPlayback,
+      playbackRate,
+      setConversationMode,
+      setAdvancedMode,
+      setSpeechToText,
+      setTextToSpeech,
+      setCacheTTS,
+      setEngineSTT,
+      setLanguageSTT,
+      setAutoTranscribeAudio,
+      setDecibelValue,
+      setAutoSendText,
+      setEngineTTS,
+      setVoice,
+      setLanguageTTS,
+      setAutomaticPlayback,
+      setPlaybackRate,
+    ],
+  );
+
+  useEffect(() => {
+    if (data) {
+      Object.entries(data).forEach(([key, value]) => {
+        updateSetting(key, value);
+      });
+    }
+  }, []);

  const contentRef = useRef(null);
  useOnClickOutside(contentRef, () => confirmClear && setConfirmClear(false), []);
@ -91,13 +175,13 @@ function Speech() {
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
              <EngineSTTDropdown />
            </div>
+            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
+              <LanguageSTTDropdown />
+            </div>
            <div className="h-px bg-black/20 bg-white/20" role="none" />
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
              <TextToSpeechSwitch />
            </div>
-            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
-              <AutomaticPlayback />
-            </div>
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
              <EngineTTSDropdown />
            </div>
@ -119,6 +203,9 @@ function Speech() {
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
              <EngineSTTDropdown />
            </div>
+            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
+              <LanguageSTTDropdown />
+            </div>
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
              <AutoTranscribeAudioSwitch />
            </div>
@ -135,7 +222,7 @@ function Speech() {
              <TextToSpeechSwitch />
            </div>
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
-              <AutomaticPlayback />
+              <AutomaticPlaybackSwitch />
            </div>
            <div className="border-b pb-3 last-of-type:border-b-0 dark:border-gray-700">
              <EngineTTSDropdown />
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlaybackSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/AutomaticPlaybackSwitch.tsx
@ -3,7 +3,7 @@ import { Switch } from '~/components/ui';
 import { useLocalize } from '~/hooks';
 import store from '~/store';

-export default function AutomaticPlayback({
+export default function AutomaticPlaybackSwitch({
  onCheckedChange,
 }: {
  onCheckedChange?: (value: boolean) => void;
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/CacheTTSSwitch.tsx
@ -10,7 +10,7 @@ export default function CacheTTSSwitch({
 }) {
  const localize = useLocalize();
  const [cacheTTS, setCacheTTS] = useRecoilState<boolean>(store.cacheTTS);
-  const [textToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [textToSpeech] = useRecoilState<boolean>(store.textToSpeech);

  const handleCheckedChange = (value: boolean) => {
    setCacheTTS(value);
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/EngineTTSDropdown.tsx
@ -5,21 +5,21 @@ import store from '~/store';

 export default function EngineTTSDropdown() {
  const localize = useLocalize();
-  const [endpointTTS, setEndpointTTS] = useRecoilState<string>(store.endpointTTS);
+  const [engineTTS, setEngineTTS] = useRecoilState<string>(store.engineTTS);
  const endpointOptions = [
    { value: 'browser', display: localize('com_nav_browser') },
    { value: 'external', display: localize('com_nav_external') },
  ];

  const handleSelect = (value: string) => {
-    setEndpointTTS(value);
+    setEngineTTS(value);
  };

  return (
    <div className="flex items-center justify-between">
      <div>{localize('com_nav_engine')}</div>
      <Dropdown
-        value={endpointTTS}
+        value={engineTTS}
        onChange={handleSelect}
        options={endpointOptions}
        width={180}
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/PlaybackRate.tsx
@ -7,7 +7,7 @@ import { cn, defaultTextProps, optionText } from '~/utils/';

 export default function DecibelSelector() {
  const localize = useLocalize();
-  const textToSpeech = useRecoilValue(store.TextToSpeech);
+  const textToSpeech = useRecoilValue(store.textToSpeech);
  const [playbackRate, setPlaybackRate] = useRecoilState(store.playbackRate);

  return (
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/TextToSpeechSwitch.tsx
@ -9,7 +9,7 @@ export default function TextToSpeechSwitch({
  onCheckedChange?: (value: boolean) => void;
 }) {
  const localize = useLocalize();
-  const [TextToSpeech, setTextToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
+  const [TextToSpeech, setTextToSpeech] = useRecoilState<boolean>(store.textToSpeech);

  const handleCheckedChange = (value: boolean) => {
    setTextToSpeech(value);
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/tests/AutomaticPlaybackSwitch.spec.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/tests/AutomaticPlaybackSwitch.spec.tsx
@ -0,0 +1,38 @@
+import React from 'react';
+import '@testing-library/jest-dom/extend-expect';
+import { render, fireEvent } from 'test/layout-test-utils';
+import AutomaticPlaybackSwitch from '../AutomaticPlaybackSwitch';
+import { RecoilRoot } from 'recoil';
+
+describe('AutomaticPlaybackSwitch', () => {
+  /**
+   * Mock function to set the text-to-speech state.
+   */
+  let mockSetAutomaticPlayback: jest.Mock<void, [boolean]> | ((value: boolean) => void) | undefined;
+
+  beforeEach(() => {
+    mockSetAutomaticPlayback = jest.fn();
+  });
+
+  it('renders correctly', () => {
+    const { getByTestId } = render(
+      <RecoilRoot>
+        <AutomaticPlaybackSwitch />
+      </RecoilRoot>,
+    );
+
+    expect(getByTestId('AutomaticPlayback')).toBeInTheDocument();
+  });
+
+  it('calls onCheckedChange when the switch is toggled', () => {
+    const { getByTestId } = render(
+      <RecoilRoot>
+        <AutomaticPlaybackSwitch onCheckedChange={mockSetAutomaticPlayback} />
+      </RecoilRoot>,
+    );
+    const switchElement = getByTestId('AutomaticPlayback');
+    fireEvent.click(switchElement);
+
+    expect(mockSetAutomaticPlayback).toHaveBeenCalledWith(true);
+  });
+});
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/index.ts
@ -1,4 +1,4 @@
-export { default as AutomaticPlayback } from './AutomaticPlayback';
+export { default as AutomaticPlaybackSwitch } from './AutomaticPlaybackSwitch';
 export { default as CacheTTSSwitch } from './CacheTTSSwitch';
 export { default as EngineTTSDropdown } from './EngineTTSDropdown';
 export { default as PlaybackRate } from './PlaybackRate';
--- a/client/src/data-provider/queries.ts
+++ b/client/src/data-provider/queries.ts
@ -423,6 +423,13 @@ export const useVoicesQuery = (): UseQueryResult<t.VoiceResponse> => {
  return useQuery([QueryKeys.voices], () => dataService.getVoices());
 };

+/* Custom config speech */
+export const useCustomConfigSpeechQuery = (): UseQueryResult<t.getCustomConfigSpeechResponse> => {
+  return useQuery([QueryKeys.customConfigSpeech], () => dataService.getCustomConfigSpeech());
+};
+
+/** Prompt */
+
 export const usePromptGroupsInfiniteQuery = (
  params?: t.TPromptGroupsWithFilterRequest,
  config?: UseInfiniteQueryOptions<t.PromptGroupListResponse, unknown>,
--- a/client/src/hooks/Input/index.ts
+++ b/client/src/hooks/Input/index.ts
@ -8,3 +8,4 @@ export { default as useRequiresKey } from './useRequiresKey';
 export { default as useMultipleKeys } from './useMultipleKeys';
 export { default as useSpeechToText } from './useSpeechToText';
 export { default as useTextToSpeech } from './useTextToSpeech';
+export { default as useGetAudioSettings } from './useGetAudioSettings';
--- a/client/src/hooks/Input/useGetAudioSettings.tsx
+++ b/client/src/hooks/Input/useGetAudioSettings.tsx
@ -0,0 +1,19 @@
+import { useRecoilState } from 'recoil';
+import store from '~/store';
+
+export enum AudioEndpoints {
+  browser = 'browser',
+  external = 'external',
+}
+
+const useGetAudioSettings = () => {
+  const [engineSTT] = useRecoilState<string>(store.engineSTT);
+  const [engineTTS] = useRecoilState<string>(store.engineTTS);
+
+  const externalSpeechToText = engineSTT === AudioEndpoints.external;
+  const externalTextToSpeech = engineTTS === AudioEndpoints.external;
+
+  return { externalSpeechToText, externalTextToSpeech };
+};
+
+export default useGetAudioSettings;
--- a/client/src/hooks/Input/useSpeechToText.ts
+++ b/client/src/hooks/Input/useSpeechToText.ts
@ -1,17 +1,16 @@
 import { useState, useEffect } from 'react';
 import useSpeechToTextBrowser from './useSpeechToTextBrowser';
 import useSpeechToTextExternal from './useSpeechToTextExternal';
-import { useRecoilState } from 'recoil';
-import store from '~/store';
+import useGetAudioSettings from './useGetAudioSettings';

 const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => {
-  const [endpointSTT] = useRecoilState<string>(store.endpointSTT);
-  const useExternalSpeechToText = endpointSTT === 'external';
+  const { externalSpeechToText } = useGetAudioSettings();
  const [animatedText, setAnimatedText] = useState('');

  const {
    isListening: speechIsListeningBrowser,
    isLoading: speechIsLoadingBrowser,
+    interimTranscript: interimTranscriptBrowser,
    text: speechTextBrowser,
    startRecording: startSpeechRecordingBrowser,
    stopRecording: stopSpeechRecordingBrowser,
@ -26,21 +25,21 @@ const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) =>
    clearText,
  } = useSpeechToTextExternal(handleTranscriptionComplete);

-  const isListening = useExternalSpeechToText
-    ? speechIsListeningExternal
-    : speechIsListeningBrowser;
-  const isLoading = useExternalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser;
-  const speechTextForm = useExternalSpeechToText ? speechTextExternal : speechTextBrowser;
-  const startRecording = useExternalSpeechToText
+  const isListening = externalSpeechToText ? speechIsListeningExternal : speechIsListeningBrowser;
+  const isLoading = externalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser;
+  const speechTextForm = externalSpeechToText ? speechTextExternal : speechTextBrowser;
+  const startRecording = externalSpeechToText
    ? startSpeechRecordingExternal
    : startSpeechRecordingBrowser;
-  const stopRecording = useExternalSpeechToText
+  const stopRecording = externalSpeechToText
    ? stopSpeechRecordingExternal
    : stopSpeechRecordingBrowser;
  const speechText =
    isListening || (speechTextExternal && speechTextExternal.length > 0)
      ? speechTextExternal
      : speechTextForm || '';
+  // for a future real-time STT external
+  const interimTranscript = externalSpeechToText ? '' : interimTranscriptBrowser;

  const animateTextTyping = (text: string) => {
    const totalDuration = 2000;
@ -65,17 +64,18 @@ const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) =>
  };

  useEffect(() => {
-    if (speechText) {
+    if (speechText && externalSpeechToText) {
      animateTextTyping(speechText);
    }
-  }, [speechText]);
+  }, [speechText, externalSpeechToText]);

  return {
    isListening,
    isLoading,
    startRecording,
    stopRecording,
-    speechText: animatedText,
+    interimTranscript,
+    speechText: externalSpeechToText ? animatedText : speechText,
    clearText,
  };
 };
--- a/client/src/hooks/Input/useSpeechToTextBrowser.ts
+++ b/client/src/hooks/Input/useSpeechToTextBrowser.ts
@ -1,34 +1,57 @@
-import { useEffect } from 'react';
+import { useEffect, useState } from 'react';
 import { useRecoilState } from 'recoil';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
 import SpeechRecognition, { useSpeechRecognition } from 'react-speech-recognition';
+import useGetAudioSettings from './useGetAudioSettings';

 const useSpeechToTextBrowser = () => {
  const { showToast } = useToastContext();
-  const [endpointSTT] = useRecoilState<string>(store.endpointSTT);
+  const [languageSTT] = useRecoilState<string>(store.languageSTT);
+  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
+  const { externalSpeechToText } = useGetAudioSettings();
+  const [isListening, setIsListening] = useState(false);

-  const { transcript, listening, resetTranscript, browserSupportsSpeechRecognition } =
-    useSpeechRecognition();
+  const {
+    interimTranscript,
+    finalTranscript,
+    listening,
+    browserSupportsSpeechRecognition,
+    isMicrophoneAvailable,
+  } = useSpeechRecognition();

  const toggleListening = () => {
-    if (browserSupportsSpeechRecognition) {
-      if (listening) {
-        SpeechRecognition.stopListening();
-      } else {
-        SpeechRecognition.startListening();
-      }
-    } else {
+    if (!browserSupportsSpeechRecognition) {
      showToast({
        message: 'Browser does not support SpeechRecognition',
        status: 'error',
      });
+      return;
+    }
+
+    if (!isMicrophoneAvailable) {
+      showToast({
+        message: 'Microphone is not available',
+        status: 'error',
+      });
+      return;
+    }
+
+    if (listening) {
+      setIsListening(false);
+      SpeechRecognition.stopListening();
+    } else {
+      setIsListening(true);
+      SpeechRecognition.startListening({
+        language: languageSTT,
+        continuous: autoTranscribeAudio,
+      });
    }
  };

  useEffect(() => {
    const handleKeyDown = (e: KeyboardEvent) => {
-      if (e.shiftKey && e.altKey && e.code === 'KeyL' && endpointSTT === 'browser') {
+      if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
        toggleListening();
      }
    };
@ -37,15 +60,19 @@ const useSpeechToTextBrowser = () => {
    return () => window.removeEventListener('keydown', handleKeyDown);
  }, []);

+  useEffect(() => {
+    if (!listening) {
+      setIsListening(false);
+    }
+  }, [listening]);
+
  return {
-    isListening: listening,
+    isListening,
    isLoading: false,
-    text: transcript,
+    interimTranscript,
+    text: finalTranscript,
    startRecording: toggleListening,
-    stopRecording: () => {
-      SpeechRecognition.stopListening();
-      resetTranscript();
-    },
+    stopRecording: toggleListening,
  };
 };

--- a/client/src/hooks/Input/useSpeechToTextExternal.ts
+++ b/client/src/hooks/Input/useSpeechToTextExternal.ts
@ -3,11 +3,12 @@ import { useRecoilState } from 'recoil';
 import { useSpeechToTextMutation } from '~/data-provider';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
+import useGetAudioSettings from './useGetAudioSettings';

 const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => {
  const { showToast } = useToastContext();
-  const [endpointSTT] = useRecoilState<string>(store.endpointSTT);
-  const [speechToText] = useRecoilState<boolean>(store.SpeechToText);
+  const { externalSpeechToText } = useGetAudioSettings();
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
  const [autoSendText] = useRecoilState<boolean>(store.autoSendText);
  const [text, setText] = useState<string>('');
@ -196,7 +197,7 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
  };

  const handleKeyDown = async (e: KeyboardEvent) => {
-    if (e.shiftKey && e.altKey && e.code === 'KeyL' && endpointSTT !== 'browser') {
+    if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
      if (!window.MediaRecorder) {
        showToast({ message: 'MediaRecorder is not supported in this browser', status: 'error' });
        return;
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -4,12 +4,10 @@ import type { TMessage } from 'librechat-data-provider';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import { usePauseGlobalAudio } from '../Audio';
-import { useRecoilState } from 'recoil';
-import store from '~/store';
+import useGetAudioSettings from './useGetAudioSettings';

 const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
-  const [endpointTTS] = useRecoilState<string>(store.endpointTTS);
-  const useExternalTextToSpeech = endpointTTS === 'external';
+  const { externalTextToSpeech } = useGetAudioSettings();

  const {
    generateSpeechLocal: generateSpeechLocal,
@ -26,9 +24,9 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
  } = useTextToSpeechExternal(message.messageId, isLast, index);
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);

-  const generateSpeech = useExternalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
-  const cancelSpeech = useExternalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
-  const isSpeaking = useExternalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;
+  const generateSpeech = externalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
+  const cancelSpeech = externalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
+  const isSpeaking = externalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;

  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
--- a/client/src/localization/languages/Eng.ts
+++ b/client/src/localization/languages/Eng.ts
@ -539,7 +539,7 @@ export default {
  com_nav_modular_chat: 'Enable switching Endpoints mid-conversation',
  com_nav_latex_parsing: 'Parsing LaTeX in messages (may affect performance)',
  com_nav_text_to_speech: 'Text to Speech',
-  com_nav_automatic_playback: 'Autoplay Latest Message (external only)',
+  com_nav_automatic_playback: 'Autoplay Latest Message',
  com_nav_speech_to_text: 'Speech to Text',
  com_nav_profile_picture: 'Profile Picture',
  com_nav_change_picture: 'Change picture',
--- a/client/src/store/settings.ts
+++ b/client/src/store/settings.ts
@ -18,32 +18,45 @@ const staticAtoms = {
  showPopover: atom<boolean>({ key: 'showPopover', default: false }),
 };

-// Atoms with localStorage
 const localStorageAtoms = {
+  // General settings
  autoScroll: atomWithLocalStorage('autoScroll', false),
-  showCode: atomWithLocalStorage('showCode', false),
  hideSidePanel: atomWithLocalStorage('hideSidePanel', false),
-  modularChat: atomWithLocalStorage('modularChat', true),
-  LaTeXParsing: atomWithLocalStorage('LaTeXParsing', true),
-  UsernameDisplay: atomWithLocalStorage('UsernameDisplay', true),
-  TextToSpeech: atomWithLocalStorage('textToSpeech', true),
-  automaticPlayback: atomWithLocalStorage('automaticPlayback', false),
+
+  // Messages settings
  enterToSend: atomWithLocalStorage('enterToSend', true),
-  SpeechToText: atomWithLocalStorage('speechToText', true),
-  conversationMode: atomWithLocalStorage('conversationMode', false),
-  advancedMode: atomWithLocalStorage('advancedMode', false),
-  autoSendText: atomWithLocalStorage('autoSendText', false),
-  autoTranscribeAudio: atomWithLocalStorage('autoTranscribeAudio', false),
-  decibelValue: atomWithLocalStorage('decibelValue', -45),
-  endpointSTT: atomWithLocalStorage('endpointSTT', 'browser'),
-  endpointTTS: atomWithLocalStorage('endpointTTS', 'browser'),
-  cacheTTS: atomWithLocalStorage('cacheTTS', true),
-  voice: atomWithLocalStorage('voice', ''),
+  showCode: atomWithLocalStorage('showCode', false),
+  saveDrafts: atomWithLocalStorage('saveDrafts', false),
  forkSetting: atomWithLocalStorage('forkSetting', ''),
  splitAtTarget: atomWithLocalStorage('splitAtTarget', false),
+
  rememberForkOption: atomWithLocalStorage('rememberForkOption', true),
+
+  // Beta features settings
+  modularChat: atomWithLocalStorage('modularChat', true),
+  LaTeXParsing: atomWithLocalStorage('LaTeXParsing', true),
+
+  // Speech settings
+  conversationMode: atomWithLocalStorage('conversationMode', false),
+  advancedMode: atomWithLocalStorage('advancedMode', false),
+
+  speechToText: atomWithLocalStorage('speechToText', true),
+  engineSTT: atomWithLocalStorage('engineSTT', 'browser'),
+  languageSTT: atomWithLocalStorage('languageSTT', ''),
+  autoTranscribeAudio: atomWithLocalStorage('autoTranscribeAudio', false),
+  decibelValue: atomWithLocalStorage('decibelValue', -45),
+  autoSendText: atomWithLocalStorage('autoSendText', false),
+
+  textToSpeech: atomWithLocalStorage('textToSpeech', true),
+  engineTTS: atomWithLocalStorage('engineTTS', 'browser'),
+  voice: atomWithLocalStorage('voice', ''),
+  languageTTS: atomWithLocalStorage('languageTTS', ''),
+  automaticPlayback: atomWithLocalStorage('automaticPlayback', false),
  playbackRate: atomWithLocalStorage<number | null>('playbackRate', null),
-  saveDrafts: atomWithLocalStorage('saveDrafts', false),
+  cacheTTS: atomWithLocalStorage('cacheTTS', true),
+
+  // Account settings
+  UsernameDisplay: atomWithLocalStorage('UsernameDisplay', true),
 };

 export default { ...staticAtoms, ...localStorageAtoms };
--- a/packages/data-provider/src/api-endpoints.ts
+++ b/packages/data-provider/src/api-endpoints.ts
@ -128,14 +128,18 @@ export const images = () => `${files()}/images`;

 export const avatar = () => `${images()}/avatar`;

-export const speechToText = () => `${files()}/stt`;
+export const speech = () => `${files()}/speech`;

-export const textToSpeech = () => `${files()}/tts`;
+export const speechToText = () => `${speech()}/stt`;
+
+export const textToSpeech = () => `${speech()}/tts`;

 export const textToSpeechManual = () => `${textToSpeech()}/manual`;

 export const textToSpeechVoices = () => `${textToSpeech()}/voices`;

+export const getCustomConfigSpeech = () => `${speech()}/config/get`;
+
 export const getPromptGroup = (_id: string) => `${prompts()}/groups/${_id}`;

 export const getPromptGroupsWithFilters = (filter: object) => {
--- a/packages/data-provider/src/config.ts
+++ b/packages/data-provider/src/config.ts
@ -6,6 +6,7 @@ import { fileConfigSchema } from './file-config';
 import { specsConfigSchema } from './models';
 import { FileSources } from './types/files';
 import { TModelsConfig } from './types';
+import { speech } from './api-endpoints';

 export const defaultSocialLogins = ['google', 'facebook', 'openid', 'github', 'discord'];

@ -273,6 +274,40 @@ const sttSchema = z.object({
    .optional(),
 });

+const speechTab = z
+  .object({
+    conversationMode: z.boolean().optional(),
+    advancedMode: z.boolean().optional(),
+    speechToText: z
+      .boolean()
+      .optional()
+      .or(
+        z.object({
+          engineSTT: z.string().optional(),
+          languageSTT: z.string().optional(),
+          autoTranscribeAudio: z.boolean().optional(),
+          decibelValue: z.number().optional(),
+          autoSendText: z.boolean().optional(),
+        }),
+      )
+      .optional(),
+    textToSpeech: z
+      .boolean()
+      .optional()
+      .or(
+        z.object({
+          engineTTS: z.string().optional(),
+          voice: z.string().optional(),
+          languageTTS: z.string().optional(),
+          automaticPlayback: z.boolean().optional(),
+          playbackRate: z.number().optional(),
+          cacheTTS: z.boolean().optional(),
+        }),
+      )
+      .optional(),
+  })
+  .optional();
+
 export enum RateLimitPrefix {
  FILE_UPLOAD = 'FILE_UPLOAD',
  IMPORT = 'IMPORT',
@ -362,8 +397,13 @@ export const configSchema = z.object({
      allowedDomains: z.array(z.string()).optional(),
    })
    .default({ socialLogins: defaultSocialLogins }),
+  speech: z
+    .object({
      tts: ttsSchema.optional(),
      stt: sttSchema.optional(),
+      speechTab: speechTab.optional(),
+    })
+    .optional(),
  rateLimits: rateLimitSchema.optional(),
  fileConfig: fileConfigSchema.optional(),
  modelSpecs: specsConfigSchema.optional(),
--- a/packages/data-provider/src/data-service.ts
+++ b/packages/data-provider/src/data-service.ts
@ -355,6 +355,10 @@ export const getVoices = (): Promise<f.VoiceResponse> => {
  return request.get(endpoints.textToSpeechVoices());
 };

+export const getCustomConfigSpeech = (): Promise<f.getCustomConfigSpeechResponse[]> => {
+  return request.get(endpoints.getCustomConfigSpeech());
+};
+
 /* actions */

 export const updateAction = (data: m.UpdateActionVariables): Promise<m.UpdateActionResponse> => {
--- a/packages/data-provider/src/keys.ts
+++ b/packages/data-provider/src/keys.ts
@ -27,6 +27,7 @@ export enum QueryKeys {
  assistantDocs = 'assistantDocs',
  fileDownload = 'fileDownload',
  voices = 'voices',
+  customConfigSpeech = 'customConfigSpeech',
  prompts = 'prompts',
  prompt = 'prompt',
  promptGroups = 'promptGroups',
--- a/packages/data-provider/src/types/files.ts
+++ b/packages/data-provider/src/types/files.ts
@ -83,6 +83,8 @@ export type SpeechToTextResponse = {

 export type VoiceResponse = string[];

+export type getCustomConfigSpeechResponse = { [key: string]: string };
+
 export type UploadMutationOptions = {
  onSuccess?: (data: TFileUpload, variables: FormData, context?: unknown) => void;
  onMutate?: (variables: FormData) => void | Promise<unknown>;
@ -113,6 +115,12 @@ export type VoiceOptions = {
  onError?: (error: unknown, variables: unknown, context?: unknown) => void;
 };

+export type getCustomConfigSpeechOptions = {
+  onSuccess?: (data: getCustomConfigSpeechResponse, variables: unknown, context?: unknown) => void;
+  onMutate?: () => void | Promise<unknown>;
+  onError?: (error: unknown, variables: unknown, context?: unknown) => void;
+};
+
 export type DeleteFilesResponse = {
  message: string;
  result: Record<string, unknown>;