🔀 refactor: Modularize TTS Logic for Improved Browser support (#3657)

* WIP: message audio refactor * WIP: use MessageAudio by provider * fix: Update MessageAudio component to use TTSEndpoints enum * feat: Update useTextToSpeechBrowser hook to handle errors and improve error logging * feat: Add voice dropdown components for different TTS engines * docs: update incorrect `voices` example changed `voice: ''` to `voices: ['alloy']` * feat: Add brwoser support check for Edge TTS engine component with error toast if not supported --------- Co-authored-by: Marco Beretta <81851188+berry-13@users.noreply.github.com>
2025-12-17 00:40:14 +01:00 · 2024-08-15 11:34:25 -04:00 · 2024-08-15 11:34:25 -04:00 · dba704079c
commit dba704079c
parent bcde0beb47
18 changed files with 784 additions and 187 deletions
--- a/client/src/common/types.ts
+++ b/client/src/common/types.ts
@ -19,6 +19,7 @@ import type {
  TStartupConfig,
  EModelEndpoint,
  AssistantsEndpoint,
  TMessageContentParts,
  AuthorizationTypeEnum,
  TSetOption as SetOption,
  TokenExchangeMethodEnum,
@ -31,6 +32,17 @@ export enum PromptsEditorMode {
  ADVANCED = 'advanced',
 }
 export enum STTEndpoints {
  browser = 'browser',
  external = 'external',
 }
 export enum TTSEndpoints {
  browser = 'browser',
  edge = 'edge',
  external = 'external',
 }
 export type AudioChunk = {
  audio: string;
  isFinal: boolean;
@ -374,6 +386,19 @@ export type Option = Record<string, unknown> & {
  value: string | number | null;
 };
 export type VoiceOption = {
  value: string;
  label: string;
 };
 export type TMessageAudio = {
  messageId?: string;
  content?: TMessageContentParts[] | string;
  className?: string;
  isLast: boolean;
  index: number;
 };
 export type OptionWithIcon = Option & { icon?: React.ReactNode };
 export type MentionOption = OptionWithIcon & {
  type: string;
--- a/client/src/components/Audio/TTS.tsx
+++ b/client/src/components/Audio/TTS.tsx
@ -0,0 +1,256 @@
 import { useEffect, useMemo } from 'react';
 import { useRecoilValue } from 'recoil';
 import type { TMessageAudio } from '~/common';
 import { useLocalize, useTTSBrowser, useTTSEdge, useTTSExternal } from '~/hooks';
 import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
 import { useToastContext } from '~/Providers/ToastContext';
 import { logger } from '~/utils';
 import store from '~/store';
 export function BrowserTTS({ isLast, index, messageId, content, className }: TMessageAudio) {
  const localize = useLocalize();
  const playbackRate = useRecoilValue(store.playbackRate);
  const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTTSBrowser({
    isLast,
    index,
    messageId,
    content,
  });
  const renderIcon = (size: string) => {
    if (isLoading === true) {
      return <Spinner size={size} />;
    }
    if (isSpeaking === true) {
      return <VolumeMuteIcon size={size} />;
    }
    return <VolumeIcon size={size} />;
  };
  useEffect(() => {
    const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
    if (!messageAudio) {
      return;
    }
    if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
      messageAudio.playbackRate = playbackRate;
    }
  }, [audioRef, isSpeaking, playbackRate, messageId]);
  logger.log(
    'MessageAudio: audioRef.current?.src, audioRef.current',
    audioRef.current?.src,
    audioRef.current,
  );
  return (
    <>
      <button
        className={className}
        onClickCapture={() => {
          if (audioRef.current) {
            audioRef.current.muted = false;
          }
          toggleSpeech();
        }}
        type="button"
        title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
      >
        {renderIcon('19')}
      </button>
      <audio
        ref={audioRef}
        controls
        preload="none"
        controlsList="nodownload nofullscreen noremoteplayback"
        style={{
          position: 'absolute',
          overflow: 'hidden',
          display: 'none',
          height: '0px',
          width: '0px',
        }}
        src={audioRef.current?.src}
        onError={(error) => {
          console.error('Error fetching audio:', error);
        }}
        id={`audio-${messageId}`}
        muted
        autoPlay
      />
    </>
  );
 }
 export function EdgeTTS({ isLast, index, messageId, content, className }: TMessageAudio) {
  const localize = useLocalize();
  const playbackRate = useRecoilValue(store.playbackRate);
  const isBrowserSupported = useMemo(
    () => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
    [],
  );
  const { showToast } = useToastContext();
  const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTTSEdge({
    isLast,
    index,
    messageId,
    content,
  });
  const renderIcon = (size: string) => {
    if (isLoading === true) {
      return <Spinner size={size} />;
    }
    if (isSpeaking === true) {
      return <VolumeMuteIcon size={size} />;
    }
    return <VolumeIcon size={size} />;
  };
  useEffect(() => {
    const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
    if (!messageAudio) {
      return;
    }
    if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
      messageAudio.playbackRate = playbackRate;
    }
  }, [audioRef, isSpeaking, playbackRate, messageId]);
  logger.log(
    'MessageAudio: audioRef.current?.src, audioRef.current',
    audioRef.current?.src,
    audioRef.current,
  );
  return (
    <>
      <button
        className={className}
        onClickCapture={() => {
          if (!isBrowserSupported) {
            showToast({
              message: localize('com_nav_tts_unsupported_error'),
              status: 'error',
            });
            return;
          }
          if (audioRef.current) {
            audioRef.current.muted = false;
          }
          toggleSpeech();
        }}
        type="button"
        title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
      >
        {renderIcon('19')}
      </button>
      {isBrowserSupported ? (
        <audio
          ref={audioRef}
          controls
          preload="none"
          controlsList="nodownload nofullscreen noremoteplayback"
          style={{
            position: 'absolute',
            overflow: 'hidden',
            display: 'none',
            height: '0px',
            width: '0px',
          }}
          src={audioRef.current?.src}
          onError={(error) => {
            console.error('Error fetching audio:', error);
          }}
          id={`audio-${messageId}`}
          muted
          autoPlay
        />
      ) : null}
    </>
  );
 }
 export function ExternalTTS({ isLast, index, messageId, content, className }: TMessageAudio) {
  const localize = useLocalize();
  const playbackRate = useRecoilValue(store.playbackRate);
  const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTTSExternal({
    isLast,
    index,
    messageId,
    content,
  });
  const renderIcon = (size: string) => {
    if (isLoading === true) {
      return <Spinner size={size} />;
    }
    if (isSpeaking === true) {
      return <VolumeMuteIcon size={size} />;
    }
    return <VolumeIcon size={size} />;
  };
  useEffect(() => {
    const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
    if (!messageAudio) {
      return;
    }
    if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
      messageAudio.playbackRate = playbackRate;
    }
  }, [audioRef, isSpeaking, playbackRate, messageId]);
  logger.log(
    'MessageAudio: audioRef.current?.src, audioRef.current',
    audioRef.current?.src,
    audioRef.current,
  );
  return (
    <>
      <button
        className={className}
        onClickCapture={() => {
          if (audioRef.current) {
            audioRef.current.muted = false;
          }
          toggleSpeech();
        }}
        type="button"
        title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
      >
        {renderIcon('19')}
      </button>
      <audio
        ref={audioRef}
        controls
        preload="none"
        controlsList="nodownload nofullscreen noremoteplayback"
        style={{
          position: 'absolute',
          overflow: 'hidden',
          display: 'none',
          height: '0px',
          width: '0px',
        }}
        src={audioRef.current?.src}
        onError={(error) => {
          console.error('Error fetching audio:', error);
        }}
        id={`audio-${messageId}`}
        muted
        autoPlay
      />
    </>
  );
 }
--- a/client/src/components/Audio/Voices.tsx
+++ b/client/src/components/Audio/Voices.tsx
@ -0,0 +1,94 @@
 import React from 'react';
 import { useRecoilState } from 'recoil';
 import type { Option } from '~/common';
 import DropdownNoState from '~/components/ui/DropdownNoState';
 import { useLocalize, useTTSBrowser, useTTSEdge, useTTSExternal } from '~/hooks';
 import { logger } from '~/utils';
 import store from '~/store';
 export function EdgeVoiceDropdown() {
  const localize = useLocalize();
  const { voices = [] } = useTTSEdge();
  const [voice, setVoice] = useRecoilState(store.voice);
  const handleVoiceChange = (newValue?: string | Option) => {
    logger.log('Edge Voice changed:', newValue);
    const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
    if (newVoice != null) {
      return setVoice(newVoice.toString());
    }
  };
  return (
    <div className="flex items-center justify-between">
      <div>{localize('com_nav_voice_select')}</div>
      <DropdownNoState
        key={`edge-voice-dropdown-${voices.length}`}
        value={voice}
        options={voices}
        onChange={handleVoiceChange}
        sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
        anchor="bottom start"
        testId="EdgeVoiceDropdown"
      />
    </div>
  );
 }
 export function BrowserVoiceDropdown() {
  const localize = useLocalize();
  const { voices = [] } = useTTSBrowser();
  const [voice, setVoice] = useRecoilState(store.voice);
  const handleVoiceChange = (newValue?: string | Option) => {
    logger.log('Browser Voice changed:', newValue);
    const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
    if (newVoice != null) {
      return setVoice(newVoice.toString());
    }
  };
  return (
    <div className="flex items-center justify-between">
      <div>{localize('com_nav_voice_select')}</div>
      <DropdownNoState
        key={`browser-voice-dropdown-${voices.length}`}
        value={voice}
        options={voices}
        onChange={handleVoiceChange}
        sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
        anchor="bottom start"
        testId="BrowserVoiceDropdown"
      />
    </div>
  );
 }
 export function ExternalVoiceDropdown() {
  const localize = useLocalize();
  const { voices = [] } = useTTSExternal();
  const [voice, setVoice] = useRecoilState(store.voice);
  const handleVoiceChange = (newValue?: string | Option) => {
    logger.log('External Voice changed:', newValue);
    const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
    if (newVoice != null) {
      return setVoice(newVoice.toString());
    }
  };
  return (
    <div className="flex items-center justify-between">
      <div>{localize('com_nav_voice_select')}</div>
      <DropdownNoState
        key={`external-voice-dropdown-${voices.length}`}
        value={voice}
        options={voices}
        onChange={handleVoiceChange}
        sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
        anchor="bottom start"
        testId="ExternalVoiceDropdown"
      />
    </div>
  );
 }
--- a/client/src/components/Chat/Messages/HoverButtons.tsx
+++ b/client/src/components/Chat/Messages/HoverButtons.tsx
@ -79,6 +79,7 @@ export default function HoverButtons({
          messageId={message.messageId}
          content={message.content ?? message.text}
          isLast={isLast}
          className="hover-button rounded-md p-1 pl-0 text-gray-500 hover:bg-gray-100 hover:text-gray-500 dark:text-gray-400/70 dark:hover:bg-gray-700 dark:hover:text-gray-200 disabled:dark:hover:text-gray-400 md:group-hover:visible md:group-[.final-completion]:visible"
        />
      )}
      {isEditableEndpoint && (
--- a/client/src/components/Chat/Messages/MessageAudio.tsx
+++ b/client/src/components/Chat/Messages/MessageAudio.tsx
@ -1,104 +1,22 @@
-import { useEffect } from 'react';
+// client/src/components/Chat/Messages/MessageAudio.tsx
 import { memo } from 'react';
 import { useRecoilValue } from 'recoil';
-import type { TMessageContentParts } from 'librechat-data-provider';
+import type { TMessageAudio } from '~/common';
-import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
+import { BrowserTTS, EdgeTTS, ExternalTTS } from '~/components/Audio/TTS';
-import { useLocalize, useTextToSpeech } from '~/hooks';
+import { TTSEndpoints } from '~/common';
 import { logger } from '~/utils';
 import store from '~/store';
-type THoverButtons = {
+function MessageAudio(props: TMessageAudio) {
-  messageId?: string;
+  const engineTTS = useRecoilValue<string>(store.engineTTS);
  content?: TMessageContentParts[] | string;
  isLast: boolean;
  index: number;
 };
-export default function MessageAudio({ isLast, index, messageId, content }: THoverButtons) {
+  const TTSComponents = {
-  const localize = useLocalize();
+    [TTSEndpoints.edge]: EdgeTTS,
-  const playbackRate = useRecoilValue(store.playbackRate);
+    [TTSEndpoints.browser]: BrowserTTS,
-
+    [TTSEndpoints.external]: ExternalTTS,
  const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTextToSpeech({
    isLast,
    index,
    messageId,
    content,
  });
  const renderIcon = (size: string) => {
    if (isLoading === true) {
      return <Spinner size={size} />;
    }
    if (isSpeaking === true) {
      return <VolumeMuteIcon size={size} />;
    }
    return <VolumeIcon size={size} />;
  };
-  useEffect(() => {
+  const SelectedTTS = TTSComponents[engineTTS];
-    const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
+  return <SelectedTTS {...props} />;
    if (!messageAudio) {
      return;
    }
    if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
      messageAudio.playbackRate = playbackRate;
    }
  }, [audioRef, isSpeaking, playbackRate, messageId]);
  logger.log(
    'MessageAudio: audioRef.current?.src, audioRef.current',
    audioRef.current?.src,
    audioRef.current,
  );
  return (
    <>
      <button
        className="hover-button rounded-md p-1 pl-0 text-gray-500 hover:bg-gray-100 hover:text-gray-500 dark:text-gray-400/70 dark:hover:bg-gray-700 dark:hover:text-gray-200 disabled:dark:hover:text-gray-400 md:group-hover:visible md:group-[.final-completion]:visible"
        // onMouseDownCapture={() => {
        //   if (audioRef.current) {
        //     audioRef.current.muted = false;
        //   }
        //   handleMouseDown();
        // }}
        // onMouseUpCapture={() => {
        //   if (audioRef.current) {
        //     audioRef.current.muted = false;
        //   }
        //   handleMouseUp();
        // }}
        onClickCapture={() => {
          if (audioRef.current) {
            audioRef.current.muted = false;
          }
          toggleSpeech();
        }}
        type="button"
        title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
      >
        {renderIcon('19')}
      </button>
      <audio
        ref={audioRef}
        controls
        preload="none"
        controlsList="nodownload nofullscreen noremoteplayback"
        style={{
          position: 'absolute',
          overflow: 'hidden',
          display: 'none',
          height: '0px',
          width: '0px',
        }}
        src={audioRef.current?.src}
        onError={(error) => {
          console.error('Error fetching audio:', error);
        }}
        id={`audio-${messageId}`}
        muted
        autoPlay
      />
    </>
  );
 }
 export default memo(MessageAudio);
--- a/client/src/components/Nav/SettingsTabs/Speech/TTS/VoiceDropdown.tsx
+++ b/client/src/components/Nav/SettingsTabs/Speech/TTS/VoiceDropdown.tsx
@ -1,37 +1,21 @@
-import React from 'react';
+import { useRecoilValue } from 'recoil';
-import { useRecoilState, useRecoilValue } from 'recoil';
+import {
-import type { Option } from '~/common';
+  EdgeVoiceDropdown,
-import DropdownNoState from '~/components/ui/DropdownNoState';
+  BrowserVoiceDropdown,
-import { useLocalize, useTextToSpeech } from '~/hooks';
+  ExternalVoiceDropdown,
-import { logger } from '~/utils';
+} from '~/components/Audio/Voices';
 import store from '~/store';
 import { TTSEndpoints } from '~/common';
 const voiceDropdownComponentsMap = {
  [TTSEndpoints.edge]: EdgeVoiceDropdown,
  [TTSEndpoints.browser]: BrowserVoiceDropdown,
  [TTSEndpoints.external]: ExternalVoiceDropdown,
 };
 export default function VoiceDropdown() {
  const localize = useLocalize();
  const { voices = [] } = useTextToSpeech();
  const [voice, setVoice] = useRecoilState(store.voice);
  const engineTTS = useRecoilValue<string>(store.engineTTS);
  const VoiceDropdownComponent = voiceDropdownComponentsMap[engineTTS];
-  const handleVoiceChange = (newValue?: string | Option) => {
+  return <VoiceDropdownComponent />;
    logger.log('Voice changed:', newValue);
    const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
    if (newVoice != null) {
      return setVoice(newVoice.toString());
    }
  };
  return (
    <div className="flex items-center justify-between">
      <div>{localize('com_nav_voice_select')}</div>
      <DropdownNoState
        key={`voice-dropdown-${engineTTS}-${voices.length}`}
        value={voice}
        options={voices}
        onChange={handleVoiceChange}
        sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
        anchor="bottom start"
        testId="VoiceDropdown"
      />
    </div>
  );
 }
--- a/client/src/hooks/Audio/index.ts
+++ b/client/src/hooks/Audio/index.ts
@ -1,3 +1,6 @@
 export * from './MediaSourceAppender';
 export { default as useCustomAudioRef } from './useCustomAudioRef';
 export { default as usePauseGlobalAudio } from './usePauseGlobalAudio';
 export { default as useTTSExternal } from './useTTSExternal';
 export { default as useTTSBrowser } from './useTTSBrowser';
 export { default as useTTSEdge } from './useTTSEdge';
--- a/client/src/hooks/Audio/useTTSBrowser.ts
+++ b/client/src/hooks/Audio/useTTSBrowser.ts
@ -0,0 +1,100 @@
 // client/src/hooks/Audio/useTTSBrowser.ts
 import { useRef, useEffect, useState } from 'react';
 import { useRecoilState, useRecoilValue } from 'recoil';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
 import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { logger } from '~/utils';
 import store from '~/store';
 type TUseTextToSpeech = {
  messageId?: string;
  content?: TMessageContentParts[] | string;
  isLast?: boolean;
  index?: number;
 };
 const useTTSBrowser = (props?: TUseTextToSpeech) => {
  const { content, isLast = false, index = 0 } = props ?? {};
  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
  const [isSpeakingState, setIsSpeaking] = useState(false);
  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
  const [voice, setVoice] = useRecoilState(store.voice);
  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
  const {
    generateSpeechLocal: generateSpeech,
    cancelSpeechLocal: cancelSpeech,
    voices,
  } = useTextToSpeechBrowser({ setIsSpeaking });
  useEffect(() => {
    const firstVoice = voices[0];
    if (voices.length && typeof firstVoice === 'object') {
      const lastSelectedVoice = voices.find((v) =>
        typeof v === 'object' ? v.value === voice : v === voice,
      );
      if (lastSelectedVoice != null) {
        const currentVoice =
          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
        setVoice(currentVoice);
        return;
      }
      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
      setVoice(firstVoice.value);
    }
  }, [setVoice, voice, voices]);
  const handleMouseDown = () => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
      if (isMouseDownRef.current) {
        const messageContent = content ?? '';
        const parsedMessage =
          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
        generateSpeech(parsedMessage);
      }
    }, 1000);
  };
  const handleMouseUp = () => {
    isMouseDownRef.current = false;
    if (timerRef.current != null) {
      window.clearTimeout(timerRef.current);
    }
  };
  const toggleSpeech = () => {
    if (isSpeaking === true) {
      cancelSpeech();
      pauseGlobalAudio();
    } else {
      const messageContent = content ?? '';
      const parsedMessage =
        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
      generateSpeech(parsedMessage);
    }
  };
  return {
    handleMouseDown,
    handleMouseUp,
    toggleSpeech,
    isSpeaking,
    isLoading: false,
    audioRef,
    voices,
  };
 };
 export default useTTSBrowser;
--- a/client/src/hooks/Audio/useTTSEdge.ts
+++ b/client/src/hooks/Audio/useTTSEdge.ts
@ -0,0 +1,100 @@
 // client/src/hooks/Audio/useTTSEdge.ts
 import { useRef, useEffect, useState } from 'react';
 import { useRecoilState, useRecoilValue } from 'recoil';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
 import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { logger } from '~/utils';
 import store from '~/store';
 type TUseTextToSpeech = {
  messageId?: string;
  content?: TMessageContentParts[] | string;
  isLast?: boolean;
  index?: number;
 };
 const useTTSEdge = (props?: TUseTextToSpeech) => {
  const { content, isLast = false, index = 0 } = props ?? {};
  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
  const [isSpeakingState, setIsSpeaking] = useState(false);
  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
  const [voice, setVoice] = useRecoilState(store.voice);
  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
  const {
    generateSpeechEdge: generateSpeech,
    cancelSpeechEdge: cancelSpeech,
    voices,
  } = useTextToSpeechEdge({ setIsSpeaking });
  useEffect(() => {
    const firstVoice = voices[0];
    if (voices.length && typeof firstVoice === 'object') {
      const lastSelectedVoice = voices.find((v) =>
        typeof v === 'object' ? v.value === voice : v === voice,
      );
      if (lastSelectedVoice != null) {
        const currentVoice =
          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
        setVoice(currentVoice);
        return;
      }
      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
      setVoice(firstVoice.value);
    }
  }, [setVoice, voice, voices]);
  const handleMouseDown = () => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
      if (isMouseDownRef.current) {
        const messageContent = content ?? '';
        const parsedMessage =
          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
        generateSpeech(parsedMessage);
      }
    }, 1000);
  };
  const handleMouseUp = () => {
    isMouseDownRef.current = false;
    if (timerRef.current != null) {
      window.clearTimeout(timerRef.current);
    }
  };
  const toggleSpeech = () => {
    if (isSpeaking === true) {
      cancelSpeech();
      pauseGlobalAudio();
    } else {
      const messageContent = content ?? '';
      const parsedMessage =
        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
      generateSpeech(parsedMessage);
    }
  };
  return {
    handleMouseDown,
    handleMouseUp,
    toggleSpeech,
    isSpeaking,
    isLoading: false,
    audioRef,
    voices,
  };
 };
 export default useTTSEdge;
--- a/client/src/hooks/Audio/useTTSExternal.ts
+++ b/client/src/hooks/Audio/useTTSExternal.ts
@ -0,0 +1,101 @@
 // client/src/hooks/Audio/useTTSExternal.ts
 import { useRef, useEffect, useState } from 'react';
 import { useRecoilState, useRecoilValue } from 'recoil';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
 import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { logger } from '~/utils';
 import store from '~/store';
 type TUseTextToSpeech = {
  messageId?: string;
  content?: TMessageContentParts[] | string;
  isLast?: boolean;
  index?: number;
 };
 const useTTSExternal = (props?: TUseTextToSpeech) => {
  const { messageId, content, isLast = false, index = 0 } = props ?? {};
  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
  const [isSpeakingState, setIsSpeaking] = useState(false);
  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
  const [voice, setVoice] = useRecoilState(store.voice);
  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
  const {
    cancelSpeech,
    generateSpeechExternal: generateSpeech,
    isLoading,
    voices,
  } = useTextToSpeechExternal({
    setIsSpeaking,
    audioRef,
    messageId,
    isLast,
    index,
  });
  useEffect(() => {
    const firstVoice = voices[0];
    if (voices.length) {
      const lastSelectedVoice = voices.find((v) => v === voice);
      if (lastSelectedVoice != null) {
        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: lastSelectedVoice });
        setVoice(lastSelectedVoice.toString());
        return;
      }
      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice });
      setVoice(firstVoice.toString());
    }
  }, [setVoice, voice, voices]);
  const handleMouseDown = () => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
      if (isMouseDownRef.current) {
        const messageContent = content ?? '';
        const parsedMessage =
          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
        generateSpeech(parsedMessage, false);
      }
    }, 1000);
  };
  const handleMouseUp = () => {
    isMouseDownRef.current = false;
    if (timerRef.current != null) {
      window.clearTimeout(timerRef.current);
    }
  };
  const toggleSpeech = () => {
    if (isSpeaking === true) {
      cancelSpeech();
      pauseGlobalAudio();
    } else {
      const messageContent = content ?? '';
      const parsedMessage =
        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
      generateSpeech(parsedMessage, false);
    }
  };
  return {
    handleMouseDown,
    handleMouseUp,
    toggleSpeech,
    isSpeaking,
    isLoading,
    audioRef,
    voices,
  };
 };
 export default useTTSExternal;
--- a/client/src/hooks/Input/useGetAudioSettings.ts
+++ b/client/src/hooks/Input/useGetAudioSettings.ts
@ -2,17 +2,6 @@ import { useMemo } from 'react';
 import { useRecoilValue } from 'recoil';
 import store from '~/store';
 export enum STTEndpoints {
  browser = 'browser',
  external = 'external',
 }
 export enum TTSEndpoints {
  browser = 'browser',
  edge = 'edge',
  external = 'external',
 }
 const useGetAudioSettings = () => {
  const engineSTT = useRecoilValue<string>(store.engineSTT);
  const engineTTS = useRecoilValue<string>(store.engineTTS);
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -3,10 +3,10 @@ import { useRef, useMemo, useEffect, useState } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import type { Option } from '~/common';
-import useTextToSpeechExternal from './useTextToSpeechExternal';
+import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
-import useTextToSpeechBrowser from './useTextToSpeechBrowser';
+import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
-import useGetAudioSettings from './useGetAudioSettings';
+import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
-import useTextToSpeechEdge from './useTextToSpeechEdge';
+import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -1,43 +1,54 @@
 import { useRecoilState } from 'recoil';
 import { useState, useEffect, useCallback } from 'react';
 import type { VoiceOption } from '~/common';
 import store from '~/store';
 interface VoiceOption {
  value: string;
  label: string;
 }
 function useTextToSpeechBrowser({
  setIsSpeaking,
 }: {
-  setIsSpeaking: (isSpeaking: boolean) => void;
+  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
 }) {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
  const [voiceName] = useRecoilState(store.voice);
  const [voices, setVoices] = useState<VoiceOption[]>([]);
  const updateVoices = useCallback(() => {
-    const availableVoices = window.speechSynthesis
+    try {
-      .getVoices()
+      const availableVoices = window.speechSynthesis.getVoices();
-      .filter((v) => cloudBrowserVoices || v.localService === true);
+      if (!Array.isArray(availableVoices)) {
        console.error('getVoices() did not return an array');
        return;
      }
-    const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
+      const filteredVoices = availableVoices.filter(
        (v) => cloudBrowserVoices || v.localService === true,
      );
      const voiceOptions: VoiceOption[] = filteredVoices.map((v) => ({
        value: v.name,
        label: v.name,
      }));
      setVoices(voiceOptions);
    } catch (error) {
      console.error('Error updating voices:', error);
    }
  }, [cloudBrowserVoices]);
  useEffect(() => {
-    if (window.speechSynthesis.getVoices().length) {
+    const synth = window.speechSynthesis;
    try {
      if (synth.getVoices().length) {
        updateVoices();
      } else {
-      window.speechSynthesis.onvoiceschanged = updateVoices;
+        synth.onvoiceschanged = updateVoices;
      }
    } catch (error) {
      console.error('Error in useEffect:', error);
    }
    return () => {
-      window.speechSynthesis.onvoiceschanged = null;
+      synth.onvoiceschanged = null;
    };
  }, [updateVoices]);
@ -46,22 +57,37 @@ function useTextToSpeechBrowser({
    const voice = voices.find((v) => v.value === voiceName);
    if (!voice) {
      console.warn('Selected voice not found');
      return;
    }
    try {
      synth.cancel();
      const utterance = new SpeechSynthesisUtterance(text);
      utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
      utterance.onend = () => {
        setIsSpeaking(false);
      };
      utterance.onerror = (event) => {
        console.error('Speech synthesis error:', event);
        setIsSpeaking(false);
      };
      setIsSpeaking(true);
      synth.speak(utterance);
    } catch (error) {
      console.error('Error generating speech:', error);
      setIsSpeaking(false);
    }
  };
  const cancelSpeechLocal = () => {
    try {
      window.speechSynthesis.cancel();
    } catch (error) {
      console.error('Error cancelling speech:', error);
    } finally {
      setIsSpeaking(false);
    }
  };
  return { generateSpeechLocal, cancelSpeechLocal, voices };
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -1,28 +1,24 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
 import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
-import { useToastContext } from '~/Providers';
+import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
 import type { VoiceOption } from '~/common';
 import { useToastContext } from '~/Providers/ToastContext';
 import useLocalize from '~/hooks/useLocalize';
 import store from '~/store';
 interface Voice {
  value: string;
  label: string;
 }
 interface UseTextToSpeechEdgeReturn {
  generateSpeechEdge: (text: string) => void;
  cancelSpeechEdge: () => void;
-  voices: Voice[];
+  voices: VoiceOption[];
 }
 function useTextToSpeechEdge({
  setIsSpeaking,
 }: {
-  setIsSpeaking: (isSpeaking: boolean) => void;
+  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
 }): UseTextToSpeechEdgeReturn {
  const localize = useLocalize();
-  const [voices, setVoices] = useState<Voice[]>([]);
+  const [voices, setVoices] = useState<VoiceOption[]>([]);
  const voiceName = useRecoilValue(store.voice);
  const ttsRef = useRef<MsEdgeTTS | null>(null);
  const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -63,7 +59,7 @@ function useTextToSpeechEdge({
    if (!ttsRef.current) {
      ttsRef.current = new MsEdgeTTS();
    }
-    const availableVoice: Voice | undefined = voices.find((v) => v.value === voiceName);
+    const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);
    if (availableVoice) {
      ttsRef.current
@ -181,7 +177,7 @@ function useTextToSpeechEdge({
      generate();
    },
-    [appendNextBuffer, showToast, localize],
+    [setIsSpeaking, appendNextBuffer, showToast, localize],
  );
  const cancelSpeechEdge = useCallback(() => {
@ -202,7 +198,7 @@ function useTextToSpeechEdge({
        status: 'error',
      });
    }
-  }, [showToast, localize]);
+  }, [setIsSpeaking, showToast, localize]);
  useEffect(() => {
    if (!isBrowserSupported) {
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,8 +1,8 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
 import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
 import { useToastContext } from '~/Providers/ToastContext';
 import useLocalize from '~/hooks/useLocalize';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
 const createFormData = (text: string, voice: string) => {
@ -13,7 +13,7 @@ const createFormData = (text: string, voice: string) => {
 };
 type TUseTTSExternal = {
-  setIsSpeaking: (isSpeaking: boolean) => void;
+  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
  audioRef: React.MutableRefObject<HTMLAudioElement | null>;
  messageId?: string;
  isLast: boolean;
--- a/client/src/hooks/index.ts
+++ b/client/src/hooks/index.ts
@ -1,3 +1,4 @@
 export * from './Audio';
 export * from './Assistants';
 export * from './Chat';
 export * from './Config';
--- a/client/src/localization/languages/Eng.ts
+++ b/client/src/localization/languages/Eng.ts
@ -664,6 +664,8 @@ export default {
  com_nav_audio_process_error: 'Error processing audio: {0}',
  com_nav_long_audio_warning: 'Longer texts will take longer to process.',
  com_nav_tts_init_error: 'Failed to initialize text-to-speech: {0}',
  com_nav_tts_unsupported_error:
    'Text-to-speech for the selected engine is not supported in this browser.',
  com_nav_source_buffer_error: 'Error setting up audio playback. Please refresh the page.',
  com_nav_media_source_init_error:
    'Unable to prepare audio player. Please check your browser settings.',
--- a/librechat.example.yaml
+++ b/librechat.example.yaml
@ -31,7 +31,8 @@ registration:
 #       url: ''
 #       apiKey: '${TTS_API_KEY}'
 #       model: ''
-#       voice: ''
+#       voices: ['']
 #  
 #   stt:
 #     openai: