🔀 refactor: Modularize TTS Logic for Improved Browser support (#3657)

* WIP: message audio refactor * WIP: use MessageAudio by provider * fix: Update MessageAudio component to use TTSEndpoints enum * feat: Update useTextToSpeechBrowser hook to handle errors and improve error logging * feat: Add voice dropdown components for different TTS engines * docs: update incorrect `voices` example changed `voice: ''` to `voices: ['alloy']` * feat: Add brwoser support check for Edge TTS engine component with error toast if not supported --------- Co-authored-by: Marco Beretta <81851188+berry-13@users.noreply.github.com>
2026-02-03 08:11:50 +01:00 · 2024-08-15 11:34:25 -04:00 · 2024-08-15 11:34:25 -04:00 · dba704079c
commit dba704079c
parent bcde0beb47
18 changed files with 784 additions and 187 deletions
--- a/client/src/hooks/Audio/index.ts
+++ b/client/src/hooks/Audio/index.ts
@ -1,3 +1,6 @@
 export * from './MediaSourceAppender';
 export { default as useCustomAudioRef } from './useCustomAudioRef';
 export { default as usePauseGlobalAudio } from './usePauseGlobalAudio';
+export { default as useTTSExternal } from './useTTSExternal';
+export { default as useTTSBrowser } from './useTTSBrowser';
+export { default as useTTSEdge } from './useTTSEdge';
--- a/client/src/hooks/Audio/useTTSBrowser.ts
+++ b/client/src/hooks/Audio/useTTSBrowser.ts
@ -0,0 +1,100 @@
+// client/src/hooks/Audio/useTTSBrowser.ts
+import { useRef, useEffect, useState } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { parseTextParts } from 'librechat-data-provider';
+import type { TMessageContentParts } from 'librechat-data-provider';
+import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
+import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
+import { logger } from '~/utils';
+import store from '~/store';
+
+type TUseTextToSpeech = {
+  messageId?: string;
+  content?: TMessageContentParts[] | string;
+  isLast?: boolean;
+  index?: number;
+};
+
+const useTTSBrowser = (props?: TUseTextToSpeech) => {
+  const { content, isLast = false, index = 0 } = props ?? {};
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
+  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
+
+  const {
+    generateSpeechLocal: generateSpeech,
+    cancelSpeechLocal: cancelSpeech,
+    voices,
+  } = useTextToSpeechBrowser({ setIsSpeaking });
+
+  useEffect(() => {
+    const firstVoice = voices[0];
+    if (voices.length && typeof firstVoice === 'object') {
+      const lastSelectedVoice = voices.find((v) =>
+        typeof v === 'object' ? v.value === voice : v === voice,
+      );
+      if (lastSelectedVoice != null) {
+        const currentVoice =
+          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
+        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
+        setVoice(currentVoice);
+        return;
+      }
+
+      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
+      setVoice(firstVoice.value);
+    }
+  }, [setVoice, voice, voices]);
+
+  const handleMouseDown = () => {
+    isMouseDownRef.current = true;
+    timerRef.current = window.setTimeout(() => {
+      if (isMouseDownRef.current) {
+        const messageContent = content ?? '';
+        const parsedMessage =
+          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+        generateSpeech(parsedMessage);
+      }
+    }, 1000);
+  };
+
+  const handleMouseUp = () => {
+    isMouseDownRef.current = false;
+    if (timerRef.current != null) {
+      window.clearTimeout(timerRef.current);
+    }
+  };
+
+  const toggleSpeech = () => {
+    if (isSpeaking === true) {
+      cancelSpeech();
+      pauseGlobalAudio();
+    } else {
+      const messageContent = content ?? '';
+      const parsedMessage =
+        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+      generateSpeech(parsedMessage);
+    }
+  };
+
+  return {
+    handleMouseDown,
+    handleMouseUp,
+    toggleSpeech,
+    isSpeaking,
+    isLoading: false,
+    audioRef,
+    voices,
+  };
+};
+
+export default useTTSBrowser;
--- a/client/src/hooks/Audio/useTTSEdge.ts
+++ b/client/src/hooks/Audio/useTTSEdge.ts
@ -0,0 +1,100 @@
+// client/src/hooks/Audio/useTTSEdge.ts
+import { useRef, useEffect, useState } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { parseTextParts } from 'librechat-data-provider';
+import type { TMessageContentParts } from 'librechat-data-provider';
+import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
+import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
+import { logger } from '~/utils';
+import store from '~/store';
+
+type TUseTextToSpeech = {
+  messageId?: string;
+  content?: TMessageContentParts[] | string;
+  isLast?: boolean;
+  index?: number;
+};
+
+const useTTSEdge = (props?: TUseTextToSpeech) => {
+  const { content, isLast = false, index = 0 } = props ?? {};
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
+  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
+
+  const {
+    generateSpeechEdge: generateSpeech,
+    cancelSpeechEdge: cancelSpeech,
+    voices,
+  } = useTextToSpeechEdge({ setIsSpeaking });
+
+  useEffect(() => {
+    const firstVoice = voices[0];
+    if (voices.length && typeof firstVoice === 'object') {
+      const lastSelectedVoice = voices.find((v) =>
+        typeof v === 'object' ? v.value === voice : v === voice,
+      );
+      if (lastSelectedVoice != null) {
+        const currentVoice =
+          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
+        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
+        setVoice(currentVoice);
+        return;
+      }
+
+      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
+      setVoice(firstVoice.value);
+    }
+  }, [setVoice, voice, voices]);
+
+  const handleMouseDown = () => {
+    isMouseDownRef.current = true;
+    timerRef.current = window.setTimeout(() => {
+      if (isMouseDownRef.current) {
+        const messageContent = content ?? '';
+        const parsedMessage =
+          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+        generateSpeech(parsedMessage);
+      }
+    }, 1000);
+  };
+
+  const handleMouseUp = () => {
+    isMouseDownRef.current = false;
+    if (timerRef.current != null) {
+      window.clearTimeout(timerRef.current);
+    }
+  };
+
+  const toggleSpeech = () => {
+    if (isSpeaking === true) {
+      cancelSpeech();
+      pauseGlobalAudio();
+    } else {
+      const messageContent = content ?? '';
+      const parsedMessage =
+        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+      generateSpeech(parsedMessage);
+    }
+  };
+
+  return {
+    handleMouseDown,
+    handleMouseUp,
+    toggleSpeech,
+    isSpeaking,
+    isLoading: false,
+    audioRef,
+    voices,
+  };
+};
+
+export default useTTSEdge;
--- a/client/src/hooks/Audio/useTTSExternal.ts
+++ b/client/src/hooks/Audio/useTTSExternal.ts
@ -0,0 +1,101 @@
+// client/src/hooks/Audio/useTTSExternal.ts
+import { useRef, useEffect, useState } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { parseTextParts } from 'librechat-data-provider';
+import type { TMessageContentParts } from 'librechat-data-provider';
+import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
+import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
+import { logger } from '~/utils';
+import store from '~/store';
+
+type TUseTextToSpeech = {
+  messageId?: string;
+  content?: TMessageContentParts[] | string;
+  isLast?: boolean;
+  index?: number;
+};
+
+const useTTSExternal = (props?: TUseTextToSpeech) => {
+  const { messageId, content, isLast = false, index = 0 } = props ?? {};
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
+  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
+  const {
+    cancelSpeech,
+    generateSpeechExternal: generateSpeech,
+    isLoading,
+    voices,
+  } = useTextToSpeechExternal({
+    setIsSpeaking,
+    audioRef,
+    messageId,
+    isLast,
+    index,
+  });
+
+  useEffect(() => {
+    const firstVoice = voices[0];
+    if (voices.length) {
+      const lastSelectedVoice = voices.find((v) => v === voice);
+      if (lastSelectedVoice != null) {
+        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: lastSelectedVoice });
+        setVoice(lastSelectedVoice.toString());
+        return;
+      }
+      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice });
+      setVoice(firstVoice.toString());
+    }
+  }, [setVoice, voice, voices]);
+
+  const handleMouseDown = () => {
+    isMouseDownRef.current = true;
+    timerRef.current = window.setTimeout(() => {
+      if (isMouseDownRef.current) {
+        const messageContent = content ?? '';
+        const parsedMessage =
+          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+        generateSpeech(parsedMessage, false);
+      }
+    }, 1000);
+  };
+
+  const handleMouseUp = () => {
+    isMouseDownRef.current = false;
+    if (timerRef.current != null) {
+      window.clearTimeout(timerRef.current);
+    }
+  };
+
+  const toggleSpeech = () => {
+    if (isSpeaking === true) {
+      cancelSpeech();
+      pauseGlobalAudio();
+    } else {
+      const messageContent = content ?? '';
+      const parsedMessage =
+        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+      generateSpeech(parsedMessage, false);
+    }
+  };
+
+  return {
+    handleMouseDown,
+    handleMouseUp,
+    toggleSpeech,
+    isSpeaking,
+    isLoading,
+    audioRef,
+    voices,
+  };
+};
+
+export default useTTSExternal;
--- a/client/src/hooks/Input/useGetAudioSettings.ts
+++ b/client/src/hooks/Input/useGetAudioSettings.ts
@ -2,17 +2,6 @@ import { useMemo } from 'react';
 import { useRecoilValue } from 'recoil';
 import store from '~/store';

-export enum STTEndpoints {
-  browser = 'browser',
-  external = 'external',
-}
-
-export enum TTSEndpoints {
-  browser = 'browser',
-  edge = 'edge',
-  external = 'external',
-}
-
 const useGetAudioSettings = () => {
  const engineSTT = useRecoilValue<string>(store.engineSTT);
  const engineTTS = useRecoilValue<string>(store.engineTTS);
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -3,10 +3,10 @@ import { useRef, useMemo, useEffect, useState } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import type { Option } from '~/common';
-import useTextToSpeechExternal from './useTextToSpeechExternal';
-import useTextToSpeechBrowser from './useTextToSpeechBrowser';
-import useGetAudioSettings from './useGetAudioSettings';
-import useTextToSpeechEdge from './useTextToSpeechEdge';
+import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
+import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
+import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
+import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -1,43 +1,54 @@
 import { useRecoilState } from 'recoil';
 import { useState, useEffect, useCallback } from 'react';
+import type { VoiceOption } from '~/common';
 import store from '~/store';

-interface VoiceOption {
-  value: string;
-  label: string;
-}
-
 function useTextToSpeechBrowser({
  setIsSpeaking,
 }: {
-  setIsSpeaking: (isSpeaking: boolean) => void;
+  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
 }) {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
  const [voiceName] = useRecoilState(store.voice);
  const [voices, setVoices] = useState<VoiceOption[]>([]);

  const updateVoices = useCallback(() => {
-    const availableVoices = window.speechSynthesis
-      .getVoices()
-      .filter((v) => cloudBrowserVoices || v.localService === true);
+    try {
+      const availableVoices = window.speechSynthesis.getVoices();
+      if (!Array.isArray(availableVoices)) {
+        console.error('getVoices() did not return an array');
+        return;
+      }

-    const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
-      value: v.name,
-      label: v.name,
-    }));
+      const filteredVoices = availableVoices.filter(
+        (v) => cloudBrowserVoices || v.localService === true,
+      );
+      const voiceOptions: VoiceOption[] = filteredVoices.map((v) => ({
+        value: v.name,
+        label: v.name,
+      }));

-    setVoices(voiceOptions);
+      setVoices(voiceOptions);
+    } catch (error) {
+      console.error('Error updating voices:', error);
+    }
  }, [cloudBrowserVoices]);

  useEffect(() => {
-    if (window.speechSynthesis.getVoices().length) {
-      updateVoices();
-    } else {
-      window.speechSynthesis.onvoiceschanged = updateVoices;
+    const synth = window.speechSynthesis;
+
+    try {
+      if (synth.getVoices().length) {
+        updateVoices();
+      } else {
+        synth.onvoiceschanged = updateVoices;
+      }
+    } catch (error) {
+      console.error('Error in useEffect:', error);
    }

    return () => {
-      window.speechSynthesis.onvoiceschanged = null;
+      synth.onvoiceschanged = null;
    };
  }, [updateVoices]);

@ -46,22 +57,37 @@ function useTextToSpeechBrowser({
    const voice = voices.find((v) => v.value === voiceName);

    if (!voice) {
+      console.warn('Selected voice not found');
      return;
    }

-    synth.cancel();
-    const utterance = new SpeechSynthesisUtterance(text);
-    utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
-    utterance.onend = () => {
+    try {
+      synth.cancel();
+      const utterance = new SpeechSynthesisUtterance(text);
+      utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
+      utterance.onend = () => {
+        setIsSpeaking(false);
+      };
+      utterance.onerror = (event) => {
+        console.error('Speech synthesis error:', event);
+        setIsSpeaking(false);
+      };
+      setIsSpeaking(true);
+      synth.speak(utterance);
+    } catch (error) {
+      console.error('Error generating speech:', error);
      setIsSpeaking(false);
-    };
-    setIsSpeaking(true);
-    synth.speak(utterance);
+    }
  };

  const cancelSpeechLocal = () => {
-    window.speechSynthesis.cancel();
-    setIsSpeaking(false);
+    try {
+      window.speechSynthesis.cancel();
+    } catch (error) {
+      console.error('Error cancelling speech:', error);
+    } finally {
+      setIsSpeaking(false);
+    }
  };

  return { generateSpeechLocal, cancelSpeechLocal, voices };
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -1,28 +1,24 @@
 import { useRecoilValue } from 'recoil';
-import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
 import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
-import { useToastContext } from '~/Providers';
+import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
+import type { VoiceOption } from '~/common';
+import { useToastContext } from '~/Providers/ToastContext';
 import useLocalize from '~/hooks/useLocalize';
 import store from '~/store';

-interface Voice {
-  value: string;
-  label: string;
-}
-
 interface UseTextToSpeechEdgeReturn {
  generateSpeechEdge: (text: string) => void;
  cancelSpeechEdge: () => void;
-  voices: Voice[];
+  voices: VoiceOption[];
 }

 function useTextToSpeechEdge({
  setIsSpeaking,
 }: {
-  setIsSpeaking: (isSpeaking: boolean) => void;
+  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
 }): UseTextToSpeechEdgeReturn {
  const localize = useLocalize();
-  const [voices, setVoices] = useState<Voice[]>([]);
+  const [voices, setVoices] = useState<VoiceOption[]>([]);
  const voiceName = useRecoilValue(store.voice);
  const ttsRef = useRef<MsEdgeTTS | null>(null);
  const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -63,7 +59,7 @@ function useTextToSpeechEdge({
    if (!ttsRef.current) {
      ttsRef.current = new MsEdgeTTS();
    }
-    const availableVoice: Voice | undefined = voices.find((v) => v.value === voiceName);
+    const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);

    if (availableVoice) {
      ttsRef.current
@ -181,7 +177,7 @@ function useTextToSpeechEdge({

      generate();
    },
-    [appendNextBuffer, showToast, localize],
+    [setIsSpeaking, appendNextBuffer, showToast, localize],
  );

  const cancelSpeechEdge = useCallback(() => {
@ -202,7 +198,7 @@ function useTextToSpeechEdge({
        status: 'error',
      });
    }
-  }, [showToast, localize]);
+  }, [setIsSpeaking, showToast, localize]);

  useEffect(() => {
    if (!isBrowserSupported) {
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,8 +1,8 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
 import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
+import { useToastContext } from '~/Providers/ToastContext';
 import useLocalize from '~/hooks/useLocalize';
-import { useToastContext } from '~/Providers';
 import store from '~/store';

 const createFormData = (text: string, voice: string) => {
@ -13,7 +13,7 @@ const createFormData = (text: string, voice: string) => {
 };

 type TUseTTSExternal = {
-  setIsSpeaking: (isSpeaking: boolean) => void;
+  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
  audioRef: React.MutableRefObject<HTMLAudioElement | null>;
  messageId?: string;
  isLast: boolean;
--- a/client/src/hooks/index.ts
+++ b/client/src/hooks/index.ts
@ -1,3 +1,4 @@
+export * from './Audio';
 export * from './Assistants';
 export * from './Chat';
 export * from './Config';