🔀 refactor: Modularize TTS Logic for Improved Browser support (#3657)

* WIP: message audio refactor * WIP: use MessageAudio by provider * fix: Update MessageAudio component to use TTSEndpoints enum * feat: Update useTextToSpeechBrowser hook to handle errors and improve error logging * feat: Add voice dropdown components for different TTS engines * docs: update incorrect `voices` example changed `voice: ''` to `voices: ['alloy']` * feat: Add brwoser support check for Edge TTS engine component with error toast if not supported --------- Co-authored-by: Marco Beretta <81851188+berry-13@users.noreply.github.com>
2025-12-20 18:30:15 +01:00 · 2024-08-15 11:34:25 -04:00 · 2024-08-15 11:34:25 -04:00 · dba704079c
commit dba704079c
parent bcde0beb47
18 changed files with 784 additions and 187 deletions
--- a/client/src/hooks/Audio/useTTSBrowser.ts
+++ b/client/src/hooks/Audio/useTTSBrowser.ts
@ -0,0 +1,100 @@
+// client/src/hooks/Audio/useTTSBrowser.ts
+import { useRef, useEffect, useState } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { parseTextParts } from 'librechat-data-provider';
+import type { TMessageContentParts } from 'librechat-data-provider';
+import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
+import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
+import { logger } from '~/utils';
+import store from '~/store';
+
+type TUseTextToSpeech = {
+  messageId?: string;
+  content?: TMessageContentParts[] | string;
+  isLast?: boolean;
+  index?: number;
+};
+
+const useTTSBrowser = (props?: TUseTextToSpeech) => {
+  const { content, isLast = false, index = 0 } = props ?? {};
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
+  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
+
+  const {
+    generateSpeechLocal: generateSpeech,
+    cancelSpeechLocal: cancelSpeech,
+    voices,
+  } = useTextToSpeechBrowser({ setIsSpeaking });
+
+  useEffect(() => {
+    const firstVoice = voices[0];
+    if (voices.length && typeof firstVoice === 'object') {
+      const lastSelectedVoice = voices.find((v) =>
+        typeof v === 'object' ? v.value === voice : v === voice,
+      );
+      if (lastSelectedVoice != null) {
+        const currentVoice =
+          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
+        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
+        setVoice(currentVoice);
+        return;
+      }
+
+      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
+      setVoice(firstVoice.value);
+    }
+  }, [setVoice, voice, voices]);
+
+  const handleMouseDown = () => {
+    isMouseDownRef.current = true;
+    timerRef.current = window.setTimeout(() => {
+      if (isMouseDownRef.current) {
+        const messageContent = content ?? '';
+        const parsedMessage =
+          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+        generateSpeech(parsedMessage);
+      }
+    }, 1000);
+  };
+
+  const handleMouseUp = () => {
+    isMouseDownRef.current = false;
+    if (timerRef.current != null) {
+      window.clearTimeout(timerRef.current);
+    }
+  };
+
+  const toggleSpeech = () => {
+    if (isSpeaking === true) {
+      cancelSpeech();
+      pauseGlobalAudio();
+    } else {
+      const messageContent = content ?? '';
+      const parsedMessage =
+        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
+      generateSpeech(parsedMessage);
+    }
+  };
+
+  return {
+    handleMouseDown,
+    handleMouseUp,
+    toggleSpeech,
+    isSpeaking,
+    isLoading: false,
+    audioRef,
+    voices,
+  };
+};
+
+export default useTTSBrowser;