🗣️ feat: Edge TTS engine (#3358)

* feat: MS Edge TTS * feat: Edge TTS; fix: STT hook
2025-12-18 09:20:15 +01:00 · 2024-08-07 20:15:41 +02:00 · 2024-08-07 20:15:41 +02:00 · b390ba781f
commit b390ba781f
parent 01a88991ab
14 changed files with 379 additions and 129 deletions
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -3,30 +3,67 @@ import { parseTextParts } from 'librechat-data-provider';
 import type { TMessage } from 'librechat-data-provider';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
-import { usePauseGlobalAudio } from '../Audio';
 import useGetAudioSettings from './useGetAudioSettings';
+import useTextToSpeechEdge from './useTextToSpeechEdge';
+import { usePauseGlobalAudio } from '../Audio';

-const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
-  const { externalTextToSpeech } = useGetAudioSettings();
+const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
+  const { textToSpeechEndpoint } = useGetAudioSettings();
+  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+  const audioRef = useRef<HTMLAudioElement | null>(null);

  const {
-    generateSpeechLocal: generateSpeechLocal,
-    cancelSpeechLocal: cancelSpeechLocal,
+    generateSpeechLocal,
+    cancelSpeechLocal,
    isSpeaking: isSpeakingLocal,
+    voices: voicesLocal,
  } = useTextToSpeechBrowser();

  const {
-    generateSpeechExternal: generateSpeechExternal,
+    generateSpeechEdge,
+    cancelSpeechEdge,
+    isSpeaking: isSpeakingEdge,
+    voices: voicesEdge,
+  } = useTextToSpeechEdge();
+
+  const {
+    generateSpeechExternal,
    cancelSpeech: cancelSpeechExternal,
    isSpeaking: isSpeakingExternal,
-    isLoading: isLoading,
-    audioRef,
-  } = useTextToSpeechExternal(message.messageId, isLast, index);
-  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+    isLoading: isLoadingExternal,
+    audioRef: audioRefExternal,
+    voices: voicesExternal,
+  } = useTextToSpeechExternal(message?.messageId || '', isLast, index);

-  const generateSpeech = externalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
-  const cancelSpeech = externalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
-  const isSpeaking = externalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;
+  let generateSpeech, cancelSpeech, isSpeaking, isLoading, voices;
+
+  switch (textToSpeechEndpoint) {
+    case 'external':
+      generateSpeech = generateSpeechExternal;
+      cancelSpeech = cancelSpeechExternal;
+      isSpeaking = isSpeakingExternal;
+      isLoading = isLoadingExternal;
+      if (audioRefExternal) {
+        audioRef.current = audioRefExternal.current;
+      }
+      voices = voicesExternal;
+      break;
+    case 'edge':
+      generateSpeech = generateSpeechEdge;
+      cancelSpeech = cancelSpeechEdge;
+      isSpeaking = isSpeakingEdge;
+      isLoading = false;
+      voices = voicesEdge;
+      break;
+    case 'browser':
+    default:
+      generateSpeech = generateSpeechLocal;
+      cancelSpeech = cancelSpeechLocal;
+      isSpeaking = isSpeakingLocal;
+      isLoading = false;
+      voices = voicesLocal;
+      break;
+  }

  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
@ -52,7 +89,6 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {

  const toggleSpeech = () => {
    if (isSpeaking) {
-      console.log('canceling message audio speech');
      cancelSpeech();
      pauseGlobalAudio();
    } else {
@ -69,6 +105,7 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
    toggleSpeech,
    isSpeaking,
    isLoading,
+    voices,
    audioRef,
  };
 };