🗣️ feat: Edge TTS engine (#3358)

* feat: MS Edge TTS * feat: Edge TTS; fix: STT hook
2026-02-03 08:11:50 +01:00 · 2024-08-07 20:15:41 +02:00 · 2024-08-07 20:15:41 +02:00 · b390ba781f
commit b390ba781f
parent 01a88991ab
14 changed files with 379 additions and 129 deletions
--- a/client/src/hooks/Input/useGetAudioSettings.ts
+++ b/client/src/hooks/Input/useGetAudioSettings.ts
@ -1,19 +1,25 @@
 import { useRecoilState } from 'recoil';
 import store from '~/store';

-export enum AudioEndpoints {
+export enum STTEndpoints {
  browser = 'browser',
  external = 'external',
 }

+export enum TTSEndpoints {
+  browser = 'browser',
+  edge = 'edge',
+  external = 'external',
+}
+
 const useGetAudioSettings = () => {
  const [engineSTT] = useRecoilState<string>(store.engineSTT);
  const [engineTTS] = useRecoilState<string>(store.engineTTS);

-  const externalSpeechToText = engineSTT === AudioEndpoints.external;
-  const externalTextToSpeech = engineTTS === AudioEndpoints.external;
+  const speechToTextEndpoint: STTEndpoints = engineSTT as STTEndpoints;
+  const textToSpeechEndpoint: TTSEndpoints = engineTTS as TTSEndpoints;

-  return { externalSpeechToText, externalTextToSpeech };
+  return { speechToTextEndpoint, textToSpeechEndpoint };
 };

 export default useGetAudioSettings;
--- a/client/src/hooks/Input/useSpeechToText.ts
+++ b/client/src/hooks/Input/useSpeechToText.ts
@ -4,8 +4,9 @@ import useSpeechToTextExternal from './useSpeechToTextExternal';
 import useGetAudioSettings from './useGetAudioSettings';

 const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => {
-  const { externalSpeechToText } = useGetAudioSettings();
+  const { speechToTextEndpoint } = useGetAudioSettings();
  const [animatedText, setAnimatedText] = useState('');
+  const externalSpeechToText = speechToTextEndpoint === 'external';

  const {
    isListening: speechIsListeningBrowser,
--- a/client/src/hooks/Input/useSpeechToTextBrowser.ts
+++ b/client/src/hooks/Input/useSpeechToTextBrowser.ts
@ -9,7 +9,8 @@ const useSpeechToTextBrowser = () => {
  const { showToast } = useToastContext();
  const [languageSTT] = useRecoilState<string>(store.languageSTT);
  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
-  const { externalSpeechToText } = useGetAudioSettings();
+  const { speechToTextEndpoint } = useGetAudioSettings();
+  const isBrowserSTTEnabled = speechToTextEndpoint === 'browser';
  const [isListening, setIsListening] = useState(false);

  const {
@ -51,7 +52,7 @@ const useSpeechToTextBrowser = () => {

  useEffect(() => {
    const handleKeyDown = (e: KeyboardEvent) => {
-      if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
+      if (e.shiftKey && e.altKey && e.code === 'KeyL' && !isBrowserSTTEnabled) {
        toggleListening();
      }
    };
--- a/client/src/hooks/Input/useSpeechToTextExternal.ts
+++ b/client/src/hooks/Input/useSpeechToTextExternal.ts
@ -7,7 +7,8 @@ import useGetAudioSettings from './useGetAudioSettings';

 const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => {
  const { showToast } = useToastContext();
-  const { externalSpeechToText } = useGetAudioSettings();
+  const { speechToTextEndpoint } = useGetAudioSettings();
+  const isExternalSTTEnabled = speechToTextEndpoint === 'external';
  const [speechToText] = useRecoilState<boolean>(store.speechToText);
  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
  const [autoSendText] = useRecoilState(store.autoSendText);
@ -194,7 +195,7 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
  };

  const handleKeyDown = async (e: KeyboardEvent) => {
-    if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
+    if (e.shiftKey && e.altKey && e.code === 'KeyL' && isExternalSTTEnabled) {
      if (!window.MediaRecorder) {
        showToast({ message: 'MediaRecorder is not supported in this browser', status: 'error' });
        return;
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -3,30 +3,67 @@ import { parseTextParts } from 'librechat-data-provider';
 import type { TMessage } from 'librechat-data-provider';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
-import { usePauseGlobalAudio } from '../Audio';
 import useGetAudioSettings from './useGetAudioSettings';
+import useTextToSpeechEdge from './useTextToSpeechEdge';
+import { usePauseGlobalAudio } from '../Audio';

-const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
-  const { externalTextToSpeech } = useGetAudioSettings();
+const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
+  const { textToSpeechEndpoint } = useGetAudioSettings();
+  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+  const audioRef = useRef<HTMLAudioElement | null>(null);

  const {
-    generateSpeechLocal: generateSpeechLocal,
-    cancelSpeechLocal: cancelSpeechLocal,
+    generateSpeechLocal,
+    cancelSpeechLocal,
    isSpeaking: isSpeakingLocal,
+    voices: voicesLocal,
  } = useTextToSpeechBrowser();

  const {
-    generateSpeechExternal: generateSpeechExternal,
+    generateSpeechEdge,
+    cancelSpeechEdge,
+    isSpeaking: isSpeakingEdge,
+    voices: voicesEdge,
+  } = useTextToSpeechEdge();
+
+  const {
+    generateSpeechExternal,
    cancelSpeech: cancelSpeechExternal,
    isSpeaking: isSpeakingExternal,
-    isLoading: isLoading,
-    audioRef,
-  } = useTextToSpeechExternal(message.messageId, isLast, index);
-  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
+    isLoading: isLoadingExternal,
+    audioRef: audioRefExternal,
+    voices: voicesExternal,
+  } = useTextToSpeechExternal(message?.messageId || '', isLast, index);

-  const generateSpeech = externalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
-  const cancelSpeech = externalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
-  const isSpeaking = externalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;
+  let generateSpeech, cancelSpeech, isSpeaking, isLoading, voices;
+
+  switch (textToSpeechEndpoint) {
+    case 'external':
+      generateSpeech = generateSpeechExternal;
+      cancelSpeech = cancelSpeechExternal;
+      isSpeaking = isSpeakingExternal;
+      isLoading = isLoadingExternal;
+      if (audioRefExternal) {
+        audioRef.current = audioRefExternal.current;
+      }
+      voices = voicesExternal;
+      break;
+    case 'edge':
+      generateSpeech = generateSpeechEdge;
+      cancelSpeech = cancelSpeechEdge;
+      isSpeaking = isSpeakingEdge;
+      isLoading = false;
+      voices = voicesEdge;
+      break;
+    case 'browser':
+    default:
+      generateSpeech = generateSpeechLocal;
+      cancelSpeech = cancelSpeechLocal;
+      isSpeaking = isSpeakingLocal;
+      isLoading = false;
+      voices = voicesLocal;
+      break;
+  }

  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
@ -52,7 +89,6 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {

  const toggleSpeech = () => {
    if (isSpeaking) {
-      console.log('canceling message audio speech');
      cancelSpeech();
      pauseGlobalAudio();
    } else {
@ -69,6 +105,7 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
    toggleSpeech,
    isSpeaking,
    isLoading,
+    voices,
    audioRef,
  };
 };
--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -2,6 +2,11 @@ import { useRecoilState } from 'recoil';
 import { useState } from 'react';
 import store from '~/store';

+interface VoiceOption {
+  value: string;
+  display: string;
+}
+
 function useTextToSpeechBrowser() {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
  const [isSpeaking, setIsSpeaking] = useState(false);
@ -32,7 +37,30 @@ function useTextToSpeechBrowser() {
    setIsSpeaking(false);
  };

-  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking };
+  const voices = (): Promise<VoiceOption[]> => {
+    return new Promise((resolve) => {
+      const getAndMapVoices = () => {
+        const availableVoices = speechSynthesis
+          .getVoices()
+          .filter((v) => cloudBrowserVoices || v.localService === true);
+
+        const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
+          value: v.name,
+          display: v.name,
+        }));
+
+        resolve(voiceOptions);
+      };
+
+      if (speechSynthesis.getVoices().length) {
+        getAndMapVoices();
+      } else {
+        speechSynthesis.onvoiceschanged = getAndMapVoices;
+      }
+    });
+  };
+
+  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
 }

 export default useTextToSpeechBrowser;
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -0,0 +1,201 @@
+import { useRecoilState } from 'recoil';
+import { useState, useCallback, useRef, useEffect } from 'react';
+import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
+import { useToastContext } from '~/Providers';
+import useLocalize from '~/hooks/useLocalize';
+import store from '~/store';
+
+interface Voice {
+  value: string;
+  display: string;
+}
+
+interface UseTextToSpeechEdgeReturn {
+  generateSpeechEdge: (text: string) => Promise<void>;
+  cancelSpeechEdge: () => void;
+  isSpeaking: boolean;
+  voices: () => Promise<Voice[]>;
+}
+
+function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
+  const localize = useLocalize();
+  const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
+  const [voiceName] = useRecoilState<string>(store.voice);
+  const ttsRef = useRef<MsEdgeTTS | null>(null);
+  const audioElementRef = useRef<HTMLAudioElement | null>(null);
+  const mediaSourceRef = useRef<MediaSource | null>(null);
+  const sourceBufferRef = useRef<SourceBuffer | null>(null);
+  const pendingBuffers = useRef<Uint8Array[]>([]);
+  const { showToast } = useToastContext();
+
+  const initializeTTS = useCallback(async (): Promise<void> => {
+    if (!ttsRef.current) {
+      ttsRef.current = new MsEdgeTTS();
+    }
+    try {
+      await ttsRef.current.setMetadata(voiceName, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3);
+    } catch (error) {
+      console.error('Error initializing TTS:', error);
+      showToast({
+        message: localize('com_nav_tts_init_error', (error as Error).message),
+        status: 'error',
+      });
+    }
+  }, [voiceName, showToast, localize]);
+
+  const onSourceOpen = useCallback((): void => {
+    if (!sourceBufferRef.current && mediaSourceRef.current) {
+      try {
+        sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
+        sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
+      } catch (error) {
+        console.error('Error adding source buffer:', error);
+        showToast({
+          message: localize('com_nav_source_buffer_error'),
+          status: 'error',
+        });
+      }
+    }
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [showToast, localize]);
+
+  const initializeMediaSource = useCallback(async (): Promise<void> => {
+    return new Promise<void>((resolve) => {
+      if (!mediaSourceRef.current) {
+        mediaSourceRef.current = new MediaSource();
+        audioElementRef.current = new Audio();
+        audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
+      }
+
+      const mediaSource = mediaSourceRef.current;
+      if (mediaSource.readyState === 'open') {
+        onSourceOpen();
+        resolve();
+      } else {
+        const onSourceOpenWrapper = (): void => {
+          onSourceOpen();
+          resolve();
+          mediaSource.removeEventListener('sourceopen', onSourceOpenWrapper);
+        };
+        mediaSource.addEventListener('sourceopen', onSourceOpenWrapper);
+      }
+    });
+  }, [onSourceOpen]);
+
+  const appendNextBuffer = useCallback((): void => {
+    if (
+      sourceBufferRef.current &&
+      !sourceBufferRef.current.updating &&
+      pendingBuffers.current.length > 0
+    ) {
+      const nextBuffer = pendingBuffers.current.shift();
+      if (nextBuffer) {
+        try {
+          sourceBufferRef.current.appendBuffer(nextBuffer);
+        } catch (error) {
+          console.error('Error appending buffer:', error);
+          showToast({
+            message: localize('com_nav_buffer_append_error'),
+            status: 'error',
+          });
+          pendingBuffers.current.unshift(nextBuffer);
+        }
+      }
+    }
+  }, [showToast, localize]);
+
+  const generateSpeechEdge = useCallback(
+    async (text: string): Promise<void> => {
+      try {
+        await initializeTTS();
+        await initializeMediaSource();
+
+        if (!ttsRef.current || !audioElementRef.current) {
+          throw new Error('TTS or Audio element not initialized');
+        }
+
+        setIsSpeaking(true);
+        pendingBuffers.current = [];
+
+        const readable = await ttsRef.current.toStream(text);
+
+        readable.on('data', (chunk: Buffer) => {
+          pendingBuffers.current.push(new Uint8Array(chunk));
+          appendNextBuffer();
+        });
+
+        readable.on('end', () => {
+          if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
+            mediaSourceRef.current.endOfStream();
+          }
+        });
+
+        audioElementRef.current.onended = () => {
+          setIsSpeaking(false);
+        };
+
+        await audioElementRef.current.play();
+      } catch (error) {
+        console.error('Error generating speech:', error);
+        showToast({
+          message: localize('com_nav_audio_play_error', (error as Error).message),
+          status: 'error',
+        });
+        setIsSpeaking(false);
+      }
+    },
+    [initializeTTS, initializeMediaSource, appendNextBuffer, showToast, localize],
+  );
+
+  const cancelSpeechEdge = useCallback((): void => {
+    try {
+      if (audioElementRef.current) {
+        audioElementRef.current.pause();
+        audioElementRef.current.currentTime = 0;
+      }
+      if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
+        mediaSourceRef.current.endOfStream();
+      }
+      pendingBuffers.current = [];
+      setIsSpeaking(false);
+    } catch (error) {
+      console.error('Error cancelling speech:', error);
+      showToast({
+        message: localize('com_nav_speech_cancel_error'),
+        status: 'error',
+      });
+    }
+  }, [showToast, localize]);
+
+  const voices = useCallback(async (): Promise<Voice[]> => {
+    if (!ttsRef.current) {
+      ttsRef.current = new MsEdgeTTS();
+    }
+    try {
+      const voicesList = await ttsRef.current.getVoices();
+      return voicesList.map((v) => ({
+        value: v.ShortName,
+        display: v.FriendlyName,
+      }));
+    } catch (error) {
+      console.error('Error fetching voices:', error);
+      showToast({
+        message: localize('com_nav_voices_fetch_error'),
+        status: 'error',
+      });
+      return [];
+    }
+  }, [showToast, localize]);
+
+  useEffect(() => {
+    return () => {
+      if (mediaSourceRef.current) {
+        URL.revokeObjectURL(audioElementRef.current?.src || '');
+      }
+    };
+  }, []);
+
+  return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
+}
+
+export default useTextToSpeechEdge;
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,6 +1,6 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
-import { useTextToSpeechMutation } from '~/data-provider';
+import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import useLocalize from '~/hooks/useLocalize';
 import { useToastContext } from '~/Providers';
@ -178,7 +178,18 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
    return isLocalSpeaking || (isLast && globalIsPlaying);
  }, [isLocalSpeaking, globalIsPlaying, isLast]);

-  return { generateSpeechExternal, cancelSpeech, isLoading, isSpeaking, audioRef };
+  const useVoices = () => {
+    return useVoicesQuery().data ?? [];
+  };
+
+  return {
+    generateSpeechExternal,
+    cancelSpeech,
+    isLoading,
+    isSpeaking,
+    audioRef,
+    voices: useVoices,
+  };
 }

 export default useTextToSpeechExternal;