🎧 fix(TTS): Improve State of audio playback, hook patterns, and fix undefined MediaSource (#3632)

2026-03-29 20:07:19 +02:00 · 2024-08-13 12:08:55 -04:00 · 2024-08-13 12:08:55 -04:00 · dc8d30ad90
commit dc8d30ad90
parent e3ebcfd2b1
6 changed files with 108 additions and 72 deletions
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -1,5 +1,5 @@
-import { useRecoilState } from 'recoil';
-import { useRef, useMemo, useEffect } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { useRef, useMemo, useEffect, useState } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import type { Option } from '~/common';
@ -7,6 +7,7 @@ import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import useGetAudioSettings from './useGetAudioSettings';
 import useTextToSpeechEdge from './useTextToSpeechEdge';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
 import store from '~/store';
@ -20,41 +21,77 @@ type TUseTextToSpeech = {

 const useTextToSpeech = (props?: TUseTextToSpeech) => {
  const { messageId, content, isLast = false, index = 0 } = props ?? {};
-  const [voice, setVoice] = useRecoilState(store.voice);
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
  const { textToSpeechEndpoint } = useGetAudioSettings();
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
-  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);

  const {
    generateSpeechLocal,
    cancelSpeechLocal,
-    isSpeaking: isSpeakingLocal,
    voices: voicesLocal,
-  } = useTextToSpeechBrowser();
+  } = useTextToSpeechBrowser({ setIsSpeaking });

  const {
    generateSpeechEdge,
    cancelSpeechEdge,
-    isSpeaking: isSpeakingEdge,
    voices: voicesEdge,
-  } = useTextToSpeechEdge();
+  } = useTextToSpeechEdge({ setIsSpeaking });

  const {
    generateSpeechExternal,
    cancelSpeech: cancelSpeechExternal,
-    isSpeaking: isSpeakingExternal,
    isLoading: isLoadingExternal,
-    audioRef: audioRefExternal,
    voices: voicesExternal,
-  } = useTextToSpeechExternal(messageId ?? '', isLast, index);
+  } = useTextToSpeechExternal({
+    setIsSpeaking,
+    audioRef,
+    messageId,
+    isLast,
+    index,
+  });

-  let generateSpeech, cancelSpeech, isSpeaking, isLoading;
+  const generateSpeech = useMemo(() => {
+    const map = {
+      edge: generateSpeechEdge,
+      browser: generateSpeechLocal,
+      external: generateSpeechExternal,
+    };
+
+    return map[textToSpeechEndpoint];
+  }, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
+
+  const cancelSpeech = useMemo(() => {
+    const map = {
+      edge: cancelSpeechEdge,
+      browser: cancelSpeechLocal,
+      external: cancelSpeechExternal,
+    };
+    return map[textToSpeechEndpoint];
+  }, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
+
+  const isLoading = useMemo(() => {
+    const map = {
+      edge: false,
+      browser: false,
+      external: isLoadingExternal,
+    };
+    return map[textToSpeechEndpoint];
+  }, [isLoadingExternal, textToSpeechEndpoint]);

  const voices: Option[] | string[] = useMemo(() => {
    const voiceMap = {
-      external: voicesExternal,
      edge: voicesEdge,
      browser: voicesLocal,
+      external: voicesExternal,
    };

    return voiceMap[textToSpeechEndpoint];
@ -88,34 +125,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
    }
  }, [setVoice, textToSpeechEndpoint, voice, voices]);

-  switch (textToSpeechEndpoint) {
-    case 'external':
-      generateSpeech = generateSpeechExternal;
-      cancelSpeech = cancelSpeechExternal;
-      isSpeaking = isSpeakingExternal;
-      isLoading = isLoadingExternal;
-      if (audioRefExternal.current) {
-        audioRef.current = audioRefExternal.current;
-      }
-      break;
-    case 'edge':
-      generateSpeech = generateSpeechEdge;
-      cancelSpeech = cancelSpeechEdge;
-      isSpeaking = isSpeakingEdge;
-      isLoading = false;
-      break;
-    case 'browser':
-    default:
-      generateSpeech = generateSpeechLocal;
-      cancelSpeech = cancelSpeechLocal;
-      isSpeaking = isSpeakingLocal;
-      isLoading = false;
-      break;
-  }
-
-  const isMouseDownRef = useRef(false);
-  const timerRef = useRef<number | undefined>(undefined);
-
  const handleMouseDown = () => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -7,9 +7,12 @@ interface VoiceOption {
  label: string;
 }

-function useTextToSpeechBrowser() {
+function useTextToSpeechBrowser({
+  setIsSpeaking,
+}: {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+}) {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
-  const [isSpeaking, setIsSpeaking] = useState(false);
  const [voiceName] = useRecoilState(store.voice);
  const [voices, setVoices] = useState<VoiceOption[]>([]);

@ -61,7 +64,7 @@ function useTextToSpeechBrowser() {
    setIsSpeaking(false);
  };

-  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
+  return { generateSpeechLocal, cancelSpeechLocal, voices };
 }

 export default useTextToSpeechBrowser;
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -13,14 +13,16 @@ interface Voice {
 interface UseTextToSpeechEdgeReturn {
  generateSpeechEdge: (text: string) => void;
  cancelSpeechEdge: () => void;
-  isSpeaking: boolean;
  voices: Voice[];
 }

-function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
+function useTextToSpeechEdge({
+  setIsSpeaking,
+}: {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+}): UseTextToSpeechEdgeReturn {
  const localize = useLocalize();
  const [voices, setVoices] = useState<Voice[]>([]);
-  const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
  const voiceName = useRecoilValue(store.voice);
  const ttsRef = useRef<MsEdgeTTS | null>(null);
  const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -29,7 +31,10 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  const pendingBuffers = useRef<Uint8Array[]>([]);
  const { showToast } = useToastContext();

-  const isBrowserSupported = useMemo(() => MediaSource.isTypeSupported('audio/mpeg'), []);
+  const isBrowserSupported = useMemo(
+    () => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
+    [],
+  );

  const fetchVoices = useCallback(() => {
    if (!ttsRef.current) {
@ -146,7 +151,7 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
          setIsSpeaking(true);
          pendingBuffers.current = [];

-          const readable = await ttsRef.current.toStream(text);
+          const readable = ttsRef.current.toStream(text);

          readable.on('data', (chunk: Buffer) => {
            pendingBuffers.current.push(new Uint8Array(chunk));
@ -200,21 +205,21 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  }, [showToast, localize]);

  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    fetchVoices();
-  }, [fetchVoices]);
+  }, [fetchVoices, isBrowserSupported]);

  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    initializeTTS();
-  }, [voiceName, initializeTTS]);
+  }, [voiceName, initializeTTS, isBrowserSupported]);

  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    initializeMediaSource();
@ -223,18 +228,17 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
        URL.revokeObjectURL(audioElementRef.current?.src ?? '');
      }
    };
-  }, [initializeMediaSource]);
+  }, [initializeMediaSource, isBrowserSupported]);

  if (!isBrowserSupported) {
    return {
      generateSpeechEdge: () => ({}),
      cancelSpeechEdge: () => ({}),
-      isSpeaking: false,
      voices: [],
    };
  }

-  return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
+  return { generateSpeechEdge, cancelSpeechEdge, voices };
 }

 export default useTextToSpeechEdge;
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,7 +1,6 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
 import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
-import useAudioRef from '~/hooks/Audio/useAudioRef';
 import useLocalize from '~/hooks/useLocalize';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
@ -13,7 +12,21 @@ const createFormData = (text: string, voice: string) => {
  return formData;
 };

-function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0) {
+type TUseTTSExternal = {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+  audioRef: React.MutableRefObject<HTMLAudioElement | null>;
+  messageId?: string;
+  isLast: boolean;
+  index?: number;
+};
+
+function useTextToSpeechExternal({
+  setIsSpeaking,
+  audioRef,
+  messageId,
+  isLast,
+  index = 0,
+}: TUseTTSExternal) {
  const localize = useLocalize();
  const { showToast } = useToastContext();
  const voice = useRecoilValue(store.voice);
@ -21,8 +34,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
  const playbackRate = useRecoilValue(store.playbackRate);

  const [downloadFile, setDownloadFile] = useState(false);
-  const [isLocalSpeaking, setIsSpeaking] = useState(false);
-  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
  const promiseAudioRef = useRef<HTMLAudioElement | null>(null);

  /* Global Audio Variables */
@ -174,17 +186,12 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
    return isProcessing || (isLast && globalIsFetching && !globalIsPlaying);
  }, [isProcessing, globalIsFetching, globalIsPlaying, isLast]);

-  const isSpeaking = useMemo(() => {
-    return isLocalSpeaking || (isLast && globalIsPlaying);
-  }, [isLocalSpeaking, globalIsPlaying, isLast]);
-
  const { data: voicesData = [] } = useVoicesQuery();

  return {
    generateSpeechExternal,
    cancelSpeech,
    isLoading,
-    isSpeaking,
    audioRef,
    voices: voicesData,
  };