🎧 fix(TTS): Improve State of audio playback, hook patterns, and fix undefined MediaSource (#3632)

2025-12-17 08:50:15 +01:00 · 2024-08-13 12:08:55 -04:00 · 2024-08-13 12:08:55 -04:00 · dc8d30ad90
commit dc8d30ad90
parent e3ebcfd2b1
6 changed files with 108 additions and 72 deletions
--- a/client/src/components/Chat/Input/StreamAudio.tsx
+++ b/client/src/components/Chat/Input/StreamAudio.tsx
@ -5,9 +5,9 @@ import { useQueryClient } from '@tanstack/react-query';
 import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil';
 import type { TMessage } from 'librechat-data-provider';
 import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio';
 import { getLatestText, logger } from '~/utils';
 import { useAuthContext } from '~/hooks';
 import { globalAudioId } from '~/common';
 import { getLatestText } from '~/utils';
 import store from '~/store';
 function timeoutPromise(ms: number, message?: string) {
@ -51,7 +51,7 @@ export default function StreamAudio({ index = 0 }) {
    const latestText = getLatestText(latestMessage);
    const shouldFetch = !!(
-      token &&
+      token != null &&
      automaticPlayback &&
      isSubmitting &&
      latestMessage &&
@ -60,7 +60,7 @@ export default function StreamAudio({ index = 0 }) {
      latestMessage.messageId &&
      !latestMessage.messageId.includes('_') &&
      !isFetching &&
-      activeRunId &&
+      activeRunId != null &&
      activeRunId !== audioRunId
    );
@ -109,7 +109,8 @@ export default function StreamAudio({ index = 0 }) {
        const reader = response.body.getReader();
        const type = 'audio/mpeg';
-        const browserSupportsType = MediaSource.isTypeSupported(type);
+        const browserSupportsType =
          typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported(type);
        let mediaSource: MediaSourceAppender | undefined;
        if (browserSupportsType) {
          mediaSource = new MediaSourceAppender(type);
@ -210,6 +211,7 @@ export default function StreamAudio({ index = 0 }) {
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [paramId]);
  logger.log('StreamAudio.tsx - globalAudioURL:', globalAudioURL);
  return (
    <audio
      ref={audioRef}
@ -222,7 +224,7 @@ export default function StreamAudio({ index = 0 }) {
        height: '0px',
        width: '0px',
      }}
-      src={globalAudioURL || undefined}
+      src={globalAudioURL ?? undefined}
      id={globalAudioId}
      muted
      autoPlay
--- a/client/src/components/Chat/Messages/MessageAudio.tsx
+++ b/client/src/components/Chat/Messages/MessageAudio.tsx
@ -3,6 +3,7 @@ import { useRecoilValue } from 'recoil';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
 import { useLocalize, useTextToSpeech } from '~/hooks';
 import { logger } from '~/utils';
 import store from '~/store';
 type THoverButtons = {
@ -45,6 +46,12 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
    }
  }, [audioRef, isSpeaking, playbackRate, messageId]);
  logger.log(
    'MessageAudio: audioRef.current?.src, audioRef.current',
    audioRef.current?.src,
    audioRef.current,
  );
  return (
    <>
      <button
@ -75,6 +82,7 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
      <audio
        ref={audioRef}
        controls
        preload="none"
        controlsList="nodownload nofullscreen noremoteplayback"
        style={{
          position: 'absolute',
@ -83,7 +91,10 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
          height: '0px',
          width: '0px',
        }}
-        src={audioRef.current?.src ?? undefined}
+        src={audioRef.current?.src}
        onError={(error) => {
          console.error('Error fetching audio:', error);
        }}
        id={`audio-${messageId}`}
        muted
        autoPlay
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -1,5 +1,5 @@
-import { useRecoilState } from 'recoil';
+import { useRecoilState, useRecoilValue } from 'recoil';
-import { useRef, useMemo, useEffect } from 'react';
+import { useRef, useMemo, useEffect, useState } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import type { Option } from '~/common';
@ -7,6 +7,7 @@ import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import useGetAudioSettings from './useGetAudioSettings';
 import useTextToSpeechEdge from './useTextToSpeechEdge';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
 import store from '~/store';
@ -20,41 +21,77 @@ type TUseTextToSpeech = {
 const useTextToSpeech = (props?: TUseTextToSpeech) => {
  const { messageId, content, isLast = false, index = 0 } = props ?? {};
-  const [voice, setVoice] = useRecoilState(store.voice);
+
  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
  const [isSpeakingState, setIsSpeaking] = useState(false);
  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
  const { textToSpeechEndpoint } = useGetAudioSettings();
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
-  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const [voice, setVoice] = useRecoilState(store.voice);
  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
  const {
    generateSpeechLocal,
    cancelSpeechLocal,
    isSpeaking: isSpeakingLocal,
    voices: voicesLocal,
-  } = useTextToSpeechBrowser();
+  } = useTextToSpeechBrowser({ setIsSpeaking });
  const {
    generateSpeechEdge,
    cancelSpeechEdge,
    isSpeaking: isSpeakingEdge,
    voices: voicesEdge,
-  } = useTextToSpeechEdge();
+  } = useTextToSpeechEdge({ setIsSpeaking });
  const {
    generateSpeechExternal,
    cancelSpeech: cancelSpeechExternal,
    isSpeaking: isSpeakingExternal,
    isLoading: isLoadingExternal,
    audioRef: audioRefExternal,
    voices: voicesExternal,
-  } = useTextToSpeechExternal(messageId ?? '', isLast, index);
+  } = useTextToSpeechExternal({
    setIsSpeaking,
    audioRef,
    messageId,
    isLast,
    index,
  });
-  let generateSpeech, cancelSpeech, isSpeaking, isLoading;
+  const generateSpeech = useMemo(() => {
    const map = {
      edge: generateSpeechEdge,
      browser: generateSpeechLocal,
      external: generateSpeechExternal,
    };
    return map[textToSpeechEndpoint];
  }, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
  const cancelSpeech = useMemo(() => {
    const map = {
      edge: cancelSpeechEdge,
      browser: cancelSpeechLocal,
      external: cancelSpeechExternal,
    };
    return map[textToSpeechEndpoint];
  }, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
  const isLoading = useMemo(() => {
    const map = {
      edge: false,
      browser: false,
      external: isLoadingExternal,
    };
    return map[textToSpeechEndpoint];
  }, [isLoadingExternal, textToSpeechEndpoint]);
  const voices: Option[] | string[] = useMemo(() => {
    const voiceMap = {
      external: voicesExternal,
      edge: voicesEdge,
      browser: voicesLocal,
      external: voicesExternal,
    };
    return voiceMap[textToSpeechEndpoint];
@ -88,34 +125,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
    }
  }, [setVoice, textToSpeechEndpoint, voice, voices]);
  switch (textToSpeechEndpoint) {
    case 'external':
      generateSpeech = generateSpeechExternal;
      cancelSpeech = cancelSpeechExternal;
      isSpeaking = isSpeakingExternal;
      isLoading = isLoadingExternal;
      if (audioRefExternal.current) {
        audioRef.current = audioRefExternal.current;
      }
      break;
    case 'edge':
      generateSpeech = generateSpeechEdge;
      cancelSpeech = cancelSpeechEdge;
      isSpeaking = isSpeakingEdge;
      isLoading = false;
      break;
    case 'browser':
    default:
      generateSpeech = generateSpeechLocal;
      cancelSpeech = cancelSpeechLocal;
      isSpeaking = isSpeakingLocal;
      isLoading = false;
      break;
  }
  const isMouseDownRef = useRef(false);
  const timerRef = useRef<number | undefined>(undefined);
  const handleMouseDown = () => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -7,9 +7,12 @@ interface VoiceOption {
  label: string;
 }
-function useTextToSpeechBrowser() {
+function useTextToSpeechBrowser({
  setIsSpeaking,
 }: {
  setIsSpeaking: (isSpeaking: boolean) => void;
 }) {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
  const [isSpeaking, setIsSpeaking] = useState(false);
  const [voiceName] = useRecoilState(store.voice);
  const [voices, setVoices] = useState<VoiceOption[]>([]);
@ -61,7 +64,7 @@ function useTextToSpeechBrowser() {
    setIsSpeaking(false);
  };
-  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
+  return { generateSpeechLocal, cancelSpeechLocal, voices };
 }
 export default useTextToSpeechBrowser;
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -13,14 +13,16 @@ interface Voice {
 interface UseTextToSpeechEdgeReturn {
  generateSpeechEdge: (text: string) => void;
  cancelSpeechEdge: () => void;
  isSpeaking: boolean;
  voices: Voice[];
 }
-function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
+function useTextToSpeechEdge({
  setIsSpeaking,
 }: {
  setIsSpeaking: (isSpeaking: boolean) => void;
 }): UseTextToSpeechEdgeReturn {
  const localize = useLocalize();
  const [voices, setVoices] = useState<Voice[]>([]);
  const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
  const voiceName = useRecoilValue(store.voice);
  const ttsRef = useRef<MsEdgeTTS | null>(null);
  const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -29,7 +31,10 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  const pendingBuffers = useRef<Uint8Array[]>([]);
  const { showToast } = useToastContext();
-  const isBrowserSupported = useMemo(() => MediaSource.isTypeSupported('audio/mpeg'), []);
+  const isBrowserSupported = useMemo(
    () => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
    [],
  );
  const fetchVoices = useCallback(() => {
    if (!ttsRef.current) {
@ -146,7 +151,7 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
          setIsSpeaking(true);
          pendingBuffers.current = [];
-          const readable = await ttsRef.current.toStream(text);
+          const readable = ttsRef.current.toStream(text);
          readable.on('data', (chunk: Buffer) => {
            pendingBuffers.current.push(new Uint8Array(chunk));
@ -200,21 +205,21 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  }, [showToast, localize]);
  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    fetchVoices();
-  }, [fetchVoices]);
+  }, [fetchVoices, isBrowserSupported]);
  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    initializeTTS();
-  }, [voiceName, initializeTTS]);
+  }, [voiceName, initializeTTS, isBrowserSupported]);
  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    initializeMediaSource();
@ -223,18 +228,17 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
        URL.revokeObjectURL(audioElementRef.current?.src ?? '');
      }
    };
-  }, [initializeMediaSource]);
+  }, [initializeMediaSource, isBrowserSupported]);
  if (!isBrowserSupported) {
    return {
      generateSpeechEdge: () => ({}),
      cancelSpeechEdge: () => ({}),
      isSpeaking: false,
      voices: [],
    };
  }
-  return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
+  return { generateSpeechEdge, cancelSpeechEdge, voices };
 }
 export default useTextToSpeechEdge;
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,7 +1,6 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
 import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import useLocalize from '~/hooks/useLocalize';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
@ -13,7 +12,21 @@ const createFormData = (text: string, voice: string) => {
  return formData;
 };
-function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0) {
+type TUseTTSExternal = {
  setIsSpeaking: (isSpeaking: boolean) => void;
  audioRef: React.MutableRefObject<HTMLAudioElement | null>;
  messageId?: string;
  isLast: boolean;
  index?: number;
 };
 function useTextToSpeechExternal({
  setIsSpeaking,
  audioRef,
  messageId,
  isLast,
  index = 0,
 }: TUseTTSExternal) {
  const localize = useLocalize();
  const { showToast } = useToastContext();
  const voice = useRecoilValue(store.voice);
@ -21,8 +34,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
  const playbackRate = useRecoilValue(store.playbackRate);
  const [downloadFile, setDownloadFile] = useState(false);
-  const [isLocalSpeaking, setIsSpeaking] = useState(false);
+
  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
  const promiseAudioRef = useRef<HTMLAudioElement | null>(null);
  /* Global Audio Variables */
@ -174,17 +186,12 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
    return isProcessing || (isLast && globalIsFetching && !globalIsPlaying);
  }, [isProcessing, globalIsFetching, globalIsPlaying, isLast]);
  const isSpeaking = useMemo(() => {
    return isLocalSpeaking || (isLast && globalIsPlaying);
  }, [isLocalSpeaking, globalIsPlaying, isLast]);
  const { data: voicesData = [] } = useVoicesQuery();
  return {
    generateSpeechExternal,
    cancelSpeech,
    isLoading,
    isSpeaking,
    audioRef,
    voices: voicesData,
  };