🎧 fix(TTS): Improve State of audio playback, hook patterns, and fix undefined MediaSource (#3632)

2025-12-17 00:40:14 +01:00 · 2024-08-13 12:08:55 -04:00 · 2024-08-13 12:08:55 -04:00 · dc8d30ad90
commit dc8d30ad90
parent e3ebcfd2b1
6 changed files with 108 additions and 72 deletions
--- a/client/src/components/Chat/Input/StreamAudio.tsx
+++ b/client/src/components/Chat/Input/StreamAudio.tsx
@ -5,9 +5,9 @@ import { useQueryClient } from '@tanstack/react-query';
 import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil';
 import type { TMessage } from 'librechat-data-provider';
 import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio';
+import { getLatestText, logger } from '~/utils';
 import { useAuthContext } from '~/hooks';
 import { globalAudioId } from '~/common';
-import { getLatestText } from '~/utils';
 import store from '~/store';

 function timeoutPromise(ms: number, message?: string) {
@ -51,7 +51,7 @@ export default function StreamAudio({ index = 0 }) {
    const latestText = getLatestText(latestMessage);

    const shouldFetch = !!(
-      token &&
+      token != null &&
      automaticPlayback &&
      isSubmitting &&
      latestMessage &&
@ -60,7 +60,7 @@ export default function StreamAudio({ index = 0 }) {
      latestMessage.messageId &&
      !latestMessage.messageId.includes('_') &&
      !isFetching &&
-      activeRunId &&
+      activeRunId != null &&
      activeRunId !== audioRunId
    );

@ -109,7 +109,8 @@ export default function StreamAudio({ index = 0 }) {
        const reader = response.body.getReader();

        const type = 'audio/mpeg';
-        const browserSupportsType = MediaSource.isTypeSupported(type);
+        const browserSupportsType =
+          typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported(type);
        let mediaSource: MediaSourceAppender | undefined;
        if (browserSupportsType) {
          mediaSource = new MediaSourceAppender(type);
@ -210,6 +211,7 @@ export default function StreamAudio({ index = 0 }) {
    // eslint-disable-next-line react-hooks/exhaustive-deps
  }, [paramId]);

+  logger.log('StreamAudio.tsx - globalAudioURL:', globalAudioURL);
  return (
    <audio
      ref={audioRef}
@ -222,7 +224,7 @@ export default function StreamAudio({ index = 0 }) {
        height: '0px',
        width: '0px',
      }}
-      src={globalAudioURL || undefined}
+      src={globalAudioURL ?? undefined}
      id={globalAudioId}
      muted
      autoPlay
--- a/client/src/components/Chat/Messages/MessageAudio.tsx
+++ b/client/src/components/Chat/Messages/MessageAudio.tsx
@ -3,6 +3,7 @@ import { useRecoilValue } from 'recoil';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
 import { useLocalize, useTextToSpeech } from '~/hooks';
+import { logger } from '~/utils';
 import store from '~/store';

 type THoverButtons = {
@ -45,6 +46,12 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
    }
  }, [audioRef, isSpeaking, playbackRate, messageId]);

+  logger.log(
+    'MessageAudio: audioRef.current?.src, audioRef.current',
+    audioRef.current?.src,
+    audioRef.current,
+  );
+
  return (
    <>
      <button
@ -75,6 +82,7 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
      <audio
        ref={audioRef}
        controls
+        preload="none"
        controlsList="nodownload nofullscreen noremoteplayback"
        style={{
          position: 'absolute',
@ -83,7 +91,10 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
          height: '0px',
          width: '0px',
        }}
-        src={audioRef.current?.src ?? undefined}
+        src={audioRef.current?.src}
+        onError={(error) => {
+          console.error('Error fetching audio:', error);
+        }}
        id={`audio-${messageId}`}
        muted
        autoPlay
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -1,5 +1,5 @@
-import { useRecoilState } from 'recoil';
-import { useRef, useMemo, useEffect } from 'react';
+import { useRecoilState, useRecoilValue } from 'recoil';
+import { useRef, useMemo, useEffect, useState } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import type { Option } from '~/common';
@ -7,6 +7,7 @@ import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import useGetAudioSettings from './useGetAudioSettings';
 import useTextToSpeechEdge from './useTextToSpeechEdge';
+import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
 import store from '~/store';
@ -20,41 +21,77 @@ type TUseTextToSpeech = {

 const useTextToSpeech = (props?: TUseTextToSpeech) => {
  const { messageId, content, isLast = false, index = 0 } = props ?? {};
-  const [voice, setVoice] = useRecoilState(store.voice);
+
+  const isMouseDownRef = useRef(false);
+  const timerRef = useRef<number | undefined>(undefined);
+  const [isSpeakingState, setIsSpeaking] = useState(false);
+  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
  const { textToSpeechEndpoint } = useGetAudioSettings();
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
-  const audioRef = useRef<HTMLAudioElement | null>(null);
+  const [voice, setVoice] = useRecoilState(store.voice);
+  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
+
+  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);

  const {
    generateSpeechLocal,
    cancelSpeechLocal,
-    isSpeaking: isSpeakingLocal,
    voices: voicesLocal,
-  } = useTextToSpeechBrowser();
+  } = useTextToSpeechBrowser({ setIsSpeaking });

  const {
    generateSpeechEdge,
    cancelSpeechEdge,
-    isSpeaking: isSpeakingEdge,
    voices: voicesEdge,
-  } = useTextToSpeechEdge();
+  } = useTextToSpeechEdge({ setIsSpeaking });

  const {
    generateSpeechExternal,
    cancelSpeech: cancelSpeechExternal,
-    isSpeaking: isSpeakingExternal,
    isLoading: isLoadingExternal,
-    audioRef: audioRefExternal,
    voices: voicesExternal,
-  } = useTextToSpeechExternal(messageId ?? '', isLast, index);
+  } = useTextToSpeechExternal({
+    setIsSpeaking,
+    audioRef,
+    messageId,
+    isLast,
+    index,
+  });

-  let generateSpeech, cancelSpeech, isSpeaking, isLoading;
+  const generateSpeech = useMemo(() => {
+    const map = {
+      edge: generateSpeechEdge,
+      browser: generateSpeechLocal,
+      external: generateSpeechExternal,
+    };
+
+    return map[textToSpeechEndpoint];
+  }, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
+
+  const cancelSpeech = useMemo(() => {
+    const map = {
+      edge: cancelSpeechEdge,
+      browser: cancelSpeechLocal,
+      external: cancelSpeechExternal,
+    };
+    return map[textToSpeechEndpoint];
+  }, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
+
+  const isLoading = useMemo(() => {
+    const map = {
+      edge: false,
+      browser: false,
+      external: isLoadingExternal,
+    };
+    return map[textToSpeechEndpoint];
+  }, [isLoadingExternal, textToSpeechEndpoint]);

  const voices: Option[] | string[] = useMemo(() => {
    const voiceMap = {
-      external: voicesExternal,
      edge: voicesEdge,
      browser: voicesLocal,
+      external: voicesExternal,
    };

    return voiceMap[textToSpeechEndpoint];
@ -88,34 +125,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
    }
  }, [setVoice, textToSpeechEndpoint, voice, voices]);

-  switch (textToSpeechEndpoint) {
-    case 'external':
-      generateSpeech = generateSpeechExternal;
-      cancelSpeech = cancelSpeechExternal;
-      isSpeaking = isSpeakingExternal;
-      isLoading = isLoadingExternal;
-      if (audioRefExternal.current) {
-        audioRef.current = audioRefExternal.current;
-      }
-      break;
-    case 'edge':
-      generateSpeech = generateSpeechEdge;
-      cancelSpeech = cancelSpeechEdge;
-      isSpeaking = isSpeakingEdge;
-      isLoading = false;
-      break;
-    case 'browser':
-    default:
-      generateSpeech = generateSpeechLocal;
-      cancelSpeech = cancelSpeechLocal;
-      isSpeaking = isSpeakingLocal;
-      isLoading = false;
-      break;
-  }
-
-  const isMouseDownRef = useRef(false);
-  const timerRef = useRef<number | undefined>(undefined);
-
  const handleMouseDown = () => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -7,9 +7,12 @@ interface VoiceOption {
  label: string;
 }

-function useTextToSpeechBrowser() {
+function useTextToSpeechBrowser({
+  setIsSpeaking,
+}: {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+}) {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
-  const [isSpeaking, setIsSpeaking] = useState(false);
  const [voiceName] = useRecoilState(store.voice);
  const [voices, setVoices] = useState<VoiceOption[]>([]);

@ -61,7 +64,7 @@ function useTextToSpeechBrowser() {
    setIsSpeaking(false);
  };

-  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
+  return { generateSpeechLocal, cancelSpeechLocal, voices };
 }

 export default useTextToSpeechBrowser;
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -13,14 +13,16 @@ interface Voice {
 interface UseTextToSpeechEdgeReturn {
  generateSpeechEdge: (text: string) => void;
  cancelSpeechEdge: () => void;
-  isSpeaking: boolean;
  voices: Voice[];
 }

-function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
+function useTextToSpeechEdge({
+  setIsSpeaking,
+}: {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+}): UseTextToSpeechEdgeReturn {
  const localize = useLocalize();
  const [voices, setVoices] = useState<Voice[]>([]);
-  const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
  const voiceName = useRecoilValue(store.voice);
  const ttsRef = useRef<MsEdgeTTS | null>(null);
  const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -29,7 +31,10 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  const pendingBuffers = useRef<Uint8Array[]>([]);
  const { showToast } = useToastContext();

-  const isBrowserSupported = useMemo(() => MediaSource.isTypeSupported('audio/mpeg'), []);
+  const isBrowserSupported = useMemo(
+    () => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
+    [],
+  );

  const fetchVoices = useCallback(() => {
    if (!ttsRef.current) {
@ -146,7 +151,7 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
          setIsSpeaking(true);
          pendingBuffers.current = [];

-          const readable = await ttsRef.current.toStream(text);
+          const readable = ttsRef.current.toStream(text);

          readable.on('data', (chunk: Buffer) => {
            pendingBuffers.current.push(new Uint8Array(chunk));
@ -200,21 +205,21 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  }, [showToast, localize]);

  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    fetchVoices();
-  }, [fetchVoices]);
+  }, [fetchVoices, isBrowserSupported]);

  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    initializeTTS();
-  }, [voiceName, initializeTTS]);
+  }, [voiceName, initializeTTS, isBrowserSupported]);

  useEffect(() => {
-    if (!MediaSource.isTypeSupported('audio/mpeg')) {
+    if (!isBrowserSupported) {
      return;
    }
    initializeMediaSource();
@ -223,18 +228,17 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
        URL.revokeObjectURL(audioElementRef.current?.src ?? '');
      }
    };
-  }, [initializeMediaSource]);
+  }, [initializeMediaSource, isBrowserSupported]);

  if (!isBrowserSupported) {
    return {
      generateSpeechEdge: () => ({}),
      cancelSpeechEdge: () => ({}),
-      isSpeaking: false,
      voices: [],
    };
  }

-  return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
+  return { generateSpeechEdge, cancelSpeechEdge, voices };
 }

 export default useTextToSpeechEdge;
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,7 +1,6 @@
 import { useRecoilValue } from 'recoil';
 import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
 import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
-import useAudioRef from '~/hooks/Audio/useAudioRef';
 import useLocalize from '~/hooks/useLocalize';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
@ -13,7 +12,21 @@ const createFormData = (text: string, voice: string) => {
  return formData;
 };

-function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0) {
+type TUseTTSExternal = {
+  setIsSpeaking: (isSpeaking: boolean) => void;
+  audioRef: React.MutableRefObject<HTMLAudioElement | null>;
+  messageId?: string;
+  isLast: boolean;
+  index?: number;
+};
+
+function useTextToSpeechExternal({
+  setIsSpeaking,
+  audioRef,
+  messageId,
+  isLast,
+  index = 0,
+}: TUseTTSExternal) {
  const localize = useLocalize();
  const { showToast } = useToastContext();
  const voice = useRecoilValue(store.voice);
@ -21,8 +34,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
  const playbackRate = useRecoilValue(store.playbackRate);

  const [downloadFile, setDownloadFile] = useState(false);
-  const [isLocalSpeaking, setIsSpeaking] = useState(false);
-  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
+
  const promiseAudioRef = useRef<HTMLAudioElement | null>(null);

  /* Global Audio Variables */
@ -174,17 +186,12 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
    return isProcessing || (isLast && globalIsFetching && !globalIsPlaying);
  }, [isProcessing, globalIsFetching, globalIsPlaying, isLast]);

-  const isSpeaking = useMemo(() => {
-    return isLocalSpeaking || (isLast && globalIsPlaying);
-  }, [isLocalSpeaking, globalIsPlaying, isLast]);
-
  const { data: voicesData = [] } = useVoicesQuery();

  return {
    generateSpeechExternal,
    cancelSpeech,
    isLoading,
-    isSpeaking,
    audioRef,
    voices: voicesData,
  };