⏯️ fix(tts): Resolve Voice Selection and Manual Playback Issues (#2845)

* fix: voice setting for autoplayback TTS * fix(useTextToSpeechExternal): resolve stateful playback issues and consolidate state logic * refactor: initialize tts voice and provider schema once per request * fix(tts): edge case, longer text inputs. TODO: use continuous stream for longer text inputs * fix(tts): pause global audio on conversation change * refactor: keyvMongo ban cache to allow db updates for unbanning, to prevent server restart * chore: eslint fix * refactor: make ban cache exclusively keyvMongo
2026-03-16 12:46:34 +01:00 · 2024-05-23 16:27:36 -04:00 · 2024-05-23 16:27:36 -04:00 · 514a502b9c
commit 514a502b9c
parent 8e66683577
10 changed files with 332 additions and 178 deletions
--- a/client/src/components/Chat/Input/StreamAudio.tsx
+++ b/client/src/components/Chat/Input/StreamAudio.tsx
@ -1,10 +1,10 @@
 import { useParams } from 'react-router-dom';
+import { useEffect, useCallback } from 'react';
 import { QueryKeys } from 'librechat-data-provider';
 import { useQueryClient } from '@tanstack/react-query';
-import { useEffect, useCallback } from 'react';
 import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil';
 import type { TMessage } from 'librechat-data-provider';
-import { useCustomAudioRef, MediaSourceAppender } from '~/hooks/Audio';
+import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio';
 import { useAuthContext } from '~/hooks';
 import { globalAudioId } from '~/common';
 import store from '~/store';
@ -24,6 +24,7 @@ export default function StreamAudio({ index = 0 }) {
  const cacheTTS = useRecoilValue(store.cacheTTS);
  const playbackRate = useRecoilValue(store.playbackRate);

+  const voice = useRecoilValue(store.voice);
  const activeRunId = useRecoilValue(store.activeRunFamily(index));
  const automaticPlayback = useRecoilValue(store.automaticPlayback);
  const isSubmitting = useRecoilValue(store.isSubmittingFamily(index));
@ -34,6 +35,7 @@ export default function StreamAudio({ index = 0 }) {
  const [globalAudioURL, setGlobalAudioURL] = useRecoilState(store.globalAudioURLFamily(index));

  const { audioRef } = useCustomAudioRef({ setIsPlaying });
+  const { pauseGlobalAudio } = usePauseGlobalAudio();

  const { conversationId: paramId } = useParams();
  const queryParam = paramId === 'new' ? paramId : latestMessage?.conversationId ?? paramId ?? '';
@ -90,7 +92,7 @@ export default function StreamAudio({ index = 0 }) {
        const response = await fetch('/api/files/tts', {
          method: 'POST',
          headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` },
-          body: JSON.stringify({ messageId: latestMessage?.messageId, runId: activeRunId }),
+          body: JSON.stringify({ messageId: latestMessage?.messageId, runId: activeRunId, voice }),
        });

        if (!response.ok) {
@ -166,6 +168,7 @@ export default function StreamAudio({ index = 0 }) {
    audioRunId,
    cacheTTS,
    audioRef,
+    voice,
    token,
  ]);

@ -180,6 +183,12 @@ export default function StreamAudio({ index = 0 }) {
    }
  }, [audioRef, globalAudioURL, playbackRate]);

+  useEffect(() => {
+    pauseGlobalAudio();
+    // We only want the effect to run when the paramId changes
+    // eslint-disable-next-line react-hooks/exhaustive-deps
+  }, [paramId]);
+
  return (
    <audio
      ref={audioRef}
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -1,6 +1,7 @@
 import { useRecoilValue } from 'recoil';
-import { useCallback, useEffect, useState, useMemo } from 'react';
+import { useCallback, useEffect, useState, useMemo, useRef } from 'react';
 import { useTextToSpeechMutation } from '~/data-provider';
+import useLocalize from '~/hooks/useLocalize';
 import { useToastContext } from '~/Providers';
 import store from '~/store';

@ -12,16 +13,16 @@ const createFormData = (text: string, voice: string) => {
 };

 function useTextToSpeechExternal(isLast: boolean, index = 0) {
+  const localize = useLocalize();
  const { showToast } = useToastContext();
  const voice = useRecoilValue(store.voice);
  const cacheTTS = useRecoilValue(store.cacheTTS);
  const playbackRate = useRecoilValue(store.playbackRate);

-  const [text, setText] = useState<string | null>(null);
+  const audioRef = useRef<HTMLAudioElement | null>(null);
+
  const [downloadFile, setDownloadFile] = useState(false);
  const [isLocalSpeaking, setIsSpeaking] = useState(false);
-  const [blobUrl, setBlobUrl] = useState<string | null>(null);
-  const [audio, setAudio] = useState<HTMLAudioElement | null>(null);

  /* Global Audio Variables */
  const globalIsFetching = useRecoilValue(store.globalAudioFetchingFamily(index));
@ -29,10 +30,13 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {

  const playAudio = (blobUrl: string) => {
    const newAudio = new Audio(blobUrl);
-    if (playbackRate && playbackRate !== 1) {
-      newAudio.playbackRate = playbackRate;
-    }
+    const initializeAudio = () => {
+      if (playbackRate && playbackRate !== 1) {
+        newAudio.playbackRate = playbackRate;
+      }
+    };

+    initializeAudio();
    const playPromise = () => newAudio.play().then(() => setIsSpeaking(true));

    playPromise().catch((error: Error) => {
@ -40,10 +44,12 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
        error?.message &&
        error.message.includes('The play() request was interrupted by a call to pause()')
      ) {
+        console.log('Play request was interrupted by a call to pause()');
+        initializeAudio();
        return playPromise().catch(console.error);
      }
      console.error(error);
-      showToast({ message: `Error playing audio: ${error.message}`, status: 'error' });
+      showToast({ message: localize('com_nav_audio_play_error', error.message), status: 'error' });
    });

    newAudio.onended = () => {
@ -52,8 +58,7 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
      setIsSpeaking(false);
    };

-    setAudio(newAudio);
-    setBlobUrl(blobUrl);
+    audioRef.current = newAudio;
  };

  const downloadAudio = (blobUrl: string) => {
@ -65,35 +70,32 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
  };

  const { mutate: processAudio, isLoading: isProcessing } = useTextToSpeechMutation({
-    onSuccess: async (data: ArrayBuffer) => {
+    onMutate: (variables) => {
+      const inputText = (variables.get('input') ?? '') as string;
+      if (inputText.length >= 4096) {
+        showToast({
+          message: localize('com_nav_long_audio_warning'),
+          status: 'warning',
+        });
+      }
+    },
+    onSuccess: async (data: ArrayBuffer, variables) => {
      try {
-        const mediaSource = new MediaSource();
-        const audio = new Audio();
-        audio.src = URL.createObjectURL(mediaSource);
-        audio.autoplay = true;
+        const inputText = (variables.get('input') ?? '') as string;
+        const audioBlob = new Blob([data], { type: 'audio/mpeg' });

-        mediaSource.onsourceopen = () => {
-          const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg');
-          sourceBuffer.appendBuffer(data);
-        };
-
-        audio.onended = () => {
-          URL.revokeObjectURL(audio.src);
-          setIsSpeaking(false);
-        };
-
-        setAudio(audio);
-
-        if (cacheTTS) {
+        if (cacheTTS && inputText) {
          const cache = await caches.open('tts-responses');
-          const request = new Request(text!);
-          const response = new Response(new Blob([data], { type: 'audio/mpeg' }));
+          const request = new Request(inputText!);
+          const response = new Response(audioBlob);
          cache.put(request, response);
        }

+        const blobUrl = URL.createObjectURL(audioBlob);
        if (downloadFile) {
-          downloadAudio(audio.src);
+          downloadAudio(blobUrl);
        }
+        playAudio(blobUrl);
      } catch (error) {
        showToast({
          message: `Error processing audio: ${(error as Error).message}`,
@ -102,13 +104,15 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
      }
    },
    onError: (error: unknown) => {
-      showToast({ message: `Error: ${(error as Error).message}`, status: 'error' });
+      showToast({
+        message: localize('com_nav_audio_process_error', (error as Error).message),
+        status: 'error',
+      });
    },
  });

  const generateSpeechExternal = async (text: string, download: boolean) => {
-    setText(text);
-    const cachedResponse = await getCachedResponse(text);
+    const cachedResponse = await caches.match(text);

    if (cachedResponse && cacheTTS) {
      handleCachedResponse(cachedResponse, download);
@ -119,8 +123,6 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
    }
  };

-  const getCachedResponse = async (text: string) => await caches.match(text);
-
  const handleCachedResponse = async (cachedResponse: Response, download: boolean) => {
    const audioBlob = await cachedResponse.blob();
    const blobUrl = URL.createObjectURL(audioBlob);
@ -132,12 +134,13 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
  };

  const cancelSpeech = useCallback(() => {
-    if (audio) {
-      audio.pause();
-      blobUrl && URL.revokeObjectURL(blobUrl);
+    if (audioRef.current) {
+      audioRef.current.pause();
+      audioRef.current.src && URL.revokeObjectURL(audioRef.current.src);
+      audioRef.current = null;
      setIsSpeaking(false);
    }
-  }, [audio, blobUrl]);
+  }, []);

  useEffect(() => cancelSpeech, [cancelSpeech]);

--- a/client/src/localization/languages/Eng.ts
+++ b/client/src/localization/languages/Eng.ts
@ -550,6 +550,9 @@ export default {
  com_nav_auto_transcribe_audio: 'Auto transcribe audio',
  com_nav_db_sensitivity: 'Decibel sensitivity',
  com_nav_playback_rate: 'Audio Playback Rate',
+  com_nav_audio_play_error: 'Error playing audio: {0}',
+  com_nav_audio_process_error: 'Error processing audio: {0}',
+  com_nav_long_audio_warning: 'Longer texts will take longer to process.',
  com_nav_engine: 'Engine',
  com_nav_browser: 'Browser',
  com_nav_external: 'External',