🎛️ fix: Improve Frontend Practices for Audio Settings (#3624)

* refactor: do not call await inside useCallbacks, rely on updates for dropdown * fix: remember last selected voice * refactor: Update Speech component to use TypeScript in useCallback * refactor: Update Dropdown component styles to match header theme
2025-12-19 09:50:15 +01:00 · 2024-08-13 02:42:49 -04:00 · 2024-08-13 02:42:49 -04:00 · 05696233a9
commit 05696233a9
parent 8cbb6ba166
20 changed files with 436 additions and 367 deletions
--- a/client/src/hooks/Input/useGetAudioSettings.ts
+++ b/client/src/hooks/Input/useGetAudioSettings.ts
@ -1,4 +1,5 @@
-import { useRecoilState } from 'recoil';
+import { useMemo } from 'react';
+import { useRecoilValue } from 'recoil';
 import store from '~/store';

 export enum STTEndpoints {
@ -13,13 +14,16 @@ export enum TTSEndpoints {
 }

 const useGetAudioSettings = () => {
-  const [engineSTT] = useRecoilState<string>(store.engineSTT);
-  const [engineTTS] = useRecoilState<string>(store.engineTTS);
+  const engineSTT = useRecoilValue<string>(store.engineSTT);
+  const engineTTS = useRecoilValue<string>(store.engineTTS);

-  const speechToTextEndpoint: STTEndpoints = engineSTT as STTEndpoints;
-  const textToSpeechEndpoint: TTSEndpoints = engineTTS as TTSEndpoints;
+  const speechToTextEndpoint = engineSTT;
+  const textToSpeechEndpoint = engineTTS;

-  return { speechToTextEndpoint, textToSpeechEndpoint };
+  return useMemo(
+    () => ({ speechToTextEndpoint, textToSpeechEndpoint }),
+    [speechToTextEndpoint, textToSpeechEndpoint],
+  );
 };

 export default useGetAudioSettings;
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -1,13 +1,18 @@
-import { useRef } from 'react';
+import { useRecoilState } from 'recoil';
+import { useRef, useMemo, useEffect } from 'react';
 import { parseTextParts } from 'librechat-data-provider';
 import type { TMessage } from 'librechat-data-provider';
+import type { Option } from '~/common';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import useGetAudioSettings from './useGetAudioSettings';
 import useTextToSpeechEdge from './useTextToSpeechEdge';
 import { usePauseGlobalAudio } from '../Audio';
+import { logger } from '~/utils';
+import store from '~/store';

 const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
+  const [voice, setVoice] = useRecoilState(store.voice);
  const { textToSpeechEndpoint } = useGetAudioSettings();
  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
  const audioRef = useRef<HTMLAudioElement | null>(null);
@ -33,9 +38,47 @@ const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
    isLoading: isLoadingExternal,
    audioRef: audioRefExternal,
    voices: voicesExternal,
-  } = useTextToSpeechExternal(message?.messageId || '', isLast, index);
+  } = useTextToSpeechExternal(message?.messageId ?? '', isLast, index);

-  let generateSpeech, cancelSpeech, isSpeaking, isLoading, voices;
+  let generateSpeech, cancelSpeech, isSpeaking, isLoading;
+
+  const voices: Option[] | string[] = useMemo(() => {
+    const voiceMap = {
+      external: voicesExternal,
+      edge: voicesEdge,
+      browser: voicesLocal,
+    };
+
+    return voiceMap[textToSpeechEndpoint];
+  }, [textToSpeechEndpoint, voicesEdge, voicesExternal, voicesLocal]);
+
+  useEffect(() => {
+    const firstVoice = voices[0];
+    if (voices.length && typeof firstVoice === 'object') {
+      const lastSelectedVoice = voices.find((v) =>
+        typeof v === 'object' ? v.value === voice : v === voice,
+      );
+      if (lastSelectedVoice != null) {
+        const currentVoice =
+          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
+        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
+        setVoice(currentVoice?.toString() ?? undefined);
+        return;
+      }
+
+      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
+      setVoice(firstVoice.value?.toString() ?? undefined);
+    } else if (voices.length) {
+      const lastSelectedVoice = voices.find((v) => v === voice);
+      if (lastSelectedVoice != null) {
+        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: lastSelectedVoice });
+        setVoice(lastSelectedVoice.toString());
+        return;
+      }
+      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice });
+      setVoice(firstVoice.toString());
+    }
+  }, [setVoice, textToSpeechEndpoint, voice, voices]);

  switch (textToSpeechEndpoint) {
    case 'external':
@ -43,17 +86,15 @@ const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
      cancelSpeech = cancelSpeechExternal;
      isSpeaking = isSpeakingExternal;
      isLoading = isLoadingExternal;
-      if (audioRefExternal) {
+      if (audioRefExternal.current) {
        audioRef.current = audioRefExternal.current;
      }
-      voices = voicesExternal;
      break;
    case 'edge':
      generateSpeech = generateSpeechEdge;
      cancelSpeech = cancelSpeechEdge;
      isSpeaking = isSpeakingEdge;
      isLoading = false;
-      voices = voicesEdge;
      break;
    case 'browser':
    default:
@ -61,7 +102,6 @@ const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
      cancelSpeech = cancelSpeechLocal;
      isSpeaking = isSpeakingLocal;
      isLoading = false;
-      voices = voicesLocal;
      break;
  }

@ -82,7 +122,7 @@ const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {

  const handleMouseUp = () => {
    isMouseDownRef.current = false;
-    if (timerRef.current) {
+    if (timerRef.current != null) {
      window.clearTimeout(timerRef.current);
    }
  };
@ -105,8 +145,8 @@ const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
    toggleSpeech,
    isSpeaking,
    isLoading,
-    voices,
    audioRef,
+    voices,
  };
 };

--- a/client/src/hooks/Input/useTextToSpeechBrowser.ts
+++ b/client/src/hooks/Input/useTextToSpeechBrowser.ts
@ -1,21 +1,46 @@
 import { useRecoilState } from 'recoil';
-import { useState } from 'react';
+import { useState, useEffect, useCallback } from 'react';
 import store from '~/store';

 interface VoiceOption {
  value: string;
-  display: string;
+  label: string;
 }

 function useTextToSpeechBrowser() {
  const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
  const [isSpeaking, setIsSpeaking] = useState(false);
  const [voiceName] = useRecoilState(store.voice);
+  const [voices, setVoices] = useState<VoiceOption[]>([]);
+
+  const updateVoices = useCallback(() => {
+    const availableVoices = window.speechSynthesis
+      .getVoices()
+      .filter((v) => cloudBrowserVoices || v.localService === true);
+
+    const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
+      value: v.name,
+      label: v.name,
+    }));
+
+    setVoices(voiceOptions);
+  }, [cloudBrowserVoices]);
+
+  useEffect(() => {
+    if (window.speechSynthesis.getVoices().length) {
+      updateVoices();
+    } else {
+      window.speechSynthesis.onvoiceschanged = updateVoices;
+    }
+
+    return () => {
+      window.speechSynthesis.onvoiceschanged = null;
+    };
+  }, [updateVoices]);

  const generateSpeechLocal = (text: string) => {
    const synth = window.speechSynthesis;
-    const voices = synth.getVoices().filter((v) => cloudBrowserVoices || v.localService === true);
-    const voice = voices.find((v) => v.name === voiceName);
+    const voice = voices.find((v) => v.value === voiceName);

    if (!voice) {
      return;
@ -23,7 +48,7 @@ function useTextToSpeechBrowser() {

    synth.cancel();
    const utterance = new SpeechSynthesisUtterance(text);
-    utterance.voice = voice;
+    utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
    utterance.onend = () => {
      setIsSpeaking(false);
    };
@ -32,34 +57,10 @@ function useTextToSpeechBrowser() {
  };

  const cancelSpeechLocal = () => {
-    const synth = window.speechSynthesis;
-    synth.cancel();
+    window.speechSynthesis.cancel();
    setIsSpeaking(false);
  };

-  const voices = (): Promise<VoiceOption[]> => {
-    return new Promise((resolve) => {
-      const getAndMapVoices = () => {
-        const availableVoices = speechSynthesis
-          .getVoices()
-          .filter((v) => cloudBrowserVoices || v.localService === true);
-
-        const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
-          value: v.name,
-          display: v.name,
-        }));
-
-        resolve(voiceOptions);
-      };
-
-      if (speechSynthesis.getVoices().length) {
-        getAndMapVoices();
-      } else {
-        speechSynthesis.onvoiceschanged = getAndMapVoices;
-      }
-    });
-  };
-
  return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
 }

--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -1,4 +1,4 @@
-import { useRecoilState } from 'recoil';
+import { useRecoilValue } from 'recoil';
 import { useState, useCallback, useRef, useEffect } from 'react';
 import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
 import { useToastContext } from '~/Providers';
@ -7,20 +7,21 @@ import store from '~/store';

 interface Voice {
  value: string;
-  display: string;
+  label: string;
 }

 interface UseTextToSpeechEdgeReturn {
-  generateSpeechEdge: (text: string) => Promise<void>;
+  generateSpeechEdge: (text: string) => void;
  cancelSpeechEdge: () => void;
  isSpeaking: boolean;
-  voices: () => Promise<Voice[]>;
+  voices: Voice[];
 }

 function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  const localize = useLocalize();
+  const [voices, setVoices] = useState<Voice[]>([]);
  const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
-  const [voiceName] = useRecoilState<string>(store.voice);
+  const voiceName = useRecoilValue(store.voice);
  const ttsRef = useRef<MsEdgeTTS | null>(null);
  const audioElementRef = useRef<HTMLAudioElement | null>(null);
  const mediaSourceRef = useRef<MediaSource | null>(null);
@ -28,61 +29,59 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
  const pendingBuffers = useRef<Uint8Array[]>([]);
  const { showToast } = useToastContext();

-  const initializeTTS = useCallback(async (): Promise<void> => {
+  const fetchVoices = useCallback(() => {
    if (!ttsRef.current) {
      ttsRef.current = new MsEdgeTTS();
    }
-    try {
-      await ttsRef.current.setMetadata(voiceName, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3);
-    } catch (error) {
-      console.error('Error initializing TTS:', error);
-      showToast({
-        message: localize('com_nav_tts_init_error', (error as Error).message),
-        status: 'error',
-      });
-    }
-  }, [voiceName, showToast, localize]);
-
-  const onSourceOpen = useCallback((): void => {
-    if (!sourceBufferRef.current && mediaSourceRef.current) {
-      try {
-        sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
-        sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
-      } catch (error) {
-        console.error('Error adding source buffer:', error);
+    ttsRef.current
+      .getVoices()
+      .then((voicesList) => {
+        setVoices(
+          voicesList.map((v) => ({
+            value: v.ShortName,
+            label: v.FriendlyName,
+          })),
+        );
+      })
+      .catch((error) => {
+        console.error('Error fetching voices:', error);
        showToast({
-          message: localize('com_nav_source_buffer_error'),
+          message: localize('com_nav_voices_fetch_error'),
          status: 'error',
        });
-      }
-    }
-    // eslint-disable-next-line react-hooks/exhaustive-deps
+      });
  }, [showToast, localize]);

-  const initializeMediaSource = useCallback(async (): Promise<void> => {
-    return new Promise<void>((resolve) => {
-      if (!mediaSourceRef.current) {
-        mediaSourceRef.current = new MediaSource();
-        audioElementRef.current = new Audio();
-        audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
-      }
+  const initializeTTS = useCallback(() => {
+    if (!ttsRef.current) {
+      ttsRef.current = new MsEdgeTTS();
+    }
+    const availableVoice: Voice | undefined = voices.find((v) => v.value === voiceName);

-      const mediaSource = mediaSourceRef.current;
-      if (mediaSource.readyState === 'open') {
-        onSourceOpen();
-        resolve();
-      } else {
-        const onSourceOpenWrapper = (): void => {
-          onSourceOpen();
-          resolve();
-          mediaSource.removeEventListener('sourceopen', onSourceOpenWrapper);
-        };
-        mediaSource.addEventListener('sourceopen', onSourceOpenWrapper);
-      }
-    });
-  }, [onSourceOpen]);
+    if (availableVoice) {
+      ttsRef.current
+        .setMetadata(availableVoice.value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3)
+        .catch((error) => {
+          console.error('Error initializing TTS:', error);
+          showToast({
+            message: localize('com_nav_tts_init_error', (error as Error).message),
+            status: 'error',
+          });
+        });
+    } else if (voices.length > 0) {
+      ttsRef.current
+        .setMetadata(voices[0].value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3)
+        .catch((error) => {
+          console.error('Error initializing TTS:', error);
+          showToast({
+            message: localize('com_nav_tts_init_error', (error as Error).message),
+            status: 'error',
+          });
+        });
+    }
+  }, [voiceName, showToast, localize, voices]);

-  const appendNextBuffer = useCallback((): void => {
+  const appendNextBuffer = useCallback(() => {
    if (
      sourceBufferRef.current &&
      !sourceBufferRef.current.updating &&
@ -104,50 +103,81 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
    }
  }, [showToast, localize]);

-  const generateSpeechEdge = useCallback(
-    async (text: string): Promise<void> => {
+  const onSourceOpen = useCallback(() => {
+    if (!sourceBufferRef.current && mediaSourceRef.current) {
      try {
-        await initializeTTS();
-        await initializeMediaSource();
-
-        if (!ttsRef.current || !audioElementRef.current) {
-          throw new Error('TTS or Audio element not initialized');
-        }
-
-        setIsSpeaking(true);
-        pendingBuffers.current = [];
-
-        const readable = await ttsRef.current.toStream(text);
-
-        readable.on('data', (chunk: Buffer) => {
-          pendingBuffers.current.push(new Uint8Array(chunk));
-          appendNextBuffer();
-        });
-
-        readable.on('end', () => {
-          if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
-            mediaSourceRef.current.endOfStream();
-          }
-        });
-
-        audioElementRef.current.onended = () => {
-          setIsSpeaking(false);
-        };
-
-        await audioElementRef.current.play();
+        sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
+        sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
      } catch (error) {
-        console.error('Error generating speech:', error);
+        console.error('Error adding source buffer:', error);
        showToast({
-          message: localize('com_nav_audio_play_error', (error as Error).message),
+          message: localize('com_nav_source_buffer_error'),
          status: 'error',
        });
-        setIsSpeaking(false);
      }
+    }
+  }, [showToast, localize, appendNextBuffer]);
+
+  const initializeMediaSource = useCallback(() => {
+    if (!mediaSourceRef.current) {
+      mediaSourceRef.current = new MediaSource();
+      audioElementRef.current = new Audio();
+      audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
+    }
+
+    const mediaSource = mediaSourceRef.current;
+    if (mediaSource.readyState === 'open') {
+      onSourceOpen();
+    } else {
+      mediaSource.addEventListener('sourceopen', onSourceOpen);
+    }
+  }, [onSourceOpen]);
+
+  const generateSpeechEdge = useCallback(
+    (text: string) => {
+      const generate = async () => {
+        try {
+          if (!ttsRef.current || !audioElementRef.current) {
+            throw new Error('TTS or Audio element not initialized');
+          }
+
+          setIsSpeaking(true);
+          pendingBuffers.current = [];
+
+          const readable = await ttsRef.current.toStream(text);
+
+          readable.on('data', (chunk: Buffer) => {
+            pendingBuffers.current.push(new Uint8Array(chunk));
+            appendNextBuffer();
+          });
+
+          readable.on('end', () => {
+            if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
+              mediaSourceRef.current.endOfStream();
+            }
+          });
+
+          audioElementRef.current.onended = () => {
+            setIsSpeaking(false);
+          };
+
+          await audioElementRef.current.play();
+        } catch (error) {
+          console.error('Error generating speech:', error);
+          showToast({
+            message: localize('com_nav_audio_play_error', (error as Error).message),
+            status: 'error',
+          });
+          setIsSpeaking(false);
+        }
+      };
+
+      generate();
    },
-    [initializeTTS, initializeMediaSource, appendNextBuffer, showToast, localize],
+    [appendNextBuffer, showToast, localize],
  );

-  const cancelSpeechEdge = useCallback((): void => {
+  const cancelSpeechEdge = useCallback(() => {
    try {
      if (audioElementRef.current) {
        audioElementRef.current.pause();
@ -167,33 +197,22 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
    }
  }, [showToast, localize]);

-  const voices = useCallback(async (): Promise<Voice[]> => {
-    if (!ttsRef.current) {
-      ttsRef.current = new MsEdgeTTS();
-    }
-    try {
-      const voicesList = await ttsRef.current.getVoices();
-      return voicesList.map((v) => ({
-        value: v.ShortName,
-        display: v.FriendlyName,
-      }));
-    } catch (error) {
-      console.error('Error fetching voices:', error);
-      showToast({
-        message: localize('com_nav_voices_fetch_error'),
-        status: 'error',
-      });
-      return [];
-    }
-  }, [showToast, localize]);
+  useEffect(() => {
+    fetchVoices();
+  }, [fetchVoices]);

  useEffect(() => {
+    initializeTTS();
+  }, [voiceName, initializeTTS]);
+
+  useEffect(() => {
+    initializeMediaSource();
    return () => {
      if (mediaSourceRef.current) {
-        URL.revokeObjectURL(audioElementRef.current?.src || '');
+        URL.revokeObjectURL(audioElementRef.current?.src ?? '');
      }
    };
-  }, []);
+  }, [initializeMediaSource]);

  return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
 }
--- a/client/src/hooks/Input/useTextToSpeechExternal.ts
+++ b/client/src/hooks/Input/useTextToSpeechExternal.ts
@ -37,7 +37,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
  const playAudioPromise = (blobUrl: string) => {
    const newAudio = new Audio(blobUrl);
    const initializeAudio = () => {
-      if (playbackRate && playbackRate !== 1 && playbackRate > 0) {
+      if (playbackRate != null && playbackRate !== 1 && playbackRate > 0) {
        newAudio.playbackRate = playbackRate;
      }
    };
@ -47,7 +47,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)

    playPromise().catch((error: Error) => {
      if (
-        error?.message &&
+        error.message &&
        error.message.includes('The play() request was interrupted by a call to pause()')
      ) {
        console.log('Play request was interrupted by a call to pause()');
@ -92,7 +92,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)

        if (cacheTTS && inputText) {
          const cache = await caches.open('tts-responses');
-          const request = new Request(inputText!);
+          const request = new Request(inputText);
          const response = new Response(audioBlob);
          cache.put(request, response);
        }
@ -118,7 +118,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
  });

  const startMutation = (text: string, download: boolean) => {
-    const formData = createFormData(text, voice);
+    const formData = createFormData(text, voice ?? '');
    setDownloadFile(download);
    processAudio(formData);
  };
@ -178,9 +178,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
    return isLocalSpeaking || (isLast && globalIsPlaying);
  }, [isLocalSpeaking, globalIsPlaying, isLast]);

-  const useVoices = () => {
-    return useVoicesQuery().data ?? [];
-  };
+  const { data: voicesData = [] } = useVoicesQuery();

  return {
    generateSpeechExternal,
@ -188,7 +186,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
    isLoading,
    isSpeaking,
    audioRef,
-    voices: useVoices,
+    voices: voicesData,
  };
 }