👋 feat: remove Edge TTS (#6885)

* feat: remove Edge TTS * remove the remaining edge code * chore: cleanup * chore: cleanup package-lock
2026-02-06 01:31:49 +01:00 · 2025-04-15 04:39:01 +02:00 · 2025-04-15 04:39:01 +02:00 · 5d56f48879
commit 5d56f48879
parent c49f883e1a
13 changed files with 63 additions and 547 deletions
--- a/client/src/hooks/Audio/index.ts
+++ b/client/src/hooks/Audio/index.ts
@ -3,4 +3,3 @@ export { default as useCustomAudioRef } from './useCustomAudioRef';
 export { default as usePauseGlobalAudio } from './usePauseGlobalAudio';
 export { default as useTTSExternal } from './useTTSExternal';
 export { default as useTTSBrowser } from './useTTSBrowser';
-export { default as useTTSEdge } from './useTTSEdge';
--- a/client/src/hooks/Audio/useTTSEdge.ts
+++ b/client/src/hooks/Audio/useTTSEdge.ts
@ -1,100 +0,0 @@
-// client/src/hooks/Audio/useTTSEdge.ts
-import { useRef, useEffect, useState } from 'react';
-import { useRecoilState, useRecoilValue } from 'recoil';
-import { parseTextParts } from 'librechat-data-provider';
-import type { TMessageContentParts } from 'librechat-data-provider';
-import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
-import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
-import useAudioRef from '~/hooks/Audio/useAudioRef';
-import { logger } from '~/utils';
-import store from '~/store';
-
-type TUseTextToSpeech = {
-  messageId?: string;
-  content?: TMessageContentParts[] | string;
-  isLast?: boolean;
-  index?: number;
-};
-
-const useTTSEdge = (props?: TUseTextToSpeech) => {
-  const { content, isLast = false, index = 0 } = props ?? {};
-
-  const isMouseDownRef = useRef(false);
-  const timerRef = useRef<number | undefined>(undefined);
-  const [isSpeakingState, setIsSpeaking] = useState(false);
-  const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
-
-  const { pauseGlobalAudio } = usePauseGlobalAudio(index);
-  const [voice, setVoice] = useRecoilState(store.voice);
-  const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
-
-  const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
-
-  const {
-    generateSpeechEdge: generateSpeech,
-    cancelSpeechEdge: cancelSpeech,
-    voices,
-  } = useTextToSpeechEdge({ setIsSpeaking });
-
-  useEffect(() => {
-    const firstVoice = voices[0];
-    if (voices.length && typeof firstVoice === 'object') {
-      const lastSelectedVoice = voices.find((v) =>
-        typeof v === 'object' ? v.value === voice : v === voice,
-      );
-      if (lastSelectedVoice != null) {
-        const currentVoice =
-          typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
-        logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
-        setVoice(currentVoice);
-        return;
-      }
-
-      logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
-      setVoice(firstVoice.value);
-    }
-  }, [setVoice, voice, voices]);
-
-  const handleMouseDown = () => {
-    isMouseDownRef.current = true;
-    timerRef.current = window.setTimeout(() => {
-      if (isMouseDownRef.current) {
-        const messageContent = content ?? '';
-        const parsedMessage =
-          typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
-        generateSpeech(parsedMessage);
-      }
-    }, 1000);
-  };
-
-  const handleMouseUp = () => {
-    isMouseDownRef.current = false;
-    if (timerRef.current != null) {
-      window.clearTimeout(timerRef.current);
-    }
-  };
-
-  const toggleSpeech = () => {
-    if (isSpeaking === true) {
-      cancelSpeech();
-      pauseGlobalAudio();
-    } else {
-      const messageContent = content ?? '';
-      const parsedMessage =
-        typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
-      generateSpeech(parsedMessage);
-    }
-  };
-
-  return {
-    handleMouseDown,
-    handleMouseUp,
-    toggleSpeech,
-    isSpeaking,
-    isLoading: false,
-    audioRef,
-    voices,
-  };
-};
-
-export default useTTSEdge;
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -6,7 +6,6 @@ import type { Option } from '~/common';
 import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
 import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
 import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
-import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
 import useAudioRef from '~/hooks/Audio/useAudioRef';
 import { usePauseGlobalAudio } from '../Audio';
 import { logger } from '~/utils';
@ -40,12 +39,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
    voices: voicesLocal,
  } = useTextToSpeechBrowser({ setIsSpeaking });

-  const {
-    generateSpeechEdge,
-    cancelSpeechEdge,
-    voices: voicesEdge,
-  } = useTextToSpeechEdge({ setIsSpeaking });
-
  const {
    generateSpeechExternal,
    cancelSpeech: cancelSpeechExternal,
@ -61,26 +54,23 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {

  const generateSpeech = useMemo(() => {
    const map = {
-      edge: generateSpeechEdge,
      browser: generateSpeechLocal,
      external: generateSpeechExternal,
    };

    return map[textToSpeechEndpoint];
-  }, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
+  }, [generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);

  const cancelSpeech = useMemo(() => {
    const map = {
-      edge: cancelSpeechEdge,
      browser: cancelSpeechLocal,
      external: cancelSpeechExternal,
    };
    return map[textToSpeechEndpoint];
-  }, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
+  }, [cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);

  const isLoading = useMemo(() => {
    const map = {
-      edge: false,
      browser: false,
      external: isLoadingExternal,
    };
@ -89,13 +79,12 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {

  const voices: Option[] | string[] = useMemo(() => {
    const voiceMap = {
-      edge: voicesEdge,
      browser: voicesLocal,
      external: voicesExternal,
    };

    return voiceMap[textToSpeechEndpoint];
-  }, [textToSpeechEndpoint, voicesEdge, voicesExternal, voicesLocal]);
+  }, [textToSpeechEndpoint, voicesExternal, voicesLocal]);

  useEffect(() => {
    const firstVoice = voices[0];
--- a/client/src/hooks/Input/useTextToSpeechEdge.ts
+++ b/client/src/hooks/Input/useTextToSpeechEdge.ts
@ -1,249 +0,0 @@
-import { useRecoilValue } from 'recoil';
-import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
-import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
-import type { VoiceOption } from '~/common';
-import { useToastContext } from '~/Providers/ToastContext';
-import useLocalize from '~/hooks/useLocalize';
-import store from '~/store';
-
-interface UseTextToSpeechEdgeReturn {
-  generateSpeechEdge: (text: string) => void;
-  cancelSpeechEdge: () => void;
-  voices: VoiceOption[];
-}
-
-function useTextToSpeechEdge({
-  setIsSpeaking,
-}: {
-  setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
-}): UseTextToSpeechEdgeReturn {
-  const localize = useLocalize();
-  const [voices, setVoices] = useState<VoiceOption[]>([]);
-  const voiceName = useRecoilValue(store.voice);
-  const ttsRef = useRef<MsEdgeTTS | null>(null);
-  const audioElementRef = useRef<HTMLAudioElement | null>(null);
-  const mediaSourceRef = useRef<MediaSource | null>(null);
-  const sourceBufferRef = useRef<SourceBuffer | null>(null);
-  const pendingBuffers = useRef<Uint8Array[]>([]);
-  const { showToast } = useToastContext();
-  const initAttempts = useRef(0);
-
-  const isBrowserSupported = useMemo(
-    () => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
-    [],
-  );
-
-  const fetchVoices = useCallback(() => {
-    if (!ttsRef.current) {
-      ttsRef.current = new MsEdgeTTS();
-    }
-    ttsRef.current
-      .getVoices()
-      .then((voicesList) => {
-        setVoices(
-          voicesList.map((v) => ({
-            value: v.ShortName,
-            label: v.FriendlyName,
-          })),
-        );
-      })
-      .catch((error) => {
-        console.error('Error fetching voices:', error);
-        showToast({
-          message: localize('com_nav_voices_fetch_error'),
-          status: 'error',
-        });
-      });
-  }, [showToast, localize]);
-
-  const initializeTTS = useCallback(() => {
-    if (!ttsRef.current) {
-      ttsRef.current = new MsEdgeTTS({
-        enableLogger: true,
-      });
-    }
-    const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);
-
-    if (availableVoice) {
-      if (initAttempts.current > 3) {
-        return;
-      }
-      ttsRef.current
-        .setMetadata(availableVoice.value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3, {})
-        .catch((error) => {
-          initAttempts.current += 1;
-          console.error('Error initializing TTS:', error);
-          showToast({
-            message: localize('com_nav_tts_init_error', { 0: (error as Error).message }),
-            status: 'error',
-          });
-        });
-    } else if (voices.length > 0) {
-      ttsRef.current
-        .setMetadata(voices[0].value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3, {})
-        .catch((error) => {
-          initAttempts.current += 1;
-          console.error('Error initializing TTS:', error);
-          showToast({
-            message: localize('com_nav_tts_init_error', { 0: (error as Error).message }),
-            status: 'error',
-          });
-        });
-    }
-  }, [voiceName, showToast, localize, voices]);
-
-  const appendNextBuffer = useCallback(() => {
-    if (
-      sourceBufferRef.current &&
-      !sourceBufferRef.current.updating &&
-      pendingBuffers.current.length > 0
-    ) {
-      const nextBuffer = pendingBuffers.current.shift();
-      if (nextBuffer) {
-        try {
-          sourceBufferRef.current.appendBuffer(nextBuffer);
-        } catch (error) {
-          console.error('Error appending buffer:', error);
-          showToast({
-            message: localize('com_nav_buffer_append_error'),
-            status: 'error',
-          });
-          pendingBuffers.current.unshift(nextBuffer);
-        }
-      }
-    }
-  }, [showToast, localize]);
-
-  const onSourceOpen = useCallback(() => {
-    if (!sourceBufferRef.current && mediaSourceRef.current) {
-      try {
-        sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
-        sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
-      } catch (error) {
-        console.error('Error adding source buffer:', error);
-        showToast({
-          message: localize('com_nav_source_buffer_error'),
-          status: 'error',
-        });
-      }
-    }
-  }, [showToast, localize, appendNextBuffer]);
-
-  const initializeMediaSource = useCallback(() => {
-    if (!mediaSourceRef.current) {
-      mediaSourceRef.current = new MediaSource();
-      audioElementRef.current = new Audio();
-      audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
-    }
-
-    const mediaSource = mediaSourceRef.current;
-    if (mediaSource.readyState === 'open') {
-      onSourceOpen();
-    } else {
-      mediaSource.addEventListener('sourceopen', onSourceOpen);
-    }
-  }, [onSourceOpen]);
-
-  const generateSpeechEdge = useCallback(
-    (text: string) => {
-      const generate = async () => {
-        try {
-          if (!ttsRef.current || !audioElementRef.current) {
-            throw new Error('TTS or Audio element not initialized');
-          }
-
-          setIsSpeaking(true);
-          pendingBuffers.current = [];
-
-          const result = await ttsRef.current.toStream(text);
-          const readable = result.audioStream;
-
-          readable.on('data', (chunk: Buffer) => {
-            pendingBuffers.current.push(new Uint8Array(chunk));
-            appendNextBuffer();
-          });
-
-          readable.on('end', () => {
-            if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
-              mediaSourceRef.current.endOfStream();
-            }
-          });
-
-          audioElementRef.current.onended = () => {
-            setIsSpeaking(false);
-          };
-
-          await audioElementRef.current.play();
-        } catch (error) {
-          console.error('Error generating speech:', error);
-          showToast({
-            message: localize('com_nav_audio_play_error', { 0: (error as Error).message }),
-            status: 'error',
-          });
-          setIsSpeaking(false);
-        }
-      };
-
-      generate();
-    },
-    [setIsSpeaking, appendNextBuffer, showToast, localize],
-  );
-
-  const cancelSpeechEdge = useCallback(() => {
-    try {
-      if (audioElementRef.current) {
-        audioElementRef.current.pause();
-        audioElementRef.current.currentTime = 0;
-      }
-      if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
-        mediaSourceRef.current.endOfStream();
-      }
-      pendingBuffers.current = [];
-      setIsSpeaking(false);
-    } catch (error) {
-      console.error('Error cancelling speech:', error);
-      showToast({
-        message: localize('com_nav_speech_cancel_error'),
-        status: 'error',
-      });
-    }
-  }, [setIsSpeaking, showToast, localize]);
-
-  useEffect(() => {
-    if (!isBrowserSupported) {
-      return;
-    }
-    fetchVoices();
-  }, [fetchVoices, isBrowserSupported]);
-
-  useEffect(() => {
-    if (!isBrowserSupported) {
-      return;
-    }
-    initializeTTS();
-  }, [voiceName, initializeTTS, isBrowserSupported]);
-
-  useEffect(() => {
-    if (!isBrowserSupported) {
-      return;
-    }
-    initializeMediaSource();
-    return () => {
-      if (mediaSourceRef.current) {
-        URL.revokeObjectURL(audioElementRef.current?.src ?? '');
-      }
-    };
-  }, [initializeMediaSource, isBrowserSupported]);
-
-  if (!isBrowserSupported) {
-    return {
-      generateSpeechEdge: () => ({}),
-      cancelSpeechEdge: () => ({}),
-      voices: [],
-    };
-  }
-
-  return { generateSpeechEdge, cancelSpeechEdge, voices };
-}
-
-export default useTextToSpeechEdge;