👋 feat: remove Edge TTS (#6885)

* feat: remove Edge TTS

* remove the remaining edge code

* chore: cleanup

* chore: cleanup package-lock
This commit is contained in:
Marco Beretta 2025-04-15 04:39:01 +02:00 committed by GitHub
parent c49f883e1a
commit 5d56f48879
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
13 changed files with 63 additions and 547 deletions

View file

@ -3,4 +3,3 @@ export { default as useCustomAudioRef } from './useCustomAudioRef';
export { default as usePauseGlobalAudio } from './usePauseGlobalAudio';
export { default as useTTSExternal } from './useTTSExternal';
export { default as useTTSBrowser } from './useTTSBrowser';
export { default as useTTSEdge } from './useTTSEdge';

View file

@ -1,100 +0,0 @@
// client/src/hooks/Audio/useTTSEdge.ts
import { useRef, useEffect, useState } from 'react';
import { useRecoilState, useRecoilValue } from 'recoil';
import { parseTextParts } from 'librechat-data-provider';
import type { TMessageContentParts } from 'librechat-data-provider';
import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { logger } from '~/utils';
import store from '~/store';
type TUseTextToSpeech = {
messageId?: string;
content?: TMessageContentParts[] | string;
isLast?: boolean;
index?: number;
};
const useTTSEdge = (props?: TUseTextToSpeech) => {
const { content, isLast = false, index = 0 } = props ?? {};
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
const [isSpeakingState, setIsSpeaking] = useState(false);
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
const [voice, setVoice] = useRecoilState(store.voice);
const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
const {
generateSpeechEdge: generateSpeech,
cancelSpeechEdge: cancelSpeech,
voices,
} = useTextToSpeechEdge({ setIsSpeaking });
useEffect(() => {
const firstVoice = voices[0];
if (voices.length && typeof firstVoice === 'object') {
const lastSelectedVoice = voices.find((v) =>
typeof v === 'object' ? v.value === voice : v === voice,
);
if (lastSelectedVoice != null) {
const currentVoice =
typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
setVoice(currentVoice);
return;
}
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
setVoice(firstVoice.value);
}
}, [setVoice, voice, voices]);
const handleMouseDown = () => {
isMouseDownRef.current = true;
timerRef.current = window.setTimeout(() => {
if (isMouseDownRef.current) {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage);
}
}, 1000);
};
const handleMouseUp = () => {
isMouseDownRef.current = false;
if (timerRef.current != null) {
window.clearTimeout(timerRef.current);
}
};
const toggleSpeech = () => {
if (isSpeaking === true) {
cancelSpeech();
pauseGlobalAudio();
} else {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage);
}
};
return {
handleMouseDown,
handleMouseUp,
toggleSpeech,
isSpeaking,
isLoading: false,
audioRef,
voices,
};
};
export default useTTSEdge;

View file

@ -6,7 +6,6 @@ import type { Option } from '~/common';
import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { usePauseGlobalAudio } from '../Audio';
import { logger } from '~/utils';
@ -40,12 +39,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
voices: voicesLocal,
} = useTextToSpeechBrowser({ setIsSpeaking });
const {
generateSpeechEdge,
cancelSpeechEdge,
voices: voicesEdge,
} = useTextToSpeechEdge({ setIsSpeaking });
const {
generateSpeechExternal,
cancelSpeech: cancelSpeechExternal,
@ -61,26 +54,23 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
const generateSpeech = useMemo(() => {
const map = {
edge: generateSpeechEdge,
browser: generateSpeechLocal,
external: generateSpeechExternal,
};
return map[textToSpeechEndpoint];
}, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
}, [generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
const cancelSpeech = useMemo(() => {
const map = {
edge: cancelSpeechEdge,
browser: cancelSpeechLocal,
external: cancelSpeechExternal,
};
return map[textToSpeechEndpoint];
}, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
}, [cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
const isLoading = useMemo(() => {
const map = {
edge: false,
browser: false,
external: isLoadingExternal,
};
@ -89,13 +79,12 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
const voices: Option[] | string[] = useMemo(() => {
const voiceMap = {
edge: voicesEdge,
browser: voicesLocal,
external: voicesExternal,
};
return voiceMap[textToSpeechEndpoint];
}, [textToSpeechEndpoint, voicesEdge, voicesExternal, voicesLocal]);
}, [textToSpeechEndpoint, voicesExternal, voicesLocal]);
useEffect(() => {
const firstVoice = voices[0];

View file

@ -1,249 +0,0 @@
import { useRecoilValue } from 'recoil';
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
import type { VoiceOption } from '~/common';
import { useToastContext } from '~/Providers/ToastContext';
import useLocalize from '~/hooks/useLocalize';
import store from '~/store';
interface UseTextToSpeechEdgeReturn {
generateSpeechEdge: (text: string) => void;
cancelSpeechEdge: () => void;
voices: VoiceOption[];
}
function useTextToSpeechEdge({
setIsSpeaking,
}: {
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
}): UseTextToSpeechEdgeReturn {
const localize = useLocalize();
const [voices, setVoices] = useState<VoiceOption[]>([]);
const voiceName = useRecoilValue(store.voice);
const ttsRef = useRef<MsEdgeTTS | null>(null);
const audioElementRef = useRef<HTMLAudioElement | null>(null);
const mediaSourceRef = useRef<MediaSource | null>(null);
const sourceBufferRef = useRef<SourceBuffer | null>(null);
const pendingBuffers = useRef<Uint8Array[]>([]);
const { showToast } = useToastContext();
const initAttempts = useRef(0);
const isBrowserSupported = useMemo(
() => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
[],
);
const fetchVoices = useCallback(() => {
if (!ttsRef.current) {
ttsRef.current = new MsEdgeTTS();
}
ttsRef.current
.getVoices()
.then((voicesList) => {
setVoices(
voicesList.map((v) => ({
value: v.ShortName,
label: v.FriendlyName,
})),
);
})
.catch((error) => {
console.error('Error fetching voices:', error);
showToast({
message: localize('com_nav_voices_fetch_error'),
status: 'error',
});
});
}, [showToast, localize]);
const initializeTTS = useCallback(() => {
if (!ttsRef.current) {
ttsRef.current = new MsEdgeTTS({
enableLogger: true,
});
}
const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);
if (availableVoice) {
if (initAttempts.current > 3) {
return;
}
ttsRef.current
.setMetadata(availableVoice.value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3, {})
.catch((error) => {
initAttempts.current += 1;
console.error('Error initializing TTS:', error);
showToast({
message: localize('com_nav_tts_init_error', { 0: (error as Error).message }),
status: 'error',
});
});
} else if (voices.length > 0) {
ttsRef.current
.setMetadata(voices[0].value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3, {})
.catch((error) => {
initAttempts.current += 1;
console.error('Error initializing TTS:', error);
showToast({
message: localize('com_nav_tts_init_error', { 0: (error as Error).message }),
status: 'error',
});
});
}
}, [voiceName, showToast, localize, voices]);
const appendNextBuffer = useCallback(() => {
if (
sourceBufferRef.current &&
!sourceBufferRef.current.updating &&
pendingBuffers.current.length > 0
) {
const nextBuffer = pendingBuffers.current.shift();
if (nextBuffer) {
try {
sourceBufferRef.current.appendBuffer(nextBuffer);
} catch (error) {
console.error('Error appending buffer:', error);
showToast({
message: localize('com_nav_buffer_append_error'),
status: 'error',
});
pendingBuffers.current.unshift(nextBuffer);
}
}
}
}, [showToast, localize]);
const onSourceOpen = useCallback(() => {
if (!sourceBufferRef.current && mediaSourceRef.current) {
try {
sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
} catch (error) {
console.error('Error adding source buffer:', error);
showToast({
message: localize('com_nav_source_buffer_error'),
status: 'error',
});
}
}
}, [showToast, localize, appendNextBuffer]);
const initializeMediaSource = useCallback(() => {
if (!mediaSourceRef.current) {
mediaSourceRef.current = new MediaSource();
audioElementRef.current = new Audio();
audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
}
const mediaSource = mediaSourceRef.current;
if (mediaSource.readyState === 'open') {
onSourceOpen();
} else {
mediaSource.addEventListener('sourceopen', onSourceOpen);
}
}, [onSourceOpen]);
const generateSpeechEdge = useCallback(
(text: string) => {
const generate = async () => {
try {
if (!ttsRef.current || !audioElementRef.current) {
throw new Error('TTS or Audio element not initialized');
}
setIsSpeaking(true);
pendingBuffers.current = [];
const result = await ttsRef.current.toStream(text);
const readable = result.audioStream;
readable.on('data', (chunk: Buffer) => {
pendingBuffers.current.push(new Uint8Array(chunk));
appendNextBuffer();
});
readable.on('end', () => {
if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
mediaSourceRef.current.endOfStream();
}
});
audioElementRef.current.onended = () => {
setIsSpeaking(false);
};
await audioElementRef.current.play();
} catch (error) {
console.error('Error generating speech:', error);
showToast({
message: localize('com_nav_audio_play_error', { 0: (error as Error).message }),
status: 'error',
});
setIsSpeaking(false);
}
};
generate();
},
[setIsSpeaking, appendNextBuffer, showToast, localize],
);
const cancelSpeechEdge = useCallback(() => {
try {
if (audioElementRef.current) {
audioElementRef.current.pause();
audioElementRef.current.currentTime = 0;
}
if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
mediaSourceRef.current.endOfStream();
}
pendingBuffers.current = [];
setIsSpeaking(false);
} catch (error) {
console.error('Error cancelling speech:', error);
showToast({
message: localize('com_nav_speech_cancel_error'),
status: 'error',
});
}
}, [setIsSpeaking, showToast, localize]);
useEffect(() => {
if (!isBrowserSupported) {
return;
}
fetchVoices();
}, [fetchVoices, isBrowserSupported]);
useEffect(() => {
if (!isBrowserSupported) {
return;
}
initializeTTS();
}, [voiceName, initializeTTS, isBrowserSupported]);
useEffect(() => {
if (!isBrowserSupported) {
return;
}
initializeMediaSource();
return () => {
if (mediaSourceRef.current) {
URL.revokeObjectURL(audioElementRef.current?.src ?? '');
}
};
}, [initializeMediaSource, isBrowserSupported]);
if (!isBrowserSupported) {
return {
generateSpeechEdge: () => ({}),
cancelSpeechEdge: () => ({}),
voices: [],
};
}
return { generateSpeechEdge, cancelSpeechEdge, voices };
}
export default useTextToSpeechEdge;