mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-19 09:50:15 +01:00
👋 feat: remove Edge TTS (#6885)
* feat: remove Edge TTS * remove the remaining edge code * chore: cleanup * chore: cleanup package-lock
This commit is contained in:
parent
c49f883e1a
commit
5d56f48879
13 changed files with 63 additions and 547 deletions
|
|
@ -3,4 +3,3 @@ export { default as useCustomAudioRef } from './useCustomAudioRef';
|
|||
export { default as usePauseGlobalAudio } from './usePauseGlobalAudio';
|
||||
export { default as useTTSExternal } from './useTTSExternal';
|
||||
export { default as useTTSBrowser } from './useTTSBrowser';
|
||||
export { default as useTTSEdge } from './useTTSEdge';
|
||||
|
|
|
|||
|
|
@ -1,100 +0,0 @@
|
|||
// client/src/hooks/Audio/useTTSEdge.ts
|
||||
import { useRef, useEffect, useState } from 'react';
|
||||
import { useRecoilState, useRecoilValue } from 'recoil';
|
||||
import { parseTextParts } from 'librechat-data-provider';
|
||||
import type { TMessageContentParts } from 'librechat-data-provider';
|
||||
import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
|
||||
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
|
||||
import useAudioRef from '~/hooks/Audio/useAudioRef';
|
||||
import { logger } from '~/utils';
|
||||
import store from '~/store';
|
||||
|
||||
type TUseTextToSpeech = {
|
||||
messageId?: string;
|
||||
content?: TMessageContentParts[] | string;
|
||||
isLast?: boolean;
|
||||
index?: number;
|
||||
};
|
||||
|
||||
const useTTSEdge = (props?: TUseTextToSpeech) => {
|
||||
const { content, isLast = false, index = 0 } = props ?? {};
|
||||
|
||||
const isMouseDownRef = useRef(false);
|
||||
const timerRef = useRef<number | undefined>(undefined);
|
||||
const [isSpeakingState, setIsSpeaking] = useState(false);
|
||||
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
|
||||
|
||||
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
|
||||
const [voice, setVoice] = useRecoilState(store.voice);
|
||||
const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
|
||||
|
||||
const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
|
||||
|
||||
const {
|
||||
generateSpeechEdge: generateSpeech,
|
||||
cancelSpeechEdge: cancelSpeech,
|
||||
voices,
|
||||
} = useTextToSpeechEdge({ setIsSpeaking });
|
||||
|
||||
useEffect(() => {
|
||||
const firstVoice = voices[0];
|
||||
if (voices.length && typeof firstVoice === 'object') {
|
||||
const lastSelectedVoice = voices.find((v) =>
|
||||
typeof v === 'object' ? v.value === voice : v === voice,
|
||||
);
|
||||
if (lastSelectedVoice != null) {
|
||||
const currentVoice =
|
||||
typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
|
||||
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
|
||||
setVoice(currentVoice);
|
||||
return;
|
||||
}
|
||||
|
||||
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
|
||||
setVoice(firstVoice.value);
|
||||
}
|
||||
}, [setVoice, voice, voices]);
|
||||
|
||||
const handleMouseDown = () => {
|
||||
isMouseDownRef.current = true;
|
||||
timerRef.current = window.setTimeout(() => {
|
||||
if (isMouseDownRef.current) {
|
||||
const messageContent = content ?? '';
|
||||
const parsedMessage =
|
||||
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
|
||||
generateSpeech(parsedMessage);
|
||||
}
|
||||
}, 1000);
|
||||
};
|
||||
|
||||
const handleMouseUp = () => {
|
||||
isMouseDownRef.current = false;
|
||||
if (timerRef.current != null) {
|
||||
window.clearTimeout(timerRef.current);
|
||||
}
|
||||
};
|
||||
|
||||
const toggleSpeech = () => {
|
||||
if (isSpeaking === true) {
|
||||
cancelSpeech();
|
||||
pauseGlobalAudio();
|
||||
} else {
|
||||
const messageContent = content ?? '';
|
||||
const parsedMessage =
|
||||
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
|
||||
generateSpeech(parsedMessage);
|
||||
}
|
||||
};
|
||||
|
||||
return {
|
||||
handleMouseDown,
|
||||
handleMouseUp,
|
||||
toggleSpeech,
|
||||
isSpeaking,
|
||||
isLoading: false,
|
||||
audioRef,
|
||||
voices,
|
||||
};
|
||||
};
|
||||
|
||||
export default useTTSEdge;
|
||||
|
|
@ -6,7 +6,6 @@ import type { Option } from '~/common';
|
|||
import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
|
||||
import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
|
||||
import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
|
||||
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
|
||||
import useAudioRef from '~/hooks/Audio/useAudioRef';
|
||||
import { usePauseGlobalAudio } from '../Audio';
|
||||
import { logger } from '~/utils';
|
||||
|
|
@ -40,12 +39,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
|
|||
voices: voicesLocal,
|
||||
} = useTextToSpeechBrowser({ setIsSpeaking });
|
||||
|
||||
const {
|
||||
generateSpeechEdge,
|
||||
cancelSpeechEdge,
|
||||
voices: voicesEdge,
|
||||
} = useTextToSpeechEdge({ setIsSpeaking });
|
||||
|
||||
const {
|
||||
generateSpeechExternal,
|
||||
cancelSpeech: cancelSpeechExternal,
|
||||
|
|
@ -61,26 +54,23 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
|
|||
|
||||
const generateSpeech = useMemo(() => {
|
||||
const map = {
|
||||
edge: generateSpeechEdge,
|
||||
browser: generateSpeechLocal,
|
||||
external: generateSpeechExternal,
|
||||
};
|
||||
|
||||
return map[textToSpeechEndpoint];
|
||||
}, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
|
||||
}, [generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
|
||||
|
||||
const cancelSpeech = useMemo(() => {
|
||||
const map = {
|
||||
edge: cancelSpeechEdge,
|
||||
browser: cancelSpeechLocal,
|
||||
external: cancelSpeechExternal,
|
||||
};
|
||||
return map[textToSpeechEndpoint];
|
||||
}, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
|
||||
}, [cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
|
||||
|
||||
const isLoading = useMemo(() => {
|
||||
const map = {
|
||||
edge: false,
|
||||
browser: false,
|
||||
external: isLoadingExternal,
|
||||
};
|
||||
|
|
@ -89,13 +79,12 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
|
|||
|
||||
const voices: Option[] | string[] = useMemo(() => {
|
||||
const voiceMap = {
|
||||
edge: voicesEdge,
|
||||
browser: voicesLocal,
|
||||
external: voicesExternal,
|
||||
};
|
||||
|
||||
return voiceMap[textToSpeechEndpoint];
|
||||
}, [textToSpeechEndpoint, voicesEdge, voicesExternal, voicesLocal]);
|
||||
}, [textToSpeechEndpoint, voicesExternal, voicesLocal]);
|
||||
|
||||
useEffect(() => {
|
||||
const firstVoice = voices[0];
|
||||
|
|
|
|||
|
|
@ -1,249 +0,0 @@
|
|||
import { useRecoilValue } from 'recoil';
|
||||
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
|
||||
import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
|
||||
import type { VoiceOption } from '~/common';
|
||||
import { useToastContext } from '~/Providers/ToastContext';
|
||||
import useLocalize from '~/hooks/useLocalize';
|
||||
import store from '~/store';
|
||||
|
||||
interface UseTextToSpeechEdgeReturn {
|
||||
generateSpeechEdge: (text: string) => void;
|
||||
cancelSpeechEdge: () => void;
|
||||
voices: VoiceOption[];
|
||||
}
|
||||
|
||||
function useTextToSpeechEdge({
|
||||
setIsSpeaking,
|
||||
}: {
|
||||
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
|
||||
}): UseTextToSpeechEdgeReturn {
|
||||
const localize = useLocalize();
|
||||
const [voices, setVoices] = useState<VoiceOption[]>([]);
|
||||
const voiceName = useRecoilValue(store.voice);
|
||||
const ttsRef = useRef<MsEdgeTTS | null>(null);
|
||||
const audioElementRef = useRef<HTMLAudioElement | null>(null);
|
||||
const mediaSourceRef = useRef<MediaSource | null>(null);
|
||||
const sourceBufferRef = useRef<SourceBuffer | null>(null);
|
||||
const pendingBuffers = useRef<Uint8Array[]>([]);
|
||||
const { showToast } = useToastContext();
|
||||
const initAttempts = useRef(0);
|
||||
|
||||
const isBrowserSupported = useMemo(
|
||||
() => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
|
||||
[],
|
||||
);
|
||||
|
||||
const fetchVoices = useCallback(() => {
|
||||
if (!ttsRef.current) {
|
||||
ttsRef.current = new MsEdgeTTS();
|
||||
}
|
||||
ttsRef.current
|
||||
.getVoices()
|
||||
.then((voicesList) => {
|
||||
setVoices(
|
||||
voicesList.map((v) => ({
|
||||
value: v.ShortName,
|
||||
label: v.FriendlyName,
|
||||
})),
|
||||
);
|
||||
})
|
||||
.catch((error) => {
|
||||
console.error('Error fetching voices:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_voices_fetch_error'),
|
||||
status: 'error',
|
||||
});
|
||||
});
|
||||
}, [showToast, localize]);
|
||||
|
||||
const initializeTTS = useCallback(() => {
|
||||
if (!ttsRef.current) {
|
||||
ttsRef.current = new MsEdgeTTS({
|
||||
enableLogger: true,
|
||||
});
|
||||
}
|
||||
const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);
|
||||
|
||||
if (availableVoice) {
|
||||
if (initAttempts.current > 3) {
|
||||
return;
|
||||
}
|
||||
ttsRef.current
|
||||
.setMetadata(availableVoice.value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3, {})
|
||||
.catch((error) => {
|
||||
initAttempts.current += 1;
|
||||
console.error('Error initializing TTS:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_tts_init_error', { 0: (error as Error).message }),
|
||||
status: 'error',
|
||||
});
|
||||
});
|
||||
} else if (voices.length > 0) {
|
||||
ttsRef.current
|
||||
.setMetadata(voices[0].value, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3, {})
|
||||
.catch((error) => {
|
||||
initAttempts.current += 1;
|
||||
console.error('Error initializing TTS:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_tts_init_error', { 0: (error as Error).message }),
|
||||
status: 'error',
|
||||
});
|
||||
});
|
||||
}
|
||||
}, [voiceName, showToast, localize, voices]);
|
||||
|
||||
const appendNextBuffer = useCallback(() => {
|
||||
if (
|
||||
sourceBufferRef.current &&
|
||||
!sourceBufferRef.current.updating &&
|
||||
pendingBuffers.current.length > 0
|
||||
) {
|
||||
const nextBuffer = pendingBuffers.current.shift();
|
||||
if (nextBuffer) {
|
||||
try {
|
||||
sourceBufferRef.current.appendBuffer(nextBuffer);
|
||||
} catch (error) {
|
||||
console.error('Error appending buffer:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_buffer_append_error'),
|
||||
status: 'error',
|
||||
});
|
||||
pendingBuffers.current.unshift(nextBuffer);
|
||||
}
|
||||
}
|
||||
}
|
||||
}, [showToast, localize]);
|
||||
|
||||
const onSourceOpen = useCallback(() => {
|
||||
if (!sourceBufferRef.current && mediaSourceRef.current) {
|
||||
try {
|
||||
sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
|
||||
sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
|
||||
} catch (error) {
|
||||
console.error('Error adding source buffer:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_source_buffer_error'),
|
||||
status: 'error',
|
||||
});
|
||||
}
|
||||
}
|
||||
}, [showToast, localize, appendNextBuffer]);
|
||||
|
||||
const initializeMediaSource = useCallback(() => {
|
||||
if (!mediaSourceRef.current) {
|
||||
mediaSourceRef.current = new MediaSource();
|
||||
audioElementRef.current = new Audio();
|
||||
audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
|
||||
}
|
||||
|
||||
const mediaSource = mediaSourceRef.current;
|
||||
if (mediaSource.readyState === 'open') {
|
||||
onSourceOpen();
|
||||
} else {
|
||||
mediaSource.addEventListener('sourceopen', onSourceOpen);
|
||||
}
|
||||
}, [onSourceOpen]);
|
||||
|
||||
const generateSpeechEdge = useCallback(
|
||||
(text: string) => {
|
||||
const generate = async () => {
|
||||
try {
|
||||
if (!ttsRef.current || !audioElementRef.current) {
|
||||
throw new Error('TTS or Audio element not initialized');
|
||||
}
|
||||
|
||||
setIsSpeaking(true);
|
||||
pendingBuffers.current = [];
|
||||
|
||||
const result = await ttsRef.current.toStream(text);
|
||||
const readable = result.audioStream;
|
||||
|
||||
readable.on('data', (chunk: Buffer) => {
|
||||
pendingBuffers.current.push(new Uint8Array(chunk));
|
||||
appendNextBuffer();
|
||||
});
|
||||
|
||||
readable.on('end', () => {
|
||||
if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
|
||||
mediaSourceRef.current.endOfStream();
|
||||
}
|
||||
});
|
||||
|
||||
audioElementRef.current.onended = () => {
|
||||
setIsSpeaking(false);
|
||||
};
|
||||
|
||||
await audioElementRef.current.play();
|
||||
} catch (error) {
|
||||
console.error('Error generating speech:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_audio_play_error', { 0: (error as Error).message }),
|
||||
status: 'error',
|
||||
});
|
||||
setIsSpeaking(false);
|
||||
}
|
||||
};
|
||||
|
||||
generate();
|
||||
},
|
||||
[setIsSpeaking, appendNextBuffer, showToast, localize],
|
||||
);
|
||||
|
||||
const cancelSpeechEdge = useCallback(() => {
|
||||
try {
|
||||
if (audioElementRef.current) {
|
||||
audioElementRef.current.pause();
|
||||
audioElementRef.current.currentTime = 0;
|
||||
}
|
||||
if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
|
||||
mediaSourceRef.current.endOfStream();
|
||||
}
|
||||
pendingBuffers.current = [];
|
||||
setIsSpeaking(false);
|
||||
} catch (error) {
|
||||
console.error('Error cancelling speech:', error);
|
||||
showToast({
|
||||
message: localize('com_nav_speech_cancel_error'),
|
||||
status: 'error',
|
||||
});
|
||||
}
|
||||
}, [setIsSpeaking, showToast, localize]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isBrowserSupported) {
|
||||
return;
|
||||
}
|
||||
fetchVoices();
|
||||
}, [fetchVoices, isBrowserSupported]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isBrowserSupported) {
|
||||
return;
|
||||
}
|
||||
initializeTTS();
|
||||
}, [voiceName, initializeTTS, isBrowserSupported]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isBrowserSupported) {
|
||||
return;
|
||||
}
|
||||
initializeMediaSource();
|
||||
return () => {
|
||||
if (mediaSourceRef.current) {
|
||||
URL.revokeObjectURL(audioElementRef.current?.src ?? '');
|
||||
}
|
||||
};
|
||||
}, [initializeMediaSource, isBrowserSupported]);
|
||||
|
||||
if (!isBrowserSupported) {
|
||||
return {
|
||||
generateSpeechEdge: () => ({}),
|
||||
cancelSpeechEdge: () => ({}),
|
||||
voices: [],
|
||||
};
|
||||
}
|
||||
|
||||
return { generateSpeechEdge, cancelSpeechEdge, voices };
|
||||
}
|
||||
|
||||
export default useTextToSpeechEdge;
|
||||
Loading…
Add table
Add a link
Reference in a new issue