🗣️ feat: Edge TTS engine (#3358)

* feat: MS Edge TTS

* feat: Edge TTS; fix: STT hook
This commit is contained in:
Marco Beretta 2024-08-07 20:15:41 +02:00 committed by GitHub
parent 01a88991ab
commit b390ba781f
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
14 changed files with 379 additions and 129 deletions

View file

@ -1,19 +1,25 @@
import { useRecoilState } from 'recoil';
import store from '~/store';
export enum AudioEndpoints {
export enum STTEndpoints {
browser = 'browser',
external = 'external',
}
export enum TTSEndpoints {
browser = 'browser',
edge = 'edge',
external = 'external',
}
const useGetAudioSettings = () => {
const [engineSTT] = useRecoilState<string>(store.engineSTT);
const [engineTTS] = useRecoilState<string>(store.engineTTS);
const externalSpeechToText = engineSTT === AudioEndpoints.external;
const externalTextToSpeech = engineTTS === AudioEndpoints.external;
const speechToTextEndpoint: STTEndpoints = engineSTT as STTEndpoints;
const textToSpeechEndpoint: TTSEndpoints = engineTTS as TTSEndpoints;
return { externalSpeechToText, externalTextToSpeech };
return { speechToTextEndpoint, textToSpeechEndpoint };
};
export default useGetAudioSettings;

View file

@ -4,8 +4,9 @@ import useSpeechToTextExternal from './useSpeechToTextExternal';
import useGetAudioSettings from './useGetAudioSettings';
const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => {
const { externalSpeechToText } = useGetAudioSettings();
const { speechToTextEndpoint } = useGetAudioSettings();
const [animatedText, setAnimatedText] = useState('');
const externalSpeechToText = speechToTextEndpoint === 'external';
const {
isListening: speechIsListeningBrowser,

View file

@ -9,7 +9,8 @@ const useSpeechToTextBrowser = () => {
const { showToast } = useToastContext();
const [languageSTT] = useRecoilState<string>(store.languageSTT);
const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
const { externalSpeechToText } = useGetAudioSettings();
const { speechToTextEndpoint } = useGetAudioSettings();
const isBrowserSTTEnabled = speechToTextEndpoint === 'browser';
const [isListening, setIsListening] = useState(false);
const {
@ -51,7 +52,7 @@ const useSpeechToTextBrowser = () => {
useEffect(() => {
const handleKeyDown = (e: KeyboardEvent) => {
if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
if (e.shiftKey && e.altKey && e.code === 'KeyL' && !isBrowserSTTEnabled) {
toggleListening();
}
};

View file

@ -7,7 +7,8 @@ import useGetAudioSettings from './useGetAudioSettings';
const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => {
const { showToast } = useToastContext();
const { externalSpeechToText } = useGetAudioSettings();
const { speechToTextEndpoint } = useGetAudioSettings();
const isExternalSTTEnabled = speechToTextEndpoint === 'external';
const [speechToText] = useRecoilState<boolean>(store.speechToText);
const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
const [autoSendText] = useRecoilState(store.autoSendText);
@ -194,7 +195,7 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
};
const handleKeyDown = async (e: KeyboardEvent) => {
if (e.shiftKey && e.altKey && e.code === 'KeyL' && !externalSpeechToText) {
if (e.shiftKey && e.altKey && e.code === 'KeyL' && isExternalSTTEnabled) {
if (!window.MediaRecorder) {
showToast({ message: 'MediaRecorder is not supported in this browser', status: 'error' });
return;

View file

@ -3,30 +3,67 @@ import { parseTextParts } from 'librechat-data-provider';
import type { TMessage } from 'librechat-data-provider';
import useTextToSpeechExternal from './useTextToSpeechExternal';
import useTextToSpeechBrowser from './useTextToSpeechBrowser';
import { usePauseGlobalAudio } from '../Audio';
import useGetAudioSettings from './useGetAudioSettings';
import useTextToSpeechEdge from './useTextToSpeechEdge';
import { usePauseGlobalAudio } from '../Audio';
const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
const { externalTextToSpeech } = useGetAudioSettings();
const useTextToSpeech = (message?: TMessage, isLast = false, index = 0) => {
const { textToSpeechEndpoint } = useGetAudioSettings();
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
const audioRef = useRef<HTMLAudioElement | null>(null);
const {
generateSpeechLocal: generateSpeechLocal,
cancelSpeechLocal: cancelSpeechLocal,
generateSpeechLocal,
cancelSpeechLocal,
isSpeaking: isSpeakingLocal,
voices: voicesLocal,
} = useTextToSpeechBrowser();
const {
generateSpeechExternal: generateSpeechExternal,
generateSpeechEdge,
cancelSpeechEdge,
isSpeaking: isSpeakingEdge,
voices: voicesEdge,
} = useTextToSpeechEdge();
const {
generateSpeechExternal,
cancelSpeech: cancelSpeechExternal,
isSpeaking: isSpeakingExternal,
isLoading: isLoading,
audioRef,
} = useTextToSpeechExternal(message.messageId, isLast, index);
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
isLoading: isLoadingExternal,
audioRef: audioRefExternal,
voices: voicesExternal,
} = useTextToSpeechExternal(message?.messageId || '', isLast, index);
const generateSpeech = externalTextToSpeech ? generateSpeechExternal : generateSpeechLocal;
const cancelSpeech = externalTextToSpeech ? cancelSpeechExternal : cancelSpeechLocal;
const isSpeaking = externalTextToSpeech ? isSpeakingExternal : isSpeakingLocal;
let generateSpeech, cancelSpeech, isSpeaking, isLoading, voices;
switch (textToSpeechEndpoint) {
case 'external':
generateSpeech = generateSpeechExternal;
cancelSpeech = cancelSpeechExternal;
isSpeaking = isSpeakingExternal;
isLoading = isLoadingExternal;
if (audioRefExternal) {
audioRef.current = audioRefExternal.current;
}
voices = voicesExternal;
break;
case 'edge':
generateSpeech = generateSpeechEdge;
cancelSpeech = cancelSpeechEdge;
isSpeaking = isSpeakingEdge;
isLoading = false;
voices = voicesEdge;
break;
case 'browser':
default:
generateSpeech = generateSpeechLocal;
cancelSpeech = cancelSpeechLocal;
isSpeaking = isSpeakingLocal;
isLoading = false;
voices = voicesLocal;
break;
}
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
@ -52,7 +89,6 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
const toggleSpeech = () => {
if (isSpeaking) {
console.log('canceling message audio speech');
cancelSpeech();
pauseGlobalAudio();
} else {
@ -69,6 +105,7 @@ const useTextToSpeech = (message: TMessage, isLast: boolean, index = 0) => {
toggleSpeech,
isSpeaking,
isLoading,
voices,
audioRef,
};
};

View file

@ -2,6 +2,11 @@ import { useRecoilState } from 'recoil';
import { useState } from 'react';
import store from '~/store';
interface VoiceOption {
value: string;
display: string;
}
function useTextToSpeechBrowser() {
const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
const [isSpeaking, setIsSpeaking] = useState(false);
@ -32,7 +37,30 @@ function useTextToSpeechBrowser() {
setIsSpeaking(false);
};
return { generateSpeechLocal, cancelSpeechLocal, isSpeaking };
const voices = (): Promise<VoiceOption[]> => {
return new Promise((resolve) => {
const getAndMapVoices = () => {
const availableVoices = speechSynthesis
.getVoices()
.filter((v) => cloudBrowserVoices || v.localService === true);
const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
value: v.name,
display: v.name,
}));
resolve(voiceOptions);
};
if (speechSynthesis.getVoices().length) {
getAndMapVoices();
} else {
speechSynthesis.onvoiceschanged = getAndMapVoices;
}
});
};
return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices };
}
export default useTextToSpeechBrowser;

View file

@ -0,0 +1,201 @@
import { useRecoilState } from 'recoil';
import { useState, useCallback, useRef, useEffect } from 'react';
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
import { useToastContext } from '~/Providers';
import useLocalize from '~/hooks/useLocalize';
import store from '~/store';
interface Voice {
value: string;
display: string;
}
interface UseTextToSpeechEdgeReturn {
generateSpeechEdge: (text: string) => Promise<void>;
cancelSpeechEdge: () => void;
isSpeaking: boolean;
voices: () => Promise<Voice[]>;
}
function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
const localize = useLocalize();
const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
const [voiceName] = useRecoilState<string>(store.voice);
const ttsRef = useRef<MsEdgeTTS | null>(null);
const audioElementRef = useRef<HTMLAudioElement | null>(null);
const mediaSourceRef = useRef<MediaSource | null>(null);
const sourceBufferRef = useRef<SourceBuffer | null>(null);
const pendingBuffers = useRef<Uint8Array[]>([]);
const { showToast } = useToastContext();
const initializeTTS = useCallback(async (): Promise<void> => {
if (!ttsRef.current) {
ttsRef.current = new MsEdgeTTS();
}
try {
await ttsRef.current.setMetadata(voiceName, OUTPUT_FORMAT.AUDIO_24KHZ_48KBITRATE_MONO_MP3);
} catch (error) {
console.error('Error initializing TTS:', error);
showToast({
message: localize('com_nav_tts_init_error', (error as Error).message),
status: 'error',
});
}
}, [voiceName, showToast, localize]);
const onSourceOpen = useCallback((): void => {
if (!sourceBufferRef.current && mediaSourceRef.current) {
try {
sourceBufferRef.current = mediaSourceRef.current.addSourceBuffer('audio/mpeg');
sourceBufferRef.current.addEventListener('updateend', appendNextBuffer);
} catch (error) {
console.error('Error adding source buffer:', error);
showToast({
message: localize('com_nav_source_buffer_error'),
status: 'error',
});
}
}
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [showToast, localize]);
const initializeMediaSource = useCallback(async (): Promise<void> => {
return new Promise<void>((resolve) => {
if (!mediaSourceRef.current) {
mediaSourceRef.current = new MediaSource();
audioElementRef.current = new Audio();
audioElementRef.current.src = URL.createObjectURL(mediaSourceRef.current);
}
const mediaSource = mediaSourceRef.current;
if (mediaSource.readyState === 'open') {
onSourceOpen();
resolve();
} else {
const onSourceOpenWrapper = (): void => {
onSourceOpen();
resolve();
mediaSource.removeEventListener('sourceopen', onSourceOpenWrapper);
};
mediaSource.addEventListener('sourceopen', onSourceOpenWrapper);
}
});
}, [onSourceOpen]);
const appendNextBuffer = useCallback((): void => {
if (
sourceBufferRef.current &&
!sourceBufferRef.current.updating &&
pendingBuffers.current.length > 0
) {
const nextBuffer = pendingBuffers.current.shift();
if (nextBuffer) {
try {
sourceBufferRef.current.appendBuffer(nextBuffer);
} catch (error) {
console.error('Error appending buffer:', error);
showToast({
message: localize('com_nav_buffer_append_error'),
status: 'error',
});
pendingBuffers.current.unshift(nextBuffer);
}
}
}
}, [showToast, localize]);
const generateSpeechEdge = useCallback(
async (text: string): Promise<void> => {
try {
await initializeTTS();
await initializeMediaSource();
if (!ttsRef.current || !audioElementRef.current) {
throw new Error('TTS or Audio element not initialized');
}
setIsSpeaking(true);
pendingBuffers.current = [];
const readable = await ttsRef.current.toStream(text);
readable.on('data', (chunk: Buffer) => {
pendingBuffers.current.push(new Uint8Array(chunk));
appendNextBuffer();
});
readable.on('end', () => {
if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
mediaSourceRef.current.endOfStream();
}
});
audioElementRef.current.onended = () => {
setIsSpeaking(false);
};
await audioElementRef.current.play();
} catch (error) {
console.error('Error generating speech:', error);
showToast({
message: localize('com_nav_audio_play_error', (error as Error).message),
status: 'error',
});
setIsSpeaking(false);
}
},
[initializeTTS, initializeMediaSource, appendNextBuffer, showToast, localize],
);
const cancelSpeechEdge = useCallback((): void => {
try {
if (audioElementRef.current) {
audioElementRef.current.pause();
audioElementRef.current.currentTime = 0;
}
if (mediaSourceRef.current && mediaSourceRef.current.readyState === 'open') {
mediaSourceRef.current.endOfStream();
}
pendingBuffers.current = [];
setIsSpeaking(false);
} catch (error) {
console.error('Error cancelling speech:', error);
showToast({
message: localize('com_nav_speech_cancel_error'),
status: 'error',
});
}
}, [showToast, localize]);
const voices = useCallback(async (): Promise<Voice[]> => {
if (!ttsRef.current) {
ttsRef.current = new MsEdgeTTS();
}
try {
const voicesList = await ttsRef.current.getVoices();
return voicesList.map((v) => ({
value: v.ShortName,
display: v.FriendlyName,
}));
} catch (error) {
console.error('Error fetching voices:', error);
showToast({
message: localize('com_nav_voices_fetch_error'),
status: 'error',
});
return [];
}
}, [showToast, localize]);
useEffect(() => {
return () => {
if (mediaSourceRef.current) {
URL.revokeObjectURL(audioElementRef.current?.src || '');
}
};
}, []);
return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices };
}
export default useTextToSpeechEdge;

View file

@ -1,6 +1,6 @@
import { useRecoilValue } from 'recoil';
import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
import { useTextToSpeechMutation } from '~/data-provider';
import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import useLocalize from '~/hooks/useLocalize';
import { useToastContext } from '~/Providers';
@ -178,7 +178,18 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
return isLocalSpeaking || (isLast && globalIsPlaying);
}, [isLocalSpeaking, globalIsPlaying, isLast]);
return { generateSpeechExternal, cancelSpeech, isLoading, isSpeaking, audioRef };
const useVoices = () => {
return useVoicesQuery().data ?? [];
};
return {
generateSpeechExternal,
cancelSpeech,
isLoading,
isSpeaking,
audioRef,
voices: useVoices,
};
}
export default useTextToSpeechExternal;