mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-12-20 02:10:15 +01:00
🔀 refactor: Modularize TTS Logic for Improved Browser support (#3657)
* WIP: message audio refactor * WIP: use MessageAudio by provider * fix: Update MessageAudio component to use TTSEndpoints enum * feat: Update useTextToSpeechBrowser hook to handle errors and improve error logging * feat: Add voice dropdown components for different TTS engines * docs: update incorrect `voices` example changed `voice: ''` to `voices: ['alloy']` * feat: Add brwoser support check for Edge TTS engine component with error toast if not supported --------- Co-authored-by: Marco Beretta <81851188+berry-13@users.noreply.github.com>
This commit is contained in:
parent
bcde0beb47
commit
dba704079c
18 changed files with 784 additions and 187 deletions
|
|
@ -2,17 +2,6 @@ import { useMemo } from 'react';
|
|||
import { useRecoilValue } from 'recoil';
|
||||
import store from '~/store';
|
||||
|
||||
export enum STTEndpoints {
|
||||
browser = 'browser',
|
||||
external = 'external',
|
||||
}
|
||||
|
||||
export enum TTSEndpoints {
|
||||
browser = 'browser',
|
||||
edge = 'edge',
|
||||
external = 'external',
|
||||
}
|
||||
|
||||
const useGetAudioSettings = () => {
|
||||
const engineSTT = useRecoilValue<string>(store.engineSTT);
|
||||
const engineTTS = useRecoilValue<string>(store.engineTTS);
|
||||
|
|
|
|||
|
|
@ -3,10 +3,10 @@ import { useRef, useMemo, useEffect, useState } from 'react';
|
|||
import { parseTextParts } from 'librechat-data-provider';
|
||||
import type { TMessageContentParts } from 'librechat-data-provider';
|
||||
import type { Option } from '~/common';
|
||||
import useTextToSpeechExternal from './useTextToSpeechExternal';
|
||||
import useTextToSpeechBrowser from './useTextToSpeechBrowser';
|
||||
import useGetAudioSettings from './useGetAudioSettings';
|
||||
import useTextToSpeechEdge from './useTextToSpeechEdge';
|
||||
import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
|
||||
import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
|
||||
import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
|
||||
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
|
||||
import useAudioRef from '~/hooks/Audio/useAudioRef';
|
||||
import { usePauseGlobalAudio } from '../Audio';
|
||||
import { logger } from '~/utils';
|
||||
|
|
|
|||
|
|
@ -1,43 +1,54 @@
|
|||
import { useRecoilState } from 'recoil';
|
||||
import { useState, useEffect, useCallback } from 'react';
|
||||
import type { VoiceOption } from '~/common';
|
||||
import store from '~/store';
|
||||
|
||||
interface VoiceOption {
|
||||
value: string;
|
||||
label: string;
|
||||
}
|
||||
|
||||
function useTextToSpeechBrowser({
|
||||
setIsSpeaking,
|
||||
}: {
|
||||
setIsSpeaking: (isSpeaking: boolean) => void;
|
||||
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
|
||||
}) {
|
||||
const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
|
||||
const [voiceName] = useRecoilState(store.voice);
|
||||
const [voices, setVoices] = useState<VoiceOption[]>([]);
|
||||
|
||||
const updateVoices = useCallback(() => {
|
||||
const availableVoices = window.speechSynthesis
|
||||
.getVoices()
|
||||
.filter((v) => cloudBrowserVoices || v.localService === true);
|
||||
try {
|
||||
const availableVoices = window.speechSynthesis.getVoices();
|
||||
if (!Array.isArray(availableVoices)) {
|
||||
console.error('getVoices() did not return an array');
|
||||
return;
|
||||
}
|
||||
|
||||
const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
|
||||
value: v.name,
|
||||
label: v.name,
|
||||
}));
|
||||
const filteredVoices = availableVoices.filter(
|
||||
(v) => cloudBrowserVoices || v.localService === true,
|
||||
);
|
||||
const voiceOptions: VoiceOption[] = filteredVoices.map((v) => ({
|
||||
value: v.name,
|
||||
label: v.name,
|
||||
}));
|
||||
|
||||
setVoices(voiceOptions);
|
||||
setVoices(voiceOptions);
|
||||
} catch (error) {
|
||||
console.error('Error updating voices:', error);
|
||||
}
|
||||
}, [cloudBrowserVoices]);
|
||||
|
||||
useEffect(() => {
|
||||
if (window.speechSynthesis.getVoices().length) {
|
||||
updateVoices();
|
||||
} else {
|
||||
window.speechSynthesis.onvoiceschanged = updateVoices;
|
||||
const synth = window.speechSynthesis;
|
||||
|
||||
try {
|
||||
if (synth.getVoices().length) {
|
||||
updateVoices();
|
||||
} else {
|
||||
synth.onvoiceschanged = updateVoices;
|
||||
}
|
||||
} catch (error) {
|
||||
console.error('Error in useEffect:', error);
|
||||
}
|
||||
|
||||
return () => {
|
||||
window.speechSynthesis.onvoiceschanged = null;
|
||||
synth.onvoiceschanged = null;
|
||||
};
|
||||
}, [updateVoices]);
|
||||
|
||||
|
|
@ -46,22 +57,37 @@ function useTextToSpeechBrowser({
|
|||
const voice = voices.find((v) => v.value === voiceName);
|
||||
|
||||
if (!voice) {
|
||||
console.warn('Selected voice not found');
|
||||
return;
|
||||
}
|
||||
|
||||
synth.cancel();
|
||||
const utterance = new SpeechSynthesisUtterance(text);
|
||||
utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
|
||||
utterance.onend = () => {
|
||||
try {
|
||||
synth.cancel();
|
||||
const utterance = new SpeechSynthesisUtterance(text);
|
||||
utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
|
||||
utterance.onend = () => {
|
||||
setIsSpeaking(false);
|
||||
};
|
||||
utterance.onerror = (event) => {
|
||||
console.error('Speech synthesis error:', event);
|
||||
setIsSpeaking(false);
|
||||
};
|
||||
setIsSpeaking(true);
|
||||
synth.speak(utterance);
|
||||
} catch (error) {
|
||||
console.error('Error generating speech:', error);
|
||||
setIsSpeaking(false);
|
||||
};
|
||||
setIsSpeaking(true);
|
||||
synth.speak(utterance);
|
||||
}
|
||||
};
|
||||
|
||||
const cancelSpeechLocal = () => {
|
||||
window.speechSynthesis.cancel();
|
||||
setIsSpeaking(false);
|
||||
try {
|
||||
window.speechSynthesis.cancel();
|
||||
} catch (error) {
|
||||
console.error('Error cancelling speech:', error);
|
||||
} finally {
|
||||
setIsSpeaking(false);
|
||||
}
|
||||
};
|
||||
|
||||
return { generateSpeechLocal, cancelSpeechLocal, voices };
|
||||
|
|
|
|||
|
|
@ -1,28 +1,24 @@
|
|||
import { useRecoilValue } from 'recoil';
|
||||
import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
|
||||
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
|
||||
import { useToastContext } from '~/Providers';
|
||||
import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
|
||||
import type { VoiceOption } from '~/common';
|
||||
import { useToastContext } from '~/Providers/ToastContext';
|
||||
import useLocalize from '~/hooks/useLocalize';
|
||||
import store from '~/store';
|
||||
|
||||
interface Voice {
|
||||
value: string;
|
||||
label: string;
|
||||
}
|
||||
|
||||
interface UseTextToSpeechEdgeReturn {
|
||||
generateSpeechEdge: (text: string) => void;
|
||||
cancelSpeechEdge: () => void;
|
||||
voices: Voice[];
|
||||
voices: VoiceOption[];
|
||||
}
|
||||
|
||||
function useTextToSpeechEdge({
|
||||
setIsSpeaking,
|
||||
}: {
|
||||
setIsSpeaking: (isSpeaking: boolean) => void;
|
||||
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
|
||||
}): UseTextToSpeechEdgeReturn {
|
||||
const localize = useLocalize();
|
||||
const [voices, setVoices] = useState<Voice[]>([]);
|
||||
const [voices, setVoices] = useState<VoiceOption[]>([]);
|
||||
const voiceName = useRecoilValue(store.voice);
|
||||
const ttsRef = useRef<MsEdgeTTS | null>(null);
|
||||
const audioElementRef = useRef<HTMLAudioElement | null>(null);
|
||||
|
|
@ -63,7 +59,7 @@ function useTextToSpeechEdge({
|
|||
if (!ttsRef.current) {
|
||||
ttsRef.current = new MsEdgeTTS();
|
||||
}
|
||||
const availableVoice: Voice | undefined = voices.find((v) => v.value === voiceName);
|
||||
const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);
|
||||
|
||||
if (availableVoice) {
|
||||
ttsRef.current
|
||||
|
|
@ -181,7 +177,7 @@ function useTextToSpeechEdge({
|
|||
|
||||
generate();
|
||||
},
|
||||
[appendNextBuffer, showToast, localize],
|
||||
[setIsSpeaking, appendNextBuffer, showToast, localize],
|
||||
);
|
||||
|
||||
const cancelSpeechEdge = useCallback(() => {
|
||||
|
|
@ -202,7 +198,7 @@ function useTextToSpeechEdge({
|
|||
status: 'error',
|
||||
});
|
||||
}
|
||||
}, [showToast, localize]);
|
||||
}, [setIsSpeaking, showToast, localize]);
|
||||
|
||||
useEffect(() => {
|
||||
if (!isBrowserSupported) {
|
||||
|
|
|
|||
|
|
@ -1,8 +1,8 @@
|
|||
import { useRecoilValue } from 'recoil';
|
||||
import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
|
||||
import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
|
||||
import { useToastContext } from '~/Providers/ToastContext';
|
||||
import useLocalize from '~/hooks/useLocalize';
|
||||
import { useToastContext } from '~/Providers';
|
||||
import store from '~/store';
|
||||
|
||||
const createFormData = (text: string, voice: string) => {
|
||||
|
|
@ -13,7 +13,7 @@ const createFormData = (text: string, voice: string) => {
|
|||
};
|
||||
|
||||
type TUseTTSExternal = {
|
||||
setIsSpeaking: (isSpeaking: boolean) => void;
|
||||
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
|
||||
audioRef: React.MutableRefObject<HTMLAudioElement | null>;
|
||||
messageId?: string;
|
||||
isLast: boolean;
|
||||
|
|
|
|||
Loading…
Add table
Add a link
Reference in a new issue