🔀 refactor: Modularize TTS Logic for Improved Browser support (#3657)

* WIP: message audio refactor

* WIP: use MessageAudio by provider

* fix: Update MessageAudio component to use TTSEndpoints enum

* feat: Update useTextToSpeechBrowser hook to handle errors and improve error logging

* feat: Add voice dropdown components for different TTS engines

* docs: update incorrect `voices` example

changed `voice: ''` to `voices: ['alloy']`

* feat: Add brwoser support check for Edge TTS engine component with error toast if not supported

---------

Co-authored-by: Marco Beretta <81851188+berry-13@users.noreply.github.com>
This commit is contained in:
Danny Avila 2024-08-15 11:34:25 -04:00 committed by GitHub
parent bcde0beb47
commit dba704079c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
18 changed files with 784 additions and 187 deletions

View file

@ -19,6 +19,7 @@ import type {
TStartupConfig,
EModelEndpoint,
AssistantsEndpoint,
TMessageContentParts,
AuthorizationTypeEnum,
TSetOption as SetOption,
TokenExchangeMethodEnum,
@ -31,6 +32,17 @@ export enum PromptsEditorMode {
ADVANCED = 'advanced',
}
export enum STTEndpoints {
browser = 'browser',
external = 'external',
}
export enum TTSEndpoints {
browser = 'browser',
edge = 'edge',
external = 'external',
}
export type AudioChunk = {
audio: string;
isFinal: boolean;
@ -374,6 +386,19 @@ export type Option = Record<string, unknown> & {
value: string | number | null;
};
export type VoiceOption = {
value: string;
label: string;
};
export type TMessageAudio = {
messageId?: string;
content?: TMessageContentParts[] | string;
className?: string;
isLast: boolean;
index: number;
};
export type OptionWithIcon = Option & { icon?: React.ReactNode };
export type MentionOption = OptionWithIcon & {
type: string;

View file

@ -0,0 +1,256 @@
import { useEffect, useMemo } from 'react';
import { useRecoilValue } from 'recoil';
import type { TMessageAudio } from '~/common';
import { useLocalize, useTTSBrowser, useTTSEdge, useTTSExternal } from '~/hooks';
import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
import { useToastContext } from '~/Providers/ToastContext';
import { logger } from '~/utils';
import store from '~/store';
export function BrowserTTS({ isLast, index, messageId, content, className }: TMessageAudio) {
const localize = useLocalize();
const playbackRate = useRecoilValue(store.playbackRate);
const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTTSBrowser({
isLast,
index,
messageId,
content,
});
const renderIcon = (size: string) => {
if (isLoading === true) {
return <Spinner size={size} />;
}
if (isSpeaking === true) {
return <VolumeMuteIcon size={size} />;
}
return <VolumeIcon size={size} />;
};
useEffect(() => {
const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
if (!messageAudio) {
return;
}
if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
messageAudio.playbackRate = playbackRate;
}
}, [audioRef, isSpeaking, playbackRate, messageId]);
logger.log(
'MessageAudio: audioRef.current?.src, audioRef.current',
audioRef.current?.src,
audioRef.current,
);
return (
<>
<button
className={className}
onClickCapture={() => {
if (audioRef.current) {
audioRef.current.muted = false;
}
toggleSpeech();
}}
type="button"
title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
>
{renderIcon('19')}
</button>
<audio
ref={audioRef}
controls
preload="none"
controlsList="nodownload nofullscreen noremoteplayback"
style={{
position: 'absolute',
overflow: 'hidden',
display: 'none',
height: '0px',
width: '0px',
}}
src={audioRef.current?.src}
onError={(error) => {
console.error('Error fetching audio:', error);
}}
id={`audio-${messageId}`}
muted
autoPlay
/>
</>
);
}
export function EdgeTTS({ isLast, index, messageId, content, className }: TMessageAudio) {
const localize = useLocalize();
const playbackRate = useRecoilValue(store.playbackRate);
const isBrowserSupported = useMemo(
() => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
[],
);
const { showToast } = useToastContext();
const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTTSEdge({
isLast,
index,
messageId,
content,
});
const renderIcon = (size: string) => {
if (isLoading === true) {
return <Spinner size={size} />;
}
if (isSpeaking === true) {
return <VolumeMuteIcon size={size} />;
}
return <VolumeIcon size={size} />;
};
useEffect(() => {
const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
if (!messageAudio) {
return;
}
if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
messageAudio.playbackRate = playbackRate;
}
}, [audioRef, isSpeaking, playbackRate, messageId]);
logger.log(
'MessageAudio: audioRef.current?.src, audioRef.current',
audioRef.current?.src,
audioRef.current,
);
return (
<>
<button
className={className}
onClickCapture={() => {
if (!isBrowserSupported) {
showToast({
message: localize('com_nav_tts_unsupported_error'),
status: 'error',
});
return;
}
if (audioRef.current) {
audioRef.current.muted = false;
}
toggleSpeech();
}}
type="button"
title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
>
{renderIcon('19')}
</button>
{isBrowserSupported ? (
<audio
ref={audioRef}
controls
preload="none"
controlsList="nodownload nofullscreen noremoteplayback"
style={{
position: 'absolute',
overflow: 'hidden',
display: 'none',
height: '0px',
width: '0px',
}}
src={audioRef.current?.src}
onError={(error) => {
console.error('Error fetching audio:', error);
}}
id={`audio-${messageId}`}
muted
autoPlay
/>
) : null}
</>
);
}
export function ExternalTTS({ isLast, index, messageId, content, className }: TMessageAudio) {
const localize = useLocalize();
const playbackRate = useRecoilValue(store.playbackRate);
const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTTSExternal({
isLast,
index,
messageId,
content,
});
const renderIcon = (size: string) => {
if (isLoading === true) {
return <Spinner size={size} />;
}
if (isSpeaking === true) {
return <VolumeMuteIcon size={size} />;
}
return <VolumeIcon size={size} />;
};
useEffect(() => {
const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
if (!messageAudio) {
return;
}
if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
messageAudio.playbackRate = playbackRate;
}
}, [audioRef, isSpeaking, playbackRate, messageId]);
logger.log(
'MessageAudio: audioRef.current?.src, audioRef.current',
audioRef.current?.src,
audioRef.current,
);
return (
<>
<button
className={className}
onClickCapture={() => {
if (audioRef.current) {
audioRef.current.muted = false;
}
toggleSpeech();
}}
type="button"
title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
>
{renderIcon('19')}
</button>
<audio
ref={audioRef}
controls
preload="none"
controlsList="nodownload nofullscreen noremoteplayback"
style={{
position: 'absolute',
overflow: 'hidden',
display: 'none',
height: '0px',
width: '0px',
}}
src={audioRef.current?.src}
onError={(error) => {
console.error('Error fetching audio:', error);
}}
id={`audio-${messageId}`}
muted
autoPlay
/>
</>
);
}

View file

@ -0,0 +1,94 @@
import React from 'react';
import { useRecoilState } from 'recoil';
import type { Option } from '~/common';
import DropdownNoState from '~/components/ui/DropdownNoState';
import { useLocalize, useTTSBrowser, useTTSEdge, useTTSExternal } from '~/hooks';
import { logger } from '~/utils';
import store from '~/store';
export function EdgeVoiceDropdown() {
const localize = useLocalize();
const { voices = [] } = useTTSEdge();
const [voice, setVoice] = useRecoilState(store.voice);
const handleVoiceChange = (newValue?: string | Option) => {
logger.log('Edge Voice changed:', newValue);
const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
if (newVoice != null) {
return setVoice(newVoice.toString());
}
};
return (
<div className="flex items-center justify-between">
<div>{localize('com_nav_voice_select')}</div>
<DropdownNoState
key={`edge-voice-dropdown-${voices.length}`}
value={voice}
options={voices}
onChange={handleVoiceChange}
sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
anchor="bottom start"
testId="EdgeVoiceDropdown"
/>
</div>
);
}
export function BrowserVoiceDropdown() {
const localize = useLocalize();
const { voices = [] } = useTTSBrowser();
const [voice, setVoice] = useRecoilState(store.voice);
const handleVoiceChange = (newValue?: string | Option) => {
logger.log('Browser Voice changed:', newValue);
const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
if (newVoice != null) {
return setVoice(newVoice.toString());
}
};
return (
<div className="flex items-center justify-between">
<div>{localize('com_nav_voice_select')}</div>
<DropdownNoState
key={`browser-voice-dropdown-${voices.length}`}
value={voice}
options={voices}
onChange={handleVoiceChange}
sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
anchor="bottom start"
testId="BrowserVoiceDropdown"
/>
</div>
);
}
export function ExternalVoiceDropdown() {
const localize = useLocalize();
const { voices = [] } = useTTSExternal();
const [voice, setVoice] = useRecoilState(store.voice);
const handleVoiceChange = (newValue?: string | Option) => {
logger.log('External Voice changed:', newValue);
const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
if (newVoice != null) {
return setVoice(newVoice.toString());
}
};
return (
<div className="flex items-center justify-between">
<div>{localize('com_nav_voice_select')}</div>
<DropdownNoState
key={`external-voice-dropdown-${voices.length}`}
value={voice}
options={voices}
onChange={handleVoiceChange}
sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
anchor="bottom start"
testId="ExternalVoiceDropdown"
/>
</div>
);
}

View file

@ -79,6 +79,7 @@ export default function HoverButtons({
messageId={message.messageId}
content={message.content ?? message.text}
isLast={isLast}
className="hover-button rounded-md p-1 pl-0 text-gray-500 hover:bg-gray-100 hover:text-gray-500 dark:text-gray-400/70 dark:hover:bg-gray-700 dark:hover:text-gray-200 disabled:dark:hover:text-gray-400 md:group-hover:visible md:group-[.final-completion]:visible"
/>
)}
{isEditableEndpoint && (

View file

@ -1,104 +1,22 @@
import { useEffect } from 'react';
// client/src/components/Chat/Messages/MessageAudio.tsx
import { memo } from 'react';
import { useRecoilValue } from 'recoil';
import type { TMessageContentParts } from 'librechat-data-provider';
import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
import { useLocalize, useTextToSpeech } from '~/hooks';
import { logger } from '~/utils';
import type { TMessageAudio } from '~/common';
import { BrowserTTS, EdgeTTS, ExternalTTS } from '~/components/Audio/TTS';
import { TTSEndpoints } from '~/common';
import store from '~/store';
type THoverButtons = {
messageId?: string;
content?: TMessageContentParts[] | string;
isLast: boolean;
index: number;
};
function MessageAudio(props: TMessageAudio) {
const engineTTS = useRecoilValue<string>(store.engineTTS);
export default function MessageAudio({ isLast, index, messageId, content }: THoverButtons) {
const localize = useLocalize();
const playbackRate = useRecoilValue(store.playbackRate);
const { toggleSpeech, isSpeaking, isLoading, audioRef } = useTextToSpeech({
isLast,
index,
messageId,
content,
});
const renderIcon = (size: string) => {
if (isLoading === true) {
return <Spinner size={size} />;
}
if (isSpeaking === true) {
return <VolumeMuteIcon size={size} />;
}
return <VolumeIcon size={size} />;
const TTSComponents = {
[TTSEndpoints.edge]: EdgeTTS,
[TTSEndpoints.browser]: BrowserTTS,
[TTSEndpoints.external]: ExternalTTS,
};
useEffect(() => {
const messageAudio = document.getElementById(`audio-${messageId}`) as HTMLAudioElement | null;
if (!messageAudio) {
return;
}
if (playbackRate != null && playbackRate > 0 && messageAudio.playbackRate !== playbackRate) {
messageAudio.playbackRate = playbackRate;
}
}, [audioRef, isSpeaking, playbackRate, messageId]);
logger.log(
'MessageAudio: audioRef.current?.src, audioRef.current',
audioRef.current?.src,
audioRef.current,
);
return (
<>
<button
className="hover-button rounded-md p-1 pl-0 text-gray-500 hover:bg-gray-100 hover:text-gray-500 dark:text-gray-400/70 dark:hover:bg-gray-700 dark:hover:text-gray-200 disabled:dark:hover:text-gray-400 md:group-hover:visible md:group-[.final-completion]:visible"
// onMouseDownCapture={() => {
// if (audioRef.current) {
// audioRef.current.muted = false;
// }
// handleMouseDown();
// }}
// onMouseUpCapture={() => {
// if (audioRef.current) {
// audioRef.current.muted = false;
// }
// handleMouseUp();
// }}
onClickCapture={() => {
if (audioRef.current) {
audioRef.current.muted = false;
}
toggleSpeech();
}}
type="button"
title={isSpeaking === true ? localize('com_ui_stop') : localize('com_ui_read_aloud')}
>
{renderIcon('19')}
</button>
<audio
ref={audioRef}
controls
preload="none"
controlsList="nodownload nofullscreen noremoteplayback"
style={{
position: 'absolute',
overflow: 'hidden',
display: 'none',
height: '0px',
width: '0px',
}}
src={audioRef.current?.src}
onError={(error) => {
console.error('Error fetching audio:', error);
}}
id={`audio-${messageId}`}
muted
autoPlay
/>
</>
);
const SelectedTTS = TTSComponents[engineTTS];
return <SelectedTTS {...props} />;
}
export default memo(MessageAudio);

View file

@ -1,37 +1,21 @@
import React from 'react';
import { useRecoilState, useRecoilValue } from 'recoil';
import type { Option } from '~/common';
import DropdownNoState from '~/components/ui/DropdownNoState';
import { useLocalize, useTextToSpeech } from '~/hooks';
import { logger } from '~/utils';
import { useRecoilValue } from 'recoil';
import {
EdgeVoiceDropdown,
BrowserVoiceDropdown,
ExternalVoiceDropdown,
} from '~/components/Audio/Voices';
import store from '~/store';
import { TTSEndpoints } from '~/common';
const voiceDropdownComponentsMap = {
[TTSEndpoints.edge]: EdgeVoiceDropdown,
[TTSEndpoints.browser]: BrowserVoiceDropdown,
[TTSEndpoints.external]: ExternalVoiceDropdown,
};
export default function VoiceDropdown() {
const localize = useLocalize();
const { voices = [] } = useTextToSpeech();
const [voice, setVoice] = useRecoilState(store.voice);
const engineTTS = useRecoilValue<string>(store.engineTTS);
const VoiceDropdownComponent = voiceDropdownComponentsMap[engineTTS];
const handleVoiceChange = (newValue?: string | Option) => {
logger.log('Voice changed:', newValue);
const newVoice = typeof newValue === 'string' ? newValue : newValue?.value;
if (newVoice != null) {
return setVoice(newVoice.toString());
}
};
return (
<div className="flex items-center justify-between">
<div>{localize('com_nav_voice_select')}</div>
<DropdownNoState
key={`voice-dropdown-${engineTTS}-${voices.length}`}
value={voice}
options={voices}
onChange={handleVoiceChange}
sizeClasses="min-w-[200px] !max-w-[400px] [--anchor-max-width:400px]"
anchor="bottom start"
testId="VoiceDropdown"
/>
</div>
);
return <VoiceDropdownComponent />;
}

View file

@ -1,3 +1,6 @@
export * from './MediaSourceAppender';
export { default as useCustomAudioRef } from './useCustomAudioRef';
export { default as usePauseGlobalAudio } from './usePauseGlobalAudio';
export { default as useTTSExternal } from './useTTSExternal';
export { default as useTTSBrowser } from './useTTSBrowser';
export { default as useTTSEdge } from './useTTSEdge';

View file

@ -0,0 +1,100 @@
// client/src/hooks/Audio/useTTSBrowser.ts
import { useRef, useEffect, useState } from 'react';
import { useRecoilState, useRecoilValue } from 'recoil';
import { parseTextParts } from 'librechat-data-provider';
import type { TMessageContentParts } from 'librechat-data-provider';
import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { logger } from '~/utils';
import store from '~/store';
type TUseTextToSpeech = {
messageId?: string;
content?: TMessageContentParts[] | string;
isLast?: boolean;
index?: number;
};
const useTTSBrowser = (props?: TUseTextToSpeech) => {
const { content, isLast = false, index = 0 } = props ?? {};
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
const [isSpeakingState, setIsSpeaking] = useState(false);
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
const [voice, setVoice] = useRecoilState(store.voice);
const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
const {
generateSpeechLocal: generateSpeech,
cancelSpeechLocal: cancelSpeech,
voices,
} = useTextToSpeechBrowser({ setIsSpeaking });
useEffect(() => {
const firstVoice = voices[0];
if (voices.length && typeof firstVoice === 'object') {
const lastSelectedVoice = voices.find((v) =>
typeof v === 'object' ? v.value === voice : v === voice,
);
if (lastSelectedVoice != null) {
const currentVoice =
typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
setVoice(currentVoice);
return;
}
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
setVoice(firstVoice.value);
}
}, [setVoice, voice, voices]);
const handleMouseDown = () => {
isMouseDownRef.current = true;
timerRef.current = window.setTimeout(() => {
if (isMouseDownRef.current) {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage);
}
}, 1000);
};
const handleMouseUp = () => {
isMouseDownRef.current = false;
if (timerRef.current != null) {
window.clearTimeout(timerRef.current);
}
};
const toggleSpeech = () => {
if (isSpeaking === true) {
cancelSpeech();
pauseGlobalAudio();
} else {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage);
}
};
return {
handleMouseDown,
handleMouseUp,
toggleSpeech,
isSpeaking,
isLoading: false,
audioRef,
voices,
};
};
export default useTTSBrowser;

View file

@ -0,0 +1,100 @@
// client/src/hooks/Audio/useTTSEdge.ts
import { useRef, useEffect, useState } from 'react';
import { useRecoilState, useRecoilValue } from 'recoil';
import { parseTextParts } from 'librechat-data-provider';
import type { TMessageContentParts } from 'librechat-data-provider';
import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { logger } from '~/utils';
import store from '~/store';
type TUseTextToSpeech = {
messageId?: string;
content?: TMessageContentParts[] | string;
isLast?: boolean;
index?: number;
};
const useTTSEdge = (props?: TUseTextToSpeech) => {
const { content, isLast = false, index = 0 } = props ?? {};
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
const [isSpeakingState, setIsSpeaking] = useState(false);
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
const [voice, setVoice] = useRecoilState(store.voice);
const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
const {
generateSpeechEdge: generateSpeech,
cancelSpeechEdge: cancelSpeech,
voices,
} = useTextToSpeechEdge({ setIsSpeaking });
useEffect(() => {
const firstVoice = voices[0];
if (voices.length && typeof firstVoice === 'object') {
const lastSelectedVoice = voices.find((v) =>
typeof v === 'object' ? v.value === voice : v === voice,
);
if (lastSelectedVoice != null) {
const currentVoice =
typeof lastSelectedVoice === 'object' ? lastSelectedVoice.value : lastSelectedVoice;
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: currentVoice });
setVoice(currentVoice);
return;
}
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice.value });
setVoice(firstVoice.value);
}
}, [setVoice, voice, voices]);
const handleMouseDown = () => {
isMouseDownRef.current = true;
timerRef.current = window.setTimeout(() => {
if (isMouseDownRef.current) {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage);
}
}, 1000);
};
const handleMouseUp = () => {
isMouseDownRef.current = false;
if (timerRef.current != null) {
window.clearTimeout(timerRef.current);
}
};
const toggleSpeech = () => {
if (isSpeaking === true) {
cancelSpeech();
pauseGlobalAudio();
} else {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage);
}
};
return {
handleMouseDown,
handleMouseUp,
toggleSpeech,
isSpeaking,
isLoading: false,
audioRef,
voices,
};
};
export default useTTSEdge;

View file

@ -0,0 +1,101 @@
// client/src/hooks/Audio/useTTSExternal.ts
import { useRef, useEffect, useState } from 'react';
import { useRecoilState, useRecoilValue } from 'recoil';
import { parseTextParts } from 'librechat-data-provider';
import type { TMessageContentParts } from 'librechat-data-provider';
import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
import usePauseGlobalAudio from '~/hooks/Audio/usePauseGlobalAudio';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { logger } from '~/utils';
import store from '~/store';
type TUseTextToSpeech = {
messageId?: string;
content?: TMessageContentParts[] | string;
isLast?: boolean;
index?: number;
};
const useTTSExternal = (props?: TUseTextToSpeech) => {
const { messageId, content, isLast = false, index = 0 } = props ?? {};
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
const [isSpeakingState, setIsSpeaking] = useState(false);
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
const { pauseGlobalAudio } = usePauseGlobalAudio(index);
const [voice, setVoice] = useRecoilState(store.voice);
const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
const {
cancelSpeech,
generateSpeechExternal: generateSpeech,
isLoading,
voices,
} = useTextToSpeechExternal({
setIsSpeaking,
audioRef,
messageId,
isLast,
index,
});
useEffect(() => {
const firstVoice = voices[0];
if (voices.length) {
const lastSelectedVoice = voices.find((v) => v === voice);
if (lastSelectedVoice != null) {
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: lastSelectedVoice });
setVoice(lastSelectedVoice.toString());
return;
}
logger.log('useTextToSpeech.ts - Effect:', { voices, voice: firstVoice });
setVoice(firstVoice.toString());
}
}, [setVoice, voice, voices]);
const handleMouseDown = () => {
isMouseDownRef.current = true;
timerRef.current = window.setTimeout(() => {
if (isMouseDownRef.current) {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage, false);
}
}, 1000);
};
const handleMouseUp = () => {
isMouseDownRef.current = false;
if (timerRef.current != null) {
window.clearTimeout(timerRef.current);
}
};
const toggleSpeech = () => {
if (isSpeaking === true) {
cancelSpeech();
pauseGlobalAudio();
} else {
const messageContent = content ?? '';
const parsedMessage =
typeof messageContent === 'string' ? messageContent : parseTextParts(messageContent);
generateSpeech(parsedMessage, false);
}
};
return {
handleMouseDown,
handleMouseUp,
toggleSpeech,
isSpeaking,
isLoading,
audioRef,
voices,
};
};
export default useTTSExternal;

View file

@ -2,17 +2,6 @@ import { useMemo } from 'react';
import { useRecoilValue } from 'recoil';
import store from '~/store';
export enum STTEndpoints {
browser = 'browser',
external = 'external',
}
export enum TTSEndpoints {
browser = 'browser',
edge = 'edge',
external = 'external',
}
const useGetAudioSettings = () => {
const engineSTT = useRecoilValue<string>(store.engineSTT);
const engineTTS = useRecoilValue<string>(store.engineTTS);

View file

@ -3,10 +3,10 @@ import { useRef, useMemo, useEffect, useState } from 'react';
import { parseTextParts } from 'librechat-data-provider';
import type { TMessageContentParts } from 'librechat-data-provider';
import type { Option } from '~/common';
import useTextToSpeechExternal from './useTextToSpeechExternal';
import useTextToSpeechBrowser from './useTextToSpeechBrowser';
import useGetAudioSettings from './useGetAudioSettings';
import useTextToSpeechEdge from './useTextToSpeechEdge';
import useTextToSpeechExternal from '~/hooks/Input/useTextToSpeechExternal';
import useTextToSpeechBrowser from '~/hooks/Input/useTextToSpeechBrowser';
import useGetAudioSettings from '~/hooks/Input/useGetAudioSettings';
import useTextToSpeechEdge from '~/hooks/Input/useTextToSpeechEdge';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { usePauseGlobalAudio } from '../Audio';
import { logger } from '~/utils';

View file

@ -1,43 +1,54 @@
import { useRecoilState } from 'recoil';
import { useState, useEffect, useCallback } from 'react';
import type { VoiceOption } from '~/common';
import store from '~/store';
interface VoiceOption {
value: string;
label: string;
}
function useTextToSpeechBrowser({
setIsSpeaking,
}: {
setIsSpeaking: (isSpeaking: boolean) => void;
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
}) {
const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
const [voiceName] = useRecoilState(store.voice);
const [voices, setVoices] = useState<VoiceOption[]>([]);
const updateVoices = useCallback(() => {
const availableVoices = window.speechSynthesis
.getVoices()
.filter((v) => cloudBrowserVoices || v.localService === true);
try {
const availableVoices = window.speechSynthesis.getVoices();
if (!Array.isArray(availableVoices)) {
console.error('getVoices() did not return an array');
return;
}
const voiceOptions: VoiceOption[] = availableVoices.map((v) => ({
value: v.name,
label: v.name,
}));
const filteredVoices = availableVoices.filter(
(v) => cloudBrowserVoices || v.localService === true,
);
const voiceOptions: VoiceOption[] = filteredVoices.map((v) => ({
value: v.name,
label: v.name,
}));
setVoices(voiceOptions);
setVoices(voiceOptions);
} catch (error) {
console.error('Error updating voices:', error);
}
}, [cloudBrowserVoices]);
useEffect(() => {
if (window.speechSynthesis.getVoices().length) {
updateVoices();
} else {
window.speechSynthesis.onvoiceschanged = updateVoices;
const synth = window.speechSynthesis;
try {
if (synth.getVoices().length) {
updateVoices();
} else {
synth.onvoiceschanged = updateVoices;
}
} catch (error) {
console.error('Error in useEffect:', error);
}
return () => {
window.speechSynthesis.onvoiceschanged = null;
synth.onvoiceschanged = null;
};
}, [updateVoices]);
@ -46,22 +57,37 @@ function useTextToSpeechBrowser({
const voice = voices.find((v) => v.value === voiceName);
if (!voice) {
console.warn('Selected voice not found');
return;
}
synth.cancel();
const utterance = new SpeechSynthesisUtterance(text);
utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
utterance.onend = () => {
try {
synth.cancel();
const utterance = new SpeechSynthesisUtterance(text);
utterance.voice = synth.getVoices().find((v) => v.name === voice.value) || null;
utterance.onend = () => {
setIsSpeaking(false);
};
utterance.onerror = (event) => {
console.error('Speech synthesis error:', event);
setIsSpeaking(false);
};
setIsSpeaking(true);
synth.speak(utterance);
} catch (error) {
console.error('Error generating speech:', error);
setIsSpeaking(false);
};
setIsSpeaking(true);
synth.speak(utterance);
}
};
const cancelSpeechLocal = () => {
window.speechSynthesis.cancel();
setIsSpeaking(false);
try {
window.speechSynthesis.cancel();
} catch (error) {
console.error('Error cancelling speech:', error);
} finally {
setIsSpeaking(false);
}
};
return { generateSpeechLocal, cancelSpeechLocal, voices };

View file

@ -1,28 +1,24 @@
import { useRecoilValue } from 'recoil';
import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
import { MsEdgeTTS, OUTPUT_FORMAT } from 'msedge-tts';
import { useToastContext } from '~/Providers';
import { useState, useCallback, useRef, useEffect, useMemo } from 'react';
import type { VoiceOption } from '~/common';
import { useToastContext } from '~/Providers/ToastContext';
import useLocalize from '~/hooks/useLocalize';
import store from '~/store';
interface Voice {
value: string;
label: string;
}
interface UseTextToSpeechEdgeReturn {
generateSpeechEdge: (text: string) => void;
cancelSpeechEdge: () => void;
voices: Voice[];
voices: VoiceOption[];
}
function useTextToSpeechEdge({
setIsSpeaking,
}: {
setIsSpeaking: (isSpeaking: boolean) => void;
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
}): UseTextToSpeechEdgeReturn {
const localize = useLocalize();
const [voices, setVoices] = useState<Voice[]>([]);
const [voices, setVoices] = useState<VoiceOption[]>([]);
const voiceName = useRecoilValue(store.voice);
const ttsRef = useRef<MsEdgeTTS | null>(null);
const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -63,7 +59,7 @@ function useTextToSpeechEdge({
if (!ttsRef.current) {
ttsRef.current = new MsEdgeTTS();
}
const availableVoice: Voice | undefined = voices.find((v) => v.value === voiceName);
const availableVoice: VoiceOption | undefined = voices.find((v) => v.value === voiceName);
if (availableVoice) {
ttsRef.current
@ -181,7 +177,7 @@ function useTextToSpeechEdge({
generate();
},
[appendNextBuffer, showToast, localize],
[setIsSpeaking, appendNextBuffer, showToast, localize],
);
const cancelSpeechEdge = useCallback(() => {
@ -202,7 +198,7 @@ function useTextToSpeechEdge({
status: 'error',
});
}
}, [showToast, localize]);
}, [setIsSpeaking, showToast, localize]);
useEffect(() => {
if (!isBrowserSupported) {

View file

@ -1,8 +1,8 @@
import { useRecoilValue } from 'recoil';
import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
import { useToastContext } from '~/Providers/ToastContext';
import useLocalize from '~/hooks/useLocalize';
import { useToastContext } from '~/Providers';
import store from '~/store';
const createFormData = (text: string, voice: string) => {
@ -13,7 +13,7 @@ const createFormData = (text: string, voice: string) => {
};
type TUseTTSExternal = {
setIsSpeaking: (isSpeaking: boolean) => void;
setIsSpeaking: React.Dispatch<React.SetStateAction<boolean>>;
audioRef: React.MutableRefObject<HTMLAudioElement | null>;
messageId?: string;
isLast: boolean;

View file

@ -1,3 +1,4 @@
export * from './Audio';
export * from './Assistants';
export * from './Chat';
export * from './Config';

View file

@ -664,6 +664,8 @@ export default {
com_nav_audio_process_error: 'Error processing audio: {0}',
com_nav_long_audio_warning: 'Longer texts will take longer to process.',
com_nav_tts_init_error: 'Failed to initialize text-to-speech: {0}',
com_nav_tts_unsupported_error:
'Text-to-speech for the selected engine is not supported in this browser.',
com_nav_source_buffer_error: 'Error setting up audio playback. Please refresh the page.',
com_nav_media_source_init_error:
'Unable to prepare audio player. Please check your browser settings.',

View file

@ -31,7 +31,8 @@ registration:
# url: ''
# apiKey: '${TTS_API_KEY}'
# model: ''
# voice: ''
# voices: ['']
#
# stt:
# openai: