🎧 fix(TTS): Improve State of audio playback, hook patterns, and fix undefined MediaSource (#3632)

This commit is contained in:
Danny Avila 2024-08-13 12:08:55 -04:00 committed by GitHub
parent e3ebcfd2b1
commit dc8d30ad90
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
6 changed files with 108 additions and 72 deletions

View file

@ -5,9 +5,9 @@ import { useQueryClient } from '@tanstack/react-query';
import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil'; import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil';
import type { TMessage } from 'librechat-data-provider'; import type { TMessage } from 'librechat-data-provider';
import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio'; import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio';
import { getLatestText, logger } from '~/utils';
import { useAuthContext } from '~/hooks'; import { useAuthContext } from '~/hooks';
import { globalAudioId } from '~/common'; import { globalAudioId } from '~/common';
import { getLatestText } from '~/utils';
import store from '~/store'; import store from '~/store';
function timeoutPromise(ms: number, message?: string) { function timeoutPromise(ms: number, message?: string) {
@ -51,7 +51,7 @@ export default function StreamAudio({ index = 0 }) {
const latestText = getLatestText(latestMessage); const latestText = getLatestText(latestMessage);
const shouldFetch = !!( const shouldFetch = !!(
token && token != null &&
automaticPlayback && automaticPlayback &&
isSubmitting && isSubmitting &&
latestMessage && latestMessage &&
@ -60,7 +60,7 @@ export default function StreamAudio({ index = 0 }) {
latestMessage.messageId && latestMessage.messageId &&
!latestMessage.messageId.includes('_') && !latestMessage.messageId.includes('_') &&
!isFetching && !isFetching &&
activeRunId && activeRunId != null &&
activeRunId !== audioRunId activeRunId !== audioRunId
); );
@ -109,7 +109,8 @@ export default function StreamAudio({ index = 0 }) {
const reader = response.body.getReader(); const reader = response.body.getReader();
const type = 'audio/mpeg'; const type = 'audio/mpeg';
const browserSupportsType = MediaSource.isTypeSupported(type); const browserSupportsType =
typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported(type);
let mediaSource: MediaSourceAppender | undefined; let mediaSource: MediaSourceAppender | undefined;
if (browserSupportsType) { if (browserSupportsType) {
mediaSource = new MediaSourceAppender(type); mediaSource = new MediaSourceAppender(type);
@ -210,6 +211,7 @@ export default function StreamAudio({ index = 0 }) {
// eslint-disable-next-line react-hooks/exhaustive-deps // eslint-disable-next-line react-hooks/exhaustive-deps
}, [paramId]); }, [paramId]);
logger.log('StreamAudio.tsx - globalAudioURL:', globalAudioURL);
return ( return (
<audio <audio
ref={audioRef} ref={audioRef}
@ -222,7 +224,7 @@ export default function StreamAudio({ index = 0 }) {
height: '0px', height: '0px',
width: '0px', width: '0px',
}} }}
src={globalAudioURL || undefined} src={globalAudioURL ?? undefined}
id={globalAudioId} id={globalAudioId}
muted muted
autoPlay autoPlay

View file

@ -3,6 +3,7 @@ import { useRecoilValue } from 'recoil';
import type { TMessageContentParts } from 'librechat-data-provider'; import type { TMessageContentParts } from 'librechat-data-provider';
import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg'; import { VolumeIcon, VolumeMuteIcon, Spinner } from '~/components/svg';
import { useLocalize, useTextToSpeech } from '~/hooks'; import { useLocalize, useTextToSpeech } from '~/hooks';
import { logger } from '~/utils';
import store from '~/store'; import store from '~/store';
type THoverButtons = { type THoverButtons = {
@ -45,6 +46,12 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
} }
}, [audioRef, isSpeaking, playbackRate, messageId]); }, [audioRef, isSpeaking, playbackRate, messageId]);
logger.log(
'MessageAudio: audioRef.current?.src, audioRef.current',
audioRef.current?.src,
audioRef.current,
);
return ( return (
<> <>
<button <button
@ -75,6 +82,7 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
<audio <audio
ref={audioRef} ref={audioRef}
controls controls
preload="none"
controlsList="nodownload nofullscreen noremoteplayback" controlsList="nodownload nofullscreen noremoteplayback"
style={{ style={{
position: 'absolute', position: 'absolute',
@ -83,7 +91,10 @@ export default function MessageAudio({ isLast, index, messageId, content }: THov
height: '0px', height: '0px',
width: '0px', width: '0px',
}} }}
src={audioRef.current?.src ?? undefined} src={audioRef.current?.src}
onError={(error) => {
console.error('Error fetching audio:', error);
}}
id={`audio-${messageId}`} id={`audio-${messageId}`}
muted muted
autoPlay autoPlay

View file

@ -1,5 +1,5 @@
import { useRecoilState } from 'recoil'; import { useRecoilState, useRecoilValue } from 'recoil';
import { useRef, useMemo, useEffect } from 'react'; import { useRef, useMemo, useEffect, useState } from 'react';
import { parseTextParts } from 'librechat-data-provider'; import { parseTextParts } from 'librechat-data-provider';
import type { TMessageContentParts } from 'librechat-data-provider'; import type { TMessageContentParts } from 'librechat-data-provider';
import type { Option } from '~/common'; import type { Option } from '~/common';
@ -7,6 +7,7 @@ import useTextToSpeechExternal from './useTextToSpeechExternal';
import useTextToSpeechBrowser from './useTextToSpeechBrowser'; import useTextToSpeechBrowser from './useTextToSpeechBrowser';
import useGetAudioSettings from './useGetAudioSettings'; import useGetAudioSettings from './useGetAudioSettings';
import useTextToSpeechEdge from './useTextToSpeechEdge'; import useTextToSpeechEdge from './useTextToSpeechEdge';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import { usePauseGlobalAudio } from '../Audio'; import { usePauseGlobalAudio } from '../Audio';
import { logger } from '~/utils'; import { logger } from '~/utils';
import store from '~/store'; import store from '~/store';
@ -20,41 +21,77 @@ type TUseTextToSpeech = {
const useTextToSpeech = (props?: TUseTextToSpeech) => { const useTextToSpeech = (props?: TUseTextToSpeech) => {
const { messageId, content, isLast = false, index = 0 } = props ?? {}; const { messageId, content, isLast = false, index = 0 } = props ?? {};
const [voice, setVoice] = useRecoilState(store.voice);
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
const [isSpeakingState, setIsSpeaking] = useState(false);
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
const { textToSpeechEndpoint } = useGetAudioSettings(); const { textToSpeechEndpoint } = useGetAudioSettings();
const { pauseGlobalAudio } = usePauseGlobalAudio(index); const { pauseGlobalAudio } = usePauseGlobalAudio(index);
const audioRef = useRef<HTMLAudioElement | null>(null); const [voice, setVoice] = useRecoilState(store.voice);
const globalIsPlaying = useRecoilValue(store.globalAudioPlayingFamily(index));
const isSpeaking = isSpeakingState || (isLast && globalIsPlaying);
const { const {
generateSpeechLocal, generateSpeechLocal,
cancelSpeechLocal, cancelSpeechLocal,
isSpeaking: isSpeakingLocal,
voices: voicesLocal, voices: voicesLocal,
} = useTextToSpeechBrowser(); } = useTextToSpeechBrowser({ setIsSpeaking });
const { const {
generateSpeechEdge, generateSpeechEdge,
cancelSpeechEdge, cancelSpeechEdge,
isSpeaking: isSpeakingEdge,
voices: voicesEdge, voices: voicesEdge,
} = useTextToSpeechEdge(); } = useTextToSpeechEdge({ setIsSpeaking });
const { const {
generateSpeechExternal, generateSpeechExternal,
cancelSpeech: cancelSpeechExternal, cancelSpeech: cancelSpeechExternal,
isSpeaking: isSpeakingExternal,
isLoading: isLoadingExternal, isLoading: isLoadingExternal,
audioRef: audioRefExternal,
voices: voicesExternal, voices: voicesExternal,
} = useTextToSpeechExternal(messageId ?? '', isLast, index); } = useTextToSpeechExternal({
setIsSpeaking,
audioRef,
messageId,
isLast,
index,
});
let generateSpeech, cancelSpeech, isSpeaking, isLoading; const generateSpeech = useMemo(() => {
const map = {
edge: generateSpeechEdge,
browser: generateSpeechLocal,
external: generateSpeechExternal,
};
return map[textToSpeechEndpoint];
}, [generateSpeechEdge, generateSpeechExternal, generateSpeechLocal, textToSpeechEndpoint]);
const cancelSpeech = useMemo(() => {
const map = {
edge: cancelSpeechEdge,
browser: cancelSpeechLocal,
external: cancelSpeechExternal,
};
return map[textToSpeechEndpoint];
}, [cancelSpeechEdge, cancelSpeechExternal, cancelSpeechLocal, textToSpeechEndpoint]);
const isLoading = useMemo(() => {
const map = {
edge: false,
browser: false,
external: isLoadingExternal,
};
return map[textToSpeechEndpoint];
}, [isLoadingExternal, textToSpeechEndpoint]);
const voices: Option[] | string[] = useMemo(() => { const voices: Option[] | string[] = useMemo(() => {
const voiceMap = { const voiceMap = {
external: voicesExternal,
edge: voicesEdge, edge: voicesEdge,
browser: voicesLocal, browser: voicesLocal,
external: voicesExternal,
}; };
return voiceMap[textToSpeechEndpoint]; return voiceMap[textToSpeechEndpoint];
@ -88,34 +125,6 @@ const useTextToSpeech = (props?: TUseTextToSpeech) => {
} }
}, [setVoice, textToSpeechEndpoint, voice, voices]); }, [setVoice, textToSpeechEndpoint, voice, voices]);
switch (textToSpeechEndpoint) {
case 'external':
generateSpeech = generateSpeechExternal;
cancelSpeech = cancelSpeechExternal;
isSpeaking = isSpeakingExternal;
isLoading = isLoadingExternal;
if (audioRefExternal.current) {
audioRef.current = audioRefExternal.current;
}
break;
case 'edge':
generateSpeech = generateSpeechEdge;
cancelSpeech = cancelSpeechEdge;
isSpeaking = isSpeakingEdge;
isLoading = false;
break;
case 'browser':
default:
generateSpeech = generateSpeechLocal;
cancelSpeech = cancelSpeechLocal;
isSpeaking = isSpeakingLocal;
isLoading = false;
break;
}
const isMouseDownRef = useRef(false);
const timerRef = useRef<number | undefined>(undefined);
const handleMouseDown = () => { const handleMouseDown = () => {
isMouseDownRef.current = true; isMouseDownRef.current = true;
timerRef.current = window.setTimeout(() => { timerRef.current = window.setTimeout(() => {

View file

@ -7,9 +7,12 @@ interface VoiceOption {
label: string; label: string;
} }
function useTextToSpeechBrowser() { function useTextToSpeechBrowser({
setIsSpeaking,
}: {
setIsSpeaking: (isSpeaking: boolean) => void;
}) {
const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices); const [cloudBrowserVoices] = useRecoilState(store.cloudBrowserVoices);
const [isSpeaking, setIsSpeaking] = useState(false);
const [voiceName] = useRecoilState(store.voice); const [voiceName] = useRecoilState(store.voice);
const [voices, setVoices] = useState<VoiceOption[]>([]); const [voices, setVoices] = useState<VoiceOption[]>([]);
@ -61,7 +64,7 @@ function useTextToSpeechBrowser() {
setIsSpeaking(false); setIsSpeaking(false);
}; };
return { generateSpeechLocal, cancelSpeechLocal, isSpeaking, voices }; return { generateSpeechLocal, cancelSpeechLocal, voices };
} }
export default useTextToSpeechBrowser; export default useTextToSpeechBrowser;

View file

@ -13,14 +13,16 @@ interface Voice {
interface UseTextToSpeechEdgeReturn { interface UseTextToSpeechEdgeReturn {
generateSpeechEdge: (text: string) => void; generateSpeechEdge: (text: string) => void;
cancelSpeechEdge: () => void; cancelSpeechEdge: () => void;
isSpeaking: boolean;
voices: Voice[]; voices: Voice[];
} }
function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn { function useTextToSpeechEdge({
setIsSpeaking,
}: {
setIsSpeaking: (isSpeaking: boolean) => void;
}): UseTextToSpeechEdgeReturn {
const localize = useLocalize(); const localize = useLocalize();
const [voices, setVoices] = useState<Voice[]>([]); const [voices, setVoices] = useState<Voice[]>([]);
const [isSpeaking, setIsSpeaking] = useState<boolean>(false);
const voiceName = useRecoilValue(store.voice); const voiceName = useRecoilValue(store.voice);
const ttsRef = useRef<MsEdgeTTS | null>(null); const ttsRef = useRef<MsEdgeTTS | null>(null);
const audioElementRef = useRef<HTMLAudioElement | null>(null); const audioElementRef = useRef<HTMLAudioElement | null>(null);
@ -29,7 +31,10 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
const pendingBuffers = useRef<Uint8Array[]>([]); const pendingBuffers = useRef<Uint8Array[]>([]);
const { showToast } = useToastContext(); const { showToast } = useToastContext();
const isBrowserSupported = useMemo(() => MediaSource.isTypeSupported('audio/mpeg'), []); const isBrowserSupported = useMemo(
() => typeof MediaSource !== 'undefined' && MediaSource.isTypeSupported('audio/mpeg'),
[],
);
const fetchVoices = useCallback(() => { const fetchVoices = useCallback(() => {
if (!ttsRef.current) { if (!ttsRef.current) {
@ -146,7 +151,7 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
setIsSpeaking(true); setIsSpeaking(true);
pendingBuffers.current = []; pendingBuffers.current = [];
const readable = await ttsRef.current.toStream(text); const readable = ttsRef.current.toStream(text);
readable.on('data', (chunk: Buffer) => { readable.on('data', (chunk: Buffer) => {
pendingBuffers.current.push(new Uint8Array(chunk)); pendingBuffers.current.push(new Uint8Array(chunk));
@ -200,21 +205,21 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
}, [showToast, localize]); }, [showToast, localize]);
useEffect(() => { useEffect(() => {
if (!MediaSource.isTypeSupported('audio/mpeg')) { if (!isBrowserSupported) {
return; return;
} }
fetchVoices(); fetchVoices();
}, [fetchVoices]); }, [fetchVoices, isBrowserSupported]);
useEffect(() => { useEffect(() => {
if (!MediaSource.isTypeSupported('audio/mpeg')) { if (!isBrowserSupported) {
return; return;
} }
initializeTTS(); initializeTTS();
}, [voiceName, initializeTTS]); }, [voiceName, initializeTTS, isBrowserSupported]);
useEffect(() => { useEffect(() => {
if (!MediaSource.isTypeSupported('audio/mpeg')) { if (!isBrowserSupported) {
return; return;
} }
initializeMediaSource(); initializeMediaSource();
@ -223,18 +228,17 @@ function useTextToSpeechEdge(): UseTextToSpeechEdgeReturn {
URL.revokeObjectURL(audioElementRef.current?.src ?? ''); URL.revokeObjectURL(audioElementRef.current?.src ?? '');
} }
}; };
}, [initializeMediaSource]); }, [initializeMediaSource, isBrowserSupported]);
if (!isBrowserSupported) { if (!isBrowserSupported) {
return { return {
generateSpeechEdge: () => ({}), generateSpeechEdge: () => ({}),
cancelSpeechEdge: () => ({}), cancelSpeechEdge: () => ({}),
isSpeaking: false,
voices: [], voices: [],
}; };
} }
return { generateSpeechEdge, cancelSpeechEdge, isSpeaking, voices }; return { generateSpeechEdge, cancelSpeechEdge, voices };
} }
export default useTextToSpeechEdge; export default useTextToSpeechEdge;

View file

@ -1,7 +1,6 @@
import { useRecoilValue } from 'recoil'; import { useRecoilValue } from 'recoil';
import { useState, useMemo, useRef, useCallback, useEffect } from 'react'; import { useState, useMemo, useRef, useCallback, useEffect } from 'react';
import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider'; import { useTextToSpeechMutation, useVoicesQuery } from '~/data-provider';
import useAudioRef from '~/hooks/Audio/useAudioRef';
import useLocalize from '~/hooks/useLocalize'; import useLocalize from '~/hooks/useLocalize';
import { useToastContext } from '~/Providers'; import { useToastContext } from '~/Providers';
import store from '~/store'; import store from '~/store';
@ -13,7 +12,21 @@ const createFormData = (text: string, voice: string) => {
return formData; return formData;
}; };
function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0) { type TUseTTSExternal = {
setIsSpeaking: (isSpeaking: boolean) => void;
audioRef: React.MutableRefObject<HTMLAudioElement | null>;
messageId?: string;
isLast: boolean;
index?: number;
};
function useTextToSpeechExternal({
setIsSpeaking,
audioRef,
messageId,
isLast,
index = 0,
}: TUseTTSExternal) {
const localize = useLocalize(); const localize = useLocalize();
const { showToast } = useToastContext(); const { showToast } = useToastContext();
const voice = useRecoilValue(store.voice); const voice = useRecoilValue(store.voice);
@ -21,8 +34,7 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
const playbackRate = useRecoilValue(store.playbackRate); const playbackRate = useRecoilValue(store.playbackRate);
const [downloadFile, setDownloadFile] = useState(false); const [downloadFile, setDownloadFile] = useState(false);
const [isLocalSpeaking, setIsSpeaking] = useState(false);
const { audioRef } = useAudioRef({ setIsPlaying: setIsSpeaking });
const promiseAudioRef = useRef<HTMLAudioElement | null>(null); const promiseAudioRef = useRef<HTMLAudioElement | null>(null);
/* Global Audio Variables */ /* Global Audio Variables */
@ -174,17 +186,12 @@ function useTextToSpeechExternal(messageId: string, isLast: boolean, index = 0)
return isProcessing || (isLast && globalIsFetching && !globalIsPlaying); return isProcessing || (isLast && globalIsFetching && !globalIsPlaying);
}, [isProcessing, globalIsFetching, globalIsPlaying, isLast]); }, [isProcessing, globalIsFetching, globalIsPlaying, isLast]);
const isSpeaking = useMemo(() => {
return isLocalSpeaking || (isLast && globalIsPlaying);
}, [isLocalSpeaking, globalIsPlaying, isLast]);
const { data: voicesData = [] } = useVoicesQuery(); const { data: voicesData = [] } = useVoicesQuery();
return { return {
generateSpeechExternal, generateSpeechExternal,
cancelSpeech, cancelSpeech,
isLoading, isLoading,
isSpeaking,
audioRef, audioRef,
voices: voicesData, voices: voicesData,
}; };