⏯️ fix(tts): Resolve Voice Selection and Manual Playback Issues (#2845)

* fix: voice setting for autoplayback TTS

* fix(useTextToSpeechExternal): resolve stateful playback issues and consolidate state logic

* refactor: initialize tts voice and provider schema once per request

* fix(tts): edge case, longer text inputs. TODO: use continuous stream for longer text inputs

* fix(tts): pause global audio on conversation change

* refactor: keyvMongo ban cache to allow db updates for unbanning, to prevent server restart

* chore: eslint fix

* refactor: make ban cache exclusively keyvMongo
This commit is contained in:
Danny Avila 2024-05-23 16:27:36 -04:00 committed by GitHub
parent 8e66683577
commit 514a502b9c
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
10 changed files with 332 additions and 178 deletions

View file

@ -1,10 +1,10 @@
import { useParams } from 'react-router-dom';
import { useEffect, useCallback } from 'react';
import { QueryKeys } from 'librechat-data-provider';
import { useQueryClient } from '@tanstack/react-query';
import { useEffect, useCallback } from 'react';
import { useRecoilState, useRecoilValue, useSetRecoilState } from 'recoil';
import type { TMessage } from 'librechat-data-provider';
import { useCustomAudioRef, MediaSourceAppender } from '~/hooks/Audio';
import { useCustomAudioRef, MediaSourceAppender, usePauseGlobalAudio } from '~/hooks/Audio';
import { useAuthContext } from '~/hooks';
import { globalAudioId } from '~/common';
import store from '~/store';
@ -24,6 +24,7 @@ export default function StreamAudio({ index = 0 }) {
const cacheTTS = useRecoilValue(store.cacheTTS);
const playbackRate = useRecoilValue(store.playbackRate);
const voice = useRecoilValue(store.voice);
const activeRunId = useRecoilValue(store.activeRunFamily(index));
const automaticPlayback = useRecoilValue(store.automaticPlayback);
const isSubmitting = useRecoilValue(store.isSubmittingFamily(index));
@ -34,6 +35,7 @@ export default function StreamAudio({ index = 0 }) {
const [globalAudioURL, setGlobalAudioURL] = useRecoilState(store.globalAudioURLFamily(index));
const { audioRef } = useCustomAudioRef({ setIsPlaying });
const { pauseGlobalAudio } = usePauseGlobalAudio();
const { conversationId: paramId } = useParams();
const queryParam = paramId === 'new' ? paramId : latestMessage?.conversationId ?? paramId ?? '';
@ -90,7 +92,7 @@ export default function StreamAudio({ index = 0 }) {
const response = await fetch('/api/files/tts', {
method: 'POST',
headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` },
body: JSON.stringify({ messageId: latestMessage?.messageId, runId: activeRunId }),
body: JSON.stringify({ messageId: latestMessage?.messageId, runId: activeRunId, voice }),
});
if (!response.ok) {
@ -166,6 +168,7 @@ export default function StreamAudio({ index = 0 }) {
audioRunId,
cacheTTS,
audioRef,
voice,
token,
]);
@ -180,6 +183,12 @@ export default function StreamAudio({ index = 0 }) {
}
}, [audioRef, globalAudioURL, playbackRate]);
useEffect(() => {
pauseGlobalAudio();
// We only want the effect to run when the paramId changes
// eslint-disable-next-line react-hooks/exhaustive-deps
}, [paramId]);
return (
<audio
ref={audioRef}

View file

@ -1,6 +1,7 @@
import { useRecoilValue } from 'recoil';
import { useCallback, useEffect, useState, useMemo } from 'react';
import { useCallback, useEffect, useState, useMemo, useRef } from 'react';
import { useTextToSpeechMutation } from '~/data-provider';
import useLocalize from '~/hooks/useLocalize';
import { useToastContext } from '~/Providers';
import store from '~/store';
@ -12,16 +13,16 @@ const createFormData = (text: string, voice: string) => {
};
function useTextToSpeechExternal(isLast: boolean, index = 0) {
const localize = useLocalize();
const { showToast } = useToastContext();
const voice = useRecoilValue(store.voice);
const cacheTTS = useRecoilValue(store.cacheTTS);
const playbackRate = useRecoilValue(store.playbackRate);
const [text, setText] = useState<string | null>(null);
const audioRef = useRef<HTMLAudioElement | null>(null);
const [downloadFile, setDownloadFile] = useState(false);
const [isLocalSpeaking, setIsSpeaking] = useState(false);
const [blobUrl, setBlobUrl] = useState<string | null>(null);
const [audio, setAudio] = useState<HTMLAudioElement | null>(null);
/* Global Audio Variables */
const globalIsFetching = useRecoilValue(store.globalAudioFetchingFamily(index));
@ -29,10 +30,13 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
const playAudio = (blobUrl: string) => {
const newAudio = new Audio(blobUrl);
if (playbackRate && playbackRate !== 1) {
newAudio.playbackRate = playbackRate;
}
const initializeAudio = () => {
if (playbackRate && playbackRate !== 1) {
newAudio.playbackRate = playbackRate;
}
};
initializeAudio();
const playPromise = () => newAudio.play().then(() => setIsSpeaking(true));
playPromise().catch((error: Error) => {
@ -40,10 +44,12 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
error?.message &&
error.message.includes('The play() request was interrupted by a call to pause()')
) {
console.log('Play request was interrupted by a call to pause()');
initializeAudio();
return playPromise().catch(console.error);
}
console.error(error);
showToast({ message: `Error playing audio: ${error.message}`, status: 'error' });
showToast({ message: localize('com_nav_audio_play_error', error.message), status: 'error' });
});
newAudio.onended = () => {
@ -52,8 +58,7 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
setIsSpeaking(false);
};
setAudio(newAudio);
setBlobUrl(blobUrl);
audioRef.current = newAudio;
};
const downloadAudio = (blobUrl: string) => {
@ -65,35 +70,32 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
};
const { mutate: processAudio, isLoading: isProcessing } = useTextToSpeechMutation({
onSuccess: async (data: ArrayBuffer) => {
onMutate: (variables) => {
const inputText = (variables.get('input') ?? '') as string;
if (inputText.length >= 4096) {
showToast({
message: localize('com_nav_long_audio_warning'),
status: 'warning',
});
}
},
onSuccess: async (data: ArrayBuffer, variables) => {
try {
const mediaSource = new MediaSource();
const audio = new Audio();
audio.src = URL.createObjectURL(mediaSource);
audio.autoplay = true;
const inputText = (variables.get('input') ?? '') as string;
const audioBlob = new Blob([data], { type: 'audio/mpeg' });
mediaSource.onsourceopen = () => {
const sourceBuffer = mediaSource.addSourceBuffer('audio/mpeg');
sourceBuffer.appendBuffer(data);
};
audio.onended = () => {
URL.revokeObjectURL(audio.src);
setIsSpeaking(false);
};
setAudio(audio);
if (cacheTTS) {
if (cacheTTS && inputText) {
const cache = await caches.open('tts-responses');
const request = new Request(text!);
const response = new Response(new Blob([data], { type: 'audio/mpeg' }));
const request = new Request(inputText!);
const response = new Response(audioBlob);
cache.put(request, response);
}
const blobUrl = URL.createObjectURL(audioBlob);
if (downloadFile) {
downloadAudio(audio.src);
downloadAudio(blobUrl);
}
playAudio(blobUrl);
} catch (error) {
showToast({
message: `Error processing audio: ${(error as Error).message}`,
@ -102,13 +104,15 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
}
},
onError: (error: unknown) => {
showToast({ message: `Error: ${(error as Error).message}`, status: 'error' });
showToast({
message: localize('com_nav_audio_process_error', (error as Error).message),
status: 'error',
});
},
});
const generateSpeechExternal = async (text: string, download: boolean) => {
setText(text);
const cachedResponse = await getCachedResponse(text);
const cachedResponse = await caches.match(text);
if (cachedResponse && cacheTTS) {
handleCachedResponse(cachedResponse, download);
@ -119,8 +123,6 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
}
};
const getCachedResponse = async (text: string) => await caches.match(text);
const handleCachedResponse = async (cachedResponse: Response, download: boolean) => {
const audioBlob = await cachedResponse.blob();
const blobUrl = URL.createObjectURL(audioBlob);
@ -132,12 +134,13 @@ function useTextToSpeechExternal(isLast: boolean, index = 0) {
};
const cancelSpeech = useCallback(() => {
if (audio) {
audio.pause();
blobUrl && URL.revokeObjectURL(blobUrl);
if (audioRef.current) {
audioRef.current.pause();
audioRef.current.src && URL.revokeObjectURL(audioRef.current.src);
audioRef.current = null;
setIsSpeaking(false);
}
}, [audio, blobUrl]);
}, []);
useEffect(() => cancelSpeech, [cancelSpeech]);

View file

@ -550,6 +550,9 @@ export default {
com_nav_auto_transcribe_audio: 'Auto transcribe audio',
com_nav_db_sensitivity: 'Decibel sensitivity',
com_nav_playback_rate: 'Audio Playback Rate',
com_nav_audio_play_error: 'Error playing audio: {0}',
com_nav_audio_process_error: 'Error processing audio: {0}',
com_nav_long_audio_warning: 'Longer texts will take longer to process.',
com_nav_engine: 'Engine',
com_nav_browser: 'Browser',
com_nav_external: 'External',