mirror of
https://github.com/danny-avila/LibreChat.git
synced 2025-09-22 06:00:56 +02:00
🎵 feat: Cumulative Transcription Support for External STT (#9318)
* 🔧 fix: TTS and STT Services to use AppConfig
- Updated `getProviderSchema` and `getProvider` methods to accept an optional `appConfig` parameter, allowing for more flexible configuration retrieval.
- Improved error handling by ensuring that the app configuration is checked before accessing TTS and STT schemas.
- Refactored `processTextToSpeech` and `streamAudio` methods to utilize the new `appConfig` parameter for better clarity and maintainability.
* feat: Cumulative Transcription Support for STT External
* style: fix medium-sized styling for admin settings dialogs
This commit is contained in:
parent
c3e88b97c8
commit
15d7a3d221
7 changed files with 58 additions and 30 deletions
|
@ -109,9 +109,11 @@ class STTService {
|
||||||
* @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set.
|
* @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set.
|
||||||
*/
|
*/
|
||||||
async getProviderSchema(req) {
|
async getProviderSchema(req) {
|
||||||
const appConfig = await getAppConfig({
|
const appConfig =
|
||||||
role: req?.user?.role,
|
req.config ??
|
||||||
});
|
(await getAppConfig({
|
||||||
|
role: req?.user?.role,
|
||||||
|
}));
|
||||||
const sttSchema = appConfig?.speech?.stt;
|
const sttSchema = appConfig?.speech?.stt;
|
||||||
if (!sttSchema) {
|
if (!sttSchema) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
|
|
|
@ -35,11 +35,12 @@ class TTSService {
|
||||||
|
|
||||||
/**
|
/**
|
||||||
* Retrieves the configured TTS provider.
|
* Retrieves the configured TTS provider.
|
||||||
|
* @param {AppConfig | null | undefined} [appConfig] - The app configuration object.
|
||||||
* @returns {string} The name of the configured provider.
|
* @returns {string} The name of the configured provider.
|
||||||
* @throws {Error} If no provider is set or multiple providers are set.
|
* @throws {Error} If no provider is set or multiple providers are set.
|
||||||
*/
|
*/
|
||||||
getProvider() {
|
getProvider(appConfig) {
|
||||||
const ttsSchema = this.customConfig.speech.tts;
|
const ttsSchema = appConfig?.speech?.tts;
|
||||||
if (!ttsSchema) {
|
if (!ttsSchema) {
|
||||||
throw new Error(
|
throw new Error(
|
||||||
'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?',
|
'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?',
|
||||||
|
@ -276,8 +277,8 @@ class TTSService {
|
||||||
/**
|
/**
|
||||||
* Processes a text-to-speech request.
|
* Processes a text-to-speech request.
|
||||||
* @async
|
* @async
|
||||||
* @param {Object} req - The request object.
|
* @param {ServerRequest} req - The request object.
|
||||||
* @param {Object} res - The response object.
|
* @param {ServerResponse} res - The response object.
|
||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
*/
|
*/
|
||||||
async processTextToSpeech(req, res) {
|
async processTextToSpeech(req, res) {
|
||||||
|
@ -287,12 +288,14 @@ class TTSService {
|
||||||
return res.status(400).send('Missing text in request body');
|
return res.status(400).send('Missing text in request body');
|
||||||
}
|
}
|
||||||
|
|
||||||
const appConfig = await getAppConfig({
|
const appConfig =
|
||||||
role: req.user?.role,
|
req.config ??
|
||||||
});
|
(await getAppConfig({
|
||||||
|
role: req.user?.role,
|
||||||
|
}));
|
||||||
try {
|
try {
|
||||||
res.setHeader('Content-Type', 'audio/mpeg');
|
res.setHeader('Content-Type', 'audio/mpeg');
|
||||||
const provider = this.getProvider();
|
const provider = this.getProvider(appConfig);
|
||||||
const ttsSchema = appConfig?.speech?.tts?.[provider];
|
const ttsSchema = appConfig?.speech?.tts?.[provider];
|
||||||
const voice = await this.getVoice(ttsSchema, requestVoice);
|
const voice = await this.getVoice(ttsSchema, requestVoice);
|
||||||
|
|
||||||
|
@ -344,14 +347,19 @@ class TTSService {
|
||||||
/**
|
/**
|
||||||
* Streams audio data from the TTS provider.
|
* Streams audio data from the TTS provider.
|
||||||
* @async
|
* @async
|
||||||
* @param {Object} req - The request object.
|
* @param {ServerRequest} req - The request object.
|
||||||
* @param {Object} res - The response object.
|
* @param {ServerResponse} res - The response object.
|
||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
*/
|
*/
|
||||||
async streamAudio(req, res) {
|
async streamAudio(req, res) {
|
||||||
res.setHeader('Content-Type', 'audio/mpeg');
|
res.setHeader('Content-Type', 'audio/mpeg');
|
||||||
const provider = this.getProvider();
|
const appConfig =
|
||||||
const ttsSchema = this.customConfig.speech.tts[provider];
|
req.config ??
|
||||||
|
(await getAppConfig({
|
||||||
|
role: req.user?.role,
|
||||||
|
}));
|
||||||
|
const provider = this.getProvider(appConfig);
|
||||||
|
const ttsSchema = appConfig?.speech?.tts?.[provider];
|
||||||
const voice = await this.getVoice(ttsSchema, req.body.voice);
|
const voice = await this.getVoice(ttsSchema, req.body.voice);
|
||||||
|
|
||||||
let shouldContinue = true;
|
let shouldContinue = true;
|
||||||
|
@ -436,8 +444,8 @@ async function createTTSService() {
|
||||||
/**
|
/**
|
||||||
* Wrapper function for text-to-speech processing.
|
* Wrapper function for text-to-speech processing.
|
||||||
* @async
|
* @async
|
||||||
* @param {Object} req - The request object.
|
* @param {ServerRequest} req - The request object.
|
||||||
* @param {Object} res - The response object.
|
* @param {ServerResponse} res - The response object.
|
||||||
* @returns {Promise<void>}
|
* @returns {Promise<void>}
|
||||||
*/
|
*/
|
||||||
async function textToSpeech(req, res) {
|
async function textToSpeech(req, res) {
|
||||||
|
@ -460,11 +468,12 @@ async function streamAudio(req, res) {
|
||||||
/**
|
/**
|
||||||
* Wrapper function to get the configured TTS provider.
|
* Wrapper function to get the configured TTS provider.
|
||||||
* @async
|
* @async
|
||||||
|
* @param {AppConfig | null | undefined} appConfig - The app configuration object.
|
||||||
* @returns {Promise<string>} A promise that resolves to the name of the configured provider.
|
* @returns {Promise<string>} A promise that resolves to the name of the configured provider.
|
||||||
*/
|
*/
|
||||||
async function getProvider() {
|
async function getProvider(appConfig) {
|
||||||
const ttsService = await createTTSService();
|
const ttsService = await createTTSService();
|
||||||
return ttsService.getProvider();
|
return ttsService.getProvider(appConfig);
|
||||||
}
|
}
|
||||||
|
|
||||||
module.exports = {
|
module.exports = {
|
||||||
|
|
|
@ -1,10 +1,11 @@
|
||||||
import { useCallback, useRef } from 'react';
|
import { useCallback, useRef } from 'react';
|
||||||
import { useToastContext, TooltipAnchor, ListeningIcon, Spinner } from '@librechat/client';
|
import { useToastContext, TooltipAnchor, ListeningIcon, Spinner } from '@librechat/client';
|
||||||
import { useLocalize, useSpeechToText } from '~/hooks';
|
import { useLocalize, useSpeechToText, useGetAudioSettings } from '~/hooks';
|
||||||
import { useChatFormContext } from '~/Providers';
|
import { useChatFormContext } from '~/Providers';
|
||||||
import { globalAudioId } from '~/common';
|
import { globalAudioId } from '~/common';
|
||||||
import { cn } from '~/utils';
|
import { cn } from '~/utils';
|
||||||
|
|
||||||
|
const isExternalSTT = (speechToTextEndpoint: string) => speechToTextEndpoint === 'external';
|
||||||
export default function AudioRecorder({
|
export default function AudioRecorder({
|
||||||
disabled,
|
disabled,
|
||||||
ask,
|
ask,
|
||||||
|
@ -21,6 +22,8 @@ export default function AudioRecorder({
|
||||||
const { setValue, reset, getValues } = methods;
|
const { setValue, reset, getValues } = methods;
|
||||||
const localize = useLocalize();
|
const localize = useLocalize();
|
||||||
const { showToast } = useToastContext();
|
const { showToast } = useToastContext();
|
||||||
|
const { speechToTextEndpoint } = useGetAudioSettings();
|
||||||
|
|
||||||
const existingTextRef = useRef<string>('');
|
const existingTextRef = useRef<string>('');
|
||||||
|
|
||||||
const onTranscriptionComplete = useCallback(
|
const onTranscriptionComplete = useCallback(
|
||||||
|
@ -38,23 +41,34 @@ export default function AudioRecorder({
|
||||||
console.log('Unmuting global audio');
|
console.log('Unmuting global audio');
|
||||||
globalAudio.muted = false;
|
globalAudio.muted = false;
|
||||||
}
|
}
|
||||||
ask({ text });
|
/** For external STT, append existing text to the transcription */
|
||||||
|
const finalText =
|
||||||
|
isExternalSTT(speechToTextEndpoint) && existingTextRef.current
|
||||||
|
? `${existingTextRef.current} ${text}`
|
||||||
|
: text;
|
||||||
|
ask({ text: finalText });
|
||||||
reset({ text: '' });
|
reset({ text: '' });
|
||||||
existingTextRef.current = '';
|
existingTextRef.current = '';
|
||||||
}
|
}
|
||||||
},
|
},
|
||||||
[ask, reset, showToast, localize, isSubmitting],
|
[ask, reset, showToast, localize, isSubmitting, speechToTextEndpoint],
|
||||||
);
|
);
|
||||||
|
|
||||||
const setText = useCallback(
|
const setText = useCallback(
|
||||||
(text: string) => {
|
(text: string) => {
|
||||||
/** The transcript is cumulative, so we only need to prepend the existing text once */
|
let newText = text;
|
||||||
const newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
|
if (isExternalSTT(speechToTextEndpoint)) {
|
||||||
|
/** For external STT, the text comes as a complete transcription, so append to existing */
|
||||||
|
newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
|
||||||
|
} else {
|
||||||
|
/** For browser STT, the transcript is cumulative, so we only need to prepend the existing text once */
|
||||||
|
newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
|
||||||
|
}
|
||||||
setValue('text', newText, {
|
setValue('text', newText, {
|
||||||
shouldValidate: true,
|
shouldValidate: true,
|
||||||
});
|
});
|
||||||
},
|
},
|
||||||
[setValue],
|
[setValue, speechToTextEndpoint],
|
||||||
);
|
);
|
||||||
|
|
||||||
const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText(
|
const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText(
|
||||||
|
@ -73,7 +87,10 @@ export default function AudioRecorder({
|
||||||
|
|
||||||
const handleStopRecording = async () => {
|
const handleStopRecording = async () => {
|
||||||
stopRecording();
|
stopRecording();
|
||||||
existingTextRef.current = '';
|
/** For browser STT, clear the reference since text was already being updated */
|
||||||
|
if (!isExternalSTT(speechToTextEndpoint)) {
|
||||||
|
existingTextRef.current = '';
|
||||||
|
}
|
||||||
};
|
};
|
||||||
|
|
||||||
const renderIcon = () => {
|
const renderIcon = () => {
|
||||||
|
|
|
@ -153,7 +153,7 @@ const AdminSettings = () => {
|
||||||
<span className="hidden sm:flex">{localize('com_ui_admin')}</span>
|
<span className="hidden sm:flex">{localize('com_ui_admin')}</span>
|
||||||
</Button>
|
</Button>
|
||||||
</OGDialogTrigger>
|
</OGDialogTrigger>
|
||||||
<OGDialogContent className="max-w-lg border-border-light bg-surface-primary text-text-primary md:w-1/4">
|
<OGDialogContent className="max-w-lg border-border-light bg-surface-primary text-text-primary lg:w-1/4">
|
||||||
<OGDialogTitle>
|
<OGDialogTitle>
|
||||||
{`${localize('com_ui_admin_settings')} - ${localize('com_ui_prompts')}`}
|
{`${localize('com_ui_admin_settings')} - ${localize('com_ui_prompts')}`}
|
||||||
</OGDialogTitle>
|
</OGDialogTitle>
|
||||||
|
|
|
@ -163,7 +163,7 @@ const PeoplePickerAdminSettings = () => {
|
||||||
{localize('com_ui_admin_settings')}
|
{localize('com_ui_admin_settings')}
|
||||||
</Button>
|
</Button>
|
||||||
</OGDialogTrigger>
|
</OGDialogTrigger>
|
||||||
<OGDialogContent className="w-full border-border-light bg-surface-primary text-text-primary md:w-1/4">
|
<OGDialogContent className="w-full border-border-light bg-surface-primary text-text-primary lg:w-1/4">
|
||||||
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
|
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
|
||||||
'com_ui_people_picker',
|
'com_ui_people_picker',
|
||||||
)}`}</OGDialogTitle>
|
)}`}</OGDialogTitle>
|
||||||
|
|
|
@ -157,7 +157,7 @@ const AdminSettings = () => {
|
||||||
{localize('com_ui_admin_settings')}
|
{localize('com_ui_admin_settings')}
|
||||||
</Button>
|
</Button>
|
||||||
</OGDialogTrigger>
|
</OGDialogTrigger>
|
||||||
<OGDialogContent className="border-border-light bg-surface-primary text-text-primary md:w-1/4">
|
<OGDialogContent className="border-border-light bg-surface-primary text-text-primary lg:w-1/4">
|
||||||
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
|
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
|
||||||
'com_ui_agents',
|
'com_ui_agents',
|
||||||
)}`}</OGDialogTitle>
|
)}`}</OGDialogTitle>
|
||||||
|
|
|
@ -146,7 +146,7 @@ const AdminSettings = () => {
|
||||||
{localize('com_ui_admin_settings')}
|
{localize('com_ui_admin_settings')}
|
||||||
</Button>
|
</Button>
|
||||||
</OGDialogTrigger>
|
</OGDialogTrigger>
|
||||||
<OGDialogContent className="border-border-light bg-surface-primary text-text-primary md:w-1/4">
|
<OGDialogContent className="border-border-light bg-surface-primary text-text-primary lg:w-1/4">
|
||||||
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
|
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
|
||||||
'com_ui_memories',
|
'com_ui_memories',
|
||||||
)}`}</OGDialogTitle>
|
)}`}</OGDialogTitle>
|
||||||
|
|
Loading…
Add table
Add a link
Reference in a new issue