🎵 feat: Cumulative Transcription Support for External STT (#9318)

* 🔧 fix: TTS and STT Services to use AppConfig

- Updated `getProviderSchema` and `getProvider` methods to accept an optional `appConfig` parameter, allowing for more flexible configuration retrieval.
- Improved error handling by ensuring that the app configuration is checked before accessing TTS and STT schemas.
- Refactored `processTextToSpeech` and `streamAudio` methods to utilize the new `appConfig` parameter for better clarity and maintainability.

* feat: Cumulative Transcription Support for STT External

* style: fix medium-sized styling for admin settings dialogs
This commit is contained in:
Danny Avila 2025-08-27 18:56:04 -04:00 committed by GitHub
parent c3e88b97c8
commit 15d7a3d221
No known key found for this signature in database
GPG key ID: B5690EEEBB952194
7 changed files with 58 additions and 30 deletions

View file

@ -109,9 +109,11 @@ class STTService {
* @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set. * @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set.
*/ */
async getProviderSchema(req) { async getProviderSchema(req) {
const appConfig = await getAppConfig({ const appConfig =
req.config ??
(await getAppConfig({
role: req?.user?.role, role: req?.user?.role,
}); }));
const sttSchema = appConfig?.speech?.stt; const sttSchema = appConfig?.speech?.stt;
if (!sttSchema) { if (!sttSchema) {
throw new Error( throw new Error(

View file

@ -35,11 +35,12 @@ class TTSService {
/** /**
* Retrieves the configured TTS provider. * Retrieves the configured TTS provider.
* @param {AppConfig | null | undefined} [appConfig] - The app configuration object.
* @returns {string} The name of the configured provider. * @returns {string} The name of the configured provider.
* @throws {Error} If no provider is set or multiple providers are set. * @throws {Error} If no provider is set or multiple providers are set.
*/ */
getProvider() { getProvider(appConfig) {
const ttsSchema = this.customConfig.speech.tts; const ttsSchema = appConfig?.speech?.tts;
if (!ttsSchema) { if (!ttsSchema) {
throw new Error( throw new Error(
'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?', 'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?',
@ -276,8 +277,8 @@ class TTSService {
/** /**
* Processes a text-to-speech request. * Processes a text-to-speech request.
* @async * @async
* @param {Object} req - The request object. * @param {ServerRequest} req - The request object.
* @param {Object} res - The response object. * @param {ServerResponse} res - The response object.
* @returns {Promise<void>} * @returns {Promise<void>}
*/ */
async processTextToSpeech(req, res) { async processTextToSpeech(req, res) {
@ -287,12 +288,14 @@ class TTSService {
return res.status(400).send('Missing text in request body'); return res.status(400).send('Missing text in request body');
} }
const appConfig = await getAppConfig({ const appConfig =
req.config ??
(await getAppConfig({
role: req.user?.role, role: req.user?.role,
}); }));
try { try {
res.setHeader('Content-Type', 'audio/mpeg'); res.setHeader('Content-Type', 'audio/mpeg');
const provider = this.getProvider(); const provider = this.getProvider(appConfig);
const ttsSchema = appConfig?.speech?.tts?.[provider]; const ttsSchema = appConfig?.speech?.tts?.[provider];
const voice = await this.getVoice(ttsSchema, requestVoice); const voice = await this.getVoice(ttsSchema, requestVoice);
@ -344,14 +347,19 @@ class TTSService {
/** /**
* Streams audio data from the TTS provider. * Streams audio data from the TTS provider.
* @async * @async
* @param {Object} req - The request object. * @param {ServerRequest} req - The request object.
* @param {Object} res - The response object. * @param {ServerResponse} res - The response object.
* @returns {Promise<void>} * @returns {Promise<void>}
*/ */
async streamAudio(req, res) { async streamAudio(req, res) {
res.setHeader('Content-Type', 'audio/mpeg'); res.setHeader('Content-Type', 'audio/mpeg');
const provider = this.getProvider(); const appConfig =
const ttsSchema = this.customConfig.speech.tts[provider]; req.config ??
(await getAppConfig({
role: req.user?.role,
}));
const provider = this.getProvider(appConfig);
const ttsSchema = appConfig?.speech?.tts?.[provider];
const voice = await this.getVoice(ttsSchema, req.body.voice); const voice = await this.getVoice(ttsSchema, req.body.voice);
let shouldContinue = true; let shouldContinue = true;
@ -436,8 +444,8 @@ async function createTTSService() {
/** /**
* Wrapper function for text-to-speech processing. * Wrapper function for text-to-speech processing.
* @async * @async
* @param {Object} req - The request object. * @param {ServerRequest} req - The request object.
* @param {Object} res - The response object. * @param {ServerResponse} res - The response object.
* @returns {Promise<void>} * @returns {Promise<void>}
*/ */
async function textToSpeech(req, res) { async function textToSpeech(req, res) {
@ -460,11 +468,12 @@ async function streamAudio(req, res) {
/** /**
* Wrapper function to get the configured TTS provider. * Wrapper function to get the configured TTS provider.
* @async * @async
* @param {AppConfig | null | undefined} appConfig - The app configuration object.
* @returns {Promise<string>} A promise that resolves to the name of the configured provider. * @returns {Promise<string>} A promise that resolves to the name of the configured provider.
*/ */
async function getProvider() { async function getProvider(appConfig) {
const ttsService = await createTTSService(); const ttsService = await createTTSService();
return ttsService.getProvider(); return ttsService.getProvider(appConfig);
} }
module.exports = { module.exports = {

View file

@ -1,10 +1,11 @@
import { useCallback, useRef } from 'react'; import { useCallback, useRef } from 'react';
import { useToastContext, TooltipAnchor, ListeningIcon, Spinner } from '@librechat/client'; import { useToastContext, TooltipAnchor, ListeningIcon, Spinner } from '@librechat/client';
import { useLocalize, useSpeechToText } from '~/hooks'; import { useLocalize, useSpeechToText, useGetAudioSettings } from '~/hooks';
import { useChatFormContext } from '~/Providers'; import { useChatFormContext } from '~/Providers';
import { globalAudioId } from '~/common'; import { globalAudioId } from '~/common';
import { cn } from '~/utils'; import { cn } from '~/utils';
const isExternalSTT = (speechToTextEndpoint: string) => speechToTextEndpoint === 'external';
export default function AudioRecorder({ export default function AudioRecorder({
disabled, disabled,
ask, ask,
@ -21,6 +22,8 @@ export default function AudioRecorder({
const { setValue, reset, getValues } = methods; const { setValue, reset, getValues } = methods;
const localize = useLocalize(); const localize = useLocalize();
const { showToast } = useToastContext(); const { showToast } = useToastContext();
const { speechToTextEndpoint } = useGetAudioSettings();
const existingTextRef = useRef<string>(''); const existingTextRef = useRef<string>('');
const onTranscriptionComplete = useCallback( const onTranscriptionComplete = useCallback(
@ -38,23 +41,34 @@ export default function AudioRecorder({
console.log('Unmuting global audio'); console.log('Unmuting global audio');
globalAudio.muted = false; globalAudio.muted = false;
} }
ask({ text }); /** For external STT, append existing text to the transcription */
const finalText =
isExternalSTT(speechToTextEndpoint) && existingTextRef.current
? `${existingTextRef.current} ${text}`
: text;
ask({ text: finalText });
reset({ text: '' }); reset({ text: '' });
existingTextRef.current = ''; existingTextRef.current = '';
} }
}, },
[ask, reset, showToast, localize, isSubmitting], [ask, reset, showToast, localize, isSubmitting, speechToTextEndpoint],
); );
const setText = useCallback( const setText = useCallback(
(text: string) => { (text: string) => {
/** The transcript is cumulative, so we only need to prepend the existing text once */ let newText = text;
const newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text; if (isExternalSTT(speechToTextEndpoint)) {
/** For external STT, the text comes as a complete transcription, so append to existing */
newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
} else {
/** For browser STT, the transcript is cumulative, so we only need to prepend the existing text once */
newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
}
setValue('text', newText, { setValue('text', newText, {
shouldValidate: true, shouldValidate: true,
}); });
}, },
[setValue], [setValue, speechToTextEndpoint],
); );
const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText( const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText(
@ -73,7 +87,10 @@ export default function AudioRecorder({
const handleStopRecording = async () => { const handleStopRecording = async () => {
stopRecording(); stopRecording();
/** For browser STT, clear the reference since text was already being updated */
if (!isExternalSTT(speechToTextEndpoint)) {
existingTextRef.current = ''; existingTextRef.current = '';
}
}; };
const renderIcon = () => { const renderIcon = () => {

View file

@ -153,7 +153,7 @@ const AdminSettings = () => {
<span className="hidden sm:flex">{localize('com_ui_admin')}</span> <span className="hidden sm:flex">{localize('com_ui_admin')}</span>
</Button> </Button>
</OGDialogTrigger> </OGDialogTrigger>
<OGDialogContent className="max-w-lg border-border-light bg-surface-primary text-text-primary md:w-1/4"> <OGDialogContent className="max-w-lg border-border-light bg-surface-primary text-text-primary lg:w-1/4">
<OGDialogTitle> <OGDialogTitle>
{`${localize('com_ui_admin_settings')} - ${localize('com_ui_prompts')}`} {`${localize('com_ui_admin_settings')} - ${localize('com_ui_prompts')}`}
</OGDialogTitle> </OGDialogTitle>

View file

@ -163,7 +163,7 @@ const PeoplePickerAdminSettings = () => {
{localize('com_ui_admin_settings')} {localize('com_ui_admin_settings')}
</Button> </Button>
</OGDialogTrigger> </OGDialogTrigger>
<OGDialogContent className="w-full border-border-light bg-surface-primary text-text-primary md:w-1/4"> <OGDialogContent className="w-full border-border-light bg-surface-primary text-text-primary lg:w-1/4">
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize( <OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
'com_ui_people_picker', 'com_ui_people_picker',
)}`}</OGDialogTitle> )}`}</OGDialogTitle>

View file

@ -157,7 +157,7 @@ const AdminSettings = () => {
{localize('com_ui_admin_settings')} {localize('com_ui_admin_settings')}
</Button> </Button>
</OGDialogTrigger> </OGDialogTrigger>
<OGDialogContent className="border-border-light bg-surface-primary text-text-primary md:w-1/4"> <OGDialogContent className="border-border-light bg-surface-primary text-text-primary lg:w-1/4">
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize( <OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
'com_ui_agents', 'com_ui_agents',
)}`}</OGDialogTitle> )}`}</OGDialogTitle>

View file

@ -146,7 +146,7 @@ const AdminSettings = () => {
{localize('com_ui_admin_settings')} {localize('com_ui_admin_settings')}
</Button> </Button>
</OGDialogTrigger> </OGDialogTrigger>
<OGDialogContent className="border-border-light bg-surface-primary text-text-primary md:w-1/4"> <OGDialogContent className="border-border-light bg-surface-primary text-text-primary lg:w-1/4">
<OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize( <OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
'com_ui_memories', 'com_ui_memories',
)}`}</OGDialogTitle> )}`}</OGDialogTitle>