🎵 feat: Cumulative Transcription Support for External STT (#9318)

* 🔧 fix: TTS and STT Services to use AppConfig - Updated `getProviderSchema` and `getProvider` methods to accept an optional `appConfig` parameter, allowing for more flexible configuration retrieval. - Improved error handling by ensuring that the app configuration is checked before accessing TTS and STT schemas. - Refactored `processTextToSpeech` and `streamAudio` methods to utilize the new `appConfig` parameter for better clarity and maintainability. * feat: Cumulative Transcription Support for STT External * style: fix medium-sized styling for admin settings dialogs
2025-12-17 00:40:14 +01:00 · 2025-08-27 18:56:04 -04:00 · 2025-08-27 18:56:04 -04:00 · 15d7a3d221
commit 15d7a3d221
parent c3e88b97c8
7 changed files with 58 additions and 30 deletions
--- a/api/server/services/Files/Audio/STTService.js
+++ b/api/server/services/Files/Audio/STTService.js
@ -109,9 +109,11 @@ class STTService {
   * @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set.
   */
  async getProviderSchema(req) {
-    const appConfig = await getAppConfig({
+    const appConfig =
+      req.config ??
+      (await getAppConfig({
        role: req?.user?.role,
-    });
+      }));
    const sttSchema = appConfig?.speech?.stt;
    if (!sttSchema) {
      throw new Error(
--- a/api/server/services/Files/Audio/TTSService.js
+++ b/api/server/services/Files/Audio/TTSService.js
@ -35,11 +35,12 @@ class TTSService {

  /**
   * Retrieves the configured TTS provider.
+   * @param {AppConfig | null | undefined} [appConfig] - The app configuration object.
   * @returns {string} The name of the configured provider.
   * @throws {Error} If no provider is set or multiple providers are set.
   */
-  getProvider() {
-    const ttsSchema = this.customConfig.speech.tts;
+  getProvider(appConfig) {
+    const ttsSchema = appConfig?.speech?.tts;
    if (!ttsSchema) {
      throw new Error(
        'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?',
@ -276,8 +277,8 @@ class TTSService {
  /**
   * Processes a text-to-speech request.
   * @async
-   * @param {Object} req - The request object.
-   * @param {Object} res - The response object.
+   * @param {ServerRequest} req - The request object.
+   * @param {ServerResponse} res - The response object.
   * @returns {Promise<void>}
   */
  async processTextToSpeech(req, res) {
@ -287,12 +288,14 @@ class TTSService {
      return res.status(400).send('Missing text in request body');
    }

-    const appConfig = await getAppConfig({
+    const appConfig =
+      req.config ??
+      (await getAppConfig({
        role: req.user?.role,
-    });
+      }));
    try {
      res.setHeader('Content-Type', 'audio/mpeg');
-      const provider = this.getProvider();
+      const provider = this.getProvider(appConfig);
      const ttsSchema = appConfig?.speech?.tts?.[provider];
      const voice = await this.getVoice(ttsSchema, requestVoice);

@ -344,14 +347,19 @@ class TTSService {
  /**
   * Streams audio data from the TTS provider.
   * @async
-   * @param {Object} req - The request object.
-   * @param {Object} res - The response object.
+   * @param {ServerRequest} req - The request object.
+   * @param {ServerResponse} res - The response object.
   * @returns {Promise<void>}
   */
  async streamAudio(req, res) {
    res.setHeader('Content-Type', 'audio/mpeg');
-    const provider = this.getProvider();
-    const ttsSchema = this.customConfig.speech.tts[provider];
+    const appConfig =
+      req.config ??
+      (await getAppConfig({
+        role: req.user?.role,
+      }));
+    const provider = this.getProvider(appConfig);
+    const ttsSchema = appConfig?.speech?.tts?.[provider];
    const voice = await this.getVoice(ttsSchema, req.body.voice);

    let shouldContinue = true;
@ -436,8 +444,8 @@ async function createTTSService() {
 /**
 * Wrapper function for text-to-speech processing.
 * @async
- * @param {Object} req - The request object.
- * @param {Object} res - The response object.
+ * @param {ServerRequest} req - The request object.
+ * @param {ServerResponse} res - The response object.
 * @returns {Promise<void>}
 */
 async function textToSpeech(req, res) {
@ -460,11 +468,12 @@ async function streamAudio(req, res) {
 /**
 * Wrapper function to get the configured TTS provider.
 * @async
+ * @param {AppConfig | null | undefined} appConfig - The app configuration object.
 * @returns {Promise<string>} A promise that resolves to the name of the configured provider.
 */
-async function getProvider() {
+async function getProvider(appConfig) {
  const ttsService = await createTTSService();
-  return ttsService.getProvider();
+  return ttsService.getProvider(appConfig);
 }

 module.exports = {
--- a/client/src/components/Chat/Input/AudioRecorder.tsx
+++ b/client/src/components/Chat/Input/AudioRecorder.tsx
@ -1,10 +1,11 @@
 import { useCallback, useRef } from 'react';
 import { useToastContext, TooltipAnchor, ListeningIcon, Spinner } from '@librechat/client';
-import { useLocalize, useSpeechToText } from '~/hooks';
+import { useLocalize, useSpeechToText, useGetAudioSettings } from '~/hooks';
 import { useChatFormContext } from '~/Providers';
 import { globalAudioId } from '~/common';
 import { cn } from '~/utils';

+const isExternalSTT = (speechToTextEndpoint: string) => speechToTextEndpoint === 'external';
 export default function AudioRecorder({
  disabled,
  ask,
@ -21,6 +22,8 @@ export default function AudioRecorder({
  const { setValue, reset, getValues } = methods;
  const localize = useLocalize();
  const { showToast } = useToastContext();
+  const { speechToTextEndpoint } = useGetAudioSettings();
+
  const existingTextRef = useRef<string>('');

  const onTranscriptionComplete = useCallback(
@ -38,23 +41,34 @@ export default function AudioRecorder({
          console.log('Unmuting global audio');
          globalAudio.muted = false;
        }
-        ask({ text });
+        /** For external STT, append existing text to the transcription */
+        const finalText =
+          isExternalSTT(speechToTextEndpoint) && existingTextRef.current
+            ? `${existingTextRef.current} ${text}`
+            : text;
+        ask({ text: finalText });
        reset({ text: '' });
        existingTextRef.current = '';
      }
    },
-    [ask, reset, showToast, localize, isSubmitting],
+    [ask, reset, showToast, localize, isSubmitting, speechToTextEndpoint],
  );

  const setText = useCallback(
    (text: string) => {
-      /** The transcript is cumulative, so we only need to prepend the existing text once */
-      const newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
+      let newText = text;
+      if (isExternalSTT(speechToTextEndpoint)) {
+        /** For external STT, the text comes as a complete transcription, so append to existing */
+        newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
+      } else {
+        /** For browser STT, the transcript is cumulative, so we only need to prepend the existing text once */
+        newText = existingTextRef.current ? `${existingTextRef.current} ${text}` : text;
+      }
      setValue('text', newText, {
        shouldValidate: true,
      });
    },
-    [setValue],
+    [setValue, speechToTextEndpoint],
  );

  const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText(
@ -73,7 +87,10 @@ export default function AudioRecorder({

  const handleStopRecording = async () => {
    stopRecording();
+    /** For browser STT, clear the reference since text was already being updated */
+    if (!isExternalSTT(speechToTextEndpoint)) {
      existingTextRef.current = '';
+    }
  };

  const renderIcon = () => {
--- a/client/src/components/Prompts/AdminSettings.tsx
+++ b/client/src/components/Prompts/AdminSettings.tsx
@ -153,7 +153,7 @@ const AdminSettings = () => {
            <span className="hidden sm:flex">{localize('com_ui_admin')}</span>
          </Button>
        </OGDialogTrigger>
-        <OGDialogContent className="max-w-lg border-border-light bg-surface-primary text-text-primary md:w-1/4">
+        <OGDialogContent className="max-w-lg border-border-light bg-surface-primary text-text-primary lg:w-1/4">
          <OGDialogTitle>
            {`${localize('com_ui_admin_settings')} - ${localize('com_ui_prompts')}`}
          </OGDialogTitle>
--- a/client/src/components/Sharing/PeoplePickerAdminSettings.tsx
+++ b/client/src/components/Sharing/PeoplePickerAdminSettings.tsx
@ -163,7 +163,7 @@ const PeoplePickerAdminSettings = () => {
          {localize('com_ui_admin_settings')}
        </Button>
      </OGDialogTrigger>
-      <OGDialogContent className="w-full border-border-light bg-surface-primary text-text-primary md:w-1/4">
+      <OGDialogContent className="w-full border-border-light bg-surface-primary text-text-primary lg:w-1/4">
        <OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
          'com_ui_people_picker',
        )}`}</OGDialogTitle>
--- a/client/src/components/SidePanel/Agents/AdminSettings.tsx
+++ b/client/src/components/SidePanel/Agents/AdminSettings.tsx
@ -157,7 +157,7 @@ const AdminSettings = () => {
          {localize('com_ui_admin_settings')}
        </Button>
      </OGDialogTrigger>
-      <OGDialogContent className="border-border-light bg-surface-primary text-text-primary md:w-1/4">
+      <OGDialogContent className="border-border-light bg-surface-primary text-text-primary lg:w-1/4">
        <OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
          'com_ui_agents',
        )}`}</OGDialogTitle>
--- a/client/src/components/SidePanel/Memories/AdminSettings.tsx
+++ b/client/src/components/SidePanel/Memories/AdminSettings.tsx
@ -146,7 +146,7 @@ const AdminSettings = () => {
          {localize('com_ui_admin_settings')}
        </Button>
      </OGDialogTrigger>
-      <OGDialogContent className="border-border-light bg-surface-primary text-text-primary md:w-1/4">
+      <OGDialogContent className="border-border-light bg-surface-primary text-text-primary lg:w-1/4">
        <OGDialogTitle>{`${localize('com_ui_admin_settings')} - ${localize(
          'com_ui_memories',
        )}`}</OGDialogTitle>