🔊 fix(tts): NotAllowedError (mobile/safari), Unsupported MediaSource type (firefox), Hide Audio Element (#2854)

* fix: hide audio element on mobile * chore: add tts docs link * fix: select voice option on first render * fix: NotAllowedError, prevent async playback for mobile triggers, consolidate MessageAudio code, user user-triggered unmutes * fix: Firefox/unsupported type for MediaSource hack * refactor(STT): make icon red when recording. consolidate logic to AudioRecorder component * fix: revert Redis changes to use separate client for sessions
2026-02-24 03:14:08 +01:00 · 2024-05-24 12:18:11 -04:00 · 2024-05-24 12:18:11 -04:00 · 35ba4ba1a4
commit 35ba4ba1a4
parent dcd2e3e62d
14 changed files with 421 additions and 130 deletions
--- a/client/src/components/Chat/Input/AudioRecorder.tsx
+++ b/client/src/components/Chat/Input/AudioRecorder.tsx
@ -1,16 +1,46 @@
-import React from 'react';
-import { ListeningIcon, Spinner, SpeechIcon } from '~/components/svg';
+import { useEffect } from 'react';
+import type { UseFormReturn } from 'react-hook-form';
 import { TooltipProvider, Tooltip, TooltipTrigger, TooltipContent } from '~/components/ui/';
-import { useLocalize } from '~/hooks';
+import { ListeningIcon, Spinner } from '~/components/svg';
+import { useLocalize, useSpeechToText } from '~/hooks';
+import { globalAudioId } from '~/common';

 export default function AudioRecorder({
-  isListening,
-  isLoading,
-  startRecording,
-  stopRecording,
+  textAreaRef,
+  methods,
+  ask,
  disabled,
+}: {
+  textAreaRef: React.RefObject<HTMLTextAreaElement>;
+  methods: UseFormReturn<{ text: string }>;
+  ask: (data: { text: string }) => void;
+  disabled: boolean;
 }) {
  const localize = useLocalize();
+
+  const handleTranscriptionComplete = (text: string) => {
+    if (text) {
+      const globalAudio = document.getElementById(globalAudioId) as HTMLAudioElement;
+      if (globalAudio) {
+        console.log('Unmuting global audio');
+        globalAudio.muted = false;
+      }
+      ask({ text });
+      methods.reset({ text: '' });
+      clearText();
+    }
+  };
+
+  const { isListening, isLoading, startRecording, stopRecording, speechText, clearText } =
+    useSpeechToText(handleTranscriptionComplete);
+
+  useEffect(() => {
+    if (textAreaRef.current) {
+      textAreaRef.current.value = speechText;
+      methods.setValue('text', speechText, { shouldValidate: true });
+    }
+  }, [speechText, methods, textAreaRef]);
+
  const handleStartRecording = async () => {
    await startRecording();
  };
@ -19,6 +49,16 @@ export default function AudioRecorder({
    await stopRecording();
  };

+  const renderIcon = () => {
+    if (isListening) {
+      return <ListeningIcon className="stroke-red-500" />;
+    }
+    if (isLoading) {
+      return <Spinner className="stroke-gray-700 dark:stroke-gray-300" />;
+    }
+    return <ListeningIcon className="stroke-gray-700 dark:stroke-gray-300" />;
+  };
+
  return (
    <TooltipProvider delayDuration={250}>
      <Tooltip>
@ -29,13 +69,7 @@ export default function AudioRecorder({
            className="absolute bottom-1.5 right-12 flex h-[30px] w-[30px] items-center justify-center rounded-lg p-0.5 transition-colors hover:bg-gray-200 dark:hover:bg-gray-700 md:bottom-3 md:right-12"
            type="button"
          >
-            {isListening ? (
-              <SpeechIcon className="stroke-gray-700 dark:stroke-gray-300" />
-            ) : isLoading ? (
-              <Spinner className="stroke-gray-700 dark:stroke-gray-300" />
-            ) : (
-              <ListeningIcon className="stroke-gray-700 dark:stroke-gray-300" />
-            )}
+            {renderIcon()}
          </button>
        </TooltipTrigger>
        <TooltipContent side="top" sideOffset={10}>
--- a/client/src/components/Chat/Input/ChatForm.tsx
+++ b/client/src/components/Chat/Input/ChatForm.tsx
@ -1,6 +1,6 @@
 import { useForm } from 'react-hook-form';
 import { useRecoilState, useRecoilValue } from 'recoil';
-import { memo, useCallback, useRef, useMemo, useEffect } from 'react';
+import { memo, useCallback, useRef, useMemo } from 'react';
 import {
  supportsFiles,
  mergeFileConfig,
@ -8,7 +8,7 @@ import {
  fileConfig as defaultFileConfig,
 } from 'librechat-data-provider';
 import { useChatContext, useAssistantsMapContext } from '~/Providers';
-import { useRequiresKey, useTextarea, useSpeechToText } from '~/hooks';
+import { useRequiresKey, useTextarea } from '~/hooks';
 import { TextareaAutosize } from '~/components/ui';
 import { useGetFileConfig } from '~/data-provider';
 import { cn, removeFocusOutlines } from '~/utils';
@ -72,24 +72,6 @@ const ChatForm = ({ index = 0 }) => {
  const { endpoint: _endpoint, endpointType } = conversation ?? { endpoint: null };
  const endpoint = endpointType ?? _endpoint;

-  const handleTranscriptionComplete = (text: string) => {
-    if (text) {
-      ask({ text });
-      methods.reset({ text: '' });
-      clearText();
-    }
-  };
-
-  const { isListening, isLoading, startRecording, stopRecording, speechText, clearText } =
-    useSpeechToText(handleTranscriptionComplete);
-
-  useEffect(() => {
-    if (textAreaRef.current) {
-      textAreaRef.current.value = speechText;
-      methods.setValue('text', speechText, { shouldValidate: true });
-    }
-  }, [speechText, methods]);
-
  const { data: fileConfig = defaultFileConfig } = useGetFileConfig({
    select: (data) => mergeFileConfig(data),
  });
@ -183,11 +165,10 @@ const ChatForm = ({ index = 0 }) => {
            )}
            {SpeechToText && (
              <AudioRecorder
-                isListening={isListening}
-                isLoading={isLoading}
-                startRecording={startRecording}
-                stopRecording={stopRecording}
                disabled={!!disableInputs}
+                textAreaRef={textAreaRef}
+                ask={submitMessage}
+                methods={methods}
              />
            )}
            {TextToSpeech && automaticPlayback && <StreamAudio index={index} />}
--- a/client/src/components/Chat/Input/StreamAudio.tsx
+++ b/client/src/components/Chat/Input/StreamAudio.tsx
@ -88,7 +88,7 @@ export default function StreamAudio({ index = 0 }) {
          return;
        }

-        console.log('Fetching audio...');
+        console.log('Fetching audio...', navigator.userAgent);
        const response = await fetch('/api/files/tts', {
          method: 'POST',
          headers: { 'Content-Type': 'application/json', Authorization: `Bearer ${token}` },
@ -103,8 +103,14 @@ export default function StreamAudio({ index = 0 }) {
        }

        const reader = response.body.getReader();
-        const mediaSource = new MediaSourceAppender('audio/mpeg');
-        setGlobalAudioURL(mediaSource.mediaSourceUrl);
+
+        const type = 'audio/mpeg';
+        const browserSupportsType = MediaSource.isTypeSupported(type);
+        let mediaSource: MediaSourceAppender | undefined;
+        if (browserSupportsType) {
+          mediaSource = new MediaSourceAppender(type);
+          setGlobalAudioURL(mediaSource.mediaSourceUrl);
+        }
        setAudioRunId(activeRunId);

        let done = false;
@ -120,7 +126,7 @@ export default function StreamAudio({ index = 0 }) {
          if (cacheTTS && value) {
            chunks.push(value);
          }
-          if (value) {
+          if (value && mediaSource) {
            mediaSource.addData(value);
          }
          done = readerDone;
@ -136,8 +142,19 @@ export default function StreamAudio({ index = 0 }) {
          if (!cacheKey) {
            throw new Error('Cache key not found');
          }
-          const audioBlob = new Blob(chunks, { type: 'audio/mpeg' });
-          cache.put(cacheKey, new Response(audioBlob));
+          const audioBlob = new Blob(chunks, { type });
+          const cachedResponse = new Response(audioBlob);
+          await cache.put(cacheKey, cachedResponse);
+          if (!browserSupportsType) {
+            const unconsumedResponse = await cache.match(cacheKey);
+            if (!unconsumedResponse) {
+              throw new Error('Failed to fetch audio from cache');
+            }
+            const audioBlob = await unconsumedResponse.blob();
+            const blobUrl = URL.createObjectURL(audioBlob);
+            setGlobalAudioURL(blobUrl);
+          }
+          setIsFetching(false);
        }

        console.log('Audio stream reading ended');
@ -194,9 +211,16 @@ export default function StreamAudio({ index = 0 }) {
      ref={audioRef}
      controls
      controlsList="nodownload nofullscreen noremoteplayback"
-      className="absolute h-0 w-0 overflow-hidden"
+      style={{
+        position: 'absolute',
+        overflow: 'hidden',
+        display: 'none',
+        height: '0px',
+        width: '0px',
+      }}
      src={globalAudioURL || undefined}
      id={globalAudioId}
+      muted
      autoPlay
    />
  );