From e309c6abef5757ee6b1d4a9c94e52145ce068b72 Mon Sep 17 00:00:00 2001
From: Danny Avila <danny@librechat.ai>
Date: Thu, 16 Jan 2025 17:38:59 -0500
Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=AF=20fix:=20Prevent=20UI=20De-sync=20?=
 =?UTF-8?q?By=20Removing=20Redundant=20States=20(#5333)?=
MIME-Version: 1.0
Content-Type: text/plain; charset=UTF-8
Content-Transfer-Encoding: 8bit

* fix: remove local state from Dropdown causing de-sync

* refactor: cleanup STT code, avoid redundant states to prevent de-sync and side effects

* fix: reset transcript after sending final text to prevent data loss

* fix: clear timeout on component unmount to prevent memory leaks
---
 .../components/Chat/Input/AudioRecorder.tsx   | 98 ++++++++++---------
 client/src/components/Chat/Input/ChatForm.tsx |  7 +-
 .../SettingsTabs/Chat/FontSizeSelector.tsx    |  1 -
 client/src/components/ui/Dropdown.tsx         |  9 +-
 client/src/hooks/Input/useSpeechToText.ts     | 63 +++---------
 .../src/hooks/Input/useSpeechToTextBrowser.ts | 79 +++++++++++----
 .../hooks/Input/useSpeechToTextExternal.ts    | 36 ++++---
 client/src/localization/languages/Eng.ts      |  1 +
 8 files changed, 149 insertions(+), 145 deletions(-)
diff --git a/client/src/components/Chat/Input/AudioRecorder.tsx b/client/src/components/Chat/Input/AudioRecorder.tsx
index a8754749ad..96e29ec502 100644
--- a/client/src/components/Chat/Input/AudioRecorder.tsx
+++ b/client/src/components/Chat/Input/AudioRecorder.tsx
@@ -1,73 +1,79 @@
-import { useEffect } from 'react';
+import { useCallback } from 'react';
+import { useChatFormContext, useToastContext } from '~/Providers';
 import { ListeningIcon, Spinner } from '~/components/svg';
 import { useLocalize, useSpeechToText } from '~/hooks';
-import { useChatFormContext } from '~/Providers';
 import { TooltipAnchor } from '~/components/ui';
 import { globalAudioId } from '~/common';
 import { cn } from '~/utils';
 
 export default function AudioRecorder({
-  textAreaRef,
-  methods,
-  ask,
   isRTL,
   disabled,
+  ask,
+  methods,
+  textAreaRef,
+  isSubmitting,
 }: {
-  textAreaRef: React.RefObject<HTMLTextAreaElement>;
-  methods: ReturnType<typeof useChatFormContext>;
-  ask: (data: { text: string }) => void;
   isRTL: boolean;
   disabled: boolean;
+  ask: (data: { text: string }) => void;
+  methods: ReturnType<typeof useChatFormContext>;
+  textAreaRef: React.RefObject<HTMLTextAreaElement>;
+  isSubmitting: boolean;
 }) {
+  const { setValue, reset } = methods;
   const localize = useLocalize();
+  const { showToast } = useToastContext();
 
-  const handleTranscriptionComplete = (text: string) => {
-    if (text) {
-      const globalAudio = document.getElementById(globalAudioId) as HTMLAudioElement;
-      if (globalAudio) {
-        console.log('Unmuting global audio');
-        globalAudio.muted = false;
+  const onTranscriptionComplete = useCallback(
+    (text: string) => {
+      if (isSubmitting) {
+        showToast({
+          message: localize('com_ui_speech_while_submitting'),
+          status: 'error',
+        });
+        return;
       }
-      ask({ text });
-      methods.reset({ text: '' });
-      clearText();
-    }
-  };
+      if (text) {
+        const globalAudio = document.getElementById(globalAudioId) as HTMLAudioElement | null;
+        if (globalAudio) {
+          console.log('Unmuting global audio');
+          globalAudio.muted = false;
+        }
+        ask({ text });
+        reset({ text: '' });
+      }
+    },
+    [ask, reset, showToast, localize, isSubmitting],
+  );
 
-  const {
-    isListening,
-    isLoading,
-    startRecording,
-    stopRecording,
-    interimTranscript,
-    speechText,
-    clearText,
-  } = useSpeechToText(handleTranscriptionComplete);
-
-  useEffect(() => {
-    if (isListening && textAreaRef.current) {
-      methods.setValue('text', interimTranscript, {
+  const setText = useCallback(
+    (text: string) => {
+      setValue('text', text, {
         shouldValidate: true,
       });
-    } else if (textAreaRef.current) {
-      textAreaRef.current.value = speechText;
-      methods.setValue('text', speechText, { shouldValidate: true });
-    }
-  }, [interimTranscript, speechText, methods, textAreaRef]);
+    },
+    [setValue],
+  );
 
-  const handleStartRecording = async () => {
-    await startRecording();
-  };
+  const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText(
+    setText,
+    onTranscriptionComplete,
+  );
 
-  const handleStopRecording = async () => {
-    await stopRecording();
-  };
+  if (!textAreaRef.current) {
+    return null;
+  }
+
+  const handleStartRecording = async () => startRecording();
+
+  const handleStopRecording = async () => stopRecording();
 
   const renderIcon = () => {
-    if (isListening) {
+    if (isListening === true) {
       return <ListeningIcon className="stroke-red-500" />;
     }
-    if (isLoading) {
+    if (isLoading === true) {
       return <Spinner className="stroke-gray-700 dark:stroke-gray-300" />;
     }
     return <ListeningIcon className="stroke-gray-700 dark:stroke-gray-300" />;
@@ -77,7 +83,7 @@ export default function AudioRecorder({
     <TooltipAnchor
       id="audio-recorder"
       aria-label={localize('com_ui_use_micrphone')}
-      onClick={isListening ? handleStopRecording : handleStartRecording}
+      onClick={isListening === true ? handleStopRecording : handleStartRecording}
       disabled={disabled}
       className={cn(
         'absolute flex size-[35px] items-center justify-center rounded-full p-1 transition-colors hover:bg-surface-hover',
diff --git a/client/src/components/Chat/Input/ChatForm.tsx b/client/src/components/Chat/Input/ChatForm.tsx
index c0b61f4fec..944a6ed6c4 100644
--- a/client/src/components/Chat/Input/ChatForm.tsx
+++ b/client/src/components/Chat/Input/ChatForm.tsx
@@ -228,11 +228,12 @@ const ChatForm = ({ index = 0 }) => {
             </FileFormWrapper>
             {SpeechToText && (
               <AudioRecorder
-                disabled={!!disableInputs}
-                textAreaRef={textAreaRef}
-                ask={submitMessage}
                 isRTL={isRTL}
                 methods={methods}
+                ask={submitMessage}
+                textAreaRef={textAreaRef}
+                disabled={!!disableInputs}
+                isSubmitting={isSubmitting}
               />
             )}
             {TextToSpeech && automaticPlayback && <StreamAudio index={index} />}
diff --git a/client/src/components/Nav/SettingsTabs/Chat/FontSizeSelector.tsx b/client/src/components/Nav/SettingsTabs/Chat/FontSizeSelector.tsx
index e140c8a4d7..b442c86cdc 100644
--- a/client/src/components/Nav/SettingsTabs/Chat/FontSizeSelector.tsx
+++ b/client/src/components/Nav/SettingsTabs/Chat/FontSizeSelector.tsx
@@ -30,7 +30,6 @@ export default function FontSizeSelector() {
         onChange={handleChange}
         testId="font-size-selector"
         sizeClasses="w-[150px]"
-        anchor="bottom start"
       />
     </div>
   );
diff --git a/client/src/components/ui/Dropdown.tsx b/client/src/components/ui/Dropdown.tsx
index 785cef36de..f89ae7dc78 100644
--- a/client/src/components/ui/Dropdown.tsx
+++ b/client/src/components/ui/Dropdown.tsx
@@ -1,10 +1,10 @@
-import React, { useState } from 'react';
+import React from 'react';
 import * as Select from '@ariakit/react/select';
 import type { Option } from '~/common';
 import { cn } from '~/utils/';
 
 interface DropdownProps {
-  value: string;
+  value?: string;
   label?: string;
   onChange: (value: string) => void;
   options: string[] | Option[];
@@ -14,7 +14,7 @@ interface DropdownProps {
 }
 
 const Dropdown: React.FC<DropdownProps> = ({
-  value: initialValue,
+  value: selectedValue,
   label = '',
   onChange,
   options,
@@ -22,10 +22,7 @@ const Dropdown: React.FC<DropdownProps> = ({
   sizeClasses,
   testId = 'dropdown-menu',
 }) => {
-  const [selectedValue, setSelectedValue] = useState(initialValue);
-
   const handleChange = (value: string) => {
-    setSelectedValue(value);
     onChange(value);
   };
 
diff --git a/client/src/hooks/Input/useSpeechToText.ts b/client/src/hooks/Input/useSpeechToText.ts
index da09926b4e..705b870dc6 100644
--- a/client/src/hooks/Input/useSpeechToText.ts
+++ b/client/src/hooks/Input/useSpeechToText.ts
@@ -1,83 +1,48 @@
-import { useState, useEffect } from 'react';
 import useSpeechToTextBrowser from './useSpeechToTextBrowser';
 import useSpeechToTextExternal from './useSpeechToTextExternal';
 import useGetAudioSettings from './useGetAudioSettings';
 
-const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => {
+const useSpeechToText = (
+  setText: (text: string) => void,
+  onTranscriptionComplete: (text: string) => void,
+): {
+  isLoading?: boolean;
+  isListening?: boolean;
+  stopRecording: () => void | (() => Promise<void>);
+  startRecording: () => void | (() => Promise<void>);
+} => {
   const { speechToTextEndpoint } = useGetAudioSettings();
-  const [animatedText, setAnimatedText] = useState('');
   const externalSpeechToText = speechToTextEndpoint === 'external';
 
   const {
     isListening: speechIsListeningBrowser,
     isLoading: speechIsLoadingBrowser,
-    interimTranscript: interimTranscriptBrowser,
-    text: speechTextBrowser,
     startRecording: startSpeechRecordingBrowser,
     stopRecording: stopSpeechRecordingBrowser,
-  } = useSpeechToTextBrowser();
+  } = useSpeechToTextBrowser(setText, onTranscriptionComplete);
 
   const {
     isListening: speechIsListeningExternal,
     isLoading: speechIsLoadingExternal,
-    text: speechTextExternal,
     externalStartRecording: startSpeechRecordingExternal,
     externalStopRecording: stopSpeechRecordingExternal,
-    clearText,
-  } = useSpeechToTextExternal(handleTranscriptionComplete);
+  } = useSpeechToTextExternal(setText, onTranscriptionComplete);
 
   const isListening = externalSpeechToText ? speechIsListeningExternal : speechIsListeningBrowser;
   const isLoading = externalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser;
-  const speechTextForm = externalSpeechToText ? speechTextExternal : speechTextBrowser;
+
   const startRecording = externalSpeechToText
     ? startSpeechRecordingExternal
     : startSpeechRecordingBrowser;
   const stopRecording = externalSpeechToText
     ? stopSpeechRecordingExternal
     : stopSpeechRecordingBrowser;
-  const speechText =
-    isListening || (speechTextExternal && speechTextExternal.length > 0)
-      ? speechTextExternal
-      : speechTextForm || '';
-  // for a future real-time STT external
-  const interimTranscript = externalSpeechToText ? '' : interimTranscriptBrowser;
-
-  const animateTextTyping = (text: string) => {
-    const totalDuration = 2000;
-    const frameRate = 60;
-    const totalFrames = totalDuration / (1000 / frameRate);
-    const charsPerFrame = Math.ceil(text.length / totalFrames);
-    let currentIndex = 0;
-
-    const animate = () => {
-      currentIndex += charsPerFrame;
-      const currentText = text.substring(0, currentIndex);
-      setAnimatedText(currentText);
-
-      if (currentIndex < text.length) {
-        requestAnimationFrame(animate);
-      } else {
-        setAnimatedText(text);
-      }
-    };
-
-    requestAnimationFrame(animate);
-  };
-
-  useEffect(() => {
-    if (speechText && externalSpeechToText) {
-      animateTextTyping(speechText);
-    }
-  }, [speechText, externalSpeechToText]);
 
   return {
-    isListening,
     isLoading,
-    startRecording,
+    isListening,
     stopRecording,
-    interimTranscript,
-    speechText: externalSpeechToText ? animatedText : speechText,
-    clearText,
+    startRecording,
   };
 };
 
diff --git a/client/src/hooks/Input/useSpeechToTextBrowser.ts b/client/src/hooks/Input/useSpeechToTextBrowser.ts
index 75393efc72..1d31c3348d 100644
--- a/client/src/hooks/Input/useSpeechToTextBrowser.ts
+++ b/client/src/hooks/Input/useSpeechToTextBrowser.ts
@@ -1,25 +1,72 @@
-import { useEffect, useState } from 'react';
+import { useEffect, useRef, useMemo } from 'react';
 import { useRecoilState } from 'recoil';
-import { useToastContext } from '~/Providers';
-import store from '~/store';
 import SpeechRecognition, { useSpeechRecognition } from 'react-speech-recognition';
 import useGetAudioSettings from './useGetAudioSettings';
+import { useToastContext } from '~/Providers';
+import store from '~/store';
 
-const useSpeechToTextBrowser = () => {
+const useSpeechToTextBrowser = (
+  setText: (text: string) => void,
+  onTranscriptionComplete: (text: string) => void,
+) => {
   const { showToast } = useToastContext();
-  const [languageSTT] = useRecoilState<string>(store.languageSTT);
-  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
   const { speechToTextEndpoint } = useGetAudioSettings();
   const isBrowserSTTEnabled = speechToTextEndpoint === 'browser';
-  const [isListening, setIsListening] = useState(false);
+
+  const lastTranscript = useRef<string | null>(null);
+  const lastInterim = useRef<string | null>(null);
+  const timeoutRef = useRef<NodeJS.Timeout | null>();
+  const [autoSendText] = useRecoilState(store.autoSendText);
+  const [languageSTT] = useRecoilState<string>(store.languageSTT);
+  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
 
   const {
-    interimTranscript,
-    finalTranscript,
     listening,
-    browserSupportsSpeechRecognition,
+    finalTranscript,
+    resetTranscript,
+    interimTranscript,
     isMicrophoneAvailable,
+    browserSupportsSpeechRecognition,
   } = useSpeechRecognition();
+  const isListening = useMemo(() => listening, [listening]);
+
+  useEffect(() => {
+    if (interimTranscript == null || interimTranscript === '') {
+      return;
+    }
+
+    if (lastInterim.current === interimTranscript) {
+      return;
+    }
+
+    setText(interimTranscript);
+    lastInterim.current = interimTranscript;
+  }, [setText, interimTranscript]);
+
+  useEffect(() => {
+    if (finalTranscript == null || finalTranscript === '') {
+      return;
+    }
+
+    if (lastTranscript.current === finalTranscript) {
+      return;
+    }
+
+    setText(finalTranscript);
+    lastTranscript.current = finalTranscript;
+    if (autoSendText > -1 && finalTranscript.length > 0) {
+      timeoutRef.current = setTimeout(() => {
+        onTranscriptionComplete(finalTranscript);
+        resetTranscript();
+      }, autoSendText * 1000);
+    }
+
+    return () => {
+      if (timeoutRef.current) {
+        clearTimeout(timeoutRef.current);
+      }
+    };
+  }, [setText, onTranscriptionComplete, resetTranscript, finalTranscript, autoSendText]);
 
   const toggleListening = () => {
     if (!browserSupportsSpeechRecognition) {
@@ -38,11 +85,9 @@ const useSpeechToTextBrowser = () => {
       return;
     }
 
-    if (listening) {
-      setIsListening(false);
+    if (isListening === true) {
       SpeechRecognition.stopListening();
     } else {
-      setIsListening(true);
       SpeechRecognition.startListening({
         language: languageSTT,
         continuous: autoTranscribeAudio,
@@ -61,17 +106,9 @@ const useSpeechToTextBrowser = () => {
     return () => window.removeEventListener('keydown', handleKeyDown);
   }, []);
 
-  useEffect(() => {
-    if (!listening) {
-      setIsListening(false);
-    }
-  }, [listening]);
-
   return {
     isListening,
     isLoading: false,
-    interimTranscript,
-    text: finalTranscript,
     startRecording: toggleListening,
     stopRecording: toggleListening,
   };
diff --git a/client/src/hooks/Input/useSpeechToTextExternal.ts b/client/src/hooks/Input/useSpeechToTextExternal.ts
index ea96f31f51..b9f0ee94d8 100644
--- a/client/src/hooks/Input/useSpeechToTextExternal.ts
+++ b/client/src/hooks/Input/useSpeechToTextExternal.ts
@@ -1,27 +1,31 @@
 import { useState, useEffect, useRef } from 'react';
 import { useRecoilState } from 'recoil';
 import { useSpeechToTextMutation } from '~/data-provider';
+import useGetAudioSettings from './useGetAudioSettings';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
-import useGetAudioSettings from './useGetAudioSettings';
 
-const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => {
+const useSpeechToTextExternal = (
+  setText: (text: string) => void,
+  onTranscriptionComplete: (text: string) => void,
+) => {
   const { showToast } = useToastContext();
   const { speechToTextEndpoint } = useGetAudioSettings();
   const isExternalSTTEnabled = speechToTextEndpoint === 'external';
-  const [speechToText] = useRecoilState<boolean>(store.speechToText);
-  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
-  const [autoSendText] = useRecoilState(store.autoSendText);
-  const [text, setText] = useState<string>('');
-  const [isListening, setIsListening] = useState(false);
+  const audioStream = useRef<MediaStream | null>(null);
+  const animationFrameIdRef = useRef<number | null>(null);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+
   const [permission, setPermission] = useState(false);
+  const [isListening, setIsListening] = useState(false);
   const [audioChunks, setAudioChunks] = useState<Blob[]>([]);
   const [isRequestBeingMade, setIsRequestBeingMade] = useState(false);
+
   const [minDecibels] = useRecoilState(store.decibelValue);
-  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
-  const audioStream = useRef<MediaStream | null>(null);
-  const audioContextRef = useRef<AudioContext | null>(null);
-  const animationFrameIdRef = useRef<number | null>(null);
+  const [autoSendText] = useRecoilState(store.autoSendText);
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
+  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
 
   const { mutate: processAudio, isLoading: isProcessing } = useSpeechToTextMutation({
     onSuccess: (data) => {
@@ -54,10 +58,6 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
     }
   };
 
-  const clearText = () => {
-    setText('');
-  };
-
   const getMicrophonePermission = async () => {
     try {
       const streamData = await navigator.mediaDevices.getUserMedia({
@@ -226,11 +226,9 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
 
   return {
     isListening,
-    isLoading: isProcessing,
-    text,
-    externalStartRecording,
     externalStopRecording,
-    clearText,
+    externalStartRecording,
+    isLoading: isProcessing,
   };
 };
 
diff --git a/client/src/localization/languages/Eng.ts b/client/src/localization/languages/Eng.ts
index 64aa137140..a33c49b78e 100644
--- a/client/src/localization/languages/Eng.ts
+++ b/client/src/localization/languages/Eng.ts
@@ -895,4 +895,5 @@ export default {
   com_ui_decline: 'I do not accept',
   com_ui_terms_and_conditions: 'Terms and Conditions',
   com_ui_no_terms_content: 'No terms and conditions content to display',
+  com_ui_speech_while_submitting: 'Can\'t submit speech while a response is being generated',
 };