🎯 fix: Prevent UI De-sync By Removing Redundant States (#5333)

* fix: remove local state from Dropdown causing de-sync * refactor: cleanup STT code, avoid redundant states to prevent de-sync and side effects * fix: reset transcript after sending final text to prevent data loss * fix: clear timeout on component unmount to prevent memory leaks
2025-12-17 00:40:14 +01:00 · 2025-01-16 17:38:59 -05:00 · 2025-01-16 17:38:59 -05:00 · e309c6abef
commit e309c6abef
parent b55e695541
8 changed files with 149 additions and 145 deletions
--- a/client/src/components/Chat/Input/AudioRecorder.tsx
+++ b/client/src/components/Chat/Input/AudioRecorder.tsx
@ -1,73 +1,79 @@
-import { useEffect } from 'react';
+import { useCallback } from 'react';
+import { useChatFormContext, useToastContext } from '~/Providers';
 import { ListeningIcon, Spinner } from '~/components/svg';
 import { useLocalize, useSpeechToText } from '~/hooks';
-import { useChatFormContext } from '~/Providers';
 import { TooltipAnchor } from '~/components/ui';
 import { globalAudioId } from '~/common';
 import { cn } from '~/utils';

 export default function AudioRecorder({
-  textAreaRef,
-  methods,
-  ask,
  isRTL,
  disabled,
+  ask,
+  methods,
+  textAreaRef,
+  isSubmitting,
 }: {
-  textAreaRef: React.RefObject<HTMLTextAreaElement>;
-  methods: ReturnType<typeof useChatFormContext>;
-  ask: (data: { text: string }) => void;
  isRTL: boolean;
  disabled: boolean;
+  ask: (data: { text: string }) => void;
+  methods: ReturnType<typeof useChatFormContext>;
+  textAreaRef: React.RefObject<HTMLTextAreaElement>;
+  isSubmitting: boolean;
 }) {
+  const { setValue, reset } = methods;
  const localize = useLocalize();
+  const { showToast } = useToastContext();

-  const handleTranscriptionComplete = (text: string) => {
-    if (text) {
-      const globalAudio = document.getElementById(globalAudioId) as HTMLAudioElement;
-      if (globalAudio) {
-        console.log('Unmuting global audio');
-        globalAudio.muted = false;
+  const onTranscriptionComplete = useCallback(
+    (text: string) => {
+      if (isSubmitting) {
+        showToast({
+          message: localize('com_ui_speech_while_submitting'),
+          status: 'error',
+        });
+        return;
      }
-      ask({ text });
-      methods.reset({ text: '' });
-      clearText();
-    }
-  };
+      if (text) {
+        const globalAudio = document.getElementById(globalAudioId) as HTMLAudioElement | null;
+        if (globalAudio) {
+          console.log('Unmuting global audio');
+          globalAudio.muted = false;
+        }
+        ask({ text });
+        reset({ text: '' });
+      }
+    },
+    [ask, reset, showToast, localize, isSubmitting],
+  );

-  const {
-    isListening,
-    isLoading,
-    startRecording,
-    stopRecording,
-    interimTranscript,
-    speechText,
-    clearText,
-  } = useSpeechToText(handleTranscriptionComplete);
-
-  useEffect(() => {
-    if (isListening && textAreaRef.current) {
-      methods.setValue('text', interimTranscript, {
+  const setText = useCallback(
+    (text: string) => {
+      setValue('text', text, {
        shouldValidate: true,
      });
-    } else if (textAreaRef.current) {
-      textAreaRef.current.value = speechText;
-      methods.setValue('text', speechText, { shouldValidate: true });
-    }
-  }, [interimTranscript, speechText, methods, textAreaRef]);
+    },
+    [setValue],
+  );

-  const handleStartRecording = async () => {
-    await startRecording();
-  };
+  const { isListening, isLoading, startRecording, stopRecording } = useSpeechToText(
+    setText,
+    onTranscriptionComplete,
+  );

-  const handleStopRecording = async () => {
-    await stopRecording();
-  };
+  if (!textAreaRef.current) {
+    return null;
+  }
+
+  const handleStartRecording = async () => startRecording();
+
+  const handleStopRecording = async () => stopRecording();

  const renderIcon = () => {
-    if (isListening) {
+    if (isListening === true) {
      return <ListeningIcon className="stroke-red-500" />;
    }
-    if (isLoading) {
+    if (isLoading === true) {
      return <Spinner className="stroke-gray-700 dark:stroke-gray-300" />;
    }
    return <ListeningIcon className="stroke-gray-700 dark:stroke-gray-300" />;
@ -77,7 +83,7 @@ export default function AudioRecorder({
    <TooltipAnchor
      id="audio-recorder"
      aria-label={localize('com_ui_use_micrphone')}
-      onClick={isListening ? handleStopRecording : handleStartRecording}
+      onClick={isListening === true ? handleStopRecording : handleStartRecording}
      disabled={disabled}
      className={cn(
        'absolute flex size-[35px] items-center justify-center rounded-full p-1 transition-colors hover:bg-surface-hover',
--- a/client/src/components/Chat/Input/ChatForm.tsx
+++ b/client/src/components/Chat/Input/ChatForm.tsx
@ -228,11 +228,12 @@ const ChatForm = ({ index = 0 }) => {
            </FileFormWrapper>
            {SpeechToText && (
              <AudioRecorder
-                disabled={!!disableInputs}
-                textAreaRef={textAreaRef}
-                ask={submitMessage}
                isRTL={isRTL}
                methods={methods}
+                ask={submitMessage}
+                textAreaRef={textAreaRef}
+                disabled={!!disableInputs}
+                isSubmitting={isSubmitting}
              />
            )}
            {TextToSpeech && automaticPlayback && <StreamAudio index={index} />}
--- a/client/src/components/Nav/SettingsTabs/Chat/FontSizeSelector.tsx
+++ b/client/src/components/Nav/SettingsTabs/Chat/FontSizeSelector.tsx
@ -30,7 +30,6 @@ export default function FontSizeSelector() {
        onChange={handleChange}
        testId="font-size-selector"
        sizeClasses="w-[150px]"
-        anchor="bottom start"
      />
    </div>
  );
--- a/client/src/components/ui/Dropdown.tsx
+++ b/client/src/components/ui/Dropdown.tsx
@ -1,10 +1,10 @@
-import React, { useState } from 'react';
+import React from 'react';
 import * as Select from '@ariakit/react/select';
 import type { Option } from '~/common';
 import { cn } from '~/utils/';

 interface DropdownProps {
-  value: string;
+  value?: string;
  label?: string;
  onChange: (value: string) => void;
  options: string[] | Option[];
@ -14,7 +14,7 @@ interface DropdownProps {
 }

 const Dropdown: React.FC<DropdownProps> = ({
-  value: initialValue,
+  value: selectedValue,
  label = '',
  onChange,
  options,
@ -22,10 +22,7 @@ const Dropdown: React.FC<DropdownProps> = ({
  sizeClasses,
  testId = 'dropdown-menu',
 }) => {
-  const [selectedValue, setSelectedValue] = useState(initialValue);
-
  const handleChange = (value: string) => {
-    setSelectedValue(value);
    onChange(value);
  };

--- a/client/src/hooks/Input/useSpeechToText.ts
+++ b/client/src/hooks/Input/useSpeechToText.ts
@ -1,83 +1,48 @@
-import { useState, useEffect } from 'react';
 import useSpeechToTextBrowser from './useSpeechToTextBrowser';
 import useSpeechToTextExternal from './useSpeechToTextExternal';
 import useGetAudioSettings from './useGetAudioSettings';

-const useSpeechToText = (handleTranscriptionComplete: (text: string) => void) => {
+const useSpeechToText = (
+  setText: (text: string) => void,
+  onTranscriptionComplete: (text: string) => void,
+): {
+  isLoading?: boolean;
+  isListening?: boolean;
+  stopRecording: () => void | (() => Promise<void>);
+  startRecording: () => void | (() => Promise<void>);
+} => {
  const { speechToTextEndpoint } = useGetAudioSettings();
-  const [animatedText, setAnimatedText] = useState('');
  const externalSpeechToText = speechToTextEndpoint === 'external';

  const {
    isListening: speechIsListeningBrowser,
    isLoading: speechIsLoadingBrowser,
-    interimTranscript: interimTranscriptBrowser,
-    text: speechTextBrowser,
    startRecording: startSpeechRecordingBrowser,
    stopRecording: stopSpeechRecordingBrowser,
-  } = useSpeechToTextBrowser();
+  } = useSpeechToTextBrowser(setText, onTranscriptionComplete);

  const {
    isListening: speechIsListeningExternal,
    isLoading: speechIsLoadingExternal,
-    text: speechTextExternal,
    externalStartRecording: startSpeechRecordingExternal,
    externalStopRecording: stopSpeechRecordingExternal,
-    clearText,
-  } = useSpeechToTextExternal(handleTranscriptionComplete);
+  } = useSpeechToTextExternal(setText, onTranscriptionComplete);

  const isListening = externalSpeechToText ? speechIsListeningExternal : speechIsListeningBrowser;
  const isLoading = externalSpeechToText ? speechIsLoadingExternal : speechIsLoadingBrowser;
-  const speechTextForm = externalSpeechToText ? speechTextExternal : speechTextBrowser;
+
  const startRecording = externalSpeechToText
    ? startSpeechRecordingExternal
    : startSpeechRecordingBrowser;
  const stopRecording = externalSpeechToText
    ? stopSpeechRecordingExternal
    : stopSpeechRecordingBrowser;
-  const speechText =
-    isListening || (speechTextExternal && speechTextExternal.length > 0)
-      ? speechTextExternal
-      : speechTextForm || '';
-  // for a future real-time STT external
-  const interimTranscript = externalSpeechToText ? '' : interimTranscriptBrowser;
-
-  const animateTextTyping = (text: string) => {
-    const totalDuration = 2000;
-    const frameRate = 60;
-    const totalFrames = totalDuration / (1000 / frameRate);
-    const charsPerFrame = Math.ceil(text.length / totalFrames);
-    let currentIndex = 0;
-
-    const animate = () => {
-      currentIndex += charsPerFrame;
-      const currentText = text.substring(0, currentIndex);
-      setAnimatedText(currentText);
-
-      if (currentIndex < text.length) {
-        requestAnimationFrame(animate);
-      } else {
-        setAnimatedText(text);
-      }
-    };
-
-    requestAnimationFrame(animate);
-  };
-
-  useEffect(() => {
-    if (speechText && externalSpeechToText) {
-      animateTextTyping(speechText);
-    }
-  }, [speechText, externalSpeechToText]);

  return {
-    isListening,
    isLoading,
-    startRecording,
+    isListening,
    stopRecording,
-    interimTranscript,
-    speechText: externalSpeechToText ? animatedText : speechText,
-    clearText,
+    startRecording,
  };
 };

--- a/client/src/hooks/Input/useSpeechToTextBrowser.ts
+++ b/client/src/hooks/Input/useSpeechToTextBrowser.ts
@ -1,25 +1,72 @@
-import { useEffect, useState } from 'react';
+import { useEffect, useRef, useMemo } from 'react';
 import { useRecoilState } from 'recoil';
-import { useToastContext } from '~/Providers';
-import store from '~/store';
 import SpeechRecognition, { useSpeechRecognition } from 'react-speech-recognition';
 import useGetAudioSettings from './useGetAudioSettings';
+import { useToastContext } from '~/Providers';
+import store from '~/store';

-const useSpeechToTextBrowser = () => {
+const useSpeechToTextBrowser = (
+  setText: (text: string) => void,
+  onTranscriptionComplete: (text: string) => void,
+) => {
  const { showToast } = useToastContext();
-  const [languageSTT] = useRecoilState<string>(store.languageSTT);
-  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
  const { speechToTextEndpoint } = useGetAudioSettings();
  const isBrowserSTTEnabled = speechToTextEndpoint === 'browser';
-  const [isListening, setIsListening] = useState(false);
+
+  const lastTranscript = useRef<string | null>(null);
+  const lastInterim = useRef<string | null>(null);
+  const timeoutRef = useRef<NodeJS.Timeout | null>();
+  const [autoSendText] = useRecoilState(store.autoSendText);
+  const [languageSTT] = useRecoilState<string>(store.languageSTT);
+  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);

  const {
-    interimTranscript,
-    finalTranscript,
    listening,
-    browserSupportsSpeechRecognition,
+    finalTranscript,
+    resetTranscript,
+    interimTranscript,
    isMicrophoneAvailable,
+    browserSupportsSpeechRecognition,
  } = useSpeechRecognition();
+  const isListening = useMemo(() => listening, [listening]);
+
+  useEffect(() => {
+    if (interimTranscript == null || interimTranscript === '') {
+      return;
+    }
+
+    if (lastInterim.current === interimTranscript) {
+      return;
+    }
+
+    setText(interimTranscript);
+    lastInterim.current = interimTranscript;
+  }, [setText, interimTranscript]);
+
+  useEffect(() => {
+    if (finalTranscript == null || finalTranscript === '') {
+      return;
+    }
+
+    if (lastTranscript.current === finalTranscript) {
+      return;
+    }
+
+    setText(finalTranscript);
+    lastTranscript.current = finalTranscript;
+    if (autoSendText > -1 && finalTranscript.length > 0) {
+      timeoutRef.current = setTimeout(() => {
+        onTranscriptionComplete(finalTranscript);
+        resetTranscript();
+      }, autoSendText * 1000);
+    }
+
+    return () => {
+      if (timeoutRef.current) {
+        clearTimeout(timeoutRef.current);
+      }
+    };
+  }, [setText, onTranscriptionComplete, resetTranscript, finalTranscript, autoSendText]);

  const toggleListening = () => {
    if (!browserSupportsSpeechRecognition) {
@ -38,11 +85,9 @@ const useSpeechToTextBrowser = () => {
      return;
    }

-    if (listening) {
-      setIsListening(false);
+    if (isListening === true) {
      SpeechRecognition.stopListening();
    } else {
-      setIsListening(true);
      SpeechRecognition.startListening({
        language: languageSTT,
        continuous: autoTranscribeAudio,
@ -61,17 +106,9 @@ const useSpeechToTextBrowser = () => {
    return () => window.removeEventListener('keydown', handleKeyDown);
  }, []);

-  useEffect(() => {
-    if (!listening) {
-      setIsListening(false);
-    }
-  }, [listening]);
-
  return {
    isListening,
    isLoading: false,
-    interimTranscript,
-    text: finalTranscript,
    startRecording: toggleListening,
    stopRecording: toggleListening,
  };
--- a/client/src/hooks/Input/useSpeechToTextExternal.ts
+++ b/client/src/hooks/Input/useSpeechToTextExternal.ts
@ -1,27 +1,31 @@
 import { useState, useEffect, useRef } from 'react';
 import { useRecoilState } from 'recoil';
 import { useSpeechToTextMutation } from '~/data-provider';
+import useGetAudioSettings from './useGetAudioSettings';
 import { useToastContext } from '~/Providers';
 import store from '~/store';
-import useGetAudioSettings from './useGetAudioSettings';

-const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void) => {
+const useSpeechToTextExternal = (
+  setText: (text: string) => void,
+  onTranscriptionComplete: (text: string) => void,
+) => {
  const { showToast } = useToastContext();
  const { speechToTextEndpoint } = useGetAudioSettings();
  const isExternalSTTEnabled = speechToTextEndpoint === 'external';
-  const [speechToText] = useRecoilState<boolean>(store.speechToText);
-  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);
-  const [autoSendText] = useRecoilState(store.autoSendText);
-  const [text, setText] = useState<string>('');
-  const [isListening, setIsListening] = useState(false);
+  const audioStream = useRef<MediaStream | null>(null);
+  const animationFrameIdRef = useRef<number | null>(null);
+  const audioContextRef = useRef<AudioContext | null>(null);
+  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
+
  const [permission, setPermission] = useState(false);
+  const [isListening, setIsListening] = useState(false);
  const [audioChunks, setAudioChunks] = useState<Blob[]>([]);
  const [isRequestBeingMade, setIsRequestBeingMade] = useState(false);
+
  const [minDecibels] = useRecoilState(store.decibelValue);
-  const mediaRecorderRef = useRef<MediaRecorder | null>(null);
-  const audioStream = useRef<MediaStream | null>(null);
-  const audioContextRef = useRef<AudioContext | null>(null);
-  const animationFrameIdRef = useRef<number | null>(null);
+  const [autoSendText] = useRecoilState(store.autoSendText);
+  const [speechToText] = useRecoilState<boolean>(store.speechToText);
+  const [autoTranscribeAudio] = useRecoilState<boolean>(store.autoTranscribeAudio);

  const { mutate: processAudio, isLoading: isProcessing } = useSpeechToTextMutation({
    onSuccess: (data) => {
@ -54,10 +58,6 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void
    }
  };

-  const clearText = () => {
-    setText('');
-  };
-
  const getMicrophonePermission = async () => {
    try {
      const streamData = await navigator.mediaDevices.getUserMedia({
@ -226,11 +226,9 @@ const useSpeechToTextExternal = (onTranscriptionComplete: (text: string) => void

  return {
    isListening,
-    isLoading: isProcessing,
-    text,
-    externalStartRecording,
    externalStopRecording,
-    clearText,
+    externalStartRecording,
+    isLoading: isProcessing,
  };
 };

--- a/client/src/localization/languages/Eng.ts
+++ b/client/src/localization/languages/Eng.ts
@ -895,4 +895,5 @@ export default {
  com_ui_decline: 'I do not accept',
  com_ui_terms_and_conditions: 'Terms and Conditions',
  com_ui_no_terms_content: 'No terms and conditions content to display',
+  com_ui_speech_while_submitting: 'Can\'t submit speech while a response is being generated',
 };