🗣️ feat: add support for gpt-4o-transcribe models (#6483)

2025-12-17 08:50:15 +01:00 · 2025-03-23 16:26:06 +01:00 · 2025-03-23 16:26:06 +01:00 · 20f353630e
commit 20f353630e
parent 842b68fc32
2 changed files with 124 additions and 5 deletions
--- a/api/server/services/Files/Audio/STTService.js
+++ b/api/server/services/Files/Audio/STTService.js
@ -7,6 +7,78 @@ const { getCustomConfig } = require('~/server/services/Config');
 const { genAzureEndpoint } = require('~/utils');
 const { logger } = require('~/config');

+/**
+ * Maps MIME types to their corresponding file extensions for audio files.
+ * @type {Object}
+ */
+const MIME_TO_EXTENSION_MAP = {
+  // MP4 container formats
+  'audio/mp4': 'm4a',
+  'audio/x-m4a': 'm4a',
+  // Ogg formats
+  'audio/ogg': 'ogg',
+  'audio/vorbis': 'ogg',
+  'application/ogg': 'ogg',
+  // Wave formats
+  'audio/wav': 'wav',
+  'audio/x-wav': 'wav',
+  'audio/wave': 'wav',
+  // MP3 formats
+  'audio/mp3': 'mp3',
+  'audio/mpeg': 'mp3',
+  'audio/mpeg3': 'mp3',
+  // WebM formats
+  'audio/webm': 'webm',
+  // Additional formats
+  'audio/flac': 'flac',
+  'audio/x-flac': 'flac',
+};
+
+/**
+ * Gets the file extension from the MIME type.
+ * @param {string} mimeType - The MIME type.
+ * @returns {string} The file extension.
+ */
+function getFileExtensionFromMime(mimeType) {
+  // Default fallback
+  if (!mimeType) {
+    return 'webm';
+  }
+
+  // Direct lookup (fastest)
+  const extension = MIME_TO_EXTENSION_MAP[mimeType];
+  if (extension) {
+    return extension;
+  }
+
+  // Try to extract subtype as fallback
+  const subtype = mimeType.split('/')[1]?.toLowerCase();
+
+  // If subtype matches a known extension
+  if (['mp3', 'mp4', 'ogg', 'wav', 'webm', 'm4a', 'flac'].includes(subtype)) {
+    return subtype === 'mp4' ? 'm4a' : subtype;
+  }
+
+  // Generic checks for partial matches
+  if (subtype?.includes('mp4') || subtype?.includes('m4a')) {
+    return 'm4a';
+  }
+  if (subtype?.includes('ogg')) {
+    return 'ogg';
+  }
+  if (subtype?.includes('wav')) {
+    return 'wav';
+  }
+  if (subtype?.includes('mp3') || subtype?.includes('mpeg')) {
+    return 'mp3';
+  }
+  if (subtype?.includes('webm')) {
+    return 'webm';
+  }
+
+  return 'webm'; // Default fallback
+}
+
 /**
 * Service class for handling Speech-to-Text (STT) operations.
 * @class
@ -170,8 +242,10 @@ class STTService {
      throw new Error('Invalid provider');
    }

+    const fileExtension = getFileExtensionFromMime(audioFile.mimetype);
+
    const audioReadStream = Readable.from(audioBuffer);
-    audioReadStream.path = 'audio.wav';
+    audioReadStream.path = `audio.${fileExtension}`;

    const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile);

--- a/client/src/hooks/Input/useSpeechToTextExternal.ts
+++ b/client/src/hooks/Input/useSpeechToTextExternal.ts
@ -21,6 +21,7 @@ const useSpeechToTextExternal = (
  const [isListening, setIsListening] = useState(false);
  const [audioChunks, setAudioChunks] = useState<Blob[]>([]);
  const [isRequestBeingMade, setIsRequestBeingMade] = useState(false);
+  const [audioMimeType, setAudioMimeType] = useState<string>('audio/webm');

  const [minDecibels] = useRecoilState(store.decibelValue);
  const [autoSendText] = useRecoilState(store.autoSendText);
@ -48,6 +49,44 @@ const useSpeechToTextExternal = (
    },
  });

+  const getBestSupportedMimeType = () => {
+    const types = [
+      'audio/webm',
+      'audio/webm;codecs=opus',
+      'audio/mp4',
+      'audio/ogg;codecs=opus',
+      'audio/ogg',
+      'audio/wav',
+    ];
+
+    for (const type of types) {
+      if (MediaRecorder.isTypeSupported(type)) {
+        return type;
+      }
+    }
+
+    const ua = navigator.userAgent.toLowerCase();
+    if (ua.indexOf('safari') !== -1 && ua.indexOf('chrome') === -1) {
+      return 'audio/mp4';
+    } else if (ua.indexOf('firefox') !== -1) {
+      return 'audio/ogg';
+    } else {
+      return 'audio/webm';
+    }
+  };
+
+  const getFileExtension = (mimeType: string) => {
+    if (mimeType.includes('mp4')) {
+      return 'm4a';
+    } else if (mimeType.includes('ogg')) {
+      return 'ogg';
+    } else if (mimeType.includes('wav')) {
+      return 'wav';
+    } else {
+      return 'webm';
+    }
+  };
+
  const cleanup = () => {
    if (mediaRecorderRef.current) {
      mediaRecorderRef.current.removeEventListener('dataavailable', (event: BlobEvent) => {
@ -73,12 +112,13 @@ const useSpeechToTextExternal = (

  const handleStop = () => {
    if (audioChunks.length > 0) {
-      const audioBlob = new Blob(audioChunks, { type: 'audio/wav' });
+      const audioBlob = new Blob(audioChunks, { type: audioMimeType });
+      const fileExtension = getFileExtension(audioMimeType);

      setAudioChunks([]);

      const formData = new FormData();
-      formData.append('audio', audioBlob, 'audio.wav');
+      formData.append('audio', audioBlob, `audio.${fileExtension}`);
      setIsRequestBeingMade(true);
      cleanup();
      processAudio(formData);
@ -133,7 +173,12 @@ const useSpeechToTextExternal = (
    if (audioStream.current) {
      try {
        setAudioChunks([]);
-        mediaRecorderRef.current = new MediaRecorder(audioStream.current);
+        const bestMimeType = getBestSupportedMimeType();
+        setAudioMimeType(bestMimeType);
+
+        mediaRecorderRef.current = new MediaRecorder(audioStream.current, {
+          mimeType: bestMimeType,
+        });
        mediaRecorderRef.current.addEventListener('dataavailable', (event: BlobEvent) => {
          audioChunks.push(event.data);
        });
@ -221,7 +266,7 @@ const useSpeechToTextExternal = (
    return () => {
      window.removeEventListener('keydown', handleKeyDown);
    };
-    // eslint-disable-next-line react-hooks/exhaustive-deps
+
  }, [isListening]);

  return {