diff --git a/api/server/services/Files/Audio/STTService.js b/api/server/services/Files/Audio/STTService.js index ea8d6ffaac..d6c8cc4146 100644 --- a/api/server/services/Files/Audio/STTService.js +++ b/api/server/services/Files/Audio/STTService.js @@ -7,6 +7,78 @@ const { getCustomConfig } = require('~/server/services/Config'); const { genAzureEndpoint } = require('~/utils'); const { logger } = require('~/config'); +/** + * Maps MIME types to their corresponding file extensions for audio files. + * @type {Object} + */ +const MIME_TO_EXTENSION_MAP = { + // MP4 container formats + 'audio/mp4': 'm4a', + 'audio/x-m4a': 'm4a', + // Ogg formats + 'audio/ogg': 'ogg', + 'audio/vorbis': 'ogg', + 'application/ogg': 'ogg', + // Wave formats + 'audio/wav': 'wav', + 'audio/x-wav': 'wav', + 'audio/wave': 'wav', + // MP3 formats + 'audio/mp3': 'mp3', + 'audio/mpeg': 'mp3', + 'audio/mpeg3': 'mp3', + // WebM formats + 'audio/webm': 'webm', + // Additional formats + 'audio/flac': 'flac', + 'audio/x-flac': 'flac', +}; + +/** + * Gets the file extension from the MIME type. + * @param {string} mimeType - The MIME type. + * @returns {string} The file extension. + */ +function getFileExtensionFromMime(mimeType) { + // Default fallback + if (!mimeType) { + return 'webm'; + } + + // Direct lookup (fastest) + const extension = MIME_TO_EXTENSION_MAP[mimeType]; + if (extension) { + return extension; + } + + // Try to extract subtype as fallback + const subtype = mimeType.split('/')[1]?.toLowerCase(); + + // If subtype matches a known extension + if (['mp3', 'mp4', 'ogg', 'wav', 'webm', 'm4a', 'flac'].includes(subtype)) { + return subtype === 'mp4' ? 'm4a' : subtype; + } + + // Generic checks for partial matches + if (subtype?.includes('mp4') || subtype?.includes('m4a')) { + return 'm4a'; + } + if (subtype?.includes('ogg')) { + return 'ogg'; + } + if (subtype?.includes('wav')) { + return 'wav'; + } + if (subtype?.includes('mp3') || subtype?.includes('mpeg')) { + return 'mp3'; + } + if (subtype?.includes('webm')) { + return 'webm'; + } + + return 'webm'; // Default fallback +} + /** * Service class for handling Speech-to-Text (STT) operations. * @class @@ -170,8 +242,10 @@ class STTService { throw new Error('Invalid provider'); } + const fileExtension = getFileExtensionFromMime(audioFile.mimetype); + const audioReadStream = Readable.from(audioBuffer); - audioReadStream.path = 'audio.wav'; + audioReadStream.path = `audio.${fileExtension}`; const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile); diff --git a/client/src/hooks/Input/useSpeechToTextExternal.ts b/client/src/hooks/Input/useSpeechToTextExternal.ts index b9f0ee94d8..5ddccc9f3e 100644 --- a/client/src/hooks/Input/useSpeechToTextExternal.ts +++ b/client/src/hooks/Input/useSpeechToTextExternal.ts @@ -21,6 +21,7 @@ const useSpeechToTextExternal = ( const [isListening, setIsListening] = useState(false); const [audioChunks, setAudioChunks] = useState([]); const [isRequestBeingMade, setIsRequestBeingMade] = useState(false); + const [audioMimeType, setAudioMimeType] = useState('audio/webm'); const [minDecibels] = useRecoilState(store.decibelValue); const [autoSendText] = useRecoilState(store.autoSendText); @@ -48,6 +49,44 @@ const useSpeechToTextExternal = ( }, }); + const getBestSupportedMimeType = () => { + const types = [ + 'audio/webm', + 'audio/webm;codecs=opus', + 'audio/mp4', + 'audio/ogg;codecs=opus', + 'audio/ogg', + 'audio/wav', + ]; + + for (const type of types) { + if (MediaRecorder.isTypeSupported(type)) { + return type; + } + } + + const ua = navigator.userAgent.toLowerCase(); + if (ua.indexOf('safari') !== -1 && ua.indexOf('chrome') === -1) { + return 'audio/mp4'; + } else if (ua.indexOf('firefox') !== -1) { + return 'audio/ogg'; + } else { + return 'audio/webm'; + } + }; + + const getFileExtension = (mimeType: string) => { + if (mimeType.includes('mp4')) { + return 'm4a'; + } else if (mimeType.includes('ogg')) { + return 'ogg'; + } else if (mimeType.includes('wav')) { + return 'wav'; + } else { + return 'webm'; + } + }; + const cleanup = () => { if (mediaRecorderRef.current) { mediaRecorderRef.current.removeEventListener('dataavailable', (event: BlobEvent) => { @@ -73,12 +112,13 @@ const useSpeechToTextExternal = ( const handleStop = () => { if (audioChunks.length > 0) { - const audioBlob = new Blob(audioChunks, { type: 'audio/wav' }); + const audioBlob = new Blob(audioChunks, { type: audioMimeType }); + const fileExtension = getFileExtension(audioMimeType); setAudioChunks([]); const formData = new FormData(); - formData.append('audio', audioBlob, 'audio.wav'); + formData.append('audio', audioBlob, `audio.${fileExtension}`); setIsRequestBeingMade(true); cleanup(); processAudio(formData); @@ -133,7 +173,12 @@ const useSpeechToTextExternal = ( if (audioStream.current) { try { setAudioChunks([]); - mediaRecorderRef.current = new MediaRecorder(audioStream.current); + const bestMimeType = getBestSupportedMimeType(); + setAudioMimeType(bestMimeType); + + mediaRecorderRef.current = new MediaRecorder(audioStream.current, { + mimeType: bestMimeType, + }); mediaRecorderRef.current.addEventListener('dataavailable', (event: BlobEvent) => { audioChunks.push(event.data); }); @@ -221,7 +266,7 @@ const useSpeechToTextExternal = ( return () => { window.removeEventListener('keydown', handleKeyDown); }; - // eslint-disable-next-line react-hooks/exhaustive-deps + }, [isListening]); return {