From 1aad315de65f6316d5aba705a1fe6769410035db Mon Sep 17 00:00:00 2001 From: Marco Beretta <81851188+berry-13@users.noreply.github.com> Date: Fri, 5 Jul 2024 17:13:34 +0300 Subject: [PATCH] =?UTF-8?q?=F0=9F=8E=A4=20feat:=20add=20custom=20speech=20?= =?UTF-8?q?config,=20browser=20TTS/STT=20features,=20and=20dynamic=20speec?= =?UTF-8?q?h=20tab=20settings=20(#2921)?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit * feat: update useTextToSpeech and useSpeechToText hooks to support external audio endpoints This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: add userSelect style to ConversationModeSwitch label * fix: remove unused updateTokenWebsocket function and import The updateTokenWebsocket function and its import are no longer used in the OpenAIClient module. This commit removes the function and import to clean up the codebase * feat: support external audio endpoints in useTextToSpeech and useSpeechToText hooks This commit updates the useTextToSpeech and useSpeechToText hooks in the Input directory to support external audio endpoints. It introduces the useGetExternalTextToSpeech and useGetExternalSpeechToText hooks, which determine whether the audio endpoints should be set to 'browser' or 'external' based on the value of the endpointTTS and endpointSTT Recoil states. The useTextToSpeech and useSpeechToText hooks now use these new hooks to determine whether to use external audio endpoints * feat: update AutomaticPlayback component to AutomaticPlaybackSwitch; tests: added AutomaticPlaybackSwitch.spec > > This commit renames the AutomaticPlayback component to AutomaticPlaybackSwitch in the Speech directory. The new name better reflects the purpose of the component and aligns with the naming convention used in the codebase. * feat: update useSpeechToText hook to include interimTranscript This commit updates the useSpeechToText hook in the client/src/components/Chat/Input/AudioRecorder.tsx file to include the interimTranscript state. This allows for real-time display of the speech-to-text transcription while the user is still speaking. The interimTranscript is now used to update the text area value during recording. * feat: Add customConfigSpeech API endpoint for retrieving custom speech configuration This commit adds a new API endpoint in the file under the directory. This endpoint is responsible for retrieving the custom speech configuration using the function from the module * feat: update store var and ; fix: getCustomConfigSpeech * fix: client tests, removed unused import * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations This commit modifies the useCustomConfigSpeechQuery function in the client/src/data-provider/queries.ts file to return an array of custom speech configurations instead of a single object. This change allows for better handling and manipulation of the data in the application * feat: Update useCustomConfigSpeechQuery to return an array of custom speech configurations * refactor: Update variable name in speechTab schema * refactor: removed unused and nested code * fix: using recoilState * refactor: Update Speech component to use useCallback for setting settings * fix: test * fix: tests * feature: ensure that the settings don't change after modifying then through the UI * remove comment * fix: Handle error gracefully in getCustomConfigSpeech and getVoices endpoints * fix: Handle error * fix: backend tests * fix: invalid custom config logging * chore: add back custom config info logging * chore: revert loadCustomConfig spec --------- Co-authored-by: Danny Avila --- api/app/clients/OpenAIClient.js | 2 - api/server/routes/files/index.js | 19 +--- .../routes/files/speech/customConfigSpeech.js | 10 ++ api/server/routes/files/speech/index.js | 17 +++ api/server/routes/files/{ => speech}/stt.js | 0 api/server/routes/files/{ => speech}/tts.js | 0 .../services/Config/loadCustomConfig.js | 24 +++- .../Files/Audio/getCustomConfigSpeech.js | 50 ++++++++ api/server/services/Files/Audio/getVoices.js | 8 +- api/server/services/Files/Audio/index.js | 4 +- .../services/Files/Audio/speechToText.js | 4 +- .../services/Files/Audio/textToSpeech.js | 4 +- api/server/services/Files/Audio/webSocket.js | 31 ----- .../components/Chat/Input/AudioRecorder.tsx | 19 +++- client/src/components/Chat/Input/ChatForm.tsx | 4 +- .../Chat/Input/Files/Table/DataTable.tsx | 2 +- .../components/Chat/Messages/HoverButtons.tsx | 2 +- .../Files/FileList/DataTableFile.tsx | 2 +- .../Files/FileList/FileTableColumns.tsx | 25 ++-- .../components/Messages/ScrollToBottom.tsx | 2 +- .../Speech/ConversationModeSwitch.tsx | 14 +-- .../Speech/STT/AutoSendTextSwitch.tsx | 2 +- .../Speech/STT/AutoTranscribeAudioSwitch.tsx | 2 +- .../Speech/STT/DecibelSelector.tsx | 2 +- .../Speech/STT/EngineSTTDropdown.tsx | 6 +- .../Speech/STT/LanguageSTTDropdown.tsx | 107 ++++++++++++++++++ .../Speech/STT/SpeechToTextSwitch.tsx | 2 +- .../Nav/SettingsTabs/Speech/STT/index.ts | 1 + .../Nav/SettingsTabs/Speech/Speech.tsx | 105 +++++++++++++++-- ...ayback.tsx => AutomaticPlaybackSwitch.tsx} | 2 +- .../Speech/TTS/CacheTTSSwitch.tsx | 2 +- .../Speech/TTS/EngineTTSDropdown.tsx | 6 +- .../SettingsTabs/Speech/TTS/PlaybackRate.tsx | 2 +- .../Speech/TTS/TextToSpeechSwitch.tsx | 2 +- .../AutomaticPlaybackSwitch.spec.tsx | 38 +++++++ .../Nav/SettingsTabs/Speech/TTS/index.ts | 2 +- client/src/data-provider/queries.ts | 7 ++ client/src/hooks/Input/index.ts | 1 + .../src/hooks/Input/useGetAudioSettings.tsx | 19 ++++ client/src/hooks/Input/useSpeechToText.ts | 28 ++--- .../src/hooks/Input/useSpeechToTextBrowser.ts | 63 ++++++++--- .../hooks/Input/useSpeechToTextExternal.ts | 7 +- client/src/hooks/Input/useTextToSpeech.ts | 12 +- client/src/localization/languages/Eng.ts | 2 +- client/src/store/settings.ts | 49 +++++--- packages/data-provider/src/api-endpoints.ts | 8 +- packages/data-provider/src/config.ts | 44 ++++++- packages/data-provider/src/data-service.ts | 4 + packages/data-provider/src/keys.ts | 1 + packages/data-provider/src/types/files.ts | 8 ++ 50 files changed, 598 insertions(+), 179 deletions(-) create mode 100644 api/server/routes/files/speech/customConfigSpeech.js create mode 100644 api/server/routes/files/speech/index.js rename api/server/routes/files/{ => speech}/stt.js (100%) rename api/server/routes/files/{ => speech}/tts.js (100%) create mode 100644 api/server/services/Files/Audio/getCustomConfigSpeech.js delete mode 100644 api/server/services/Files/Audio/webSocket.js create mode 100644 client/src/components/Nav/SettingsTabs/Speech/STT/LanguageSTTDropdown.tsx rename client/src/components/Nav/SettingsTabs/Speech/TTS/{AutomaticPlayback.tsx => AutomaticPlaybackSwitch.tsx} (94%) create mode 100644 client/src/components/Nav/SettingsTabs/Speech/TTS/__tests__/AutomaticPlaybackSwitch.spec.tsx create mode 100644 client/src/hooks/Input/useGetAudioSettings.tsx diff --git a/api/app/clients/OpenAIClient.js b/api/app/clients/OpenAIClient.js index ced2387bd5..7520cbb897 100644 --- a/api/app/clients/OpenAIClient.js +++ b/api/app/clients/OpenAIClient.js @@ -27,7 +27,6 @@ const { createContextHandlers, } = require('./prompts'); const { encodeAndFormat } = require('~/server/services/Files/images/encode'); -const { updateTokenWebsocket } = require('~/server/services/Files/Audio'); const { isEnabled, sleep } = require('~/server/utils'); const { handleOpenAIErrors } = require('./tools/util'); const spendTokens = require('~/models/spendTokens'); @@ -595,7 +594,6 @@ class OpenAIClient extends BaseClient { payload, (progressMessage) => { if (progressMessage === '[DONE]') { - updateTokenWebsocket('[DONE]'); return; } diff --git a/api/server/routes/files/index.js b/api/server/routes/files/index.js index 2911ecb0b3..6317f4495f 100644 --- a/api/server/routes/files/index.js +++ b/api/server/routes/files/index.js @@ -1,19 +1,11 @@ const express = require('express'); -const { - uaParser, - checkBan, - requireJwtAuth, - createFileLimiters, - createTTSLimiters, - createSTTLimiters, -} = require('~/server/middleware'); +const { uaParser, checkBan, requireJwtAuth, createFileLimiters } = require('~/server/middleware'); const { createMulterInstance } = require('./multer'); const files = require('./files'); const images = require('./images'); const avatar = require('./avatar'); -const stt = require('./stt'); -const tts = require('./tts'); +const speech = require('./speech'); const initialize = async () => { const router = express.Router(); @@ -21,11 +13,8 @@ const initialize = async () => { router.use(checkBan); router.use(uaParser); - /* Important: stt/tts routes must be added before the upload limiters */ - const { sttIpLimiter, sttUserLimiter } = createSTTLimiters(); - const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters(); - router.use('/stt', sttIpLimiter, sttUserLimiter, stt); - router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts); + /* Important: speech route must be added before the upload limiters */ + router.use('/speech', speech); const upload = await createMulterInstance(); const { fileUploadIpLimiter, fileUploadUserLimiter } = createFileLimiters(); diff --git a/api/server/routes/files/speech/customConfigSpeech.js b/api/server/routes/files/speech/customConfigSpeech.js new file mode 100644 index 0000000000..c3b1e2eb47 --- /dev/null +++ b/api/server/routes/files/speech/customConfigSpeech.js @@ -0,0 +1,10 @@ +const express = require('express'); +const router = express.Router(); + +const { getCustomConfigSpeech } = require('~/server/services/Files/Audio'); + +router.get('/get', async (req, res) => { + await getCustomConfigSpeech(req, res); +}); + +module.exports = router; diff --git a/api/server/routes/files/speech/index.js b/api/server/routes/files/speech/index.js new file mode 100644 index 0000000000..074ed553c9 --- /dev/null +++ b/api/server/routes/files/speech/index.js @@ -0,0 +1,17 @@ +const express = require('express'); +const { createTTSLimiters, createSTTLimiters } = require('~/server/middleware'); + +const stt = require('./stt'); +const tts = require('./tts'); +const customConfigSpeech = require('./customConfigSpeech'); + +const router = express.Router(); + +const { sttIpLimiter, sttUserLimiter } = createSTTLimiters(); +const { ttsIpLimiter, ttsUserLimiter } = createTTSLimiters(); +router.use('/stt', sttIpLimiter, sttUserLimiter, stt); +router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts); + +router.use('/config', customConfigSpeech); + +module.exports = router; diff --git a/api/server/routes/files/stt.js b/api/server/routes/files/speech/stt.js similarity index 100% rename from api/server/routes/files/stt.js rename to api/server/routes/files/speech/stt.js diff --git a/api/server/routes/files/tts.js b/api/server/routes/files/speech/tts.js similarity index 100% rename from api/server/routes/files/tts.js rename to api/server/routes/files/speech/tts.js diff --git a/api/server/services/Config/loadCustomConfig.js b/api/server/services/Config/loadCustomConfig.js index 1b5b287066..2127ec239e 100644 --- a/api/server/services/Config/loadCustomConfig.js +++ b/api/server/services/Config/loadCustomConfig.js @@ -76,8 +76,28 @@ Please specify a correct \`imageOutputType\` value (case-sensitive). ); } if (!result.success) { - i === 0 && logger.error(`Invalid custom config file at ${configPath}`, result.error); - i === 0 && i++; + let errorMessage = `Invalid custom config file at ${configPath}: +${JSON.stringify(result.error, null, 2)}`; + + if (i === 0) { + logger.error(errorMessage); + const speechError = result.error.errors.find( + (err) => + err.code === 'unrecognized_keys' && + (err.message?.includes('stt') || err.message?.includes('tts')), + ); + + if (speechError) { + logger.warn(` +The Speech-to-text and Text-to-speech configuration format has recently changed. +If you're getting this error, please refer to the latest documentation: + +https://www.librechat.ai/docs/configuration/stt_tts`); + } + + i++; + } + return null; } else { logger.info('Custom config file loaded:'); diff --git a/api/server/services/Files/Audio/getCustomConfigSpeech.js b/api/server/services/Files/Audio/getCustomConfigSpeech.js new file mode 100644 index 0000000000..e9d185af2e --- /dev/null +++ b/api/server/services/Files/Audio/getCustomConfigSpeech.js @@ -0,0 +1,50 @@ +const getCustomConfig = require('~/server/services/Config/getCustomConfig'); + +/** + * This function retrieves the speechTab settings from the custom configuration + * It first fetches the custom configuration + * Then, it checks if the custom configuration and the speechTab schema exist + * If they do, it sends the speechTab settings as a JSON response + * If they don't, it throws an error + * + * @param {Object} req - The request object + * @param {Object} res - The response object + * @returns {Promise} + * @throws {Error} - If the custom configuration or the speechTab schema is missing, an error is thrown + */ +async function getCustomConfigSpeech(req, res) { + try { + const customConfig = await getCustomConfig(); + + if (!customConfig || !customConfig.speech?.speechTab) { + throw new Error('Configuration or speechTab schema is missing'); + } + + const ttsSchema = customConfig.speech?.speechTab; + let settings = {}; + + if (ttsSchema.advancedMode !== undefined) { + settings.advancedMode = ttsSchema.advancedMode; + } + if (ttsSchema.speechToText) { + for (const key in ttsSchema.speechToText) { + if (ttsSchema.speechToText[key] !== undefined) { + settings[key] = ttsSchema.speechToText[key]; + } + } + } + if (ttsSchema.textToSpeech) { + for (const key in ttsSchema.textToSpeech) { + if (ttsSchema.textToSpeech[key] !== undefined) { + settings[key] = ttsSchema.textToSpeech[key]; + } + } + } + + res.json(settings); + } catch (error) { + res.status(200).send(); + } +} + +module.exports = getCustomConfigSpeech; diff --git a/api/server/services/Files/Audio/getVoices.js b/api/server/services/Files/Audio/getVoices.js index b87cd363b2..56341cd2b0 100644 --- a/api/server/services/Files/Audio/getVoices.js +++ b/api/server/services/Files/Audio/getVoices.js @@ -1,4 +1,3 @@ -const { logger } = require('~/config'); const getCustomConfig = require('~/server/services/Config/getCustomConfig'); const { getProvider } = require('./textToSpeech'); @@ -16,11 +15,11 @@ async function getVoices(req, res) { try { const customConfig = await getCustomConfig(); - if (!customConfig || !customConfig?.tts) { + if (!customConfig || !customConfig?.speech?.tts) { throw new Error('Configuration or TTS schema is missing'); } - const ttsSchema = customConfig?.tts; + const ttsSchema = customConfig?.speech?.tts; const provider = getProvider(ttsSchema); let voices; @@ -40,8 +39,7 @@ async function getVoices(req, res) { res.json(voices); } catch (error) { - logger.error(`Failed to get voices: ${error.message}`); - res.status(500).json({ error: 'Failed to get voices' }); + res.status(500).json({ error: `Failed to get voices: ${error.message}` }); } } diff --git a/api/server/services/Files/Audio/index.js b/api/server/services/Files/Audio/index.js index a201ea556c..75882f2397 100644 --- a/api/server/services/Files/Audio/index.js +++ b/api/server/services/Files/Audio/index.js @@ -1,11 +1,11 @@ const getVoices = require('./getVoices'); +const getCustomConfigSpeech = require('./getCustomConfigSpeech'); const textToSpeech = require('./textToSpeech'); const speechToText = require('./speechToText'); -const { updateTokenWebsocket } = require('./webSocket'); module.exports = { getVoices, + getCustomConfigSpeech, speechToText, ...textToSpeech, - updateTokenWebsocket, }; diff --git a/api/server/services/Files/Audio/speechToText.js b/api/server/services/Files/Audio/speechToText.js index 96e70b76fe..7e0d2a2145 100644 --- a/api/server/services/Files/Audio/speechToText.js +++ b/api/server/services/Files/Audio/speechToText.js @@ -25,7 +25,7 @@ async function handleResponse(response) { } function getProvider(sttSchema) { - if (sttSchema.openai) { + if (sttSchema?.openai) { return 'openai'; } @@ -176,7 +176,7 @@ async function speechToText(req, res) { const audioReadStream = Readable.from(audioBuffer); audioReadStream.path = 'audio.wav'; - const provider = getProvider(customConfig.stt); + const provider = getProvider(customConfig.speech.stt); let [url, data, headers] = []; diff --git a/api/server/services/Files/Audio/textToSpeech.js b/api/server/services/Files/Audio/textToSpeech.js index 7778faabeb..49a0d4e2e7 100644 --- a/api/server/services/Files/Audio/textToSpeech.js +++ b/api/server/services/Files/Audio/textToSpeech.js @@ -191,8 +191,8 @@ function localAIProvider(ttsSchema, input, voice) { * @returns {Promise<[string, TProviderSchema]>} */ async function getProviderSchema(customConfig) { - const provider = getProvider(customConfig.tts); - return [provider, customConfig.tts[provider]]; + const provider = getProvider(customConfig.speech.tts); + return [provider, customConfig.speech.tts[provider]]; } /** diff --git a/api/server/services/Files/Audio/webSocket.js b/api/server/services/Files/Audio/webSocket.js deleted file mode 100644 index f2d96c7941..0000000000 --- a/api/server/services/Files/Audio/webSocket.js +++ /dev/null @@ -1,31 +0,0 @@ -let token = ''; - -function updateTokenWebsocket(newToken) { - console.log('Token:', newToken); - token = newToken; -} - -function sendTextToWebsocket(ws, onDataReceived) { - if (token === '[DONE]') { - ws.send(' '); - return; - } - - if (ws.readyState === WebSocket.OPEN) { - ws.send(token); - - ws.onmessage = function (event) { - console.log('Received:', event.data); - if (onDataReceived) { - onDataReceived(event.data); // Pass the received data to the callback function - } - }; - } else { - console.error('WebSocket is not open. Ready state is: ' + ws.readyState); - } -} - -module.exports = { - updateTokenWebsocket, - sendTextToWebsocket, -}; diff --git a/client/src/components/Chat/Input/AudioRecorder.tsx b/client/src/components/Chat/Input/AudioRecorder.tsx index 48d89c2c3f..dd088ea3c8 100644 --- a/client/src/components/Chat/Input/AudioRecorder.tsx +++ b/client/src/components/Chat/Input/AudioRecorder.tsx @@ -31,15 +31,26 @@ export default function AudioRecorder({ } }; - const { isListening, isLoading, startRecording, stopRecording, speechText, clearText } = - useSpeechToText(handleTranscriptionComplete); + const { + isListening, + isLoading, + startRecording, + stopRecording, + interimTranscript, + speechText, + clearText, + } = useSpeechToText(handleTranscriptionComplete); useEffect(() => { - if (textAreaRef.current) { + if (isListening && textAreaRef.current) { + methods.setValue('text', interimTranscript, { + shouldValidate: true, + }); + } else if (textAreaRef.current) { textAreaRef.current.value = speechText; methods.setValue('text', speechText, { shouldValidate: true }); } - }, [speechText, methods, textAreaRef]); + }, [interimTranscript, speechText, methods, textAreaRef]); const handleStartRecording = async () => { await startRecording(); diff --git a/client/src/components/Chat/Input/ChatForm.tsx b/client/src/components/Chat/Input/ChatForm.tsx index 63ac15b01b..2ad4580eea 100644 --- a/client/src/components/Chat/Input/ChatForm.tsx +++ b/client/src/components/Chat/Input/ChatForm.tsx @@ -38,8 +38,8 @@ const ChatForm = ({ index = 0 }) => { const submitButtonRef = useRef(null); const textAreaRef = useRef(null); - const SpeechToText = useRecoilValue(store.SpeechToText); - const TextToSpeech = useRecoilValue(store.TextToSpeech); + const SpeechToText = useRecoilValue(store.speechToText); + const TextToSpeech = useRecoilValue(store.textToSpeech); const automaticPlayback = useRecoilValue(store.automaticPlayback); const [showStopButton, setShowStopButton] = useRecoilState(store.showStopButtonByIndex(index)); diff --git a/client/src/components/Chat/Input/Files/Table/DataTable.tsx b/client/src/components/Chat/Input/Files/Table/DataTable.tsx index a61a41ed0e..c80cb33191 100644 --- a/client/src/components/Chat/Input/Files/Table/DataTable.tsx +++ b/client/src/components/Chat/Input/Files/Table/DataTable.tsx @@ -96,7 +96,7 @@ export default function DataTable({ columns, data }: DataTablePro deleteFiles({ files: filesToDelete as TFile[] }); setRowSelection({}); }} - className="dark:hover:bg-gray-850/25 ml-1 gap-2 sm:ml-0" + className="ml-1 gap-2 dark:hover:bg-gray-850/25 sm:ml-0" disabled={!table.getFilteredSelectedRowModel().rows.length || isDeleting} > {isDeleting ? ( diff --git a/client/src/components/Chat/Messages/HoverButtons.tsx b/client/src/components/Chat/Messages/HoverButtons.tsx index 3c3ad97890..163d5e9765 100644 --- a/client/src/components/Chat/Messages/HoverButtons.tsx +++ b/client/src/components/Chat/Messages/HoverButtons.tsx @@ -39,7 +39,7 @@ export default function HoverButtons({ const { endpoint: _endpoint, endpointType } = conversation ?? {}; const endpoint = endpointType ?? _endpoint; const [isCopied, setIsCopied] = useState(false); - const [TextToSpeech] = useRecoilState(store.TextToSpeech); + const [TextToSpeech] = useRecoilState(store.textToSpeech); const { hideEditButton, diff --git a/client/src/components/Files/FileList/DataTableFile.tsx b/client/src/components/Files/FileList/DataTableFile.tsx index 50cb855ae9..92e454016d 100644 --- a/client/src/components/Files/FileList/DataTableFile.tsx +++ b/client/src/components/Files/FileList/DataTableFile.tsx @@ -106,7 +106,7 @@ export default function DataTableFile({ deleteFiles({ files: filesToDelete as TFile[] }); setRowSelection({}); }} - className="dark:hover:bg-gray-850/25 ml-1 gap-2 sm:ml-0" + className="ml-1 gap-2 dark:hover:bg-gray-850/25 sm:ml-0" disabled={!table.getFilteredSelectedRowModel().rows.length || isDeleting} > {isDeleting ? ( diff --git a/client/src/components/Files/FileList/FileTableColumns.tsx b/client/src/components/Files/FileList/FileTableColumns.tsx index 6421cd8ec6..8e670aa805 100644 --- a/client/src/components/Files/FileList/FileTableColumns.tsx +++ b/client/src/components/Files/FileList/FileTableColumns.tsx @@ -75,18 +75,21 @@ export const fileTableColumns: ColumnDef[] = [ return ( <> {attachedVectorStores.map((vectorStore, index) => { - if (index === 4) - {return ( - - + if (index === 4) { + return ( + +   - {attachedVectorStores.length - index} more - - );} - if (index > 4) {return null;} + {attachedVectorStores.length - index} more + + ); + } + if (index > 4) { + return null; + } return ( {vectorStore.name} diff --git a/client/src/components/Messages/ScrollToBottom.tsx b/client/src/components/Messages/ScrollToBottom.tsx index 01434f5ecb..b5eef83735 100644 --- a/client/src/components/Messages/ScrollToBottom.tsx +++ b/client/src/components/Messages/ScrollToBottom.tsx @@ -8,7 +8,7 @@ export default function ScrollToBottom({ scrollHandler }: Props) { return (