From 6b90817ae015e6ed692ec1f574ec05632eb914fb Mon Sep 17 00:00:00 2001 From: Marco Beretta <81851188+berry-13@users.noreply.github.com> Date: Thu, 19 Dec 2024 14:58:15 +0100 Subject: [PATCH] =?UTF-8?q?=E2=9C=A8=20WIP:=20Implement=20Realtime=20Ephem?= =?UTF-8?q?eral=20Token=20functionality=20and=20update=20related=20compone?= =?UTF-8?q?nts?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../routes/files/speech/customConfigSpeech.js | 2 +- api/server/routes/files/speech/index.js | 3 + api/server/routes/files/speech/realtime.js | 10 ++ .../services/Files/Audio/getRealtimeConfig.js | 102 ++++++++++++++++++ api/server/services/Files/Audio/index.js | 2 + .../src/components/Chat/Input/SendButton.tsx | 51 +++++++-- client/src/data-provider/mutations.ts | 15 +++ packages/data-provider/src/api-endpoints.ts | 4 +- packages/data-provider/src/config.ts | 20 ++++ packages/data-provider/src/data-service.ts | 6 ++ packages/data-provider/src/keys.ts | 1 + packages/data-provider/src/types.ts | 10 ++ 12 files changed, 213 insertions(+), 13 deletions(-) create mode 100644 api/server/routes/files/speech/realtime.js create mode 100644 api/server/services/Files/Audio/getRealtimeConfig.js diff --git a/api/server/routes/files/speech/customConfigSpeech.js b/api/server/routes/files/speech/customConfigSpeech.js index c3b1e2eb47..4ab380a2e6 100644 --- a/api/server/routes/files/speech/customConfigSpeech.js +++ b/api/server/routes/files/speech/customConfigSpeech.js @@ -3,7 +3,7 @@ const router = express.Router(); const { getCustomConfigSpeech } = require('~/server/services/Files/Audio'); -router.get('/get', async (req, res) => { +router.get('/', async (req, res) => { await getCustomConfigSpeech(req, res); }); diff --git a/api/server/routes/files/speech/index.js b/api/server/routes/files/speech/index.js index 074ed553c9..082b18780c 100644 --- a/api/server/routes/files/speech/index.js +++ b/api/server/routes/files/speech/index.js @@ -4,6 +4,7 @@ const { createTTSLimiters, createSTTLimiters } = require('~/server/middleware'); const stt = require('./stt'); const tts = require('./tts'); const customConfigSpeech = require('./customConfigSpeech'); +const realtime = require('./realtime'); const router = express.Router(); @@ -14,4 +15,6 @@ router.use('/tts', ttsIpLimiter, ttsUserLimiter, tts); router.use('/config', customConfigSpeech); +router.use('/realtime', realtime); + module.exports = router; diff --git a/api/server/routes/files/speech/realtime.js b/api/server/routes/files/speech/realtime.js new file mode 100644 index 0000000000..79ec659379 --- /dev/null +++ b/api/server/routes/files/speech/realtime.js @@ -0,0 +1,10 @@ +const express = require('express'); +const router = express.Router(); + +const { getRealtimeConfig } = require('~/server/services/Files/Audio'); + +router.get('/', async (req, res) => { + await getRealtimeConfig(req, res); +}); + +module.exports = router; diff --git a/api/server/services/Files/Audio/getRealtimeConfig.js b/api/server/services/Files/Audio/getRealtimeConfig.js new file mode 100644 index 0000000000..4b71437e55 --- /dev/null +++ b/api/server/services/Files/Audio/getRealtimeConfig.js @@ -0,0 +1,102 @@ +const { extractEnvVariable, RealtimeVoiceProviders } = require('librechat-data-provider'); +const { getCustomConfig } = require('~/server/services/Config'); +const { logger } = require('~/config'); + +class RealtimeService { + constructor(customConfig) { + this.customConfig = customConfig; + this.providerStrategies = { + [RealtimeVoiceProviders.OPENAI]: this.openaiProvider.bind(this), + }; + } + + static async getInstance() { + const customConfig = await getCustomConfig(); + if (!customConfig) { + throw new Error('Custom config not found'); + } + return new RealtimeService(customConfig); + } + + async getProviderSchema() { + const realtimeSchema = this.customConfig.speech.realtime; + if (!realtimeSchema) { + throw new Error('No Realtime schema is set in config'); + } + + const providers = Object.entries(realtimeSchema).filter( + ([, value]) => Object.keys(value).length > 0, + ); + + if (providers.length !== 1) { + throw new Error(providers.length > 1 ? 'Multiple providers set' : 'No provider set'); + } + + return providers[0]; + } + + async openaiProvider(schema, voice) { + const defaultRealtimeUrl = 'https://api.openai.com/v1/realtime'; + const allowedVoices = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'sage', 'shimmer', 'verse']; + + if (!voice) { + throw new Error('Voice not specified'); + } + + if (!allowedVoices.includes(voice)) { + throw new Error(`Invalid voice: ${voice}`); + } + + const apiKey = extractEnvVariable(schema.apiKey); + if (!apiKey) { + throw new Error('OpenAI API key not configured'); + } + + const response = await fetch('https://api.openai.com/v1/realtime/sessions', { + method: 'POST', + headers: { + Authorization: `Bearer ${apiKey}`, + 'Content-Type': 'application/json', + }, + body: JSON.stringify({ + model: 'gpt-4o-realtime-preview-2024-12-17', + modalities: ['audio', 'text'], + voice: voice, + }), + }); + + const token = response.json(); + + return { + provider: RealtimeVoiceProviders.OPENAI, + token: token, + url: schema.url || defaultRealtimeUrl, + }; + } + + async getRealtimeConfig(req, res) { + try { + const [provider, schema] = await this.getProviderSchema(); + const strategy = this.providerStrategies[provider]; + + if (!strategy) { + throw new Error(`Unsupported provider: ${provider}`); + } + + const voice = req.query.voice; + + const config = strategy(schema, voice); + res.json(config); + } catch (error) { + logger.error('[RealtimeService] Config generation failed:', error); + res.status(500).json({ error: error.message }); + } + } +} + +async function getRealtimeConfig(req, res) { + const service = await RealtimeService.getInstance(); + await service.getRealtimeConfig(req, res); +} + +module.exports = getRealtimeConfig; diff --git a/api/server/services/Files/Audio/index.js b/api/server/services/Files/Audio/index.js index 4378391e27..5e0cbd26bd 100644 --- a/api/server/services/Files/Audio/index.js +++ b/api/server/services/Files/Audio/index.js @@ -1,4 +1,5 @@ const getCustomConfigSpeech = require('./getCustomConfigSpeech'); +const getRealtimeConfig = require('./getRealtimeConfig'); const TTSService = require('./TTSService'); const STTService = require('./STTService'); const getVoices = require('./getVoices'); @@ -6,6 +7,7 @@ const getVoices = require('./getVoices'); module.exports = { getVoices, getCustomConfigSpeech, + getRealtimeConfig, ...STTService, ...TTSService, }; diff --git a/client/src/components/Chat/Input/SendButton.tsx b/client/src/components/Chat/Input/SendButton.tsx index c925b8c14a..6a17326e69 100644 --- a/client/src/components/Chat/Input/SendButton.tsx +++ b/client/src/components/Chat/Input/SendButton.tsx @@ -1,7 +1,10 @@ import React, { forwardRef } from 'react'; import { useWatch } from 'react-hook-form'; +import type { TRealtimeEphemeralTokenResponse } from 'librechat-data-provider'; import type { Control } from 'react-hook-form'; +import { useRealtimeEphemeralTokenMutation } from '~/data-provider'; import { TooltipAnchor, SendIcon, CallIcon } from '~/components'; +import { useToastContext } from '~/Providers/ToastContext'; import { useLocalize } from '~/hooks'; import { cn } from '~/utils'; @@ -17,6 +20,7 @@ const ActionButton = forwardRef( icon: React.ReactNode; tooltip: string; testId: string; + onClick?: () => void; }, ref: React.ForwardedRef, ) => { @@ -36,6 +40,7 @@ const ActionButton = forwardRef( )} data-testid={props.testId} type="submit" + onClick={props.onClick} > {props.icon} @@ -49,19 +54,43 @@ const ActionButton = forwardRef( const SendButton = forwardRef((props: ButtonProps, ref: React.ForwardedRef) => { const localize = useLocalize(); - const { text } = useWatch({ control: props.control }); + const { showToast } = useToastContext(); + const { text = '' } = useWatch({ control: props.control }); - const buttonProps = text - ? { - icon: , - tooltip: localize('com_nav_send_message'), - testId: 'send-button', + const { mutate: startCall, isLoading: isProcessing } = useRealtimeEphemeralTokenMutation({ + onSuccess: async (data: TRealtimeEphemeralTokenResponse) => { + showToast({ + message: 'IT WORKS!!', + status: 'success', + }); + }, + onError: (error: unknown) => { + showToast({ + message: localize('com_nav_audio_process_error', (error as Error).message), + status: 'error', + }); + }, + }); + + const handleClick = () => { + if (text.trim() === '') { + startCall({ voice: 'verse' }); } - : { - icon: , - tooltip: localize('com_nav_call'), - testId: 'call-button', - }; + }; + + const buttonProps = + text.trim() !== '' + ? { + icon: , + tooltip: localize('com_nav_send_message'), + testId: 'send-button', + } + : { + icon: , + tooltip: localize('com_nav_call'), + testId: 'call-button', + onClick: handleClick, + }; return ; }); diff --git a/client/src/data-provider/mutations.ts b/client/src/data-provider/mutations.ts index 60f65eeeec..357bd17a38 100644 --- a/client/src/data-provider/mutations.ts +++ b/client/src/data-provider/mutations.ts @@ -726,6 +726,21 @@ export const useTextToSpeechMutation = ( }); }; +export const useRealtimeEphemeralTokenMutation = ( + options?: t.MutationOptions, +): UseMutationResult< + t.TRealtimeEphemeralTokenResponse, + unknown, + t.TRealtimeEphemeralTokenRequest, + unknown +> => { + return useMutation([MutationKeys.realtimeEphemeralToken], { + mutationFn: (data: t.TRealtimeEphemeralTokenRequest) => + dataService.getRealtimeEphemeralToken(data), + ...(options || {}), + }); +}; + /** * ASSISTANTS */ diff --git a/packages/data-provider/src/api-endpoints.ts b/packages/data-provider/src/api-endpoints.ts index 27cc221d72..b6714025a8 100644 --- a/packages/data-provider/src/api-endpoints.ts +++ b/packages/data-provider/src/api-endpoints.ts @@ -171,7 +171,9 @@ export const textToSpeechManual = () => `${textToSpeech()}/manual`; export const textToSpeechVoices = () => `${textToSpeech()}/voices`; -export const getCustomConfigSpeech = () => `${speech()}/config/get`; +export const getCustomConfigSpeech = () => `${speech()}/config`; + +export const getRealtimeEphemeralToken = () => `${speech()}/realtime`; export const getPromptGroup = (_id: string) => `${prompts()}/groups/${_id}`; diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 82137e6157..164782f0b9 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -379,6 +379,18 @@ const speechTab = z }) .optional(); +const realtime = z + .object({ + openai: z + .object({ + url: z.string().optional(), + apiKey: z.string().optional(), + voices: z.array(z.string()).optional(), + }) + .optional(), + }) + .optional(); + export enum RateLimitPrefix { FILE_UPLOAD = 'FILE_UPLOAD', IMPORT = 'IMPORT', @@ -534,6 +546,7 @@ export const configSchema = z.object({ tts: ttsSchema.optional(), stt: sttSchema.optional(), speechTab: speechTab.optional(), + realtime: realtime.optional(), }) .optional(), rateLimits: rateLimitSchema.optional(), @@ -1135,6 +1148,13 @@ export enum TTSProviders { LOCALAI = 'localai', } +export enum RealtimeVoiceProviders { + /** + * Provider for OpenAI Realtime Voice API + */ + OPENAI = 'openai', +} + /** Enum for app-wide constants */ export enum Constants { /** Key for the app's version. */ diff --git a/packages/data-provider/src/data-service.ts b/packages/data-provider/src/data-service.ts index 5af00fdcb9..80566488b2 100644 --- a/packages/data-provider/src/data-service.ts +++ b/packages/data-provider/src/data-service.ts @@ -576,6 +576,12 @@ export const getCustomConfigSpeech = (): Promise return request.get(endpoints.getCustomConfigSpeech()); }; +export const getRealtimeEphemeralToken = ( + data: t.TRealtimeEphemeralTokenRequest, +): Promise => { + return request.get(endpoints.getRealtimeEphemeralToken(), { params: data }); +}; + /* conversations */ export function duplicateConversation( diff --git a/packages/data-provider/src/keys.ts b/packages/data-provider/src/keys.ts index c1e0c24557..19fa908d67 100644 --- a/packages/data-provider/src/keys.ts +++ b/packages/data-provider/src/keys.ts @@ -67,4 +67,5 @@ export enum MutationKeys { deleteAgentAction = 'deleteAgentAction', deleteUser = 'deleteUser', updateRole = 'updateRole', + realtimeEphemeralToken = 'realtimeEphemeralToken', } diff --git a/packages/data-provider/src/types.ts b/packages/data-provider/src/types.ts index 6d9cd87c88..9fff3db971 100644 --- a/packages/data-provider/src/types.ts +++ b/packages/data-provider/src/types.ts @@ -10,6 +10,7 @@ import type { TConversationTag, TBanner, } from './schemas'; +import { string } from 'zod'; export type TOpenAIMessage = OpenAI.Chat.ChatCompletionMessageParam; export * from './schemas'; @@ -472,3 +473,12 @@ export type TAcceptTermsResponse = { }; export type TBannerResponse = TBanner | null; + +export type TRealtimeEphemeralTokenRequest = { + voice: string; +}; + +export type TRealtimeEphemeralTokenResponse = { + token: string; + url: string; +};