⏯️ fix(tts): Resolve Voice Selection and Manual Playback Issues (#2845)

* fix: voice setting for autoplayback TTS * fix(useTextToSpeechExternal): resolve stateful playback issues and consolidate state logic * refactor: initialize tts voice and provider schema once per request * fix(tts): edge case, longer text inputs. TODO: use continuous stream for longer text inputs * fix(tts): pause global audio on conversation change * refactor: keyvMongo ban cache to allow db updates for unbanning, to prevent server restart * chore: eslint fix * refactor: make ban cache exclusively keyvMongo
2026-02-27 12:54:09 +01:00 · 2024-05-23 16:27:36 -04:00 · 2024-05-23 16:27:36 -04:00 · 514a502b9c
commit 514a502b9c
parent 8e66683577
10 changed files with 332 additions and 178 deletions
--- a/packages/data-provider/src/config.ts
+++ b/packages/data-provider/src/config.ts
@ -223,41 +223,41 @@ export const azureEndpointSchema = z
 export type TAzureConfig = Omit<z.infer<typeof azureEndpointSchema>, 'groups'> &
  TAzureConfigValidationResult;

+const ttsOpenaiSchema = z.object({
+  url: z.string().optional(),
+  apiKey: z.string(),
+  model: z.string(),
+  voices: z.array(z.string()),
+});
+
+const ttsElevenLabsSchema = z.object({
+  url: z.string().optional(),
+  websocketUrl: z.string().optional(),
+  apiKey: z.string(),
+  model: z.string(),
+  voices: z.array(z.string()),
+  voice_settings: z
+    .object({
+      similarity_boost: z.number().optional(),
+      stability: z.number().optional(),
+      style: z.number().optional(),
+      use_speaker_boost: z.boolean().optional(),
+    })
+    .optional(),
+  pronunciation_dictionary_locators: z.array(z.string()).optional(),
+});
+
+const ttsLocalaiSchema = z.object({
+  url: z.string(),
+  apiKey: z.string().optional(),
+  voices: z.array(z.string()),
+  backend: z.string(),
+});
+
 const ttsSchema = z.object({
-  openai: z
-    .object({
-      url: z.string().optional(),
-      apiKey: z.string(),
-      model: z.string(),
-      voices: z.array(z.string()),
-    })
-    .optional(),
-  elevenLabs: z
-    .object({
-      url: z.string().optional(),
-      websocketUrl: z.string().optional(),
-      apiKey: z.string(),
-      model: z.string(),
-      voices: z.array(z.string()),
-      voice_settings: z
-        .object({
-          similarity_boost: z.number().optional(),
-          stability: z.number().optional(),
-          style: z.number().optional(),
-          use_speaker_boost: z.boolean().optional(),
-        })
-        .optional(),
-      pronunciation_dictionary_locators: z.array(z.string()).optional(),
-    })
-    .optional(),
-  localai: z
-    .object({
-      url: z.string(),
-      apiKey: z.string().optional(),
-      voices: z.array(z.string()),
-      backend: z.string(),
-    })
-    .optional(),
+  openai: ttsOpenaiSchema.optional(),
+  elevenLabs: ttsElevenLabsSchema.optional(),
+  localai: ttsLocalaiSchema.optional(),
 });

 const sttSchema = z.object({
@ -359,6 +359,12 @@ export const getConfigDefaults = () => getSchemaDefaults(configSchema);

 export type TCustomConfig = z.infer<typeof configSchema>;

+export type TProviderSchema =
+  | z.infer<typeof ttsOpenaiSchema>
+  | z.infer<typeof ttsElevenLabsSchema>
+  | z.infer<typeof ttsLocalaiSchema>
+  | undefined;
+
 export enum KnownEndpoints {
  anyscale = 'anyscale',
  apipie = 'apipie',