🗣️ fix(tts): Add Text Parser for Message Content Parts (#2840)

* fix: manual TTS trigger for message content parts * ci(streamAudio): processChunks test
2025-12-17 08:50:15 +01:00 · 2024-05-22 23:27:37 -04:00 · 2024-05-22 23:27:37 -04:00 · 8e66683577
commit 8e66683577
parent dc1778b11f
4 changed files with 52 additions and 27 deletions
--- a/api/server/services/Files/Audio/streamAudio.spec.js
+++ b/api/server/services/Files/Audio/streamAudio.spec.js
@ -13,7 +13,7 @@ describe('processChunks', () => {
  let processChunks;
  beforeEach(() => {
-    processChunks = createChunkProcessor();
+    processChunks = createChunkProcessor('message-id');
    Message.findOne.mockClear();
    Message.findOne().lean.mockClear();
  });
@ -21,20 +21,17 @@ describe('processChunks', () => {
  it('should return an empty array when the message is not found', async () => {
    Message.findOne().lean.mockResolvedValueOnce(null);
-    const result = await processChunks('non-existent-id');
+    const result = await processChunks();
    expect(result).toEqual([]);
-    expect(Message.findOne).toHaveBeenCalledWith(
+    expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished');
      { messageId: 'non-existent-id' },
      'text unfinished',
    );
    expect(Message.findOne().lean).toHaveBeenCalled();
  });
  it('should return an empty array when the message does not have a text property', async () => {
    Message.findOne().lean.mockResolvedValueOnce({ unfinished: true });
-    const result = await processChunks('message-id');
+    const result = await processChunks();
    expect(result).toEqual([]);
    expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished');
@ -45,7 +42,7 @@ describe('processChunks', () => {
    const messageText = 'This is a long message. It should be split into chunks. Lol hi mom';
    Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: true });
-    const result = await processChunks('message-id');
+    const result = await processChunks();
    expect(result).toEqual([
      { text: 'This is a long message. It should be split into chunks.', isFinished: false },
@ -58,7 +55,7 @@ describe('processChunks', () => {
    const messageText = 'This is a long message without separators hello there my friend';
    Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: true });
-    const result = await processChunks('message-id');
+    const result = await processChunks();
    expect(result).toEqual([{ text: messageText, isFinished: false }]);
    expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished');
@ -69,7 +66,7 @@ describe('processChunks', () => {
    const messageText = 'This is a finished message.';
    Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: false });
-    const result = await processChunks('message-id');
+    const result = await processChunks();
    expect(result).toEqual([{ text: messageText, isFinished: true }]);
    expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished');
@ -80,9 +77,9 @@ describe('processChunks', () => {
    const messageText = 'This is a finished message.';
    Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: false });
-    await processChunks('message-id');
+    await processChunks();
    Message.findOne().lean.mockResolvedValueOnce({ text: messageText, unfinished: false });
-    const result = await processChunks('message-id');
+    const result = await processChunks();
    expect(result).toEqual([]);
    expect(Message.findOne).toHaveBeenCalledWith({ messageId: 'message-id' }, 'text unfinished');
--- a/client/src/components/Chat/Messages/HoverButtons.tsx
+++ b/client/src/components/Chat/Messages/HoverButtons.tsx
@ -50,7 +50,7 @@ export default function HoverButtons({
  const [TextToSpeech] = useRecoilState<boolean>(store.TextToSpeech);
  const { handleMouseDown, handleMouseUp, toggleSpeech, isSpeaking, isLoading } = useTextToSpeech(
-    message?.text ?? '',
+    message?.content ?? message?.text ?? '',
    isLast,
    index,
  );
--- a/client/src/hooks/Input/useTextToSpeech.ts
+++ b/client/src/hooks/Input/useTextToSpeech.ts
@ -1,11 +1,13 @@
 import { useRef } from 'react';
-import useTextToSpeechBrowser from './useTextToSpeechBrowser';
+import { parseTextParts } from 'librechat-data-provider';
 import type { TMessageContentParts } from 'librechat-data-provider';
 import useTextToSpeechExternal from './useTextToSpeechExternal';
 import useTextToSpeechBrowser from './useTextToSpeechBrowser';
 import { usePauseGlobalAudio } from '../Audio';
 import { useRecoilState } from 'recoil';
 import store from '~/store';
-const useTextToSpeech = (message: string, isLast: boolean, index = 0) => {
+const useTextToSpeech = (message: string | TMessageContentParts[], isLast: boolean, index = 0) => {
  const [endpointTTS] = useRecoilState<string>(store.endpointTTS);
  const useExternalTextToSpeech = endpointTTS === 'external';
@ -34,7 +36,8 @@ const useTextToSpeech = (message: string, isLast: boolean, index = 0) => {
    isMouseDownRef.current = true;
    timerRef.current = window.setTimeout(() => {
      if (isMouseDownRef.current) {
-        generateSpeech(message, true);
+        const parsedMessage = typeof message === 'string' ? message : parseTextParts(message);
        generateSpeech(parsedMessage, true);
      }
    }, 1000);
  };
@ -51,7 +54,8 @@ const useTextToSpeech = (message: string, isLast: boolean, index = 0) => {
      cancelSpeech();
      pauseGlobalAudio();
    } else {
-      generateSpeech(message, false);
+      const parsedMessage = typeof message === 'string' ? message : parseTextParts(message);
      generateSpeech(parsedMessage, false);
    }
  };
--- a/packages/data-provider/src/parsers.ts
+++ b/packages/data-provider/src/parsers.ts
@ -1,6 +1,8 @@
 import type { ZodIssue } from 'zod';
-import type { TConversation, TPreset } from './schemas';
+import type * as a from './types/assistants';
-import type { TConfig, TEndpointOption, TEndpointsConfig } from './types';
+import type * as s from './schemas';
 import type * as t from './types';
 import { ContentTypes } from './types/assistants';
 import {
  EModelEndpoint,
  openAISchema,
@ -71,7 +73,7 @@ export function getEnabledEndpoints() {
 }
 /** Orders an existing EndpointsConfig object based on enabled endpoint/custom ordering */
-export function orderEndpointsConfig(endpointsConfig: TEndpointsConfig) {
+export function orderEndpointsConfig(endpointsConfig: t.TEndpointsConfig) {
  if (!endpointsConfig) {
    return {};
  }
@ -79,7 +81,7 @@ export function orderEndpointsConfig(endpointsConfig: TEndpointsConfig) {
  const endpointKeys = Object.keys(endpointsConfig);
  const defaultCustomIndex = enabledEndpoints.indexOf(EModelEndpoint.custom);
  return endpointKeys.reduce(
-    (accumulatedConfig: Record<string, TConfig | null | undefined>, currentEndpointKey) => {
+    (accumulatedConfig: Record<string, t.TConfig | null | undefined>, currentEndpointKey) => {
      const isCustom = !(currentEndpointKey in EModelEndpoint);
      const isEnabled = enabledEndpoints.includes(currentEndpointKey);
      if (!isEnabled && !isCustom) {
@ -91,7 +93,7 @@ export function orderEndpointsConfig(endpointsConfig: TEndpointsConfig) {
      if (isCustom) {
        accumulatedConfig[currentEndpointKey] = {
          order: defaultCustomIndex >= 0 ? defaultCustomIndex : 9999,
-          ...(endpointsConfig[currentEndpointKey] as Omit<TConfig, 'order'> & { order?: number }),
+          ...(endpointsConfig[currentEndpointKey] as Omit<t.TConfig, 'order'> & { order?: number }),
        };
      } else if (endpointsConfig[currentEndpointKey]) {
        accumulatedConfig[currentEndpointKey] = {
@ -165,7 +167,7 @@ export const parseConvo = ({
 }: {
  endpoint: EModelEndpoint;
  endpointType?: EModelEndpoint;
-  conversation: Partial<TConversation | TPreset>;
+  conversation: Partial<s.TConversation | s.TPreset>;
  possibleValues?: TPossibleValues;
  // TODO: POC for default schema
  // defaultSchema?: Partial<EndpointSchema>,
@ -182,7 +184,7 @@ export const parseConvo = ({
  //   schema = schemaCreators[endpoint](defaultSchema);
  // }
-  const convo = schema.parse(conversation) as TConversation;
+  const convo = schema.parse(conversation) as s.TConversation;
  const { models, secondaryModels } = possibleValues ?? {};
  if (models && convo) {
@ -196,7 +198,7 @@ export const parseConvo = ({
  return convo;
 };
-export const getResponseSender = (endpointOption: TEndpointOption): string => {
+export const getResponseSender = (endpointOption: t.TEndpointOption): string => {
  const { model, endpoint, endpointType, modelDisplayLabel, chatGptLabel, modelLabel, jailbreak } =
    endpointOption;
@ -292,7 +294,7 @@ export const parseCompactConvo = ({
 }: {
  endpoint?: EModelEndpoint;
  endpointType?: EModelEndpoint;
-  conversation: Partial<TConversation | TPreset>;
+  conversation: Partial<s.TConversation | s.TPreset>;
  possibleValues?: TPossibleValues;
  // TODO: POC for default schema
  // defaultSchema?: Partial<EndpointSchema>,
@ -309,7 +311,7 @@ export const parseCompactConvo = ({
    schema = compactEndpointSchemas[endpointType];
  }
-  const convo = schema.parse(conversation) as TConversation;
+  const convo = schema.parse(conversation) as s.TConversation;
  // const { models, secondaryModels } = possibleValues ?? {};
  const { models } = possibleValues ?? {};
@ -323,3 +325,25 @@ export const parseCompactConvo = ({
  return convo;
 };
 export function parseTextParts(contentParts: a.TMessageContentParts[]): string {
  let result = '';
  for (const part of contentParts) {
    if (part.type === ContentTypes.TEXT) {
      const textValue = part.text.value;
      if (
        result.length > 0 &&
        textValue.length > 0 &&
        result[result.length - 1] !== ' ' &&
        textValue[0] !== ' '
      ) {
        result += ' ';
      }
      result += textValue;
    }
  }
  return result;
 }