LibreChat/api/server/services/Files/Audio/streamAudio.js
Danny Avila 6655304753
🎙️ a11y: Screen Reader Support for Dynamic Content Updates (#3625)
* WIP: first pass, hooks

* wip: isStream arg

* feat: first pass, dynamic content updates, screen reader announcements

* chore: unrelated, styling redundancy
2024-08-13 03:04:27 -04:00

358 lines
9.3 KiB
JavaScript

const WebSocket = require('ws');
const { CacheKeys, findLastSeparatorIndex, SEPARATORS } = require('librechat-data-provider');
const { getLogStores } = require('~/cache');
/**
* @param {string[]} voiceIds - Array of voice IDs
* @returns {string}
*/
function getRandomVoiceId(voiceIds) {
const randomIndex = Math.floor(Math.random() * voiceIds.length);
return voiceIds[randomIndex];
}
/**
* @typedef {Object} VoiceSettings
* @property {number} similarity_boost
* @property {number} stability
* @property {boolean} use_speaker_boost
*/
/**
* @typedef {Object} GenerateAudioBulk
* @property {string} model_id
* @property {string} text
* @property {VoiceSettings} voice_settings
*/
/**
* @typedef {Object} TextToSpeechClient
* @property {function(Object): Promise<stream.Readable>} generate
*/
/**
* @typedef {Object} AudioChunk
* @property {string} audio
* @property {boolean} isFinal
* @property {Object} alignment
* @property {number[]} alignment.char_start_times_ms
* @property {number[]} alignment.chars_durations_ms
* @property {string[]} alignment.chars
* @property {Object} normalizedAlignment
* @property {number[]} normalizedAlignment.char_start_times_ms
* @property {number[]} normalizedAlignment.chars_durations_ms
* @property {string[]} normalizedAlignment.chars
*/
/**
*
* @param {Record<string, unknown | undefined>} parameters
* @returns
*/
function assembleQuery(parameters) {
let query = '';
let hasQuestionMark = false;
for (const [key, value] of Object.entries(parameters)) {
if (value == null) {
continue;
}
if (!hasQuestionMark) {
query += '?';
hasQuestionMark = true;
} else {
query += '&';
}
query += `${key}=${value}`;
}
return query;
}
const MAX_NOT_FOUND_COUNT = 6;
const MAX_NO_CHANGE_COUNT = 10;
/**
* @param {string} messageId
* @returns {() => Promise<{ text: string, isFinished: boolean }[]>}
*/
function createChunkProcessor(messageId) {
let notFoundCount = 0;
let noChangeCount = 0;
let processedText = '';
if (!messageId) {
throw new Error('Message ID is required');
}
const messageCache = getLogStores(CacheKeys.MESSAGES);
/**
* @returns {Promise<{ text: string, isFinished: boolean }[] | string>}
*/
async function processChunks() {
if (notFoundCount >= MAX_NOT_FOUND_COUNT) {
return `Message not found after ${MAX_NOT_FOUND_COUNT} attempts`;
}
if (noChangeCount >= MAX_NO_CHANGE_COUNT) {
return `No change in message after ${MAX_NO_CHANGE_COUNT} attempts`;
}
/** @type { string | { text: string; complete: boolean } } */
const message = await messageCache.get(messageId);
if (!message) {
notFoundCount++;
return [];
}
const text = typeof message === 'string' ? message : message.text;
const complete = typeof message === 'string' ? false : message.complete;
if (text === processedText) {
noChangeCount++;
}
const remainingText = text.slice(processedText.length);
const chunks = [];
if (!complete && remainingText.length >= 20) {
const separatorIndex = findLastSeparatorIndex(remainingText);
if (separatorIndex !== -1) {
const chunkText = remainingText.slice(0, separatorIndex + 1);
chunks.push({ text: chunkText, isFinished: false });
processedText += chunkText;
} else {
chunks.push({ text: remainingText, isFinished: false });
processedText = text;
}
} else if (complete && remainingText.trim().length > 0) {
chunks.push({ text: remainingText.trim(), isFinished: true });
processedText = text;
}
return chunks;
}
return processChunks;
}
/**
* @param {string} text
* @param {number} [chunkSize=4000]
* @returns {{ text: string, isFinished: boolean }[]}
*/
function splitTextIntoChunks(text, chunkSize = 4000) {
if (!text) {
throw new Error('Text is required');
}
const chunks = [];
let startIndex = 0;
const textLength = text.length;
while (startIndex < textLength) {
let endIndex = Math.min(startIndex + chunkSize, textLength);
let chunkText = text.slice(startIndex, endIndex);
if (endIndex < textLength) {
let lastSeparatorIndex = -1;
for (const separator of SEPARATORS) {
const index = chunkText.lastIndexOf(separator);
if (index !== -1) {
lastSeparatorIndex = Math.max(lastSeparatorIndex, index);
}
}
if (lastSeparatorIndex !== -1) {
endIndex = startIndex + lastSeparatorIndex + 1;
chunkText = text.slice(startIndex, endIndex);
} else {
const nextSeparatorIndex = text.slice(endIndex).search(/\S/);
if (nextSeparatorIndex !== -1) {
endIndex += nextSeparatorIndex;
chunkText = text.slice(startIndex, endIndex);
}
}
}
chunkText = chunkText.trim();
if (chunkText) {
chunks.push({
text: chunkText,
isFinished: endIndex >= textLength,
});
} else if (chunks.length > 0) {
chunks[chunks.length - 1].isFinished = true;
}
startIndex = endIndex;
while (startIndex < textLength && text[startIndex].trim() === '') {
startIndex++;
}
}
return chunks;
}
/**
* Input stream text to speech
* @param {Express.Response} res
* @param {AsyncIterable<string>} textStream
* @param {(token: string) => Promise<boolean>} callback - Whether to continue the stream or not
* @returns {AsyncGenerator<AudioChunk>}
*/
function inputStreamTextToSpeech(res, textStream, callback) {
const model = 'eleven_monolingual_v1';
const wsUrl = `wss://api.elevenlabs.io/v1/text-to-speech/${getRandomVoiceId()}/stream-input${assembleQuery(
{
model_id: model,
// flush: true,
// optimize_streaming_latency: this.settings.optimizeStreamingLatency,
optimize_streaming_latency: 1,
// output_format: this.settings.outputFormat,
},
)}`;
const socket = new WebSocket(wsUrl);
socket.onopen = function () {
const streamStart = {
text: ' ',
voice_settings: {
stability: 0.5,
similarity_boost: 0.8,
},
xi_api_key: process.env.ELEVENLABS_API_KEY,
// generation_config: { chunk_length_schedule: [50, 90, 120, 150, 200] },
};
socket.send(JSON.stringify(streamStart));
// send stream until done
const streamComplete = new Promise((resolve, reject) => {
(async () => {
let textBuffer = '';
let shouldContinue = true;
for await (const textDelta of textStream) {
textBuffer += textDelta;
// using ". " as separator: sending in full sentences improves the quality
// of the audio output significantly.
const separatorIndex = findLastSeparatorIndex(textBuffer);
// Callback for textStream (will return false if signal is aborted)
shouldContinue = await callback(textDelta);
if (separatorIndex === -1) {
continue;
}
if (!shouldContinue) {
break;
}
const textToProcess = textBuffer.slice(0, separatorIndex);
textBuffer = textBuffer.slice(separatorIndex + 1);
const request = {
text: textToProcess,
try_trigger_generation: true,
};
socket.send(JSON.stringify(request));
}
// send remaining text:
if (shouldContinue && textBuffer.length > 0) {
socket.send(
JSON.stringify({
text: `${textBuffer} `, // append space
try_trigger_generation: true,
}),
);
}
})()
.then(resolve)
.catch(reject);
});
streamComplete
.then(() => {
const endStream = {
text: '',
};
socket.send(JSON.stringify(endStream));
})
.catch((e) => {
console.error('Error streaming text to speech:', e);
throw e;
});
};
return (async function* audioStream() {
let isDone = false;
let chunks = [];
let resolve;
let waitForMessage = new Promise((r) => (resolve = r));
socket.onmessage = function (event) {
// console.log(event);
const audioChunk = JSON.parse(event.data);
if (audioChunk.audio && audioChunk.alignment) {
res.write(`event: audio\ndata: ${event.data}\n\n`);
chunks.push(audioChunk);
resolve(null);
waitForMessage = new Promise((r) => (resolve = r));
} else if (audioChunk.isFinal) {
isDone = true;
resolve(null);
} else if (audioChunk.message) {
console.warn('Received Elevenlabs message:', audioChunk.message);
resolve(null);
}
};
socket.onerror = function (error) {
console.error('WebSocket error:', error);
// throw error;
};
socket.onclose = function () {
isDone = true;
resolve(null);
};
while (!isDone) {
await waitForMessage;
yield* chunks;
chunks = [];
}
res.write('event: end\ndata: \n\n');
})();
}
/**
*
* @param {AsyncIterable<string>} llmStream
*/
async function* llmMessageSource(llmStream) {
for await (const chunk of llmStream) {
const message = chunk.choices[0].delta.content;
if (message) {
yield message;
}
}
}
module.exports = {
inputStreamTextToSpeech,
findLastSeparatorIndex,
createChunkProcessor,
splitTextIntoChunks,
llmMessageSource,
getRandomVoiceId,
};