LibreChat/api/server/services/Files/Audio/speechToText.js

const axios = require('axios');
const { Readable } = require('stream');
const { logger } = require('~/config');
const getCustomConfig = require('~/server/services/Config/getCustomConfig');
const { extractEnvVariable } = require('librechat-data-provider');

/**
 * Handle the response from the STT API
 * @param {Object} response - The response from the STT API
 *
 * @returns {string} The text from the response data
 *
 * @throws Will throw an error if the response status is not 200 or the response data is missing
 */
async function handleResponse(response) {
  if (response.status !== 200) {
    throw new Error('Invalid response from the STT API');
  }

  if (!response.data || !response.data.text) {
    throw new Error('Missing data in response from the STT API');
  }

  return response.data.text.trim();
}

function getProvider(sttSchema) {
  if (sttSchema?.openai) {
    return 'openai';
  }

  throw new Error('Invalid provider');
}

function removeUndefined(obj) {
  Object.keys(obj).forEach((key) => {
    if (obj[key] && typeof obj[key] === 'object') {
      removeUndefined(obj[key]);
      if (Object.keys(obj[key]).length === 0) {
        delete obj[key];
      }
    } else if (obj[key] === undefined) {
      delete obj[key];
    }
  });
}

/**
 * This function prepares the necessary data and headers for making a request to the OpenAI API
 * It uses the provided speech-to-text schema and audio stream to create the request
 *
 * @param {Object} sttSchema - The speech-to-text schema containing the OpenAI configuration
 * @param {Stream} audioReadStream - The audio data to be transcribed
 *
 * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
 * If an error occurs, it returns an array with three null values and logs the error with logger
 */
function openAIProvider(sttSchema, audioReadStream) {
  try {
    const url = sttSchema.openai?.url || 'https://api.openai.com/v1/audio/transcriptions';
    const apiKey = sttSchema.openai.apiKey ? extractEnvVariable(sttSchema.openai.apiKey) : '';

    let data = {
      file: audioReadStream,
      model: sttSchema.openai.model,
    };

    let headers = {
      'Content-Type': 'multipart/form-data',
    };

    [headers].forEach(removeUndefined);

    if (apiKey) {
      headers.Authorization = 'Bearer ' + apiKey;
    }

    return [url, data, headers];
  } catch (error) {
    logger.error('An error occurred while preparing the OpenAI API STT request: ', error);
    return [null, null, null];
  }
}

/**
 * This function prepares the necessary data and headers for making a request to the Azure API
 * It uses the provided request and audio stream to create the request
 *
 * @param {Object} req - The request object, which should contain the endpoint in its body
 * @param {Stream} audioReadStream - The audio data to be transcribed
 *
 * @returns {Array} An array containing the URL for the API request, the data to be sent, and the headers for the request
 * If an error occurs, it returns an array with three null values and logs the error with logger
 */
function azureProvider(req, audioReadStream) {
  try {
    const { endpoint } = req.body;
    const azureConfig = req.app.locals[endpoint];

    if (!azureConfig) {
      throw new Error(`No configuration found for endpoint: ${endpoint}`);
    }

    const { apiKey, instanceName, whisperModel, apiVersion } = Object.entries(
      azureConfig.groupMap,
    ).reduce((acc, [, value]) => {
      if (acc) {
        return acc;
      }

      const whisperKey = Object.keys(value.models).find((modelKey) =>
        modelKey.startsWith('whisper'),
      );

      if (whisperKey) {
        return {
          apiVersion: value.version,
          apiKey: value.apiKey,
          instanceName: value.instanceName,
          whisperModel: value.models[whisperKey]['deploymentName'],
        };
      }

      return null;
    }, null);

    if (!apiKey || !instanceName || !whisperModel || !apiVersion) {
      throw new Error('Required Azure configuration values are missing');
    }

    const baseURL = `https://${instanceName}.openai.azure.com`;

    const url = `${baseURL}/openai/deployments/${whisperModel}/audio/transcriptions?api-version=${apiVersion}`;

    let data = {
      file: audioReadStream,
      filename: 'audio.wav',
      contentType: 'audio/wav',
      knownLength: audioReadStream.length,
    };

    const headers = {
      ...data.getHeaders(),
      'Content-Type': 'multipart/form-data',
      'api-key': apiKey,
    };

    return [url, data, headers];
  } catch (error) {
    logger.error('An error occurred while preparing the Azure API STT request: ', error);
    return [null, null, null];
  }
}

/**
 * Convert speech to text
 * @param {Object} req - The request object
 * @param {Object} res - The response object
 *
 * @returns {Object} The response object with the text from the STT API
 *
 * @throws Will throw an error if an error occurs while processing the audio
 */

async function speechToText(req, res) {
  const customConfig = await getCustomConfig();
  if (!customConfig) {
    return res.status(500).send('Custom config not found');
  }

  if (!req.file || !req.file.buffer) {
    return res.status(400).json({ message: 'No audio file provided in the FormData' });
  }

  const audioBuffer = req.file.buffer;
  const audioReadStream = Readable.from(audioBuffer);
  audioReadStream.path = 'audio.wav';

  const provider = getProvider(customConfig.speech.stt);

  let [url, data, headers] = [];

  switch (provider) {
    case 'openai':
      [url, data, headers] = openAIProvider(customConfig.speech.stt, audioReadStream);
      break;
    case 'azure':
      [url, data, headers] = azureProvider(req, audioReadStream);
      break;
    default:
      throw new Error('Invalid provider');
  }

  if (!Readable.from) {
    const audioBlob = new Blob([audioBuffer], { type: req.file.mimetype });
    delete data['file'];
    data['file'] = audioBlob;
  }

  try {
    const response = await axios.post(url, data, { headers: headers });
    const text = await handleResponse(response);

    res.json({ text });
  } catch (error) {
    logger.error('An error occurred while processing the audio:', error);
    res.sendStatus(500);
  }
}

module.exports = speechToText;