🗣️ refactor: speech services; fix: OpenAI STT (#3431)

* fix: OpenAI STT * refactor: STT and TTS service, slightly imporve of performance * fix(DecibelSelector): update default value
2026-02-06 01:31:49 +01:00 · 2024-07-30 09:18:52 -04:00 · 2024-07-30 09:18:52 -04:00 · 51cd847606
commit 51cd847606
parent 4ffdefc2a8
8 changed files with 737 additions and 714 deletions
--- a/api/server/services/Files/Audio/STTService.js
+++ b/api/server/services/Files/Audio/STTService.js
@ -0,0 +1,248 @@
+const { Readable } = require('stream');
+const axios = require('axios');
+const { extractEnvVariable, STTProviders } = require('librechat-data-provider');
+const getCustomConfig = require('~/server/services/Config/getCustomConfig');
+const { genAzureEndpoint } = require('~/utils');
+const { logger } = require('~/config');
+
+/**
+ * Service class for handling Speech-to-Text (STT) operations.
+ * @class
+ */
+class STTService {
+  /**
+   * Creates an instance of STTService.
+   * @param {Object} customConfig - The custom configuration object.
+   */
+  constructor(customConfig) {
+    this.customConfig = customConfig;
+    this.providerStrategies = {
+      [STTProviders.OPENAI]: this.openAIProvider,
+      [STTProviders.AZURE_OPENAI]: this.azureOpenAIProvider,
+    };
+  }
+
+  /**
+   * Creates a singleton instance of STTService.
+   * @static
+   * @async
+   * @returns {Promise<STTService>} The STTService instance.
+   * @throws {Error} If the custom config is not found.
+   */
+  static async getInstance() {
+    const customConfig = await getCustomConfig();
+    if (!customConfig) {
+      throw new Error('Custom config not found');
+    }
+    return new STTService(customConfig);
+  }
+
+  /**
+   * Retrieves the configured STT provider and its schema.
+   * @returns {Promise<[string, Object]>} A promise that resolves to an array containing the provider name and its schema.
+   * @throws {Error} If no STT schema is set, multiple providers are set, or no provider is set.
+   */
+  async getProviderSchema() {
+    const sttSchema = this.customConfig.speech.stt;
+
+    if (!sttSchema) {
+      throw new Error(
+        'No STT schema is set. Did you configure STT in the custom config (librechat.yaml)?',
+      );
+    }
+
+    const providers = Object.entries(sttSchema).filter(
+      ([, value]) => Object.keys(value).length > 0,
+    );
+
+    if (providers.length !== 1) {
+      throw new Error(
+        providers.length > 1
+          ? 'Multiple providers are set. Please set only one provider.'
+          : 'No provider is set. Please set a provider.',
+      );
+    }
+
+    const [provider, schema] = providers[0];
+    return [provider, schema];
+  }
+
+  /**
+   * Recursively removes undefined properties from an object.
+   * @param {Object} obj - The object to clean.
+   * @returns {void}
+   */
+  removeUndefined(obj) {
+    Object.keys(obj).forEach((key) => {
+      if (obj[key] && typeof obj[key] === 'object') {
+        this.removeUndefined(obj[key]);
+        if (Object.keys(obj[key]).length === 0) {
+          delete obj[key];
+        }
+      } else if (obj[key] === undefined) {
+        delete obj[key];
+      }
+    });
+  }
+
+  /**
+   * Prepares the request for the OpenAI STT provider.
+   * @param {Object} sttSchema - The STT schema for OpenAI.
+   * @param {Stream} audioReadStream - The audio data to be transcribed.
+   * @returns {Array} An array containing the URL, data, and headers for the request.
+   */
+  openAIProvider(sttSchema, audioReadStream) {
+    const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions';
+    const apiKey = extractEnvVariable(sttSchema.apiKey) || '';
+
+    const data = {
+      file: audioReadStream,
+      model: sttSchema.model,
+    };
+
+    const headers = {
+      'Content-Type': 'multipart/form-data',
+      ...(apiKey && { Authorization: `Bearer ${apiKey}` }),
+    };
+    [headers].forEach(this.removeUndefined);
+
+    return [url, data, headers];
+  }
+
+  /**
+   * Prepares the request for the Azure OpenAI STT provider.
+   * @param {Object} sttSchema - The STT schema for Azure OpenAI.
+   * @param {Buffer} audioBuffer - The audio data to be transcribed.
+   * @param {Object} audioFile - The audio file object containing originalname, mimetype, and size.
+   * @returns {Array} An array containing the URL, data, and headers for the request.
+   * @throws {Error} If the audio file size exceeds 25MB or the audio file format is not accepted.
+   */
+  azureOpenAIProvider(sttSchema, audioBuffer, audioFile) {
+    const url = `${genAzureEndpoint({
+      azureOpenAIApiInstanceName: sttSchema?.instanceName,
+      azureOpenAIApiDeploymentName: sttSchema?.deploymentName,
+    })}/audio/transcriptions?api-version=${sttSchema?.apiVersion}`;
+
+    const apiKey = sttSchema.apiKey ? extractEnvVariable(sttSchema.apiKey) : '';
+
+    if (audioBuffer.byteLength > 25 * 1024 * 1024) {
+      throw new Error('The audio file size exceeds the limit of 25MB');
+    }
+
+    const acceptedFormats = ['flac', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'ogg', 'wav', 'webm'];
+    const fileFormat = audioFile.mimetype.split('/')[1];
+    if (!acceptedFormats.includes(fileFormat)) {
+      throw new Error(`The audio file format ${fileFormat} is not accepted`);
+    }
+
+    const formData = new FormData();
+    const audioBlob = new Blob([audioBuffer], { type: audioFile.mimetype });
+    formData.append('file', audioBlob, audioFile.originalname);
+
+    const headers = {
+      'Content-Type': 'multipart/form-data',
+      ...(apiKey && { 'api-key': apiKey }),
+    };
+
+    [headers].forEach(this.removeUndefined);
+
+    return [url, formData, headers];
+  }
+
+  /**
+   * Sends an STT request to the specified provider.
+   * @async
+   * @param {string} provider - The STT provider to use.
+   * @param {Object} sttSchema - The STT schema for the provider.
+   * @param {Object} requestData - The data required for the STT request.
+   * @param {Buffer} requestData.audioBuffer - The audio data to be transcribed.
+   * @param {Object} requestData.audioFile - The audio file object containing originalname, mimetype, and size.
+   * @returns {Promise<string>} A promise that resolves to the transcribed text.
+   * @throws {Error} If the provider is invalid, the response status is not 200, or the response data is missing.
+   */
+  async sttRequest(provider, sttSchema, { audioBuffer, audioFile }) {
+    const strategy = this.providerStrategies[provider];
+    if (!strategy) {
+      throw new Error('Invalid provider');
+    }
+
+    const audioReadStream = Readable.from(audioBuffer);
+    audioReadStream.path = 'audio.wav';
+
+    const [url, data, headers] = strategy.call(this, sttSchema, audioReadStream, audioFile);
+
+    if (!Readable.from && data instanceof FormData) {
+      const audioBlob = new Blob([audioBuffer], { type: audioFile.mimetype });
+      data.set('file', audioBlob, audioFile.originalname);
+    }
+
+    try {
+      const response = await axios.post(url, data, { headers });
+
+      if (response.status !== 200) {
+        throw new Error('Invalid response from the STT API');
+      }
+
+      if (!response.data || !response.data.text) {
+        throw new Error('Missing data in response from the STT API');
+      }
+
+      return response.data.text.trim();
+    } catch (error) {
+      logger.error(`STT request failed for provider ${provider}:`, error);
+      throw error;
+    }
+  }
+
+  /**
+   * Processes a speech-to-text request.
+   * @async
+   * @param {Object} req - The request object.
+   * @param {Object} res - The response object.
+   * @returns {Promise<void>}
+   */
+  async processTextToSpeech(req, res) {
+    if (!req.file || !req.file.buffer) {
+      return res.status(400).json({ message: 'No audio file provided in the FormData' });
+    }
+
+    const audioBuffer = req.file.buffer;
+    const audioFile = {
+      originalname: req.file.originalname,
+      mimetype: req.file.mimetype,
+      size: req.file.size,
+    };
+
+    try {
+      const [provider, sttSchema] = await this.getProviderSchema();
+      const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile });
+      res.json({ text });
+    } catch (error) {
+      logger.error('An error occurred while processing the audio:', error);
+      res.sendStatus(500);
+    }
+  }
+}
+
+/**
+ * Factory function to create an STTService instance.
+ * @async
+ * @returns {Promise<STTService>} A promise that resolves to an STTService instance.
+ */
+async function createSTTService() {
+  return STTService.getInstance();
+}
+
+/**
+ * Wrapper function for speech-to-text processing.
+ * @async
+ * @param {Object} req - The request object.
+ * @param {Object} res - The response object.
+ * @returns {Promise<void>}
+ */
+async function speechToText(req, res) {
+  const sttService = await createSTTService();
+  await sttService.processTextToSpeech(req, res);
+}
+
+module.exports = speechToText;