From f158f07ee0239c2e60f3f393e8d1447f6156810d Mon Sep 17 00:00:00 2001 From: Mieszko Makuch Date: Wed, 25 Mar 2026 11:38:57 +0100 Subject: [PATCH 1/3] feat(stt): add server-side language fallback and extraParams for OpenAI STT provider Add two optional fields to the OpenAI STT provider config schema: - `language`: server-side default language (ISO 639-1) sent to Whisper when the client doesn't provide one. Useful for non-English deployments where admins want to predefine the transcription language without requiring each user to configure it in the browser. - `extraParams`: arbitrary key-value pairs forwarded to the STT endpoint. Enables self-hosted Whisper servers (e.g. Speaches, faster-whisper-server) to receive provider-specific parameters like `vad_filter` (Voice Activity Detection) which filters silence and prevents hallucinations on empty audio clips. These params are ignored by the official OpenAI API. Example librechat.yaml configuration: ```yaml speech: stt: openai: url: 'http://whisper-server/v1/audio/transcriptions' apiKey: 'none' model: 'whisper-large-v3-turbo' language: 'pl' extraParams: vad_filter: true ``` --- api/server/services/Files/Audio/STTService.js | 6 +++++- packages/data-provider/src/config.ts | 2 ++ 2 files changed, 7 insertions(+), 1 deletion(-) diff --git a/api/server/services/Files/Audio/STTService.js b/api/server/services/Files/Audio/STTService.js index 4ba62a7eeb..3a7a51bd8d 100644 --- a/api/server/services/Files/Audio/STTService.js +++ b/api/server/services/Files/Audio/STTService.js @@ -206,6 +206,10 @@ class STTService { data.language = validLanguage; } + if (sttSchema?.extraParams) { + Object.assign(data, sttSchema.extraParams); + } + const headers = { 'Content-Type': 'multipart/form-data', ...(apiKey && { Authorization: `Bearer ${apiKey}` }), @@ -338,7 +342,7 @@ class STTService { try { const [provider, sttSchema] = await this.getProviderSchema(req); - const language = req.body?.language || ''; + const language = req.body?.language || sttSchema?.language || ''; const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }); res.json({ text }); } catch (error) { diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index 9bc3822c4b..fb89c7c486 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -478,6 +478,8 @@ const sttOpenaiSchema = z.object({ url: z.string().optional(), apiKey: z.string(), model: z.string(), + language: z.string().optional(), + extraParams: z.record(z.unknown()).optional(), }); const sttAzureOpenAISchema = z.object({ From dead7b9d6bb8bd1c17f28192513e2bda64e583e0 Mon Sep 17 00:00:00 2001 From: Mieszko Makuch Date: Thu, 26 Mar 2026 01:56:40 +0100 Subject: [PATCH 2/3] =?UTF-8?q?fix(stt):=20address=20review=20feedback=20?= =?UTF-8?q?=E2=80=94=20reserved=20fields=20guard,=20schema=20validation,?= =?UTF-8?q?=20docs?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Filter reserved fields (file, model, language) from extraParams before merge - Change extraParams schema to z.record(z.union([z.string(), z.number(), z.boolean()])) - Add regex validation for language field in Zod schema - Add JSDoc note about extraParams in openAIProvider - Add comment clarifying language/extraParams are OpenAI-only (not Azure) - Remove unnecessary optional chaining on sttSchema --- api/server/services/Files/Audio/STTService.js | 10 +++++++--- packages/data-provider/src/config.ts | 8 ++++++-- 2 files changed, 13 insertions(+), 5 deletions(-) diff --git a/api/server/services/Files/Audio/STTService.js b/api/server/services/Files/Audio/STTService.js index 3a7a51bd8d..abd9c2272b 100644 --- a/api/server/services/Files/Audio/STTService.js +++ b/api/server/services/Files/Audio/STTService.js @@ -186,7 +186,7 @@ class STTService { /** * Prepares the request for the OpenAI STT provider. - * @param {Object} sttSchema - The STT schema for OpenAI. + * @param {Object} sttSchema - The STT schema for OpenAI (includes optional language and extraParams). * @param {Stream} audioReadStream - The audio data to be transcribed. * @param {Object} audioFile - The audio file object (unused in OpenAI provider). * @param {string} language - The language code for the transcription. @@ -206,8 +206,12 @@ class STTService { data.language = validLanguage; } - if (sttSchema?.extraParams) { - Object.assign(data, sttSchema.extraParams); + if (sttSchema.extraParams) { + const reservedFields = new Set(['file', 'model', 'language']); + const safeParams = Object.fromEntries( + Object.entries(sttSchema.extraParams).filter(([key]) => !reservedFields.has(key)), + ); + Object.assign(data, safeParams); } const headers = { diff --git a/packages/data-provider/src/config.ts b/packages/data-provider/src/config.ts index fb89c7c486..7ba0c99249 100644 --- a/packages/data-provider/src/config.ts +++ b/packages/data-provider/src/config.ts @@ -478,10 +478,14 @@ const sttOpenaiSchema = z.object({ url: z.string().optional(), apiKey: z.string(), model: z.string(), - language: z.string().optional(), - extraParams: z.record(z.unknown()).optional(), + language: z + .string() + .regex(/^[a-z]{2}(-[a-z]{2})?$/) + .optional(), + extraParams: z.record(z.union([z.string(), z.number(), z.boolean()])).optional(), }); +/** Note: language and extraParams are only supported for the OpenAI provider. */ const sttAzureOpenAISchema = z.object({ instanceName: z.string(), apiKey: z.string(), From 5301ab4351ee5b009c8134e09bf4f68a2ae7e7fe Mon Sep 17 00:00:00 2001 From: Mieszko Makuch Date: Thu, 26 Mar 2026 03:16:05 +0100 Subject: [PATCH 3/3] test(stt): add tests for language fallback, extraParams, and schema validation - Schema: accepts valid language codes (pl, en-us), rejects invalid (Polish, xyz123) - Schema: accepts string/number/boolean extraParams, rejects null - openAIProvider: validated language included in request data - openAIProvider: extraParams forwarded, reserved fields (file, model, language) filtered - processSpeechToText: client language wins over schema, schema fallback when empty - processSpeechToText: temp file cleanup verified --- .../services/Files/Audio/STTService.spec.js | 161 ++++++++++++++++++ 1 file changed, 161 insertions(+) create mode 100644 api/server/services/Files/Audio/STTService.spec.js diff --git a/api/server/services/Files/Audio/STTService.spec.js b/api/server/services/Files/Audio/STTService.spec.js new file mode 100644 index 0000000000..db5ec625bc --- /dev/null +++ b/api/server/services/Files/Audio/STTService.spec.js @@ -0,0 +1,161 @@ +const { z } = require('zod'); +const { Readable } = require('stream'); +const { STTService } = require('./STTService'); + +jest.mock('axios'); +// Required: real import pulls in sharp via dependency chain, which isn't available in test env +jest.mock('@librechat/api', () => ({ genAzureEndpoint: jest.fn(), logAxiosError: jest.fn() })); +jest.mock('~/server/services/Config', () => ({ getAppConfig: jest.fn() })); + +const axios = require('axios'); +const fs = require('fs').promises; +const { getAppConfig } = require('~/server/services/Config'); + +// Helpers +const createStream = () => + Object.assign(Readable.from(Buffer.from('audio')), { path: 'audio.webm' }); + +const baseSchema = { + url: 'http://whisper/v1/audio/transcriptions', + apiKey: 'none', + model: 'whisper-1', +}; + +const createAppConfig = (extra = {}) => ({ + speech: { stt: { openai: { ...baseSchema, ...extra } } }, +}); + +// Mirror of sttOpenaiSchema from config.ts (not exported). +// If the upstream schema changes, these tests catch the drift. +const sttOpenaiSchema = z.object({ + url: z.string().optional(), + apiKey: z.string(), + model: z.string(), + language: z + .string() + .regex(/^[a-z]{2}(-[a-z]{2})?$/) + .optional(), + extraParams: z.record(z.union([z.string(), z.number(), z.boolean()])).optional(), +}); + +describe('sttOpenaiSchema', () => { + const base = { apiKey: 'none', model: 'whisper-1' }; + + it.each([ + { lang: 'pl', valid: true }, + { lang: 'en-us', valid: true }, + { lang: 'Polish', valid: false }, + { lang: 'xyz123', valid: false }, + ])('language "$lang" → valid=$valid', ({ lang, valid }) => { + const fn = () => sttOpenaiSchema.parse({ ...base, language: lang }); + valid ? expect(fn().language).toBe(lang) : expect(fn).toThrow(); + }); + + it.each([ + { desc: 'string/number/boolean', params: { vad_filter: true, beam_size: 5 }, valid: true }, + { desc: 'null value', params: { bad: null }, valid: false }, + ])('extraParams with $desc → valid=$valid', ({ params, valid }) => { + const fn = () => sttOpenaiSchema.parse({ ...base, extraParams: params }); + valid ? expect(fn().extraParams).toEqual(params) : expect(fn).toThrow(); + }); + + it('works without optional fields', () => { + const result = sttOpenaiSchema.parse(base); + expect(result.language).toBeUndefined(); + expect(result.extraParams).toBeUndefined(); + }); +}); + +describe('STTService — openAIProvider', () => { + let service; + beforeEach(() => { + jest.clearAllMocks(); + service = new STTService(); + }); + + it('includes validated language in request data', () => { + const [, data] = service.openAIProvider(baseSchema, createStream(), {}, 'pl'); + expect(data.language).toBe('pl'); + }); + + it('omits language when empty string passed', () => { + const [, data] = service.openAIProvider(baseSchema, createStream(), {}, ''); + expect(data.language).toBeUndefined(); + }); + + it('forwards extraParams to request data', () => { + const schema = { ...baseSchema, extraParams: { vad_filter: true, temperature: 0.5 } }; + const [, data] = service.openAIProvider(schema, createStream(), {}, ''); + expect(data.vad_filter).toBe(true); + expect(data.temperature).toBe(0.5); + }); + + it.each([ + { field: 'file', preserved: 'stream' }, + { field: 'model', preserved: 'whisper-1' }, + { field: 'language', preserved: 'pl' }, + ])('filters reserved field "$field" from extraParams', ({ field }) => { + const stream = createStream(); + const schema = { ...baseSchema, extraParams: { [field]: 'bad', vad_filter: true } }; + const [, data] = service.openAIProvider(schema, stream, {}, field === 'language' ? 'pl' : ''); + + expect(data.vad_filter).toBe(true); + if (field === 'file') expect(data.file).toBe(stream); + if (field === 'model') expect(data.model).toBe('whisper-1'); + if (field === 'language') expect(data.language).toBe('pl'); + }); + + it('works without extraParams', () => { + const [, data] = service.openAIProvider(baseSchema, createStream(), {}, ''); + expect(data.file).toBeDefined(); + expect(data.model).toBe('whisper-1'); + }); +}); + +describe('STTService — processSpeechToText', () => { + let service; + const mockReq = (lang = '') => ({ + file: { + path: '/tmp/audio.webm', + originalname: 'audio.webm', + mimetype: 'audio/webm', + size: 1000, + }, + body: { language: lang }, + }); + const mockRes = () => ({ + json: jest.fn(), + status: jest.fn().mockReturnThis(), + sendStatus: jest.fn(), + }); + + beforeEach(() => { + jest.clearAllMocks(); + service = new STTService(); + jest.spyOn(fs, 'readFile').mockResolvedValue(Buffer.from('audio')); + jest.spyOn(fs, 'unlink').mockResolvedValue(); + axios.post.mockResolvedValue({ status: 200, data: { text: 'transcribed' } }); + }); + + it.each([ + { desc: 'client language over schema', clientLang: 'en', schemaLang: 'pl', expected: 'en' }, + { desc: 'schema fallback when client empty', clientLang: '', schemaLang: 'pl', expected: 'pl' }, + { + desc: 'no language when neither set', + clientLang: '', + schemaLang: undefined, + expected: undefined, + }, + ])('uses $desc', async ({ clientLang, schemaLang, expected }) => { + const extra = schemaLang ? { language: schemaLang } : {}; + getAppConfig.mockResolvedValue(createAppConfig(extra)); + await service.processSpeechToText(mockReq(clientLang), mockRes()); + expect(axios.post.mock.calls[0][1].language).toBe(expected); + }); + + it('cleans up temp file after processing', async () => { + getAppConfig.mockResolvedValue(createAppConfig()); + await service.processSpeechToText(mockReq(), mockRes()); + expect(fs.unlink).toHaveBeenCalledWith('/tmp/audio.webm'); + }); +});