| | const axios = require('axios'); |
| | const fs = require('fs').promises; |
| | const FormData = require('form-data'); |
| | const { Readable } = require('stream'); |
| | const { logger } = require('@librechat/data-schemas'); |
| | const { HttpsProxyAgent } = require('https-proxy-agent'); |
| | const { genAzureEndpoint, logAxiosError } = require('@librechat/api'); |
| | const { extractEnvVariable, STTProviders } = require('librechat-data-provider'); |
| | const { getAppConfig } = require('~/server/services/Config'); |
| |
|
| | |
| | |
| | |
| | |
| | const MIME_TO_EXTENSION_MAP = { |
| | |
| | 'audio/mp4': 'm4a', |
| | 'audio/x-m4a': 'm4a', |
| | |
| | 'audio/ogg': 'ogg', |
| | 'audio/vorbis': 'ogg', |
| | 'application/ogg': 'ogg', |
| | |
| | 'audio/wav': 'wav', |
| | 'audio/x-wav': 'wav', |
| | 'audio/wave': 'wav', |
| | |
| | 'audio/mp3': 'mp3', |
| | 'audio/mpeg': 'mp3', |
| | 'audio/mpeg3': 'mp3', |
| | |
| | 'audio/webm': 'webm', |
| | |
| | 'audio/flac': 'flac', |
| | 'audio/x-flac': 'flac', |
| | }; |
| |
|
| | |
| | |
| | |
| | |
| | |
| | function getValidatedLanguageCode(language) { |
| | try { |
| | if (!language) { |
| | return null; |
| | } |
| |
|
| | const normalizedLanguage = language.toLowerCase(); |
| | const isValidLocaleCode = /^[a-z]{2}(-[a-z]{2})?$/.test(normalizedLanguage); |
| |
|
| | if (isValidLocaleCode) { |
| | return normalizedLanguage.split('-')[0]; |
| | } |
| |
|
| | logger.warn( |
| | `[STT] Invalid language format "${language}". Expected ISO-639-1 locale code like "en-US" or "en". Skipping language parameter.`, |
| | ); |
| | return null; |
| | } catch (error) { |
| | logger.error(`[STT] Error validating language code "${language}":`, error); |
| | return null; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | function getFileExtensionFromMime(mimeType) { |
| | |
| | if (!mimeType) { |
| | return 'webm'; |
| | } |
| |
|
| | |
| | const extension = MIME_TO_EXTENSION_MAP[mimeType]; |
| | if (extension) { |
| | return extension; |
| | } |
| |
|
| | |
| | const subtype = mimeType.split('/')[1]?.toLowerCase(); |
| |
|
| | |
| | if (['mp3', 'mp4', 'ogg', 'wav', 'webm', 'm4a', 'flac'].includes(subtype)) { |
| | return subtype === 'mp4' ? 'm4a' : subtype; |
| | } |
| |
|
| | |
| | if (subtype?.includes('mp4') || subtype?.includes('m4a')) { |
| | return 'm4a'; |
| | } |
| | if (subtype?.includes('ogg')) { |
| | return 'ogg'; |
| | } |
| | if (subtype?.includes('wav')) { |
| | return 'wav'; |
| | } |
| | if (subtype?.includes('mp3') || subtype?.includes('mpeg')) { |
| | return 'mp3'; |
| | } |
| | if (subtype?.includes('webm')) { |
| | return 'webm'; |
| | } |
| |
|
| | return 'webm'; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | class STTService { |
| | constructor() { |
| | this.providerStrategies = { |
| | [STTProviders.OPENAI]: this.openAIProvider, |
| | [STTProviders.AZURE_OPENAI]: this.azureOpenAIProvider, |
| | }; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | static async getInstance() { |
| | return new STTService(); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | async getProviderSchema(req) { |
| | const appConfig = |
| | req.config ?? |
| | (await getAppConfig({ |
| | role: req?.user?.role, |
| | })); |
| | const sttSchema = appConfig?.speech?.stt; |
| | if (!sttSchema) { |
| | throw new Error( |
| | 'No STT schema is set. Did you configure STT in the custom config (librechat.yaml)?', |
| | ); |
| | } |
| |
|
| | const providers = Object.entries(sttSchema).filter( |
| | ([, value]) => Object.keys(value).length > 0, |
| | ); |
| |
|
| | if (providers.length !== 1) { |
| | throw new Error( |
| | providers.length > 1 |
| | ? 'Multiple providers are set. Please set only one provider.' |
| | : 'No provider is set. Please set a provider.', |
| | ); |
| | } |
| |
|
| | const [provider, schema] = providers[0]; |
| | return [provider, schema]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | removeUndefined(obj) { |
| | Object.keys(obj).forEach((key) => { |
| | if (obj[key] && typeof obj[key] === 'object') { |
| | this.removeUndefined(obj[key]); |
| | if (Object.keys(obj[key]).length === 0) { |
| | delete obj[key]; |
| | } |
| | } else if (obj[key] === undefined) { |
| | delete obj[key]; |
| | } |
| | }); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | openAIProvider(sttSchema, audioReadStream, audioFile, language) { |
| | const url = sttSchema?.url || 'https://api.openai.com/v1/audio/transcriptions'; |
| | const apiKey = extractEnvVariable(sttSchema.apiKey) || ''; |
| |
|
| | const data = { |
| | file: audioReadStream, |
| | model: sttSchema.model, |
| | }; |
| |
|
| | const validLanguage = getValidatedLanguageCode(language); |
| | if (validLanguage) { |
| | data.language = validLanguage; |
| | } |
| |
|
| | const headers = { |
| | 'Content-Type': 'multipart/form-data', |
| | ...(apiKey && { Authorization: `Bearer ${apiKey}` }), |
| | }; |
| | [headers].forEach(this.removeUndefined); |
| |
|
| | return [url, data, headers]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | azureOpenAIProvider(sttSchema, audioBuffer, audioFile, language) { |
| | const url = `${genAzureEndpoint({ |
| | azureOpenAIApiInstanceName: extractEnvVariable(sttSchema?.instanceName), |
| | azureOpenAIApiDeploymentName: extractEnvVariable(sttSchema?.deploymentName), |
| | })}/audio/transcriptions?api-version=${extractEnvVariable(sttSchema?.apiVersion)}`; |
| |
|
| | const apiKey = sttSchema.apiKey ? extractEnvVariable(sttSchema.apiKey) : ''; |
| |
|
| | if (audioBuffer.byteLength > 25 * 1024 * 1024) { |
| | throw new Error('The audio file size exceeds the limit of 25MB'); |
| | } |
| |
|
| | const acceptedFormats = ['flac', 'mp3', 'mp4', 'mpeg', 'mpga', 'm4a', 'ogg', 'wav', 'webm']; |
| | const fileFormat = audioFile.mimetype.split('/')[1]; |
| | if (!acceptedFormats.includes(fileFormat)) { |
| | throw new Error(`The audio file format ${fileFormat} is not accepted`); |
| | } |
| |
|
| | const formData = new FormData(); |
| | formData.append('file', audioBuffer, { |
| | filename: audioFile.originalname, |
| | contentType: audioFile.mimetype, |
| | }); |
| |
|
| | const validLanguage = getValidatedLanguageCode(language); |
| | if (validLanguage) { |
| | formData.append('language', validLanguage); |
| | } |
| |
|
| | const headers = { |
| | ...(apiKey && { 'api-key': apiKey }), |
| | }; |
| |
|
| | [headers].forEach(this.removeUndefined); |
| |
|
| | return [url, formData, { ...headers, ...formData.getHeaders() }]; |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }) { |
| | const strategy = this.providerStrategies[provider]; |
| | if (!strategy) { |
| | throw new Error('Invalid provider'); |
| | } |
| |
|
| | const fileExtension = getFileExtensionFromMime(audioFile.mimetype); |
| |
|
| | const audioReadStream = Readable.from(audioBuffer); |
| | audioReadStream.path = `audio.${fileExtension}`; |
| |
|
| | const [url, data, headers] = strategy.call( |
| | this, |
| | sttSchema, |
| | audioReadStream, |
| | audioFile, |
| | language, |
| | ); |
| |
|
| | const options = { headers }; |
| |
|
| | if (process.env.PROXY) { |
| | options.httpsAgent = new HttpsProxyAgent(process.env.PROXY); |
| | } |
| |
|
| | try { |
| | const response = await axios.post(url, data, options); |
| |
|
| | if (response.status !== 200) { |
| | throw new Error('Invalid response from the STT API'); |
| | } |
| |
|
| | if (!response.data || !response.data.text) { |
| | throw new Error('Missing data in response from the STT API'); |
| | } |
| |
|
| | return response.data.text.trim(); |
| | } catch (error) { |
| | logAxiosError({ message: `STT request failed for provider ${provider}:`, error }); |
| | throw error; |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async processSpeechToText(req, res) { |
| | if (!req.file) { |
| | return res.status(400).json({ message: 'No audio file provided in the FormData' }); |
| | } |
| |
|
| | const audioBuffer = await fs.readFile(req.file.path); |
| | const audioFile = { |
| | originalname: req.file.originalname, |
| | mimetype: req.file.mimetype, |
| | size: req.file.size, |
| | }; |
| |
|
| | try { |
| | const [provider, sttSchema] = await this.getProviderSchema(req); |
| | const language = req.body?.language || ''; |
| | const text = await this.sttRequest(provider, sttSchema, { audioBuffer, audioFile, language }); |
| | res.json({ text }); |
| | } catch (error) { |
| | logAxiosError({ message: 'An error occurred while processing the audio:', error }); |
| | res.sendStatus(500); |
| | } finally { |
| | try { |
| | await fs.unlink(req.file.path); |
| | logger.debug('[/speech/stt] Temp. audio upload file deleted'); |
| | } catch { |
| | logger.debug('[/speech/stt] Temp. audio upload file already deleted'); |
| | } |
| | } |
| | } |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | async function createSTTService() { |
| | return STTService.getInstance(); |
| | } |
| |
|
| | |
| | |
| | |
| | |
| | |
| | |
| | |
| | async function speechToText(req, res) { |
| | const sttService = await createSTTService(); |
| | await sttService.processSpeechToText(req, res); |
| | } |
| |
|
| | module.exports = { STTService, speechToText }; |
| |
|