File size: 15,406 Bytes
f0743f4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
335
336
337
338
339
340
341
342
343
344
345
346
347
348
349
350
351
352
353
354
355
356
357
358
359
360
361
362
363
364
365
366
367
368
369
370
371
372
373
374
375
376
377
378
379
380
381
382
383
384
385
386
387
388
389
390
391
392
393
394
395
396
397
398
399
400
401
402
403
404
405
406
407
408
409
410
411
412
413
414
415
416
417
418
419
420
421
422
423
424
425
426
427
428
429
430
431
432
433
434
435
436
437
438
439
440
441
442
443
444
445
446
447
448
449
450
451
452
453
454
455
456
457
458
459
460
461
462
463
464
465
466
467
468
469
470
471
472
473
474
475
476
477
478
479
480
481
482
483
484
485
486
487
488
489
490
491
492
493
494
495
const axios = require('axios');
const { logger } = require('@librechat/data-schemas');
const { HttpsProxyAgent } = require('https-proxy-agent');
const { genAzureEndpoint, logAxiosError } = require('@librechat/api');
const { extractEnvVariable, TTSProviders } = require('librechat-data-provider');
const { getRandomVoiceId, createChunkProcessor, splitTextIntoChunks } = require('./streamAudio');
const { getAppConfig } = require('~/server/services/Config');

/**
 * Service class for handling Text-to-Speech (TTS) operations.
 * @class
 */
class TTSService {
  /**
   * Creates an instance of TTSService.
   */
  constructor() {
    this.providerStrategies = {
      [TTSProviders.OPENAI]: this.openAIProvider.bind(this),
      [TTSProviders.AZURE_OPENAI]: this.azureOpenAIProvider.bind(this),
      [TTSProviders.ELEVENLABS]: this.elevenLabsProvider.bind(this),
      [TTSProviders.LOCALAI]: this.localAIProvider.bind(this),
    };
  }

  /**
   * Creates a singleton instance of TTSService.
   * @static
   * @async
   * @returns {Promise<TTSService>} The TTSService instance.
   * @throws {Error} If the custom config is not found.
   */
  static async getInstance() {
    return new TTSService();
  }

  /**
   * Retrieves the configured TTS provider.
   * @param {AppConfig | null | undefined} [appConfig] - The app configuration object.
   * @returns {string} The name of the configured provider.
   * @throws {Error} If no provider is set or multiple providers are set.
   */
  getProvider(appConfig) {
    const ttsSchema = appConfig?.speech?.tts;
    if (!ttsSchema) {
      throw new Error(
        'No TTS schema is set. Did you configure TTS in the custom config (librechat.yaml)?',
      );
    }
    const providers = Object.entries(ttsSchema).filter(
      ([, value]) => Object.keys(value).length > 0,
    );

    if (providers.length !== 1) {
      throw new Error(
        providers.length > 1
          ? 'Multiple providers are set. Please set only one provider.'
          : 'No provider is set. Please set a provider.',
      );
    }
    return providers[0][0];
  }

  /**
   * Selects a voice for TTS based on provider schema and request.
   * @async
   * @param {Object} providerSchema - The schema for the selected provider.
   * @param {string} requestVoice - The requested voice.
   * @returns {Promise<string>} The selected voice.
   */
  async getVoice(providerSchema, requestVoice) {
    const voices = providerSchema.voices.filter((voice) => voice && voice.toUpperCase() !== 'ALL');
    let voice = requestVoice;
    if (!voice || !voices.includes(voice) || (voice.toUpperCase() === 'ALL' && voices.length > 1)) {
      voice = getRandomVoiceId(voices);
    }
    return voice;
  }

  /**
   * Recursively removes undefined properties from an object.
   * @param {Object} obj - The object to clean.
   */
  removeUndefined(obj) {
    Object.keys(obj).forEach((key) => {
      if (obj[key] && typeof obj[key] === 'object') {
        this.removeUndefined(obj[key]);
        if (Object.keys(obj[key]).length === 0) {
          delete obj[key];
        }
      } else if (obj[key] === undefined) {
        delete obj[key];
      }
    });
  }

  /**
   * Prepares the request for OpenAI TTS provider.
   * @param {Object} ttsSchema - The TTS schema for OpenAI.
   * @param {string} input - The input text.
   * @param {string} voice - The selected voice.
   * @returns {Array} An array containing the URL, data, and headers for the request.
   * @throws {Error} If the selected voice is not available.
   */
  openAIProvider(ttsSchema, input, voice) {
    const url = ttsSchema?.url || 'https://api.openai.com/v1/audio/speech';

    if (
      ttsSchema?.voices &&
      ttsSchema.voices.length > 0 &&
      !ttsSchema.voices.includes(voice) &&
      !ttsSchema.voices.includes('ALL')
    ) {
      throw new Error(`Voice ${voice} is not available.`);
    }

    const data = {
      input,
      model: ttsSchema?.model,
      voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
      backend: ttsSchema?.backend,
    };

    const headers = {
      'Content-Type': 'application/json',
      Authorization: `Bearer ${extractEnvVariable(ttsSchema?.apiKey)}`,
    };

    return [url, data, headers];
  }

  /**
   * Prepares the request for Azure OpenAI TTS provider.
   * @param {Object} ttsSchema - The TTS schema for Azure OpenAI.
   * @param {string} input - The input text.
   * @param {string} voice - The selected voice.
   * @returns {Array} An array containing the URL, data, and headers for the request.
   * @throws {Error} If the selected voice is not available.
   */
  azureOpenAIProvider(ttsSchema, input, voice) {
    const url = `${genAzureEndpoint({
      azureOpenAIApiInstanceName: extractEnvVariable(ttsSchema?.instanceName),
      azureOpenAIApiDeploymentName: extractEnvVariable(ttsSchema?.deploymentName),
    })}/audio/speech?api-version=${extractEnvVariable(ttsSchema?.apiVersion)}`;

    if (
      ttsSchema?.voices &&
      ttsSchema.voices.length > 0 &&
      !ttsSchema.voices.includes(voice) &&
      !ttsSchema.voices.includes('ALL')
    ) {
      throw new Error(`Voice ${voice} is not available.`);
    }

    const data = {
      model: extractEnvVariable(ttsSchema?.model),
      input,
      voice: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
    };

    const headers = {
      'Content-Type': 'application/json',
      'api-key': ttsSchema.apiKey ? extractEnvVariable(ttsSchema.apiKey) : '',
    };

    return [url, data, headers];
  }

  /**
   * Prepares the request for ElevenLabs TTS provider.
   * @param {Object} ttsSchema - The TTS schema for ElevenLabs.
   * @param {string} input - The input text.
   * @param {string} voice - The selected voice.
   * @param {boolean} stream - Whether to use streaming.
   * @returns {Array} An array containing the URL, data, and headers for the request.
   * @throws {Error} If the selected voice is not available.
   */
  elevenLabsProvider(ttsSchema, input, voice, stream) {
    let url =
      ttsSchema?.url ||
      `https://api.elevenlabs.io/v1/text-to-speech/${voice}${stream ? '/stream' : ''}`;

    if (!ttsSchema?.voices.includes(voice) && !ttsSchema?.voices.includes('ALL')) {
      throw new Error(`Voice ${voice} is not available.`);
    }

    const data = {
      model_id: ttsSchema?.model,
      text: input,
      voice_settings: {
        similarity_boost: ttsSchema?.voice_settings?.similarity_boost,
        stability: ttsSchema?.voice_settings?.stability,
        style: ttsSchema?.voice_settings?.style,
        use_speaker_boost: ttsSchema?.voice_settings?.use_speaker_boost,
      },
      pronunciation_dictionary_locators: ttsSchema?.pronunciation_dictionary_locators,
    };

    const headers = {
      'Content-Type': 'application/json',
      'xi-api-key': extractEnvVariable(ttsSchema?.apiKey),
      Accept: 'audio/mpeg',
    };

    return [url, data, headers];
  }

  /**
   * Prepares the request for LocalAI TTS provider.
   * @param {Object} ttsSchema - The TTS schema for LocalAI.
   * @param {string} input - The input text.
   * @param {string} voice - The selected voice.
   * @returns {Array} An array containing the URL, data, and headers for the request.
   * @throws {Error} If the selected voice is not available.
   */
  localAIProvider(ttsSchema, input, voice) {
    const url = ttsSchema?.url;

    if (
      ttsSchema?.voices &&
      ttsSchema.voices.length > 0 &&
      !ttsSchema.voices.includes(voice) &&
      !ttsSchema.voices.includes('ALL')
    ) {
      throw new Error(`Voice ${voice} is not available.`);
    }

    const data = {
      input,
      model: ttsSchema?.voices && ttsSchema.voices.length > 0 ? voice : undefined,
      backend: ttsSchema?.backend,
    };

    const headers = {
      'Content-Type': 'application/json',
      Authorization: `Bearer ${extractEnvVariable(ttsSchema?.apiKey)}`,
    };

    if (extractEnvVariable(ttsSchema.apiKey) === '') {
      delete headers.Authorization;
    }

    return [url, data, headers];
  }

  /**
   * Sends a TTS request to the specified provider.
   * @async
   * @param {string} provider - The TTS provider to use.
   * @param {Object} ttsSchema - The TTS schema for the provider.
   * @param {Object} options - The options for the TTS request.
   * @param {string} options.input - The input text.
   * @param {string} options.voice - The voice to use.
   * @param {boolean} [options.stream=true] - Whether to use streaming.
   * @returns {Promise<Object>} The axios response object.
   * @throws {Error} If the provider is invalid or the request fails.
   */
  async ttsRequest(provider, ttsSchema, { input, voice, stream = true }) {
    const strategy = this.providerStrategies[provider];
    if (!strategy) {
      throw new Error('Invalid provider');
    }

    const [url, data, headers] = strategy.call(this, ttsSchema, input, voice, stream);

    [data, headers].forEach(this.removeUndefined.bind(this));

    const options = { headers, responseType: stream ? 'stream' : 'arraybuffer' };

    if (process.env.PROXY) {
      options.httpsAgent = new HttpsProxyAgent(process.env.PROXY);
    }

    try {
      return await axios.post(url, data, options);
    } catch (error) {
      logAxiosError({ message: `TTS request failed for provider ${provider}:`, error });
      throw error;
    }
  }

  /**
   * Processes a text-to-speech request.
   * @async
   * @param {ServerRequest} req - The request object.
   * @param {ServerResponse} res - The response object.
   * @returns {Promise<void>}
   */
  async processTextToSpeech(req, res) {
    const { input, voice: requestVoice } = req.body;

    if (!input) {
      return res.status(400).send('Missing text in request body');
    }

    const appConfig =
      req.config ??
      (await getAppConfig({
        role: req.user?.role,
      }));
    try {
      res.setHeader('Content-Type', 'audio/mpeg');
      const provider = this.getProvider(appConfig);
      const ttsSchema = appConfig?.speech?.tts?.[provider];
      const voice = await this.getVoice(ttsSchema, requestVoice);

      if (input.length < 4096) {
        const response = await this.ttsRequest(provider, ttsSchema, { input, voice });
        response.data.pipe(res);
        return;
      }

      const textChunks = splitTextIntoChunks(input, 1000);

      for (const chunk of textChunks) {
        try {
          const response = await this.ttsRequest(provider, ttsSchema, {
            voice,
            input: chunk.text,
            stream: true,
          });

          logger.debug(`[textToSpeech] user: ${req?.user?.id} | writing audio stream`);
          await new Promise((resolve) => {
            response.data.pipe(res, { end: chunk.isFinished });
            response.data.on('end', resolve);
          });

          if (chunk.isFinished) {
            break;
          }
        } catch (innerError) {
          logAxiosError({
            message: `[TTS] Error processing manual update for chunk: ${chunk?.text?.substring(0, 50)}...`,
            error: innerError,
          });
          if (!res.headersSent) {
            return res.status(500).end();
          }
          return;
        }
      }

      if (!res.headersSent) {
        res.end();
      }
    } catch (error) {
      logAxiosError({ message: '[TTS] Error creating the audio stream:', error });
      if (!res.headersSent) {
        return res.status(500).send('An error occurred');
      }
    }
  }

  /**
   * Streams audio data from the TTS provider.
   * @async
   * @param {ServerRequest} req - The request object.
   * @param {ServerResponse} res - The response object.
   * @returns {Promise<void>}
   */
  async streamAudio(req, res) {
    res.setHeader('Content-Type', 'audio/mpeg');
    const appConfig =
      req.config ??
      (await getAppConfig({
        role: req.user?.role,
      }));
    const provider = this.getProvider(appConfig);
    const ttsSchema = appConfig?.speech?.tts?.[provider];
    const voice = await this.getVoice(ttsSchema, req.body.voice);

    let shouldContinue = true;

    req.on('close', () => {
      logger.warn('[streamAudio] Audio Stream Request closed by client');
      shouldContinue = false;
    });

    const processChunks = createChunkProcessor(req.user.id, req.body.messageId);

    try {
      while (shouldContinue) {
        const updates = await processChunks();
        if (typeof updates === 'string') {
          logger.error(`Error processing audio stream updates: ${updates}`);
          return res.status(500).end();
        }

        if (updates.length === 0) {
          await new Promise((resolve) => setTimeout(resolve, 1250));
          continue;
        }

        for (const update of updates) {
          try {
            const response = await this.ttsRequest(provider, ttsSchema, {
              voice,
              input: update.text,
              stream: true,
            });

            if (!shouldContinue) {
              break;
            }

            logger.debug(`[streamAudio] user: ${req?.user?.id} | writing audio stream`);
            await new Promise((resolve) => {
              response.data.pipe(res, { end: update.isFinished });
              response.data.on('end', resolve);
            });

            if (update.isFinished) {
              shouldContinue = false;
              break;
            }
          } catch (innerError) {
            logAxiosError({
              message: `[TTS] Error processing audio stream update: ${update?.text?.substring(0, 50)}...`,
              error: innerError,
            });
            if (!res.headersSent) {
              return res.status(500).end();
            }
            return;
          }
        }

        if (!shouldContinue) {
          break;
        }
      }

      if (!res.headersSent) {
        res.end();
      }
    } catch (error) {
      logAxiosError({ message: '[TTS] Failed to fetch audio:', error });
      if (!res.headersSent) {
        res.status(500).end();
      }
    }
  }
}

/**
 * Factory function to create a TTSService instance.
 * @async
 * @returns {Promise<TTSService>} A promise that resolves to a TTSService instance.
 */
async function createTTSService() {
  return TTSService.getInstance();
}

/**
 * Wrapper function for text-to-speech processing.
 * @async
 * @param {ServerRequest} req - The request object.
 * @param {ServerResponse} res - The response object.
 * @returns {Promise<void>}
 */
async function textToSpeech(req, res) {
  const ttsService = await createTTSService();
  await ttsService.processTextToSpeech(req, res);
}

/**
 * Wrapper function for audio streaming.
 * @async
 * @param {Object} req - The request object.
 * @param {Object} res - The response object.
 * @returns {Promise<void>}
 */
async function streamAudio(req, res) {
  const ttsService = await createTTSService();
  await ttsService.streamAudio(req, res);
}

/**
 * Wrapper function to get the configured TTS provider.
 * @async
 * @param {AppConfig | null | undefined} appConfig - The app configuration object.
 * @returns {Promise<string>} A promise that resolves to the name of the configured provider.
 */
async function getProvider(appConfig) {
  const ttsService = await createTTSService();
  return ttsService.getProvider(appConfig);
}

module.exports = {
  textToSpeech,
  streamAudio,
  getProvider,
};