|
|
import { event_types, eventSource, getRequestHeaders } from '../../../script.js'; |
|
|
import { SECRET_KEYS, secret_state } from '../../secrets.js'; |
|
|
import { getPreviewString, saveTtsProviderSettings, initVoiceMap } from './index.js'; |
|
|
|
|
|
export { ElectronHubTtsProvider }; |
|
|
|
|
|
class ElectronHubTtsProvider { |
|
|
settings; |
|
|
voices = []; |
|
|
models = []; |
|
|
separator = ' . '; |
|
|
audioElement = document.createElement('audio'); |
|
|
|
|
|
defaultSettings = { |
|
|
voiceMap: {}, |
|
|
model: 'tts-1', |
|
|
speed: 1, |
|
|
temperature: 1, |
|
|
top_p: 1, |
|
|
|
|
|
instructions: '', |
|
|
|
|
|
speaker_transcript: '', |
|
|
cfg_filter_top_k: 25, |
|
|
cfg_scale: 3, |
|
|
|
|
|
speech_rate: 0, |
|
|
pitch_adjustment: 0, |
|
|
emotional_style: '', |
|
|
}; |
|
|
|
|
|
get settingsHtml() { |
|
|
let html = ` |
|
|
<div>Electron Hub unified TTS API.</div> |
|
|
<div class="flex-container alignItemsCenter"> |
|
|
<div class="flex1"></div> |
|
|
<div id="electronhub_tts_key" class="menu_button menu_button_icon manage-api-keys" data-key="api_key_electronhub"> |
|
|
<i class="fa-solid fa-key"></i> |
|
|
<span>API Key</span> |
|
|
</div> |
|
|
</div> |
|
|
<div class="flex-container flexGap10 wrap"> |
|
|
<div class="flex1"> |
|
|
<label for="electronhub_tts_model">Model</label> |
|
|
<select id="electronhub_tts_model" class="text_pole"></select> |
|
|
</div> |
|
|
<div> |
|
|
<label for="electronhub_tts_speed">Speed <span id="electronhub_tts_speed_output"></span></label> |
|
|
<input type="range" id="electronhub_tts_speed" value="1" min="0.25" max="4" step="0.05"> |
|
|
</div> |
|
|
<div> |
|
|
<label for="electronhub_tts_temperature">Temperature</label> |
|
|
<input id="electronhub_tts_temperature" class="text_pole" type="number" min="0" max="2" step="0.1" value="1" /> |
|
|
</div> |
|
|
<div id="electronhub_block_top_p" style="display:none;"> |
|
|
<label for="electronhub_tts_top_p">Top-p</label> |
|
|
<input id="electronhub_tts_top_p" class="text_pole" type="number" min="0" max="1" step="0.01" value="1" /> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div id="electronhub_block_instructions" style="display:none;"> |
|
|
<label for="electronhub_tts_instructions">Instructions (GPT-4o Mini TTS):</label> |
|
|
<textarea id="electronhub_tts_instructions" class="textarea_compact autoSetHeight" placeholder="e.g., 'Speak cheerfully and energetically'"></textarea> |
|
|
</div> |
|
|
|
|
|
<div id="electronhub_block_dia" style="display:none;"> |
|
|
<label for="electronhub_tts_speaker_transcript">Speaker transcript (Dia):</label> |
|
|
<textarea id="electronhub_tts_speaker_transcript" class="textarea_compact autoSetHeight" maxlength="1000"></textarea> |
|
|
<label for="electronhub_tts_cfg_scale">CFG scale (1-5):</label> |
|
|
<input id="electronhub_tts_cfg_scale" type="number" min="1" max="5" step="1" /> |
|
|
<label for="electronhub_tts_cfg_topk">CFG filter top_k (15-50):</label> |
|
|
<input id="electronhub_tts_cfg_topk" type="number" min="15" max="50" step="1" /> |
|
|
</div> |
|
|
|
|
|
<div id="electronhub_block_msft" style="display:none;"> |
|
|
<div class="flex-container flexGap10 wrap"> |
|
|
<div> |
|
|
<label for="electronhub_tts_speech_rate">Speech rate (-100..100)</label> |
|
|
<input id="electronhub_tts_speech_rate" class="text_pole" type="number" min="-100" max="100" step="1" style="width:120px;" /> |
|
|
</div> |
|
|
<div> |
|
|
<label for="electronhub_tts_pitch_adjustment">Pitch adjustment (-100..100)</label> |
|
|
<input id="electronhub_tts_pitch_adjustment" class="text_pole" type="number" min="-100" max="100" step="1" style="width:120px;" /> |
|
|
</div> |
|
|
</div> |
|
|
<div class="flex-container flexGap10"> |
|
|
<div class="flex1"> |
|
|
<label for="electronhub_tts_emotional_style">Emotional style</label> |
|
|
<input id="electronhub_tts_emotional_style" class="text_pole" type="text" placeholder="cheerful, sad, angry, gentle..." /> |
|
|
</div> |
|
|
</div> |
|
|
</div> |
|
|
|
|
|
<div id="electronhub_dynamic_params" class="flex-container flexGap10 wrap" style="display:none;"></div>`; |
|
|
return html; |
|
|
} |
|
|
|
|
|
constructor() { |
|
|
this.handler = async function ( key) { |
|
|
if (key !== SECRET_KEYS.ELECTRONHUB) return; |
|
|
$('#electronhub_tts_key').toggleClass('success', !!secret_state[SECRET_KEYS.ELECTRONHUB]); |
|
|
await this.onRefreshClick(); |
|
|
}.bind(this); |
|
|
} |
|
|
|
|
|
dispose() { |
|
|
[event_types.SECRET_WRITTEN, event_types.SECRET_DELETED, event_types.SECRET_ROTATED].forEach(event => { |
|
|
eventSource.removeListener(event, this.handler); |
|
|
}); |
|
|
} |
|
|
|
|
|
async loadSettings(settings) { |
|
|
if (Object.keys(settings).length == 0) { |
|
|
console.info('Using default Electron Hub TTS settings'); |
|
|
} |
|
|
|
|
|
this.settings = { ...this.defaultSettings, ...settings }; |
|
|
|
|
|
await this.loadModels(); |
|
|
this.populateModelSelect(); |
|
|
|
|
|
$('#electronhub_tts_model').val(this.settings.model); |
|
|
$('#electronhub_tts_model').on('change', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_speed').val(this.settings.speed); |
|
|
$('#electronhub_tts_speed_output').text(this.settings.speed); |
|
|
$('#electronhub_tts_speed').on('input', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_temperature').val(this.settings.temperature); |
|
|
$('#electronhub_tts_temperature').on('input', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_top_p').val(this.settings.top_p); |
|
|
$('#electronhub_tts_top_p').on('input', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_instructions').val(this.settings.instructions); |
|
|
$('#electronhub_tts_instructions').on('input', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_speaker_transcript').val(this.settings.speaker_transcript); |
|
|
$('#electronhub_tts_speaker_transcript').on('input', () => { this.onSettingsChange(); }); |
|
|
$('#electronhub_tts_cfg_scale').val(this.settings.cfg_scale); |
|
|
$('#electronhub_tts_cfg_scale').on('input', () => { this.onSettingsChange(); }); |
|
|
$('#electronhub_tts_cfg_topk').val(this.settings.cfg_filter_top_k); |
|
|
$('#electronhub_tts_cfg_topk').on('input', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_speech_rate').val(this.settings.speech_rate); |
|
|
$('#electronhub_tts_speech_rate').on('input', () => { this.onSettingsChange(); }); |
|
|
$('#electronhub_tts_pitch_adjustment').val(this.settings.pitch_adjustment); |
|
|
$('#electronhub_tts_pitch_adjustment').on('input', () => { this.onSettingsChange(); }); |
|
|
$('#electronhub_tts_emotional_style').val(this.settings.emotional_style); |
|
|
$('#electronhub_tts_emotional_style').on('input', () => { this.onSettingsChange(); }); |
|
|
|
|
|
$('#electronhub_tts_key').toggleClass('success', !!secret_state[SECRET_KEYS.ELECTRONHUB]); |
|
|
[event_types.SECRET_WRITTEN, event_types.SECRET_DELETED, event_types.SECRET_ROTATED].forEach(event => { |
|
|
eventSource.on(event, this.handler); |
|
|
}); |
|
|
|
|
|
await this.checkReady(); |
|
|
this.updateConditionalBlocks(); |
|
|
this.renderDynamicParams(); |
|
|
console.debug('Electron Hub TTS: Settings loaded'); |
|
|
} |
|
|
|
|
|
async onSettingsChange() { |
|
|
const previousModel = this.settings.model; |
|
|
this.settings.model = String($('#electronhub_tts_model').find(':selected').val() || this.settings.model); |
|
|
this.settings.speed = Number($('#electronhub_tts_speed').val()); |
|
|
$('#electronhub_tts_speed_output').text(this.settings.speed); |
|
|
this.settings.temperature = Number($('#electronhub_tts_temperature').val()); |
|
|
this.settings.top_p = Number($('#electronhub_tts_top_p').val()); |
|
|
this.settings.instructions = String($('#electronhub_tts_instructions').val() || ''); |
|
|
this.settings.speaker_transcript = String($('#electronhub_tts_speaker_transcript').val() || ''); |
|
|
this.settings.cfg_scale = Number($('#electronhub_tts_cfg_scale').val()); |
|
|
this.settings.cfg_filter_top_k = Number($('#electronhub_tts_cfg_topk').val()); |
|
|
this.settings.speech_rate = Number($('#electronhub_tts_speech_rate').val()); |
|
|
this.settings.pitch_adjustment = Number($('#electronhub_tts_pitch_adjustment').val()); |
|
|
this.settings.emotional_style = String($('#electronhub_tts_emotional_style').val() || ''); |
|
|
this.updateConditionalBlocks(); |
|
|
this.renderDynamicParams(); |
|
|
saveTtsProviderSettings(); |
|
|
if (previousModel !== this.settings.model) { |
|
|
this.voices = await this.fetchTtsVoiceObjects(); |
|
|
await initVoiceMap(); |
|
|
} |
|
|
} |
|
|
|
|
|
async loadModels() { |
|
|
try { |
|
|
const response = await fetch('/api/openai/electronhub/models', { |
|
|
method: 'POST', |
|
|
headers: getRequestHeaders(), |
|
|
}); |
|
|
if (!response.ok) { |
|
|
throw new Error(`HTTP ${response.status}: ${await response.text()}`); |
|
|
} |
|
|
|
|
|
const data = await response.json(); |
|
|
const allModels = Array.isArray(data) ? data : []; |
|
|
const ttsModels = allModels.filter(m => { |
|
|
const eps = Array.isArray(m?.endpoints) ? m.endpoints : []; |
|
|
return eps.some(ep => { |
|
|
if (typeof ep !== 'string') return false; |
|
|
return ep === '/v1/audio/speech' || ep.endsWith('/audio/speech') || ep === 'audio/speech'; |
|
|
}); |
|
|
}); |
|
|
|
|
|
this.models = ttsModels; |
|
|
|
|
|
if (this.models.length > 0 && !this.models.find(m => m.id === this.settings.model)) { |
|
|
this.settings.model = this.models[0].id; |
|
|
saveTtsProviderSettings(); |
|
|
} |
|
|
} catch (err) { |
|
|
console.warn('Electron Hub models fetch failed', err); |
|
|
this.models = []; |
|
|
} |
|
|
} |
|
|
|
|
|
populateModelSelect() { |
|
|
const select = $('#electronhub_tts_model'); |
|
|
select.empty(); |
|
|
const groups = this.groupByVendor(this.models); |
|
|
for (const [vendor, models] of groups.entries()) { |
|
|
const optgroup = document.createElement('optgroup'); |
|
|
optgroup.label = vendor; |
|
|
for (const m of models) { |
|
|
const opt = document.createElement('option'); |
|
|
opt.value = m.id; |
|
|
opt.text = m.name || m.id; |
|
|
optgroup.appendChild(opt); |
|
|
} |
|
|
select.append(optgroup); |
|
|
} |
|
|
|
|
|
if (this.models.find(x => x.id === this.settings.model)) { |
|
|
select.val(this.settings.model); |
|
|
} |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
groupByVendor(array) { |
|
|
return array.reduce((acc, curr) => { |
|
|
const name = String(curr?.name || curr?.id || 'Other'); |
|
|
const vendor = name.split(':')[0].trim() || 'Other'; |
|
|
if (!acc.has(vendor)) acc.set(vendor, []); |
|
|
acc.get(vendor).push(curr); |
|
|
return acc; |
|
|
}, new Map()); |
|
|
} |
|
|
|
|
|
updateConditionalBlocks() { |
|
|
const modelId = this.settings.model; |
|
|
const model = this.models.find(m => m.id === modelId); |
|
|
const params = model?.parameters || {}; |
|
|
const vendorName = String(model?.name || '').split(':')[0].trim().toLowerCase(); |
|
|
|
|
|
const hasInstructions = 'instructions' in params || modelId === 'gpt-4o-mini-tts'; |
|
|
const hasDia = 'speaker_transcript' in params || 'cfg_scale' in params || 'cfg_filter_top_k' in params || modelId.includes('dia'); |
|
|
|
|
|
const hasMsft = 'speech_rate' in params || 'pitch_adjustment' in params || 'emotional_style' in params || vendorName === 'microsoft' || modelId === 'microsoft-tts'; |
|
|
const hasTopP = 'top_p' in params; |
|
|
|
|
|
$('#electronhub_block_instructions').toggle(!!hasInstructions); |
|
|
$('#electronhub_block_dia').toggle(!!hasDia); |
|
|
$('#electronhub_block_msft').toggle(!!hasMsft); |
|
|
$('#electronhub_block_top_p').toggle(!!hasTopP); |
|
|
} |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
renderDynamicParams() { |
|
|
const container = $('#electronhub_dynamic_params'); |
|
|
container.empty(); |
|
|
const model = this.models.find(m => m.id === this.settings.model); |
|
|
const params = model?.parameters || {}; |
|
|
const modelHasVoices = Array.isArray(model?.voices) && model.voices.length > 0; |
|
|
const exclude = new Set(['input', 'response_format', 'model', 'speed', 'temperature', 'top_p', 'instructions', 'speaker_transcript', 'cfg_scale', 'cfg_filter_top_k', 'speech_rate', 'pitch_adjustment', 'emotional_style']); |
|
|
if (modelHasVoices) exclude.add('voice'); |
|
|
|
|
|
const entries = Object.entries(params).filter(([k]) => !exclude.has(k)); |
|
|
container.toggle(entries.length > 0); |
|
|
if (entries.length === 0) return; |
|
|
|
|
|
for (const [key, spec] of entries) { |
|
|
const nice = key.replace(/_/g, ' ').replace(/\b\w/g, c => c.toUpperCase()); |
|
|
const type = String(spec?.type || 'string'); |
|
|
const id = `electronhub_dyn_${key.replace(/[^a-zA-Z0-9_-]/g, '_')}`; |
|
|
|
|
|
if (Array.isArray(spec?.enum) && spec.enum.length) { |
|
|
const select = $(`<div><label for="${id}">${nice}</label><select id="${id}" class="text_pole"></select></div>`); |
|
|
container.append(select); |
|
|
const el = select.find('select'); |
|
|
for (const opt of spec.enum) el.append(new Option(String(opt), String(opt))); |
|
|
const val = this.settings[key] ?? spec.default ?? spec.enum[0]; |
|
|
el.val(String(val)); |
|
|
el.on('change', () => { this.settings[key] = String(el.val() || ''); saveTtsProviderSettings(); }); |
|
|
continue; |
|
|
} |
|
|
|
|
|
if (type === 'boolean') { |
|
|
const block = $(`<label class="checkbox_label" for="${id}"><input type="checkbox" id="${id}"> <small>${nice}</small></label>`); |
|
|
container.append(block); |
|
|
const el = block.find('input'); |
|
|
el.prop('checked', !!(this.settings[key] ?? spec.default ?? false)); |
|
|
el.on('change', () => { this.settings[key] = !!el.is(':checked'); saveTtsProviderSettings(); }); |
|
|
continue; |
|
|
} |
|
|
|
|
|
if (type === 'number' || type === 'integer') { |
|
|
const min = spec.minimum ?? undefined; |
|
|
const max = spec.maximum ?? undefined; |
|
|
const step = type === 'integer' ? 1 : (spec.step ?? 0.01); |
|
|
const block = $(`<div><label for="${id}">${nice}${(min != null || max != null) ? ` (${min ?? ''}..${max ?? ''})` : ''}:</label><input id="${id}" type="number" class="text_pole" ${min != null ? `min="${min}"` : ''} ${max != null ? `max="${max}"` : ''} step="${step}"></div>`); |
|
|
container.append(block); |
|
|
const el = block.find('input'); |
|
|
const val = this.settings[key] ?? spec.default ?? ''; |
|
|
if (val !== '') el.val(val); |
|
|
el.on('input', () => { |
|
|
const raw = el.val(); |
|
|
this.settings[key] = (raw === '') ? '' : Number(raw); |
|
|
saveTtsProviderSettings(); |
|
|
}); |
|
|
continue; |
|
|
} |
|
|
|
|
|
const isLong = /instructions|transcript|style|prompt|description/i.test(key); |
|
|
if (isLong) { |
|
|
const block = $(`<div><label for="${id}">${nice}</label><textarea id="${id}" class="textarea_compact autoSetHeight"></textarea></div>`); |
|
|
container.append(block); |
|
|
const el = block.find('textarea'); |
|
|
el.val(String(this.settings[key] ?? spec.default ?? '')); |
|
|
el.on('input', () => { this.settings[key] = String(el.val() || ''); saveTtsProviderSettings(); }); |
|
|
} else { |
|
|
const block = $(`<div><label for="${id}">${nice}</label><input id="${id}" type="text" class="text_pole" /></div>`); |
|
|
container.append(block); |
|
|
const el = block.find('input'); |
|
|
el.val(String(this.settings[key] ?? spec.default ?? '')); |
|
|
el.on('input', () => { this.settings[key] = String(el.val() || ''); saveTtsProviderSettings(); }); |
|
|
} |
|
|
} |
|
|
} |
|
|
|
|
|
async checkReady() { |
|
|
this.voices = await this.fetchTtsVoiceObjects(); |
|
|
} |
|
|
|
|
|
async onRefreshClick() { |
|
|
await this.loadModels(); |
|
|
this.populateModelSelect(); |
|
|
this.voices = await this.fetchTtsVoiceObjects(); |
|
|
this.updateConditionalBlocks(); |
|
|
this.renderDynamicParams(); |
|
|
saveTtsProviderSettings(); |
|
|
} |
|
|
|
|
|
async getVoice(voiceName) { |
|
|
if (this.voices.length == 0) { |
|
|
this.voices = await this.fetchTtsVoiceObjects(); |
|
|
} |
|
|
const match = this.voices.filter(v => v.name == voiceName)[0]; |
|
|
if (!match) { |
|
|
throw `TTS Voice name ${voiceName} not found`; |
|
|
} |
|
|
return match; |
|
|
} |
|
|
|
|
|
async generateTts(text, voiceId) { |
|
|
const response = await this.fetchTtsGeneration(text, voiceId); |
|
|
return response; |
|
|
} |
|
|
|
|
|
async fetchTtsVoiceObjects() { |
|
|
const modelId = this.settings.model; |
|
|
const model = this.models.find(m => m.id === modelId); |
|
|
if (model && Array.isArray(model.voices) && model.voices.length) { |
|
|
return model.voices.map(name => ({ name, voice_id: name, lang: 'en-US' })); |
|
|
} |
|
|
|
|
|
const fallback = ['alloy', 'ash', 'ballad', 'coral', 'echo', 'fable', 'onyx', 'nova', 'sage', 'shimmer', 'verse']; |
|
|
return fallback.map(name => ({ name, voice_id: name, lang: 'en-US' })); |
|
|
} |
|
|
|
|
|
async previewTtsVoice(voiceId) { |
|
|
this.audioElement.pause(); |
|
|
this.audioElement.currentTime = 0; |
|
|
const text = getPreviewString('en-US'); |
|
|
const response = await this.fetchTtsGeneration(text, voiceId); |
|
|
if (!response.ok) { |
|
|
throw new Error(`HTTP ${response.status}`); |
|
|
} |
|
|
const audio = await response.blob(); |
|
|
const url = URL.createObjectURL(audio); |
|
|
this.audioElement.src = url; |
|
|
this.audioElement.play(); |
|
|
this.audioElement.onended = () => URL.revokeObjectURL(url); |
|
|
} |
|
|
|
|
|
async fetchTtsGeneration(inputText, voiceId) { |
|
|
console.info(`Generating Electron Hub TTS for voice_id ${voiceId}`); |
|
|
const body = { |
|
|
input: inputText, |
|
|
voice: voiceId, |
|
|
speed: this.settings.speed, |
|
|
temperature: this.settings.temperature, |
|
|
model: this.settings.model, |
|
|
}; |
|
|
|
|
|
const model = (this.settings.model || '').toLowerCase(); |
|
|
if (model === 'gpt-4o-mini-tts') { |
|
|
if (this.settings.instructions?.trim()) body.instructions = this.settings.instructions.trim(); |
|
|
} |
|
|
if (model.includes('dia')) { |
|
|
if (this.settings.speaker_transcript?.trim()) body.speaker_transcript = this.settings.speaker_transcript.trim(); |
|
|
if (Number.isFinite(this.settings.cfg_scale)) body.cfg_scale = Number(this.settings.cfg_scale); |
|
|
if (Number.isFinite(this.settings.cfg_filter_top_k)) body.cfg_filter_top_k = Number(this.settings.cfg_filter_top_k); |
|
|
} |
|
|
if (model.includes('microsoft-tts')) { |
|
|
if (Number.isFinite(this.settings.speech_rate)) body.speech_rate = Number(this.settings.speech_rate); |
|
|
if (Number.isFinite(this.settings.pitch_adjustment)) body.pitch_adjustment = Number(this.settings.pitch_adjustment); |
|
|
if ((this.settings.emotional_style || '').trim()) body.emotional_style = String(this.settings.emotional_style).trim(); |
|
|
} |
|
|
if (Number.isFinite(this.settings.top_p)) { |
|
|
body.top_p = Number(this.settings.top_p); |
|
|
} |
|
|
|
|
|
|
|
|
const modelObj = this.models.find(m => m.id === this.settings.model); |
|
|
const params = modelObj?.parameters || {}; |
|
|
const modelHasVoices = Array.isArray(modelObj?.voices) && modelObj.voices.length > 0; |
|
|
const exclude = new Set(['input', 'response_format', 'model', 'speed', 'temperature', 'top_p', 'instructions', 'speaker_transcript', 'cfg_scale', 'cfg_filter_top_k', 'speech_rate', 'pitch_adjustment', 'emotional_style']); |
|
|
if (modelHasVoices) exclude.add('voice'); |
|
|
for (const key of Object.keys(params)) { |
|
|
if (exclude.has(key)) continue; |
|
|
const val = this.settings[key]; |
|
|
if (val === undefined || val === '') continue; |
|
|
body[key] = val; |
|
|
} |
|
|
|
|
|
const response = await fetch('/api/openai/electronhub/generate-voice', { |
|
|
method: 'POST', |
|
|
headers: getRequestHeaders(), |
|
|
body: JSON.stringify(body), |
|
|
}); |
|
|
|
|
|
if (!response.ok) { |
|
|
throw new Error(`HTTP ${response.status}: ${await response.text()}`); |
|
|
} |
|
|
|
|
|
return response; |
|
|
} |
|
|
} |
|
|
|