Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
Merge branch 'more_radio'
Browse files- app.py +105 -513
- gr_client.py +394 -287
app.py
CHANGED
|
@@ -1,16 +1,14 @@
|
|
| 1 |
import os
|
| 2 |
import sys
|
| 3 |
-
import time
|
| 4 |
import requests
|
| 5 |
import json
|
| 6 |
-
from subprocess import Popen, PIPE
|
| 7 |
-
import threading
|
| 8 |
from huggingface_hub import HfApi
|
| 9 |
-
import gradio as gr
|
| 10 |
|
| 11 |
# start xVASynth service (no HTTP)
|
| 12 |
import resources.app.no_server as xvaserver
|
| 13 |
|
|
|
|
|
|
|
| 14 |
# model
|
| 15 |
hf_model_name = "Pendrokar/xvapitch_nvidia"
|
| 16 |
model_repo = HfApi()
|
|
@@ -19,117 +17,9 @@ latest_commit_sha = commits[0].commit_id
|
|
| 19 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
| 20 |
models_path = hf_cache_models_path
|
| 21 |
|
| 22 |
-
# ordered from most emotional and respects pauses to ones that do the least
|
| 23 |
-
voice_models = [
|
| 24 |
-
("๐จโ๐ฆณ #6671", "ccby_nvidia_hifi_6671_M"),
|
| 25 |
-
("๐ฑโโ๏ธ ๐ฌ๐ง #92", "ccby_nvidia_hifi_92_F"),
|
| 26 |
-
("๐ง #6670", "ccby_nvidia_hifi_6670_M"),
|
| 27 |
-
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
| 28 |
-
("Male #6097", "ccby_nvidia_hifi_6097_M"),
|
| 29 |
-
("๐ฉโ๐ฆฑ #12787", "ccby_nvidia_hifi_12787_F"),
|
| 30 |
-
("๐ต #11614", "ccby_nv_hifi_11614_F"),
|
| 31 |
-
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
| 32 |
-
("๐ฉโ๐ฆณ #11697", "ccby_nvidia_hifi_11697_F"),
|
| 33 |
-
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
| 34 |
-
]
|
| 35 |
-
|
| 36 |
current_voice_model = None
|
| 37 |
base_speaker_emb = ''
|
| 38 |
|
| 39 |
-
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
| 40 |
-
languages = [
|
| 41 |
-
("๐บ๐ธ EN", "en"),
|
| 42 |
-
("๐ฉ๐ช DE", "de"),
|
| 43 |
-
("๐ช๐ธ ES", "es"),
|
| 44 |
-
("๐ฎ๐น IT", "it"),
|
| 45 |
-
("๐ณ๐ฑ NL", "nl"),
|
| 46 |
-
("๐ง๐ท PT", "pt"),
|
| 47 |
-
("๐ต๐ฑ PL", "pl"),
|
| 48 |
-
("๐ท๐ด RO", "ro"),
|
| 49 |
-
("๐ธ๐ช SV", "sv"),
|
| 50 |
-
("๐ฉ๐ฐ DA", "da"),
|
| 51 |
-
("๐ซ๐ฎ FI", "fi"),
|
| 52 |
-
("๐ญ๐บ HU", "hu"),
|
| 53 |
-
("๐ฌ๐ท EL", "el"),
|
| 54 |
-
("๐ซ๐ท FR", "fr"),
|
| 55 |
-
("๐ท๐บ RU", "ru"),
|
| 56 |
-
("๐บ๐ฆ UA", "uk"),
|
| 57 |
-
("๐น๐ท TR", "tr"),
|
| 58 |
-
("๐ธ๐ฆ AR", "ar"),
|
| 59 |
-
("๐ฎ๐ณ HI", "hi"),
|
| 60 |
-
("๐ฏ๐ต JP", "jp"),
|
| 61 |
-
("๐ฐ๐ท KO", "ko"),
|
| 62 |
-
("๐จ๐ณ ZH", "zh"),
|
| 63 |
-
("๐ป๐ณ VI", "vi"),
|
| 64 |
-
("๐ป๐ฆ LA", "la"),
|
| 65 |
-
("๐ณ๐ฌ YO", "yo"),
|
| 66 |
-
("Swahili", "sw"),
|
| 67 |
-
("Hausa", "ha"),
|
| 68 |
-
("Wolof", "wo"),
|
| 69 |
-
]
|
| 70 |
-
|
| 71 |
-
# Translated from English by DeepMind's Gemini Pro
|
| 72 |
-
default_text = {
|
| 73 |
-
"ar": "ูุฐุง ูู ุตูุชู.",
|
| 74 |
-
"da": "Sรฅdan lyder min stemme.",
|
| 75 |
-
"de": "So klingt meine Stimme.",
|
| 76 |
-
"el": "ฮฯฯฮน ฮฑฮบฮฟฯฮณฮตฯฮฑฮน ฮท ฯฯฮฝฮฎ ฮผฮฟฯ
.",
|
| 77 |
-
"en": "This is what my voice sounds like.",
|
| 78 |
-
"es": "Asรญ suena mi voz.",
|
| 79 |
-
"fi": "Nรคin รครคneni kuulostaa.",
|
| 80 |
-
"fr": "Voici ร quoi ressemble ma voix.",
|
| 81 |
-
"ha": "Wannan ne muryata ke.",
|
| 82 |
-
"hi": "เคฏเคน เคฎเฅเคฐเฅ เคเคตเคพเคเคผ เคเฅเคธเฅ เคฒเคเคคเฅ เคนเฅเฅค",
|
| 83 |
-
"hu": "รgy hangzik a hangom.",
|
| 84 |
-
"it": "Cosรฌ suona la mia voce.",
|
| 85 |
-
"jp": "ใใใ็งใฎๅฃฐใงใใ",
|
| 86 |
-
"ko": "์ฌ๊ธฐ ์ ๋ชฉ์๋ฆฌ๊ฐ ์ด๋ค์ง ๋ค์ด๋ณด์ธ์.",
|
| 87 |
-
"la": "Haec est vox mea sonans.",
|
| 88 |
-
"nl": "Dit is hoe mijn stem klinkt.",
|
| 89 |
-
"pl": "Tak brzmi mรณj gลos.",
|
| 90 |
-
"pt": "ร assim que minha voz soa.",
|
| 91 |
-
"ro": "Aศa sunฤ vocea mea.",
|
| 92 |
-
"ru": "ะะพั ะบะฐะบ ะทะฒััะธั ะผะพะน ะณะพะปะพั.",
|
| 93 |
-
"sv": "Sรฅhรคr lรฅter min rรถst.",
|
| 94 |
-
"sw": "Baba, yetu, yetu, uliye. Mbinguni, yetu, yetu. Amiiinaa!!", #civ4
|
| 95 |
-
"tr": "Benim sesimin sesi bรถyle.",
|
| 96 |
-
"uk": "ะัั ัะบ ะทะฒััะธัั ะผัะน ะณะพะปะพั.",
|
| 97 |
-
"vi": "ฤรขy lร giแปng nรณi cแปงa tรดi.",
|
| 98 |
-
"wo": "Ndox li neen xewnaal ma.",
|
| 99 |
-
"yo": "รyรญ ni ohรนn mi ลlรก.",
|
| 100 |
-
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ",
|
| 101 |
-
}
|
| 102 |
-
|
| 103 |
-
def run_xvaserver():
|
| 104 |
-
# start the process without waiting for a response
|
| 105 |
-
print('Running xVAServer subprocess...\n')
|
| 106 |
-
xvaserver = Popen(['python', f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/server.py'], stdout=PIPE, stderr=PIPE, cwd=f'{os.path.dirname(os.path.abspath(__file__))}/resources/app/')
|
| 107 |
-
|
| 108 |
-
# Wait for a moment to ensure the server starts up
|
| 109 |
-
time.sleep(10)
|
| 110 |
-
|
| 111 |
-
# Check if the server is running
|
| 112 |
-
if xvaserver.poll() is not None:
|
| 113 |
-
print("Web server failed to start.")
|
| 114 |
-
sys.exit(0)
|
| 115 |
-
|
| 116 |
-
# contact local xVASynth server
|
| 117 |
-
print('Attempting to connect to xVASynth...')
|
| 118 |
-
try:
|
| 119 |
-
response = requests.get('http://0.0.0.0:8008')
|
| 120 |
-
response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
|
| 121 |
-
except requests.exceptions.RequestException as err:
|
| 122 |
-
print('Failed to connect!')
|
| 123 |
-
return
|
| 124 |
-
|
| 125 |
-
print('xVAServer running on port 8008')
|
| 126 |
-
|
| 127 |
-
# load default model
|
| 128 |
-
load_model("ccby_nvidia_hifi_6671_M")
|
| 129 |
-
|
| 130 |
-
# Wait for the process to exit
|
| 131 |
-
xvaserver.wait()
|
| 132 |
-
|
| 133 |
def load_model(voice_model_name):
|
| 134 |
model_path = models_path + voice_model_name
|
| 135 |
|
|
@@ -160,413 +50,115 @@ def load_model(voice_model_name):
|
|
| 160 |
|
| 161 |
return embs
|
| 162 |
|
| 163 |
-
def predict(
|
| 164 |
-
input_text,
|
| 165 |
-
voice,
|
| 166 |
-
lang,
|
| 167 |
-
pacing,
|
| 168 |
-
pitch,
|
| 169 |
-
energy,
|
| 170 |
-
anger,
|
| 171 |
-
happy,
|
| 172 |
-
sad,
|
| 173 |
-
surprise,
|
| 174 |
-
use_deepmoji
|
| 175 |
-
):
|
| 176 |
-
# grab only the first 1000 characters
|
| 177 |
-
input_text = input_text[:1000]
|
| 178 |
-
|
| 179 |
-
# load voice model if not the current model
|
| 180 |
-
if (current_voice_model != voice):
|
| 181 |
-
base_speaker_emb = load_model(voice)
|
| 182 |
|
| 183 |
-
|
| 184 |
-
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
-
|
| 192 |
-
|
| 193 |
-
|
| 194 |
-
|
| 195 |
-
|
| 196 |
-
|
| 197 |
-
|
| 198 |
-
|
| 199 |
-
|
| 200 |
-
|
| 201 |
-
|
| 202 |
-
|
| 203 |
-
|
| 204 |
-
|
| 205 |
-
'
|
| 206 |
-
|
| 207 |
-
|
| 208 |
-
|
| 209 |
-
|
| 210 |
-
|
| 211 |
-
|
| 212 |
-
|
| 213 |
-
|
| 214 |
-
|
| 215 |
-
|
| 216 |
-
|
| 217 |
-
|
| 218 |
-
|
| 219 |
-
# json_data = json.loads(response.text)
|
| 220 |
-
except requests.exceptions.RequestException as err:
|
| 221 |
-
print('FAILED to synthesize: {err}')
|
| 222 |
-
save_path = ''
|
| 223 |
-
response = {'text': '{"message": "Failed"}'}
|
| 224 |
-
json_data = {
|
| 225 |
-
'arpabet': ['Failed'],
|
| 226 |
-
'durations': [0],
|
| 227 |
-
'em_anger': anger,
|
| 228 |
-
'em_happy': happy,
|
| 229 |
-
'em_sad': sad,
|
| 230 |
-
'em_surprise': surprise,
|
| 231 |
}
|
| 232 |
|
| 233 |
-
# print('server.log contents:')
|
| 234 |
-
# with open('resources/app/server.log', 'r') as f:
|
| 235 |
-
# print(f.read())
|
| 236 |
-
|
| 237 |
-
arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
|
| 238 |
-
arpabet_symbols = json_data['arpabet'].split('|')
|
| 239 |
-
utter_time = 0
|
| 240 |
-
for symb_i in range(len(json_data['durations'])):
|
| 241 |
-
# skip PAD symbol
|
| 242 |
-
if (arpabet_symbols[symb_i] == '<PAD>'):
|
| 243 |
-
continue
|
| 244 |
-
|
| 245 |
-
length = float(json_data['durations'][symb_i])
|
| 246 |
-
arpa_length = str(round(length/2, 1))
|
| 247 |
-
arpabet_html += '<strong\
|
| 248 |
-
class="arpabet"\
|
| 249 |
-
style="padding: 0 '\
|
| 250 |
-
+ str(arpa_length)\
|
| 251 |
-
+'em"'\
|
| 252 |
-
+f" title=\"{utter_time} + {length}\""\
|
| 253 |
-
+'>'\
|
| 254 |
-
+ arpabet_symbols[symb_i]\
|
| 255 |
-
+ '</strong> '
|
| 256 |
-
utter_time += round(length, 1)
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
| 264 |
-
|
| 265 |
-
|
| 266 |
-
|
| 267 |
-
|
| 268 |
-
|
| 269 |
-
|
| 270 |
-
|
| 271 |
-
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
| 272 |
-
lines=1,
|
| 273 |
-
max_lines=5,
|
| 274 |
-
autofocus=True
|
| 275 |
-
)
|
| 276 |
-
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
| 277 |
-
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
| 278 |
-
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
| 279 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Anger", info="Tread lightly beyond 0.9")
|
| 280 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Happiness", info="Tread lightly beyond 0.7")
|
| 281 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ญ Sadness", info="Duration increased when beyond 0.2")
|
| 282 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
|
| 283 |
-
voice_radio = gr.Radio(
|
| 284 |
-
voice_models,
|
| 285 |
-
value="ccby_nvidia_hifi_6671_M",
|
| 286 |
-
label="Voice",
|
| 287 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
| 288 |
-
)
|
| 289 |
-
|
| 290 |
-
def set_default_text(lang, deepmoji_checked):
|
| 291 |
-
# DeepMoji only works on English Text
|
| 292 |
-
# checkbox_enabled = True
|
| 293 |
-
# if lang != 'en':
|
| 294 |
-
# checkbox_enabled = False
|
| 295 |
-
|
| 296 |
-
if lang == 'en':
|
| 297 |
-
checkbox_enabled = gr.Checkbox(
|
| 298 |
-
label="Use DeepMoji",
|
| 299 |
-
info="Auto adjust emotional values",
|
| 300 |
-
value=deepmoji_checked,
|
| 301 |
-
interactive=True
|
| 302 |
-
)
|
| 303 |
-
else:
|
| 304 |
-
checkbox_enabled = gr.Checkbox(
|
| 305 |
-
label="Use DeepMoji",
|
| 306 |
-
info="Works only with English!",
|
| 307 |
-
value=False,
|
| 308 |
-
interactive=False
|
| 309 |
-
)
|
| 310 |
-
|
| 311 |
-
return default_text[lang], checkbox_enabled # Return the modified textbox (important for Blocks)
|
| 312 |
-
|
| 313 |
-
en_examples = [
|
| 314 |
-
"This is what my voice sounds like.",
|
| 315 |
-
"If there is anything else you need, feel free to ask.",
|
| 316 |
-
"Amazing! Could you do that again?",
|
| 317 |
-
"Why, I would be more than happy to help you!",
|
| 318 |
-
"That was unexpected.",
|
| 319 |
-
"How dare you! . You have no right.",
|
| 320 |
-
"Ahh, well, you see. There is more to it.",
|
| 321 |
-
"I can't believe she is gone.",
|
| 322 |
-
"Stay out of my way!!!",
|
| 323 |
-
# ARPAbet example
|
| 324 |
-
"This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
|
| 325 |
-
]
|
| 326 |
-
|
| 327 |
-
def set_example_as_input(example_text):
|
| 328 |
-
return example_text
|
| 329 |
-
|
| 330 |
-
def reset_em_sliders(
|
| 331 |
-
deepmoji_enabled,
|
| 332 |
-
anger,
|
| 333 |
-
happy,
|
| 334 |
-
sad,
|
| 335 |
-
surprise
|
| 336 |
-
):
|
| 337 |
-
if (deepmoji_enabled):
|
| 338 |
-
return (0, 0, 0, 0)
|
| 339 |
-
else:
|
| 340 |
-
return (
|
| 341 |
-
anger,
|
| 342 |
-
happy,
|
| 343 |
-
sad,
|
| 344 |
-
surprise
|
| 345 |
-
)
|
| 346 |
-
|
| 347 |
-
def set_default_audio(voice_id):
|
| 348 |
-
return models_path + voice_id + '.wav'
|
| 349 |
-
|
| 350 |
-
def toggle_deepmoji(
|
| 351 |
-
checked,
|
| 352 |
-
anger,
|
| 353 |
-
happy,
|
| 354 |
-
sad,
|
| 355 |
-
surprise
|
| 356 |
-
):
|
| 357 |
-
if checked:
|
| 358 |
-
return (0, 0, 0, 0)
|
| 359 |
-
else:
|
| 360 |
-
return (
|
| 361 |
-
anger,
|
| 362 |
-
happy,
|
| 363 |
-
sad,
|
| 364 |
-
surprise
|
| 365 |
-
)
|
| 366 |
-
|
| 367 |
-
language_radio = gr.Radio(
|
| 368 |
-
languages,
|
| 369 |
-
value="en",
|
| 370 |
-
label="Language",
|
| 371 |
-
info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
|
| 372 |
-
)
|
| 373 |
-
|
| 374 |
-
_DESCRIPTION = '''
|
| 375 |
-
<div>
|
| 376 |
-
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
| 377 |
-
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.3k-blue?logo=nexusmods'/></a>
|
| 378 |
-
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
|
| 379 |
-
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
|
| 380 |
-
</div>
|
| 381 |
-
'''
|
| 382 |
-
|
| 383 |
-
with gr.Blocks(css=".arpabet {display: inline-block; background-color: gray; border-radius: 5px; font-size: 120%; margin: 0.1em 0}") as demo:
|
| 384 |
-
gr.Markdown("# xVASynth TTS")
|
| 385 |
-
|
| 386 |
-
gr.HTML(label="description", value=_DESCRIPTION)
|
| 387 |
-
|
| 388 |
-
with gr.Row(): # Main row for inputs and language selection
|
| 389 |
-
with gr.Column(): # Input column
|
| 390 |
-
input_textbox = gr.Textbox(
|
| 391 |
-
label="Input Text",
|
| 392 |
-
value="This is what my voice sounds like.",
|
| 393 |
-
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
| 394 |
-
lines=1,
|
| 395 |
-
max_lines=5,
|
| 396 |
-
autofocus=True
|
| 397 |
-
)
|
| 398 |
-
language_radio = gr.Radio(
|
| 399 |
-
languages,
|
| 400 |
-
value="en",
|
| 401 |
-
label="Language",
|
| 402 |
-
info="Will be more monotone and have an English accent. Tested mostly by a native Briton."
|
| 403 |
-
)
|
| 404 |
-
with gr.Row():
|
| 405 |
-
with gr.Column():
|
| 406 |
-
en_examples_dropdown = gr.Dropdown(
|
| 407 |
-
en_examples,
|
| 408 |
-
value=en_examples[0],
|
| 409 |
-
label="Example dropdown",
|
| 410 |
-
show_label=False,
|
| 411 |
-
info="English Examples",
|
| 412 |
-
visible=(language_radio.value == 'en')
|
| 413 |
-
)
|
| 414 |
-
with gr.Column():
|
| 415 |
-
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
| 416 |
-
with gr.Column(): # Control column
|
| 417 |
-
voice_radio = gr.Radio(
|
| 418 |
-
voice_models,
|
| 419 |
-
value="ccby_nvidia_hifi_6671_M",
|
| 420 |
-
label="Voice",
|
| 421 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
| 422 |
-
)
|
| 423 |
-
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
| 424 |
-
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
| 425 |
-
with gr.Row(): # Main row for inputs and language selection
|
| 426 |
-
with gr.Column(): # Input column
|
| 427 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Anger", info="Tread lightly beyond 0.9")
|
| 428 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ญ Sadness", info="Duration increased when beyond 0.2")
|
| 429 |
-
with gr.Column(): # Input column
|
| 430 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Happiness", info="Tread lightly beyond 0.7")
|
| 431 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ฎ Surprise", info="Can oversaturate Happiness")
|
| 432 |
-
deepmoji_checkbox = gr.Checkbox(label="Use DeepMoji", info="Auto adjust emotional values", value=True)
|
| 433 |
-
|
| 434 |
-
# Event handling using click
|
| 435 |
-
btn = gr.Button("Generate", variant="primary")
|
| 436 |
-
|
| 437 |
-
with gr.Row(): # Main row for inputs and language selection
|
| 438 |
-
with gr.Column(): # Input column
|
| 439 |
-
output_wav = gr.Audio(
|
| 440 |
-
label="22kHz audio output (autoplay enabled)",
|
| 441 |
-
type="filepath",
|
| 442 |
-
editable=False,
|
| 443 |
-
autoplay=True
|
| 444 |
-
)
|
| 445 |
-
with gr.Column(): # Input column
|
| 446 |
-
output_arpabet = gr.HTML(label="ARPAbet")
|
| 447 |
-
|
| 448 |
-
btn.click(
|
| 449 |
-
fn=predict,
|
| 450 |
-
inputs=[
|
| 451 |
-
input_textbox,
|
| 452 |
-
voice_radio,
|
| 453 |
-
language_radio,
|
| 454 |
-
pacing_slider,
|
| 455 |
-
pitch_slider,
|
| 456 |
-
energy_slider,
|
| 457 |
-
anger_slider,
|
| 458 |
-
happy_slider,
|
| 459 |
-
sad_slider,
|
| 460 |
-
surprise_slider,
|
| 461 |
-
deepmoji_checkbox
|
| 462 |
-
],
|
| 463 |
-
outputs=[
|
| 464 |
-
output_wav,
|
| 465 |
-
output_arpabet,
|
| 466 |
-
anger_slider,
|
| 467 |
-
happy_slider,
|
| 468 |
-
sad_slider,
|
| 469 |
-
surprise_slider,
|
| 470 |
-
# xVAServer response
|
| 471 |
-
gr.Textbox(visible=False)
|
| 472 |
-
]
|
| 473 |
-
)
|
| 474 |
-
input_textbox.submit(
|
| 475 |
-
fn=predict,
|
| 476 |
-
inputs=[
|
| 477 |
-
input_textbox,
|
| 478 |
-
voice_radio,
|
| 479 |
-
language_radio,
|
| 480 |
-
pacing_slider,
|
| 481 |
-
pitch_slider,
|
| 482 |
-
energy_slider,
|
| 483 |
-
anger_slider,
|
| 484 |
-
happy_slider,
|
| 485 |
-
sad_slider,
|
| 486 |
-
surprise_slider,
|
| 487 |
-
deepmoji_checkbox
|
| 488 |
-
],
|
| 489 |
-
outputs=[
|
| 490 |
-
output_wav,
|
| 491 |
-
output_arpabet,
|
| 492 |
-
anger_slider,
|
| 493 |
-
happy_slider,
|
| 494 |
-
sad_slider,
|
| 495 |
-
surprise_slider,
|
| 496 |
-
# xVAServer response
|
| 497 |
-
gr.Textbox(visible=False)
|
| 498 |
-
]
|
| 499 |
-
)
|
| 500 |
-
|
| 501 |
-
language_radio.change(
|
| 502 |
-
set_default_text,
|
| 503 |
-
inputs=[language_radio, deepmoji_checkbox],
|
| 504 |
-
outputs=[input_textbox, deepmoji_checkbox]
|
| 505 |
-
)
|
| 506 |
-
|
| 507 |
-
en_examples_dropdown.change(
|
| 508 |
-
set_example_as_input,
|
| 509 |
-
inputs=[en_examples_dropdown],
|
| 510 |
-
outputs=[input_textbox]
|
| 511 |
-
)
|
| 512 |
-
|
| 513 |
-
deepmoji_checkbox.change(
|
| 514 |
-
toggle_deepmoji,
|
| 515 |
-
inputs=[
|
| 516 |
-
deepmoji_checkbox,
|
| 517 |
-
anger_slider,
|
| 518 |
-
happy_slider,
|
| 519 |
-
sad_slider,
|
| 520 |
-
surprise_slider
|
| 521 |
-
],
|
| 522 |
-
outputs=[
|
| 523 |
-
anger_slider,
|
| 524 |
-
happy_slider,
|
| 525 |
-
sad_slider,
|
| 526 |
-
surprise_slider
|
| 527 |
-
]
|
| 528 |
-
)
|
| 529 |
-
|
| 530 |
-
input_textbox.change(
|
| 531 |
-
reset_em_sliders,
|
| 532 |
-
inputs=[
|
| 533 |
-
deepmoji_checkbox,
|
| 534 |
-
anger_slider,
|
| 535 |
-
happy_slider,
|
| 536 |
-
sad_slider,
|
| 537 |
-
surprise_slider
|
| 538 |
-
],
|
| 539 |
-
outputs=[
|
| 540 |
-
anger_slider,
|
| 541 |
-
happy_slider,
|
| 542 |
-
sad_slider,
|
| 543 |
-
surprise_slider
|
| 544 |
-
]
|
| 545 |
-
)
|
| 546 |
|
| 547 |
-
|
| 548 |
-
|
| 549 |
-
|
| 550 |
-
|
| 551 |
-
|
| 552 |
-
|
| 553 |
-
|
| 554 |
-
|
| 555 |
-
|
| 556 |
-
|
| 557 |
-
|
| 558 |
-
|
| 559 |
-
|
| 560 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 561 |
]
|
| 562 |
-
)
|
| 563 |
-
|
| 564 |
-
voice_radio.change(
|
| 565 |
-
set_default_audio,
|
| 566 |
-
inputs=voice_radio,
|
| 567 |
-
outputs=output_wav
|
| 568 |
-
)
|
| 569 |
|
| 570 |
if __name__ == "__main__":
|
| 571 |
print('running custom Gradio interface')
|
| 572 |
-
demo
|
|
|
|
|
|
| 1 |
import os
|
| 2 |
import sys
|
|
|
|
| 3 |
import requests
|
| 4 |
import json
|
|
|
|
|
|
|
| 5 |
from huggingface_hub import HfApi
|
|
|
|
| 6 |
|
| 7 |
# start xVASynth service (no HTTP)
|
| 8 |
import resources.app.no_server as xvaserver
|
| 9 |
|
| 10 |
+
from gr_client import BlocksDemo
|
| 11 |
+
|
| 12 |
# model
|
| 13 |
hf_model_name = "Pendrokar/xvapitch_nvidia"
|
| 14 |
model_repo = HfApi()
|
|
|
|
| 17 |
hf_cache_models_path = f'/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvidia/snapshots/{latest_commit_sha}/'
|
| 18 |
models_path = hf_cache_models_path
|
| 19 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 20 |
current_voice_model = None
|
| 21 |
base_speaker_emb = ''
|
| 22 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 23 |
def load_model(voice_model_name):
|
| 24 |
model_path = models_path + voice_model_name
|
| 25 |
|
|
|
|
| 50 |
|
| 51 |
return embs
|
| 52 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 53 |
|
| 54 |
+
class LocalBlocksDemo(BlocksDemo):
|
| 55 |
+
def predict(
|
| 56 |
+
self,
|
| 57 |
+
input_text,
|
| 58 |
+
voice,
|
| 59 |
+
lang,
|
| 60 |
+
pacing,
|
| 61 |
+
pitch,
|
| 62 |
+
energy,
|
| 63 |
+
anger,
|
| 64 |
+
happy,
|
| 65 |
+
sad,
|
| 66 |
+
surprise,
|
| 67 |
+
use_deepmoji
|
| 68 |
+
):
|
| 69 |
+
# grab only the first 1000 characters
|
| 70 |
+
input_text = input_text[:1000]
|
| 71 |
+
|
| 72 |
+
# load voice model if not the current model
|
| 73 |
+
if (current_voice_model != voice):
|
| 74 |
+
base_speaker_emb = load_model(voice)
|
| 75 |
+
|
| 76 |
+
model_type = 'xVAPitch'
|
| 77 |
+
pace = pacing if pacing else 1.0
|
| 78 |
+
save_path = '/tmp/xvapitch_audio_sample.wav'
|
| 79 |
+
language = lang
|
| 80 |
+
use_sr = 0
|
| 81 |
+
use_cleanup = 0
|
| 82 |
+
|
| 83 |
+
pluginsContext = {}
|
| 84 |
+
pluginsContext["mantella_settings"] = {
|
| 85 |
+
"emAngry": (anger if anger > 0 else 0),
|
| 86 |
+
"emHappy": (happy if happy > 0 else 0),
|
| 87 |
+
"emSad": (sad if sad > 0 else 0),
|
| 88 |
+
"emSurprise": (surprise if surprise > 0 else 0),
|
| 89 |
+
"run_model": use_deepmoji
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 90 |
}
|
| 91 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 92 |
|
| 93 |
+
data = {
|
| 94 |
+
'pluginsContext': json.dumps(pluginsContext),
|
| 95 |
+
'modelType': model_type,
|
| 96 |
+
# pad with whitespaces as a workaround to avoid cutoffs
|
| 97 |
+
'sequence': input_text.center(len(input_text) + 2, ' '),
|
| 98 |
+
'pace': pace,
|
| 99 |
+
'outfile': save_path,
|
| 100 |
+
'vocoder': 'n/a',
|
| 101 |
+
'base_lang': language,
|
| 102 |
+
'base_emb': base_speaker_emb,
|
| 103 |
+
'useSR': use_sr,
|
| 104 |
+
'useCleanup': use_cleanup,
|
| 105 |
+
}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 106 |
|
| 107 |
+
print('Synthesizing...')
|
| 108 |
+
try:
|
| 109 |
+
json_data = xvaserver.synthesize(data)
|
| 110 |
+
# response = requests.post('http://0.0.0.0:8008/synthesize', json=data, timeout=60)
|
| 111 |
+
# response.raise_for_status() # If the response contains an HTTP error status code, raise an exception
|
| 112 |
+
# json_data = json.loads(response.text)
|
| 113 |
+
except requests.exceptions.RequestException as err:
|
| 114 |
+
print('FAILED to synthesize: {err}')
|
| 115 |
+
save_path = ''
|
| 116 |
+
response = {'text': '{"message": "Failed"}'}
|
| 117 |
+
json_data = {
|
| 118 |
+
'arpabet': ['Failed'],
|
| 119 |
+
'durations': [0],
|
| 120 |
+
'em_anger': anger,
|
| 121 |
+
'em_happy': happy,
|
| 122 |
+
'em_sad': sad,
|
| 123 |
+
'em_surprise': surprise,
|
| 124 |
+
}
|
| 125 |
+
|
| 126 |
+
# print('server.log contents:')
|
| 127 |
+
# with open('resources/app/server.log', 'r') as f:
|
| 128 |
+
# print(f.read())
|
| 129 |
+
|
| 130 |
+
arpabet_html = '<h6>ARPAbet & Phoneme lengths</h6>'
|
| 131 |
+
arpabet_symbols = json_data['arpabet'].split('|')
|
| 132 |
+
utter_time = 0
|
| 133 |
+
for symb_i in range(len(json_data['durations'])):
|
| 134 |
+
# skip PAD symbol
|
| 135 |
+
if (arpabet_symbols[symb_i] == '<PAD>'):
|
| 136 |
+
continue
|
| 137 |
+
|
| 138 |
+
length = float(json_data['durations'][symb_i])
|
| 139 |
+
arpa_length = str(round(length/2, 1))
|
| 140 |
+
arpabet_html += '<strong\
|
| 141 |
+
class="arpabet"\
|
| 142 |
+
style="padding: 0 '\
|
| 143 |
+
+ str(arpa_length)\
|
| 144 |
+
+'em"'\
|
| 145 |
+
+f" title=\"{utter_time} + {length}\""\
|
| 146 |
+
+'>'\
|
| 147 |
+
+ arpabet_symbols[symb_i]\
|
| 148 |
+
+ '</strong> '
|
| 149 |
+
utter_time += round(length, 1)
|
| 150 |
+
|
| 151 |
+
return [
|
| 152 |
+
save_path,
|
| 153 |
+
arpabet_html,
|
| 154 |
+
round(json_data['em_angry'][0], 2),
|
| 155 |
+
round(json_data['em_happy'][0], 2),
|
| 156 |
+
round(json_data['em_sad'][0], 2),
|
| 157 |
+
round(json_data['em_surprise'][0], 2),
|
| 158 |
+
json_data
|
| 159 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 160 |
|
| 161 |
if __name__ == "__main__":
|
| 162 |
print('running custom Gradio interface')
|
| 163 |
+
demo = LocalBlocksDemo()
|
| 164 |
+
demo.block.launch()
|
gr_client.py
CHANGED
|
@@ -1,34 +1,35 @@
|
|
| 1 |
import os
|
| 2 |
-
import sys
|
| 3 |
-
import time
|
| 4 |
-
import requests
|
| 5 |
import json
|
| 6 |
-
from huggingface_hub import hf_hub_download
|
| 7 |
import gradio as gr
|
| 8 |
from gradio_client import Client
|
| 9 |
|
| 10 |
voice_models = [
|
| 11 |
-
("
|
| 12 |
-
("
|
|
|
|
|
|
|
|
|
|
| 13 |
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
| 14 |
("Male #6097", "ccby_nvidia_hifi_6097_M"),
|
| 15 |
-
("
|
| 16 |
-
("
|
| 17 |
-
("Female #12787", "ccby_nvidia_hifi_12787_F"),
|
| 18 |
-
("Female #11614", "ccby_nv_hifi_11614_F"),
|
| 19 |
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
|
|
|
| 20 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
| 21 |
]
|
| 22 |
-
current_voice_model = None
|
| 23 |
|
| 24 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
| 25 |
languages = [
|
| 26 |
-
("
|
| 27 |
("๐ฉ๐ช DE", "de"),
|
| 28 |
("๐ช๐ธ ES", "es"),
|
| 29 |
-
("
|
|
|
|
|
|
|
|
|
|
| 30 |
("๐ณ๐ฑ NL", "nl"),
|
| 31 |
-
("
|
|
|
|
| 32 |
("๐ต๐ฑ PL", "pl"),
|
| 33 |
("๐ท๐ด RO", "ro"),
|
| 34 |
("๐ธ๐ช SV", "sv"),
|
|
@@ -38,19 +39,17 @@ languages = [
|
|
| 38 |
("๐ฌ๐ท EL", "el"),
|
| 39 |
("๐ซ๐ท FR", "fr"),
|
| 40 |
("๐ท๐บ RU", "ru"),
|
| 41 |
-
("๐บ๐ฆ
|
| 42 |
("๐น๐ท TR", "tr"),
|
| 43 |
("๐ธ๐ฆ AR", "ar"),
|
| 44 |
-
("๐ฎ๐ณ HI", "hi"),
|
| 45 |
("๐ฏ๐ต JP", "jp"),
|
| 46 |
("๐ฐ๐ท KO", "ko"),
|
| 47 |
-
("๐จ๐ณ ZH", "zh"),
|
| 48 |
("๐ป๐ณ VI", "vi"),
|
| 49 |
("๐ป๐ฆ LA", "la"),
|
| 50 |
-
("HA", "ha"),
|
| 51 |
-
("SW", "sw"),
|
| 52 |
("๐ณ๐ฌ YO", "yo"),
|
| 53 |
-
("
|
|
|
|
|
|
|
| 54 |
]
|
| 55 |
|
| 56 |
# Translated from English by DeepMind's Gemini Pro
|
|
@@ -85,112 +84,118 @@ default_text = {
|
|
| 85 |
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ",
|
| 86 |
}
|
| 87 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 88 |
|
| 89 |
-
|
| 90 |
-
|
| 91 |
-
|
| 92 |
-
lang,
|
| 93 |
-
pacing,
|
| 94 |
-
pitch,
|
| 95 |
-
energy,
|
| 96 |
-
anger,
|
| 97 |
-
happy,
|
| 98 |
-
sad,
|
| 99 |
-
surprise,
|
| 100 |
-
deepmoji_checked
|
| 101 |
-
):
|
| 102 |
-
wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
|
| 103 |
-
input_text, # str in 'Input Text' Textbox component
|
| 104 |
-
voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
|
| 105 |
-
lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
|
| 106 |
-
pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
|
| 107 |
-
pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
|
| 108 |
-
energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
|
| 109 |
-
anger, # float (numeric value between 0 and 1.0) in '๐ Anger' Slider component
|
| 110 |
-
happy, # float (numeric value between 0 and 1.0) in '๐ Happiness' Slider component
|
| 111 |
-
sad, # float (numeric value between 0 and 1.0) in '๐ญ Sadness' Slider component
|
| 112 |
-
surprise, # float (numeric value between 0 and 1.0) in '๐ฎ Surprise' Slider component
|
| 113 |
-
deepmoji_checked, # bool
|
| 114 |
-
api_name="/predict"
|
| 115 |
-
)
|
| 116 |
-
|
| 117 |
-
json_data = json.loads(response.replace("'", '"'))
|
| 118 |
-
|
| 119 |
-
arpabet_html = '<h6>ARPAbet & Durations</h6>'
|
| 120 |
-
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
|
| 121 |
-
arpabet_nopad = json_data['arpabet'].split('|PAD|')
|
| 122 |
-
arpabet_symbols = json_data['arpabet'].split('|')
|
| 123 |
-
wpad_len = len(arpabet_symbols)
|
| 124 |
-
nopad_len = len(arpabet_nopad)
|
| 125 |
-
total_dur_length = 0
|
| 126 |
-
for symb_i in range(wpad_len):
|
| 127 |
-
if (arpabet_symbols[symb_i] == '<PAD>'):
|
| 128 |
-
continue
|
| 129 |
-
total_dur_length += float(json_data['durations'][symb_i])
|
| 130 |
-
|
| 131 |
-
for symb_i in range(wpad_len):
|
| 132 |
-
if (arpabet_symbols[symb_i] == '<PAD>'):
|
| 133 |
-
continue
|
| 134 |
-
|
| 135 |
-
arpabet_length = float(json_data['durations'][symb_i])
|
| 136 |
-
cell_width = round(arpabet_length / total_dur_length * 100, 2)
|
| 137 |
-
arpabet_html += '<td class="arpabet" style="width: '\
|
| 138 |
-
+ str(cell_width)\
|
| 139 |
-
+'%">'\
|
| 140 |
-
+ arpabet_symbols[symb_i]\
|
| 141 |
-
+ '</td> '
|
| 142 |
-
arpabet_html += '<tr></tbody></table>'
|
| 143 |
-
|
| 144 |
-
return [
|
| 145 |
-
wav_path,
|
| 146 |
-
arpabet_html,
|
| 147 |
-
round(json_data['em_angry'][0], 2),
|
| 148 |
-
round(json_data['em_happy'][0], 2),
|
| 149 |
-
round(json_data['em_sad'][0], 2),
|
| 150 |
-
round(json_data['em_surprise'][0], 2)
|
| 151 |
-
]
|
| 152 |
-
|
| 153 |
-
input_textbox = gr.Textbox(
|
| 154 |
-
label="Input Text",
|
| 155 |
-
value="This is what my voice sounds like.",
|
| 156 |
-
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
| 157 |
-
lines=1,
|
| 158 |
-
max_lines=5,
|
| 159 |
-
autofocus=True
|
| 160 |
-
)
|
| 161 |
-
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
| 162 |
-
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
| 163 |
-
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
| 164 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Anger", info="Tread lightly beyond 0.9")
|
| 165 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ Happiness", info="Tread lightly beyond 0.7")
|
| 166 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ญ Sadness", info="Duration increased when beyond 0.2")
|
| 167 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="๐ฎ Surprise", info="Does not play well with Happiness with either being beyond 0.3")
|
| 168 |
-
voice_radio = gr.Radio(
|
| 169 |
-
voice_models,
|
| 170 |
-
value="ccby_nvidia_hifi_6671_M",
|
| 171 |
-
label="Voice",
|
| 172 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
| 173 |
-
)
|
| 174 |
|
| 175 |
def set_default_text(lang, deepmoji_checked):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 176 |
# DeepMoji only works on English Text
|
|
|
|
| 177 |
if lang == 'en':
|
| 178 |
-
|
| 179 |
-
|
| 180 |
-
info="Auto adjust emotional values",
|
| 181 |
-
value=deepmoji_checked,
|
| 182 |
-
interactive=True
|
| 183 |
-
)
|
| 184 |
else:
|
| 185 |
-
|
| 186 |
-
|
| 187 |
-
|
| 188 |
-
|
| 189 |
-
|
| 190 |
-
|
| 191 |
|
| 192 |
-
return
|
| 193 |
|
|
|
|
| 194 |
en_examples = [
|
| 195 |
"This is what my voice sounds like.",
|
| 196 |
"If there is anything else you need, feel free to ask.",
|
|
@@ -204,22 +209,37 @@ en_examples = [
|
|
| 204 |
# ARPAbet example
|
| 205 |
"This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
|
| 206 |
]
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 207 |
|
| 208 |
def set_example_as_input(example_text):
|
|
|
|
| 209 |
return example_text
|
| 210 |
|
| 211 |
def toggle_example_dropdown(lang):
|
|
|
|
|
|
|
| 212 |
if lang == 'en':
|
| 213 |
-
|
| 214 |
-
en_examples,
|
| 215 |
-
value=en_examples[0],
|
| 216 |
-
label="Example dropdown",
|
| 217 |
-
show_label=False,
|
| 218 |
-
info="English Examples",
|
| 219 |
-
visible=True
|
| 220 |
-
)
|
| 221 |
else:
|
| 222 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 223 |
|
| 224 |
def reset_em_sliders(
|
| 225 |
deepmoji_enabled,
|
|
@@ -228,6 +248,7 @@ def reset_em_sliders(
|
|
| 228 |
sad,
|
| 229 |
surprise
|
| 230 |
):
|
|
|
|
| 231 |
if (deepmoji_enabled):
|
| 232 |
return (0, 0, 0, 0)
|
| 233 |
else:
|
|
@@ -245,6 +266,7 @@ def toggle_deepmoji(
|
|
| 245 |
sad,
|
| 246 |
surprise
|
| 247 |
):
|
|
|
|
| 248 |
if checked:
|
| 249 |
return (0, 0, 0, 0)
|
| 250 |
else:
|
|
@@ -255,183 +277,268 @@ def toggle_deepmoji(
|
|
| 255 |
surprise
|
| 256 |
)
|
| 257 |
|
| 258 |
-
|
| 259 |
-
|
| 260 |
-
|
| 261 |
-
|
| 262 |
-
|
| 263 |
-
|
|
|
|
| 264 |
|
| 265 |
_DESCRIPTION = '''
|
| 266 |
<div>
|
| 267 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
|
|
|
| 268 |
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
|
| 269 |
-
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run</span>
|
| 270 |
</div>
|
| 271 |
'''
|
| 272 |
|
| 273 |
-
with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
|
| 274 |
-
gr.Markdown("# xVASynth TTS")
|
| 275 |
|
| 276 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 277 |
|
| 278 |
-
|
| 279 |
-
|
| 280 |
-
|
| 281 |
-
|
| 282 |
-
|
| 283 |
-
|
| 284 |
-
|
| 285 |
-
|
| 286 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 287 |
)
|
| 288 |
-
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 293 |
)
|
| 294 |
|
| 295 |
-
|
| 296 |
-
|
| 297 |
-
|
| 298 |
-
|
| 299 |
-
|
| 300 |
-
|
| 301 |
-
|
| 302 |
-
info="English Examples"
|
| 303 |
-
)
|
| 304 |
-
with gr.Column():
|
| 305 |
-
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
| 306 |
-
with gr.Column(): # Control column
|
| 307 |
-
voice_radio = gr.Radio(
|
| 308 |
-
voice_models,
|
| 309 |
-
value="ccby_nvidia_hifi_6671_M",
|
| 310 |
-
label="Voice",
|
| 311 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
| 312 |
)
|
| 313 |
-
|
| 314 |
-
|
| 315 |
-
|
| 316 |
-
|
| 317 |
-
|
| 318 |
-
|
| 319 |
-
|
| 320 |
-
|
| 321 |
-
|
| 322 |
-
|
| 323 |
-
|
| 324 |
-
|
| 325 |
-
|
| 326 |
-
|
| 327 |
-
|
| 328 |
-
|
| 329 |
-
|
| 330 |
-
|
| 331 |
-
|
| 332 |
-
|
| 333 |
-
|
| 334 |
-
|
| 335 |
-
|
| 336 |
-
|
| 337 |
-
|
| 338 |
-
|
| 339 |
-
|
| 340 |
-
|
| 341 |
-
|
| 342 |
-
|
| 343 |
-
|
| 344 |
-
|
| 345 |
-
|
| 346 |
-
|
| 347 |
-
|
| 348 |
-
|
| 349 |
-
|
| 350 |
-
|
| 351 |
-
|
| 352 |
-
|
| 353 |
-
|
| 354 |
-
|
| 355 |
-
|
| 356 |
-
|
| 357 |
-
|
| 358 |
-
|
| 359 |
-
|
| 360 |
-
|
| 361 |
-
|
| 362 |
-
|
| 363 |
-
|
| 364 |
-
|
| 365 |
-
|
| 366 |
-
|
| 367 |
-
|
| 368 |
-
|
| 369 |
-
|
| 370 |
-
|
| 371 |
-
|
| 372 |
-
|
| 373 |
-
|
| 374 |
-
|
| 375 |
-
|
| 376 |
-
|
| 377 |
-
|
| 378 |
-
|
| 379 |
-
|
| 380 |
-
|
| 381 |
-
|
| 382 |
-
|
| 383 |
-
|
| 384 |
-
|
| 385 |
-
|
| 386 |
-
|
| 387 |
-
|
| 388 |
-
|
| 389 |
-
|
| 390 |
-
|
| 391 |
-
|
| 392 |
-
|
| 393 |
-
|
| 394 |
-
|
| 395 |
-
|
| 396 |
-
|
| 397 |
-
|
| 398 |
-
|
| 399 |
-
|
| 400 |
-
|
| 401 |
-
|
| 402 |
-
|
| 403 |
-
|
| 404 |
-
|
| 405 |
-
|
| 406 |
-
|
| 407 |
-
|
| 408 |
-
|
| 409 |
-
|
| 410 |
-
|
| 411 |
-
|
| 412 |
-
|
| 413 |
-
|
| 414 |
-
|
| 415 |
-
|
| 416 |
-
|
| 417 |
-
|
| 418 |
-
|
| 419 |
-
|
| 420 |
-
|
| 421 |
-
|
| 422 |
-
|
| 423 |
-
|
| 424 |
-
|
| 425 |
-
|
| 426 |
-
|
| 427 |
-
|
| 428 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 429 |
]
|
| 430 |
-
)
|
| 431 |
|
| 432 |
if __name__ == "__main__":
|
| 433 |
print('running Gradio interface')
|
| 434 |
-
# gradio_app.launch()
|
| 435 |
client = Client("Pendrokar/xVASynth")
|
| 436 |
|
| 437 |
-
demo
|
|
|
|
|
|
| 1 |
import os
|
|
|
|
|
|
|
|
|
|
| 2 |
import json
|
|
|
|
| 3 |
import gradio as gr
|
| 4 |
from gradio_client import Client
|
| 5 |
|
| 6 |
voice_models = [
|
| 7 |
+
("๐จโ๐ฆณ #6671", "ccby_nvidia_hifi_6671_M"),
|
| 8 |
+
("๐ฑโโ๏ธ ๐ฌ๐ง #92", "ccby_nvidia_hifi_92_F"),
|
| 9 |
+
]
|
| 10 |
+
voice_models_more = [
|
| 11 |
+
("๐ง #6670", "ccby_nvidia_hifi_6670_M"),
|
| 12 |
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
| 13 |
("Male #6097", "ccby_nvidia_hifi_6097_M"),
|
| 14 |
+
("๐ฉโ๐ฆฑ #12787", "ccby_nvidia_hifi_12787_F"),
|
| 15 |
+
("๐ต #11614", "ccby_nv_hifi_11614_F"),
|
|
|
|
|
|
|
| 16 |
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
| 17 |
+
("๐ฉโ๐ฆณ #11697", "ccby_nvidia_hifi_11697_F"),
|
| 18 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
| 19 |
]
|
|
|
|
| 20 |
|
| 21 |
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
| 22 |
languages = [
|
| 23 |
+
("๐บ๐ธ EN", "en"),
|
| 24 |
("๐ฉ๐ช DE", "de"),
|
| 25 |
("๐ช๐ธ ES", "es"),
|
| 26 |
+
("๐ฎ๐ณ HI", "hi"),
|
| 27 |
+
("๐จ๐ณ ZH", "zh"),
|
| 28 |
+
]
|
| 29 |
+
languages_more = [
|
| 30 |
("๐ณ๐ฑ NL", "nl"),
|
| 31 |
+
("๐ง๐ท PT", "pt"),
|
| 32 |
+
("๐ฎ๐น IT", "it"),
|
| 33 |
("๐ต๐ฑ PL", "pl"),
|
| 34 |
("๐ท๐ด RO", "ro"),
|
| 35 |
("๐ธ๐ช SV", "sv"),
|
|
|
|
| 39 |
("๐ฌ๐ท EL", "el"),
|
| 40 |
("๐ซ๐ท FR", "fr"),
|
| 41 |
("๐ท๐บ RU", "ru"),
|
| 42 |
+
("๐บ๐ฆ UA", "uk"),
|
| 43 |
("๐น๐ท TR", "tr"),
|
| 44 |
("๐ธ๐ฆ AR", "ar"),
|
|
|
|
| 45 |
("๐ฏ๐ต JP", "jp"),
|
| 46 |
("๐ฐ๐ท KO", "ko"),
|
|
|
|
| 47 |
("๐ป๐ณ VI", "vi"),
|
| 48 |
("๐ป๐ฆ LA", "la"),
|
|
|
|
|
|
|
| 49 |
("๐ณ๐ฌ YO", "yo"),
|
| 50 |
+
("Swahili", "sw"),
|
| 51 |
+
("Hausa", "ha"),
|
| 52 |
+
("Wolof", "wo"),
|
| 53 |
]
|
| 54 |
|
| 55 |
# Translated from English by DeepMind's Gemini Pro
|
|
|
|
| 84 |
"zh": "่ฟๆฏๆ็ๅฃฐ้ณใ",
|
| 85 |
}
|
| 86 |
|
| 87 |
+
# Component defaults
|
| 88 |
+
input_textbox_init = {
|
| 89 |
+
'label': "Input Text",
|
| 90 |
+
'value': "This is what my voice sounds like.",
|
| 91 |
+
'info': "Also accepts ARPAbet symbols placed within {} brackets.",
|
| 92 |
+
'lines': 1,
|
| 93 |
+
'max_lines': 5,
|
| 94 |
+
'autofocus': True,
|
| 95 |
+
}
|
| 96 |
+
pacing_slider_init = {
|
| 97 |
+
'value': 1.0,
|
| 98 |
+
'minimum': 0.5,
|
| 99 |
+
'maximum': 2.0,
|
| 100 |
+
'step': 0.1,
|
| 101 |
+
'label': "Duration",
|
| 102 |
+
}
|
| 103 |
+
pitch_slider_init = {
|
| 104 |
+
'minimum': 0,
|
| 105 |
+
'maximum': 1.0,
|
| 106 |
+
'value': 0.5,
|
| 107 |
+
'step': 0.05,
|
| 108 |
+
'label': "Pitch",
|
| 109 |
+
'visible': False,
|
| 110 |
+
}
|
| 111 |
+
energy_slider_init = {
|
| 112 |
+
'minimum': 0.1,
|
| 113 |
+
'maximum': 1.0,
|
| 114 |
+
'value': 1.0,
|
| 115 |
+
'step': 0.05,
|
| 116 |
+
'label': "Energy",
|
| 117 |
+
'visible': False,
|
| 118 |
+
}
|
| 119 |
+
anger_slider_init = {
|
| 120 |
+
'minimum': 0,
|
| 121 |
+
'maximum': 1.0,
|
| 122 |
+
'value': 0,
|
| 123 |
+
'step': 0.05,
|
| 124 |
+
'label': "๐ Anger",
|
| 125 |
+
'info': "Tread lightly beyond 0.9",
|
| 126 |
+
}
|
| 127 |
+
happy_slider_init = {
|
| 128 |
+
'minimum': 0,
|
| 129 |
+
'maximum': 1.0,
|
| 130 |
+
'value': 0,
|
| 131 |
+
'step': 0.05,
|
| 132 |
+
'label': "๐ Happiness",
|
| 133 |
+
'info': "Tread lightly beyond 0.7",
|
| 134 |
+
}
|
| 135 |
+
sad_slider_init = {
|
| 136 |
+
'minimum': 0,
|
| 137 |
+
'maximum': 1.0,
|
| 138 |
+
'value': 0,
|
| 139 |
+
'step': 0.05,
|
| 140 |
+
'label': "๐ญ Sadness",
|
| 141 |
+
'info': "Duration increased when beyond 0.2",
|
| 142 |
+
}
|
| 143 |
+
surprise_slider_init = {
|
| 144 |
+
'minimum': 0,
|
| 145 |
+
'maximum': 1.0,
|
| 146 |
+
'value': 0,
|
| 147 |
+
'step': 0.05,
|
| 148 |
+
'label': "๐ฎ Surprise",
|
| 149 |
+
'info': "Does not play well with Happiness with either being beyond 0.3",
|
| 150 |
+
}
|
| 151 |
+
voice_radio_init = {
|
| 152 |
+
'choices': [*voice_models, (f'+{len(voice_models_more)}', 'more')],
|
| 153 |
+
'value': "ccby_nvidia_hifi_6671_M",
|
| 154 |
+
'label': "Voice",
|
| 155 |
+
'info': "NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
| 156 |
+
}
|
| 157 |
+
deepmoji_checkbox_init = {
|
| 158 |
+
'label': "Use DeepMoji",
|
| 159 |
+
'info': "Auto adjust emotional values for English",
|
| 160 |
+
'value': True,
|
| 161 |
+
'interactive': True
|
| 162 |
+
}
|
| 163 |
+
|
| 164 |
+
def more_lang_options(lang):
|
| 165 |
+
# print('more_lang_options')
|
| 166 |
+
if lang != 'more':
|
| 167 |
+
return lang
|
| 168 |
|
| 169 |
+
radio_init = {**language_radio_init}
|
| 170 |
+
radio_init['choices'] = [*languages, *languages_more]
|
| 171 |
+
return gr.Radio(**radio_init)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 172 |
|
| 173 |
def set_default_text(lang, deepmoji_checked):
|
| 174 |
+
# print('set_default_text')
|
| 175 |
+
textbox_init = {**input_textbox_init}
|
| 176 |
+
if lang == 'more':
|
| 177 |
+
textbox_init['value'] = default_text['en']
|
| 178 |
+
# return default_text['en'], deepmoji_checked
|
| 179 |
+
return gr.Textbox(**textbox_init), deepmoji_checked
|
| 180 |
+
|
| 181 |
+
textbox_init['value'] = default_text[lang]
|
| 182 |
+
|
| 183 |
# DeepMoji only works on English Text
|
| 184 |
+
checkbox_init = {**deepmoji_checkbox_init}
|
| 185 |
if lang == 'en':
|
| 186 |
+
checkbox_init['value'] = deepmoji_checked,
|
| 187 |
+
# checkbox_init['interactive'] = True
|
|
|
|
|
|
|
|
|
|
|
|
|
| 188 |
else:
|
| 189 |
+
deepmoji_checked = False
|
| 190 |
+
# FIXME: event listener conflict with toggle_deepmoji
|
| 191 |
+
# checkbox_init['info'] = "Works only with English!",
|
| 192 |
+
# checkbox_init['value'] = False,
|
| 193 |
+
# checkbox_init['interactive'] = False
|
| 194 |
+
# gr.Checkbox(**checkbox_init)
|
| 195 |
|
| 196 |
+
return gr.Textbox(**textbox_init), deepmoji_checked
|
| 197 |
|
| 198 |
+
# examples component
|
| 199 |
en_examples = [
|
| 200 |
"This is what my voice sounds like.",
|
| 201 |
"If there is anything else you need, feel free to ask.",
|
|
|
|
| 209 |
# ARPAbet example
|
| 210 |
"This { IH1 Z } { W AH1 T } { M AY1 } { V OY1 S } { S AW1 N D Z } like.",
|
| 211 |
]
|
| 212 |
+
en_examples_dropdown_init = {
|
| 213 |
+
'choices': en_examples,
|
| 214 |
+
'value': en_examples[0],
|
| 215 |
+
'label': "Example dropdown",
|
| 216 |
+
'show_label': False,
|
| 217 |
+
'info': "English Examples",
|
| 218 |
+
'visible': True
|
| 219 |
+
}
|
| 220 |
|
| 221 |
def set_example_as_input(example_text):
|
| 222 |
+
# print('set_example_as_input')
|
| 223 |
return example_text
|
| 224 |
|
| 225 |
def toggle_example_dropdown(lang):
|
| 226 |
+
# print('toggle_example_dropdown')
|
| 227 |
+
dropdown_init = {**en_examples_dropdown_init}
|
| 228 |
if lang == 'en':
|
| 229 |
+
dropdown_init['visible'] = True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 230 |
else:
|
| 231 |
+
dropdown_init['visible'] = False
|
| 232 |
+
|
| 233 |
+
return gr.Dropdown(**dropdown_init)
|
| 234 |
+
|
| 235 |
+
def more_voice_options(voice):
|
| 236 |
+
# print('more_voice_options')
|
| 237 |
+
if voice != 'more':
|
| 238 |
+
return voice
|
| 239 |
+
|
| 240 |
+
radio_init = {**voice_radio_init}
|
| 241 |
+
radio_init['choices'] = [*voice_models, *voice_models_more]
|
| 242 |
+
return gr.Radio(**radio_init)
|
| 243 |
|
| 244 |
def reset_em_sliders(
|
| 245 |
deepmoji_enabled,
|
|
|
|
| 248 |
sad,
|
| 249 |
surprise
|
| 250 |
):
|
| 251 |
+
# print('reset_em_sliders')
|
| 252 |
if (deepmoji_enabled):
|
| 253 |
return (0, 0, 0, 0)
|
| 254 |
else:
|
|
|
|
| 266 |
sad,
|
| 267 |
surprise
|
| 268 |
):
|
| 269 |
+
# print('toggle_deepmoji')
|
| 270 |
if checked:
|
| 271 |
return (0, 0, 0, 0)
|
| 272 |
else:
|
|
|
|
| 277 |
surprise
|
| 278 |
)
|
| 279 |
|
| 280 |
+
# languages component
|
| 281 |
+
language_radio_init = {
|
| 282 |
+
'choices': [*languages, *[(f'+{len(languages_more)}', 'more')]],
|
| 283 |
+
'value': "en",
|
| 284 |
+
'label': "Language",
|
| 285 |
+
'info': "Will be more monotone and have an English accent."
|
| 286 |
+
}
|
| 287 |
|
| 288 |
_DESCRIPTION = '''
|
| 289 |
<div>
|
| 290 |
<a style="display:inline-block;" href="https://github.com/DanRuta/xVA-Synth"><img src='https://img.shields.io/github/stars/DanRuta/xVA-Synth?style=social'/></a>
|
| 291 |
+
<a style="display:inline-block;" href="https://www.nexusmods.com/skyrimspecialedition/mods/44184"><img src='https://img.shields.io/badge/Endorsements-3.4k-blue?logo=nexusmods'/></a>
|
| 292 |
<a style="display:inline-block; margin-left: .5em" href="https://discord.gg/nv7c6E2TzV"><img src='https://img.shields.io/discord/794590496202293278.svg?label=&logo=discord&logoColor=ffffff&color=7389D8&labelColor=6A7EC2'/></a>
|
| 293 |
+
<span style="display: inline-block;margin-left: .5em;vertical-align: top;"><a href="https://huggingface.co/spaces/Pendrokar/xVASynth?duplicate=true" style="" target="_blank"><img style="margin-bottom: 0em;display: inline;" src="https://bit.ly/3gLdBN6" alt="Duplicate Space"></a> for a personal CPU-run one</span>
|
| 294 |
</div>
|
| 295 |
'''
|
| 296 |
|
|
|
|
|
|
|
| 297 |
|
| 298 |
+
class BlocksDemo:
|
| 299 |
+
def __init__(self):
|
| 300 |
+
with gr.Blocks(css=".arpabet {background-color: gray; border-radius: 5px; font-size: 120%; padding: 0 0.1em; margin: 0 0.1em; text-align: center}") as demo:
|
| 301 |
+
gr.Markdown("# xVASynth TTS")
|
| 302 |
+
|
| 303 |
+
gr.HTML(label="description", value=_DESCRIPTION)
|
| 304 |
|
| 305 |
+
with gr.Row(): # Main row for inputs and language selection
|
| 306 |
+
with gr.Column(): # Input column
|
| 307 |
+
input_textbox = gr.Textbox(**input_textbox_init)
|
| 308 |
+
language_radio = gr.Radio(**language_radio_init)
|
| 309 |
+
|
| 310 |
+
# remove autofocus
|
| 311 |
+
input_textbox_init['autofocus'] = False
|
| 312 |
+
|
| 313 |
+
with gr.Row():
|
| 314 |
+
with gr.Column():
|
| 315 |
+
en_examples_dropdown = gr.Dropdown(**en_examples_dropdown_init)
|
| 316 |
+
with gr.Column():
|
| 317 |
+
pacing_slider = gr.Slider(**pacing_slider_init)
|
| 318 |
+
with gr.Column(): # Control column
|
| 319 |
+
voice_radio = gr.Radio(**voice_radio_init)
|
| 320 |
+
pitch_slider = gr.Slider(**pitch_slider_init)
|
| 321 |
+
energy_slider = gr.Slider(**energy_slider_init)
|
| 322 |
+
with gr.Row(): # Main row for inputs and language selection
|
| 323 |
+
with gr.Column(): # Input column
|
| 324 |
+
anger_slider = gr.Slider(**anger_slider_init)
|
| 325 |
+
sad_slider = gr.Slider(**sad_slider_init)
|
| 326 |
+
with gr.Column(): # Input column
|
| 327 |
+
happy_slider = gr.Slider(**happy_slider_init)
|
| 328 |
+
surprise_slider = gr.Slider(**surprise_slider_init)
|
| 329 |
+
deepmoji_checkbox = gr.Checkbox(**deepmoji_checkbox_init)
|
| 330 |
+
|
| 331 |
+
# Event handling using click
|
| 332 |
+
btn = gr.Button("Generate", variant="primary")
|
| 333 |
+
|
| 334 |
+
# with gr.Row(): # Main row for inputs and language selection
|
| 335 |
+
# with gr.Column(): # Input column
|
| 336 |
+
output_wav = gr.Audio(
|
| 337 |
+
label="22kHz audio output",
|
| 338 |
+
type="filepath",
|
| 339 |
+
editable=False,
|
| 340 |
+
autoplay=True
|
| 341 |
)
|
| 342 |
+
# with gr.Column(): # Input column
|
| 343 |
+
output_arpabet = gr.HTML(label="ARPAbet")
|
| 344 |
+
|
| 345 |
+
btn.click(
|
| 346 |
+
fn=self.predict,
|
| 347 |
+
inputs=[
|
| 348 |
+
input_textbox,
|
| 349 |
+
voice_radio,
|
| 350 |
+
language_radio,
|
| 351 |
+
pacing_slider,
|
| 352 |
+
pitch_slider,
|
| 353 |
+
energy_slider,
|
| 354 |
+
anger_slider,
|
| 355 |
+
happy_slider,
|
| 356 |
+
sad_slider,
|
| 357 |
+
surprise_slider,
|
| 358 |
+
deepmoji_checkbox
|
| 359 |
+
],
|
| 360 |
+
outputs=[
|
| 361 |
+
output_wav,
|
| 362 |
+
output_arpabet,
|
| 363 |
+
anger_slider,
|
| 364 |
+
happy_slider,
|
| 365 |
+
sad_slider,
|
| 366 |
+
surprise_slider
|
| 367 |
+
]
|
| 368 |
)
|
| 369 |
|
| 370 |
+
# more languages option
|
| 371 |
+
language_radio.change(
|
| 372 |
+
more_lang_options,
|
| 373 |
+
inputs=language_radio,
|
| 374 |
+
outputs=language_radio,
|
| 375 |
+
trigger_mode='once',
|
| 376 |
+
show_progress='hidden',
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 377 |
)
|
| 378 |
+
|
| 379 |
+
# more voices option
|
| 380 |
+
voice_radio.change(
|
| 381 |
+
more_voice_options,
|
| 382 |
+
inputs=voice_radio,
|
| 383 |
+
outputs=voice_radio,
|
| 384 |
+
trigger_mode='once',
|
| 385 |
+
show_progress='hidden',
|
| 386 |
+
queue=False,
|
| 387 |
+
)
|
| 388 |
+
|
| 389 |
+
# set default text
|
| 390 |
+
language_radio.change(
|
| 391 |
+
set_default_text,
|
| 392 |
+
inputs=[language_radio, deepmoji_checkbox],
|
| 393 |
+
outputs=[input_textbox, deepmoji_checkbox],
|
| 394 |
+
show_progress='hidden',
|
| 395 |
+
queue=False,
|
| 396 |
+
)
|
| 397 |
+
|
| 398 |
+
# toggle en examples
|
| 399 |
+
language_radio.change(
|
| 400 |
+
toggle_example_dropdown,
|
| 401 |
+
inputs=language_radio,
|
| 402 |
+
outputs=en_examples_dropdown,
|
| 403 |
+
show_progress='hidden',
|
| 404 |
+
queue=False,
|
| 405 |
+
)
|
| 406 |
+
|
| 407 |
+
en_examples_dropdown.change(
|
| 408 |
+
set_example_as_input,
|
| 409 |
+
inputs=[en_examples_dropdown],
|
| 410 |
+
outputs=[input_textbox],
|
| 411 |
+
show_progress='hidden',
|
| 412 |
+
queue=False,
|
| 413 |
+
)
|
| 414 |
+
|
| 415 |
+
deepmoji_checkbox.change(
|
| 416 |
+
toggle_deepmoji,
|
| 417 |
+
inputs=[
|
| 418 |
+
deepmoji_checkbox,
|
| 419 |
+
anger_slider,
|
| 420 |
+
happy_slider,
|
| 421 |
+
sad_slider,
|
| 422 |
+
surprise_slider
|
| 423 |
+
],
|
| 424 |
+
outputs=[
|
| 425 |
+
anger_slider,
|
| 426 |
+
happy_slider,
|
| 427 |
+
sad_slider,
|
| 428 |
+
surprise_slider
|
| 429 |
+
],
|
| 430 |
+
show_progress='hidden',
|
| 431 |
+
queue=False,
|
| 432 |
+
)
|
| 433 |
+
|
| 434 |
+
input_textbox.change(
|
| 435 |
+
reset_em_sliders,
|
| 436 |
+
inputs=[
|
| 437 |
+
deepmoji_checkbox,
|
| 438 |
+
anger_slider,
|
| 439 |
+
happy_slider,
|
| 440 |
+
sad_slider,
|
| 441 |
+
surprise_slider
|
| 442 |
+
],
|
| 443 |
+
outputs=[
|
| 444 |
+
anger_slider,
|
| 445 |
+
happy_slider,
|
| 446 |
+
sad_slider,
|
| 447 |
+
surprise_slider
|
| 448 |
+
],
|
| 449 |
+
show_progress='hidden',
|
| 450 |
+
queue=False,
|
| 451 |
+
)
|
| 452 |
+
|
| 453 |
+
voice_radio.change(
|
| 454 |
+
reset_em_sliders,
|
| 455 |
+
inputs=[
|
| 456 |
+
deepmoji_checkbox,
|
| 457 |
+
anger_slider,
|
| 458 |
+
happy_slider,
|
| 459 |
+
sad_slider,
|
| 460 |
+
surprise_slider
|
| 461 |
+
],
|
| 462 |
+
outputs=[
|
| 463 |
+
anger_slider,
|
| 464 |
+
happy_slider,
|
| 465 |
+
sad_slider,
|
| 466 |
+
surprise_slider
|
| 467 |
+
],
|
| 468 |
+
show_progress='hidden',
|
| 469 |
+
queue=False,
|
| 470 |
+
)
|
| 471 |
+
|
| 472 |
+
self.block = demo
|
| 473 |
+
|
| 474 |
+
def predict(
|
| 475 |
+
self,
|
| 476 |
+
input_text,
|
| 477 |
+
voice,
|
| 478 |
+
lang,
|
| 479 |
+
pacing,
|
| 480 |
+
pitch,
|
| 481 |
+
energy,
|
| 482 |
+
anger,
|
| 483 |
+
happy,
|
| 484 |
+
sad,
|
| 485 |
+
surprise,
|
| 486 |
+
deepmoji_checked
|
| 487 |
+
):
|
| 488 |
+
wav_path, arpabet_html, angry, happy, sad, surprise, response = client.predict(
|
| 489 |
+
input_text, # str in 'Input Text' Textbox component
|
| 490 |
+
voice, # Literal['ccby_nvidia_hifi_6670_M', 'ccby_nv_hifi_11614_F', 'ccby_nvidia_hifi_11697_F', 'ccby_nvidia_hifi_12787_F', 'ccby_nvidia_hifi_6097_M', 'ccby_nvidia_hifi_6671_M', 'ccby_nvidia_hifi_8051_F', 'ccby_nvidia_hifi_9017_M', 'ccby_nvidia_hifi_9136_F', 'ccby_nvidia_hifi_92_F'] in 'Voice' Radio component
|
| 491 |
+
lang, # Literal['en', 'de', 'es', 'it', 'fr', 'ru', 'tr', 'la', 'ro', 'da', 'vi', 'ha', 'nl', 'zh', 'ar', 'uk', 'hi', 'ko', 'pl', 'sw', 'fi', 'hu', 'pt', 'yo', 'sv', 'el', 'wo', 'jp'] in 'Language' Radio component
|
| 492 |
+
pacing, # float (numeric value between 0.5 and 2.0) in 'Duration' Slider component
|
| 493 |
+
pitch, # float (numeric value between 0 and 1.0) in 'Pitch' Slider component
|
| 494 |
+
energy, # float (numeric value between 0.1 and 1.0) in 'Energy' Slider component
|
| 495 |
+
anger, # float (numeric value between 0 and 1.0) in '๐ Anger' Slider component
|
| 496 |
+
happy, # float (numeric value between 0 and 1.0) in '๐ Happiness' Slider component
|
| 497 |
+
sad, # float (numeric value between 0 and 1.0) in '๐ญ Sadness' Slider component
|
| 498 |
+
surprise, # float (numeric value between 0 and 1.0) in '๐ฎ Surprise' Slider component
|
| 499 |
+
deepmoji_checked, # bool
|
| 500 |
+
api_name="/predict"
|
| 501 |
+
)
|
| 502 |
+
|
| 503 |
+
json_data = json.loads(response.replace("'", '"'))
|
| 504 |
+
|
| 505 |
+
arpabet_html = '<h6>ARPAbet & Durations</h6>'
|
| 506 |
+
arpabet_html += '<table style="margin: 0 var(--size-2)"><tbody><tr>'
|
| 507 |
+
arpabet_nopad = json_data['arpabet'].split('|PAD|')
|
| 508 |
+
arpabet_symbols = json_data['arpabet'].split('|')
|
| 509 |
+
wpad_len = len(arpabet_symbols)
|
| 510 |
+
nopad_len = len(arpabet_nopad)
|
| 511 |
+
total_dur_length = 0
|
| 512 |
+
for symb_i in range(wpad_len):
|
| 513 |
+
if (arpabet_symbols[symb_i] == '<PAD>'):
|
| 514 |
+
continue
|
| 515 |
+
total_dur_length += float(json_data['durations'][symb_i])
|
| 516 |
+
|
| 517 |
+
for symb_i in range(wpad_len):
|
| 518 |
+
if (arpabet_symbols[symb_i] == '<PAD>'):
|
| 519 |
+
continue
|
| 520 |
+
|
| 521 |
+
arpabet_length = float(json_data['durations'][symb_i])
|
| 522 |
+
cell_width = round(arpabet_length / total_dur_length * 100, 2)
|
| 523 |
+
arpabet_html += '<td class="arpabet" style="width: '\
|
| 524 |
+
+ str(cell_width)\
|
| 525 |
+
+'%">'\
|
| 526 |
+
+ arpabet_symbols[symb_i]\
|
| 527 |
+
+ '</td> '
|
| 528 |
+
arpabet_html += '<tr></tbody></table>'
|
| 529 |
+
|
| 530 |
+
return [
|
| 531 |
+
wav_path,
|
| 532 |
+
arpabet_html,
|
| 533 |
+
round(json_data['em_angry'][0], 2),
|
| 534 |
+
round(json_data['em_happy'][0], 2),
|
| 535 |
+
round(json_data['em_sad'][0], 2),
|
| 536 |
+
round(json_data['em_surprise'][0], 2)
|
| 537 |
]
|
|
|
|
| 538 |
|
| 539 |
if __name__ == "__main__":
|
| 540 |
print('running Gradio interface')
|
|
|
|
| 541 |
client = Client("Pendrokar/xVASynth")
|
| 542 |
|
| 543 |
+
demo = BlocksDemo()
|
| 544 |
+
demo.block.launch()
|