Spaces:
Running
on
CPU Upgrade
Running
on
CPU Upgrade
language & voice reorder
Browse files
app.py
CHANGED
|
@@ -15,79 +15,81 @@ models_path = '/home/user/.cache/huggingface/hub/models--Pendrokar--xvapitch_nvi
|
|
| 15 |
|
| 16 |
|
| 17 |
voice_models = [
|
|
|
|
| 18 |
("Male #6670", "ccby_nvidia_hifi_6670_M"),
|
| 19 |
-
("
|
|
|
|
|
|
|
| 20 |
("Female #11697", "ccby_nvidia_hifi_11697_F"),
|
| 21 |
("Female #12787", "ccby_nvidia_hifi_12787_F"),
|
| 22 |
-
("
|
| 23 |
-
("Male #6671", "ccby_nvidia_hifi_6671_M"),
|
| 24 |
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
| 25 |
-
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
| 26 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
| 27 |
-
("Female #92", "ccby_nvidia_hifi_92_F"),
|
| 28 |
]
|
| 29 |
current_voice_model = None
|
| 30 |
|
|
|
|
| 31 |
languages = [
|
| 32 |
("🇬🇧 EN", "en"),
|
| 33 |
("🇩🇪 DE", "de"),
|
| 34 |
("🇪🇸 ES", "es"),
|
| 35 |
("🇮🇹 IT", "it"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 36 |
("🇫🇷 FR", "fr"),
|
| 37 |
("🇷🇺 RU", "ru"),
|
|
|
|
| 38 |
("🇹🇷 TR", "tr"),
|
| 39 |
-
("🇻🇦 LA", "la"),
|
| 40 |
-
("🇷🇴 RO", "ro"),
|
| 41 |
-
("🇩🇰 DA", "da"),
|
| 42 |
-
("🇻🇳 VI", "vi"),
|
| 43 |
-
("🇳🇬 HA", "ha"),
|
| 44 |
-
("🇳🇱 NL", "nl"),
|
| 45 |
-
("🇨🇳 ZH", "zh"),
|
| 46 |
("🇸🇦 AR", "ar"),
|
| 47 |
-
("🇺🇦 UK", "uk"),
|
| 48 |
("🇮🇳 HI", "hi"),
|
|
|
|
| 49 |
("🇰🇷 KO", "ko"),
|
| 50 |
-
("
|
| 51 |
-
("
|
| 52 |
-
("
|
| 53 |
-
("
|
| 54 |
-
("🇵🇹 PT", "pt"),
|
| 55 |
("🇳🇬 YO", "yo"),
|
| 56 |
-
("
|
| 57 |
-
("🇬🇷 EL", "el"),
|
| 58 |
-
("🇸🇳 WO", "wo"),
|
| 59 |
-
("🇯🇵 JP", "jp"),
|
| 60 |
]
|
| 61 |
|
|
|
|
| 62 |
default_text = {
|
| 63 |
-
"
|
|
|
|
| 64 |
"de": "So klingt meine Stimme.",
|
|
|
|
|
|
|
| 65 |
"es": "Así suena mi voz.",
|
| 66 |
-
"
|
| 67 |
"fr": "Voici à quoi ressemble ma voix.",
|
| 68 |
-
"ru": "Вот как звучит мой голос.",
|
| 69 |
-
"tr": "Benim sesimin sesi böyle.",
|
| 70 |
-
"la": "Haec est vox mea sonans.",
|
| 71 |
-
"ro": "Așa sună vocea mea.",
|
| 72 |
-
"da": "Sådan lyder min stemme.",
|
| 73 |
-
"vi": "Đây là giọng nói của tôi.",
|
| 74 |
"ha": "Wannan ne muryata ke.",
|
| 75 |
-
"nl": "Dit is hoe mijn stem klinkt.",
|
| 76 |
-
"zh": "这是我的声音。",
|
| 77 |
-
"ar": "هذا هو صوتي.",
|
| 78 |
-
"uk": "Ось як звучить мій голос.",
|
| 79 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
|
|
|
|
|
|
|
|
|
| 80 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
|
|
|
|
|
|
| 81 |
"pl": "Tak brzmi mój głos.",
|
| 82 |
-
"sw": "Sauti yangu inasikika hivi.",
|
| 83 |
-
"fi": "Näin ääneni kuulostaa.",
|
| 84 |
-
"hu": "Így hangzik a hangom.",
|
| 85 |
"pt": "É assim que minha voz soa.",
|
| 86 |
-
"
|
|
|
|
| 87 |
"sv": "Såhär låter min röst.",
|
| 88 |
-
"
|
|
|
|
|
|
|
|
|
|
| 89 |
"wo": "Ndox li neen xewnaal ma.",
|
| 90 |
-
"
|
|
|
|
| 91 |
}
|
| 92 |
|
| 93 |
def run_xvaserver():
|
|
@@ -115,7 +117,7 @@ def run_xvaserver():
|
|
| 115 |
print('xVAServer running on port 8008')
|
| 116 |
|
| 117 |
# load default model
|
| 118 |
-
load_model("
|
| 119 |
|
| 120 |
# Wait for the process to exit
|
| 121 |
xvaserver.wait()
|
|
@@ -207,6 +209,7 @@ def predict(
|
|
| 207 |
input_textbox = gr.Textbox(
|
| 208 |
label="Input Text",
|
| 209 |
value="This is what my voice sounds like.",
|
|
|
|
| 210 |
lines=1,
|
| 211 |
max_lines=5,
|
| 212 |
autofocus=True
|
|
@@ -214,15 +217,15 @@ input_textbox = gr.Textbox(
|
|
| 214 |
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
| 215 |
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
| 216 |
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
| 217 |
-
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger")
|
| 218 |
-
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness")
|
| 219 |
-
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness")
|
| 220 |
-
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise")
|
| 221 |
voice_radio = gr.Radio(
|
| 222 |
voice_models,
|
| 223 |
-
value="
|
| 224 |
label="Voice",
|
| 225 |
-
info="NVIDIA HIFI CC-BY-4.0 xVAPitch
|
| 226 |
)
|
| 227 |
|
| 228 |
def set_default_text(lang):
|
|
|
|
| 15 |
|
| 16 |
|
| 17 |
voice_models = [
|
| 18 |
+
("Male #6671", "ccby_nvidia_hifi_6671_M"),
|
| 19 |
("Male #6670", "ccby_nvidia_hifi_6670_M"),
|
| 20 |
+
("Male #9017", "ccby_nvidia_hifi_9017_M"),
|
| 21 |
+
("Male #6097", "ccby_nvidia_hifi_6097_M"),
|
| 22 |
+
("Female #92", "ccby_nvidia_hifi_92_F"),
|
| 23 |
("Female #11697", "ccby_nvidia_hifi_11697_F"),
|
| 24 |
("Female #12787", "ccby_nvidia_hifi_12787_F"),
|
| 25 |
+
("Female #11614", "ccby_nv_hifi_11614_F"),
|
|
|
|
| 26 |
("Female #8051", "ccby_nvidia_hifi_8051_F"),
|
|
|
|
| 27 |
("Female #9136", "ccby_nvidia_hifi_9136_F"),
|
|
|
|
| 28 |
]
|
| 29 |
current_voice_model = None
|
| 30 |
|
| 31 |
+
# order ranked by similarity to English due to the xVASynth's use of ARPAbet instead of IPA
|
| 32 |
languages = [
|
| 33 |
("🇬🇧 EN", "en"),
|
| 34 |
("🇩🇪 DE", "de"),
|
| 35 |
("🇪🇸 ES", "es"),
|
| 36 |
("🇮🇹 IT", "it"),
|
| 37 |
+
("🇳🇱 NL", "nl"),
|
| 38 |
+
("🇵🇹 PT", "pt"),
|
| 39 |
+
("🇵🇱 PL", "pl"),
|
| 40 |
+
("🇷🇴 RO", "ro"),
|
| 41 |
+
("🇸🇪 SV", "sv"),
|
| 42 |
+
("SW", "sw"),
|
| 43 |
+
("🇩🇰 DA", "da"),
|
| 44 |
+
("🇫🇮 FI", "fi"),
|
| 45 |
+
("🇭🇺 HU", "hu"),
|
| 46 |
+
("🇬🇷 EL", "el"),
|
| 47 |
("🇫🇷 FR", "fr"),
|
| 48 |
("🇷🇺 RU", "ru"),
|
| 49 |
+
("🇺🇦 UK", "uk"),
|
| 50 |
("🇹🇷 TR", "tr"),
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 51 |
("🇸🇦 AR", "ar"),
|
|
|
|
| 52 |
("🇮🇳 HI", "hi"),
|
| 53 |
+
("🇯🇵 JP", "jp"),
|
| 54 |
("🇰🇷 KO", "ko"),
|
| 55 |
+
("🇨🇳 ZH", "zh"),
|
| 56 |
+
("🇻🇳 VI", "vi"),
|
| 57 |
+
("🇻🇦 LA", "la"),
|
| 58 |
+
("HA", "ha"),
|
|
|
|
| 59 |
("🇳🇬 YO", "yo"),
|
| 60 |
+
("WO", "wo"),
|
|
|
|
|
|
|
|
|
|
| 61 |
]
|
| 62 |
|
| 63 |
+
# Translated from English by DeepMind's Gemini Pro
|
| 64 |
default_text = {
|
| 65 |
+
"ar": "هذا هو صوتي.",
|
| 66 |
+
"da": "Sådan lyder min stemme.",
|
| 67 |
"de": "So klingt meine Stimme.",
|
| 68 |
+
"el": "Έτσι ακούγεται η φωνή μου.",
|
| 69 |
+
"en": "This is what my voice sounds like.",
|
| 70 |
"es": "Así suena mi voz.",
|
| 71 |
+
"fi": "Näin ääneni kuulostaa.",
|
| 72 |
"fr": "Voici à quoi ressemble ma voix.",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 73 |
"ha": "Wannan ne muryata ke.",
|
|
|
|
|
|
|
|
|
|
|
|
|
| 74 |
"hi": "यह मेरी आवाज़ कैसी लगती है।",
|
| 75 |
+
"hu": "Így hangzik a hangom.",
|
| 76 |
+
"it": "Così suona la mia voce.",
|
| 77 |
+
"jp": "これが私の声です。",
|
| 78 |
"ko": "여기 제 목소리가 어떤지 들어보세요.",
|
| 79 |
+
"la": "Haec est vox mea sonans.",
|
| 80 |
+
"nl": "Dit is hoe mijn stem klinkt.",
|
| 81 |
"pl": "Tak brzmi mój głos.",
|
|
|
|
|
|
|
|
|
|
| 82 |
"pt": "É assim que minha voz soa.",
|
| 83 |
+
"ro": "Așa sună vocea mea.",
|
| 84 |
+
"ru": "Вот как звучит мой голос.",
|
| 85 |
"sv": "Såhär låter min röst.",
|
| 86 |
+
"sw": "Sauti yangu inasikika hivi.",
|
| 87 |
+
"tr": "Benim sesimin sesi böyle.",
|
| 88 |
+
"uk": "Ось як звучить мій голос.",
|
| 89 |
+
"vi": "Đây là giọng nói của tôi.",
|
| 90 |
"wo": "Ndox li neen xewnaal ma.",
|
| 91 |
+
"yo": "Ìyí ni ohùn mi ńlá.",
|
| 92 |
+
"zh": "这是我的声音。",
|
| 93 |
}
|
| 94 |
|
| 95 |
def run_xvaserver():
|
|
|
|
| 117 |
print('xVAServer running on port 8008')
|
| 118 |
|
| 119 |
# load default model
|
| 120 |
+
load_model("ccby_nvidia_hifi_6671_M")
|
| 121 |
|
| 122 |
# Wait for the process to exit
|
| 123 |
xvaserver.wait()
|
|
|
|
| 209 |
input_textbox = gr.Textbox(
|
| 210 |
label="Input Text",
|
| 211 |
value="This is what my voice sounds like.",
|
| 212 |
+
info="Also accepts ARPAbet symbols placed within {} brackets.",
|
| 213 |
lines=1,
|
| 214 |
max_lines=5,
|
| 215 |
autofocus=True
|
|
|
|
| 217 |
pacing_slider = gr.Slider(0.5, 2.0, value=1.0, step=0.1, label="Duration")
|
| 218 |
pitch_slider = gr.Slider(0, 1.0, value=0.5, step=0.05, label="Pitch", visible=False)
|
| 219 |
energy_slider = gr.Slider(0.1, 1.0, value=1.0, step=0.05, label="Energy", visible=False)
|
| 220 |
+
anger_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😠 Anger", info="Tread lightly beyond 0.9")
|
| 221 |
+
happy_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😃 Happiness", info="Tread lightly beyond 0.7")
|
| 222 |
+
sad_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😭 Sadness", info="Duration increased when beyond 0.2")
|
| 223 |
+
surprise_slider = gr.Slider(0, 1.0, value=0, step=0.05, label="😮 Surprise", info="Does not play well with Happiness with either being beyond 0.3")
|
| 224 |
voice_radio = gr.Radio(
|
| 225 |
voice_models,
|
| 226 |
+
value="ccby_nvidia_hifi_6671_M",
|
| 227 |
label="Voice",
|
| 228 |
+
info="NVIDIA HIFI CC-BY-4.0 xVAPitch voice model"
|
| 229 |
)
|
| 230 |
|
| 231 |
def set_default_text(lang):
|