Spaces:
Running
Running
Contenders tab: query relevant TTS models
Browse files
app.py
CHANGED
|
@@ -117,6 +117,7 @@ HF_SPACES = {
|
|
| 117 |
'function': '1',
|
| 118 |
'text_param_index': 0,
|
| 119 |
'return_audio_index': 1,
|
|
|
|
| 120 |
},
|
| 121 |
# WhisperSpeech
|
| 122 |
'collabora/WhisperSpeech': {
|
|
@@ -124,6 +125,7 @@ HF_SPACES = {
|
|
| 124 |
'function': '/whisper_speech_demo',
|
| 125 |
'text_param_index': 0,
|
| 126 |
'return_audio_index': 0,
|
|
|
|
| 127 |
},
|
| 128 |
# OpenVoice (MyShell.ai)
|
| 129 |
'myshell-ai/OpenVoice': {
|
|
@@ -131,6 +133,7 @@ HF_SPACES = {
|
|
| 131 |
'function': '1',
|
| 132 |
'text_param_index': 0,
|
| 133 |
'return_audio_index': 1,
|
|
|
|
| 134 |
},
|
| 135 |
# OpenVoice v2 (MyShell.ai)
|
| 136 |
'myshell-ai/OpenVoiceV2': {
|
|
@@ -138,13 +141,15 @@ HF_SPACES = {
|
|
| 138 |
'function': '1',
|
| 139 |
'text_param_index': 0,
|
| 140 |
'return_audio_index': 1,
|
|
|
|
| 141 |
},
|
| 142 |
# MetaVoice
|
| 143 |
'mrfakename/MetaVoice-1B-v0.1': {
|
| 144 |
-
'name':'MetaVoice',
|
| 145 |
'function': '/tts',
|
| 146 |
'text_param_index': 0,
|
| 147 |
'return_audio_index': 0,
|
|
|
|
| 148 |
},
|
| 149 |
# xVASynth (CPU)
|
| 150 |
'Pendrokar/xVASynth-TTS': {
|
|
@@ -152,6 +157,7 @@ HF_SPACES = {
|
|
| 152 |
'function': '/predict',
|
| 153 |
'text_param_index': 0,
|
| 154 |
'return_audio_index': 0,
|
|
|
|
| 155 |
},
|
| 156 |
# CoquiTTS (CPU)
|
| 157 |
'coqui/CoquiTTS': {
|
|
@@ -159,6 +165,7 @@ HF_SPACES = {
|
|
| 159 |
'function': '0',
|
| 160 |
'text_param_index': 0,
|
| 161 |
'return_audio_index': 0,
|
|
|
|
| 162 |
},
|
| 163 |
# HierSpeech_TTS
|
| 164 |
'LeeSangHoon/HierSpeech_TTS': {
|
|
@@ -166,6 +173,7 @@ HF_SPACES = {
|
|
| 166 |
'function': '/predict',
|
| 167 |
'text_param_index': 0,
|
| 168 |
'return_audio_index': 0,
|
|
|
|
| 169 |
},
|
| 170 |
# MeloTTS (MyShell.ai)
|
| 171 |
'mrfakename/MeloTTS': {
|
|
@@ -173,6 +181,7 @@ HF_SPACES = {
|
|
| 173 |
'function': '/synthesize',
|
| 174 |
'text_param_index': 0,
|
| 175 |
'return_audio_index': 0,
|
|
|
|
| 176 |
},
|
| 177 |
|
| 178 |
# Parler
|
|
@@ -182,6 +191,7 @@ HF_SPACES = {
|
|
| 182 |
'text_param_index': 0,
|
| 183 |
'return_audio_index': 0,
|
| 184 |
'is_zero_gpu_space': True,
|
|
|
|
| 185 |
},
|
| 186 |
# Parler Mini
|
| 187 |
# 'parler-tts/parler_tts': {
|
|
@@ -190,6 +200,7 @@ HF_SPACES = {
|
|
| 190 |
# 'text_param_index': 0,
|
| 191 |
# 'return_audio_index': 0,
|
| 192 |
# 'is_zero_gpu_space': True,
|
|
|
|
| 193 |
# },
|
| 194 |
# Parler Mini which using Expresso dataset
|
| 195 |
'parler-tts/parler-tts-expresso': {
|
|
@@ -198,6 +209,7 @@ HF_SPACES = {
|
|
| 198 |
'text_param_index': 0,
|
| 199 |
'return_audio_index': 0,
|
| 200 |
'is_zero_gpu_space': True,
|
|
|
|
| 201 |
},
|
| 202 |
|
| 203 |
# Microsoft Edge TTS
|
|
@@ -207,6 +219,7 @@ HF_SPACES = {
|
|
| 207 |
'text_param_index': 0,
|
| 208 |
'return_audio_index': 0,
|
| 209 |
'is_proprietary': True,
|
|
|
|
| 210 |
},
|
| 211 |
|
| 212 |
# Fish Speech
|
|
@@ -215,6 +228,7 @@ HF_SPACES = {
|
|
| 215 |
'function': '/inference_wrapper',
|
| 216 |
'text_param_index': 0,
|
| 217 |
'return_audio_index': 1,
|
|
|
|
| 218 |
},
|
| 219 |
|
| 220 |
# E2/F5 TTS
|
|
@@ -224,6 +238,7 @@ HF_SPACES = {
|
|
| 224 |
'text_param_index': 2,
|
| 225 |
'return_audio_index': 0,
|
| 226 |
'is_zero_gpu_space': True,
|
|
|
|
| 227 |
},
|
| 228 |
|
| 229 |
# TTS w issues
|
|
@@ -543,6 +558,7 @@ Generated audio clips cannot be redistributed and may be used for personal, non-
|
|
| 543 |
|
| 544 |
Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html).
|
| 545 |
""".strip()
|
|
|
|
| 546 |
LDESC = f"""
|
| 547 |
## π Leaderboard
|
| 548 |
|
|
@@ -552,19 +568,25 @@ The leaderboard displays models in descending order of how natural they sound (b
|
|
| 552 |
|
| 553 |
Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold. Tick the `Reveal preliminary results` to show models without sufficient votes. Please note that preliminary results may be inaccurate. [This dataset is public](https://huggingface.co/datasets/{DB_DATASET_ID}) and only saves the hardcoded sentences while keeping the voters anonymous.
|
| 554 |
""".strip()
|
|
|
|
| 555 |
TTS_INFO = f"""
|
| 556 |
## π£ Contenders
|
| 557 |
|
| 558 |
### Open Source TTS capabilities table
|
| 559 |
|
| 560 |
-
See the dataset itself for the legend and more in depth information for each model.
|
| 561 |
""".strip()
|
| 562 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 563 |
<iframe
|
| 564 |
-
|
| 565 |
-
|
| 566 |
-
|
| 567 |
-
|
| 568 |
></iframe>
|
| 569 |
""".strip()
|
| 570 |
|
|
@@ -1576,7 +1598,7 @@ with gr.Blocks() as about:
|
|
| 1576 |
gr.Markdown(ABOUT)
|
| 1577 |
with gr.Blocks() as tts_info:
|
| 1578 |
gr.Markdown(TTS_INFO)
|
| 1579 |
-
gr.HTML(
|
| 1580 |
# with gr.Blocks() as admin:
|
| 1581 |
# rdb = gr.Button("Reload Audio Dataset")
|
| 1582 |
# # rdb.click(reload_audio_dataset, outputs=rdb)
|
|
|
|
| 117 |
'function': '1',
|
| 118 |
'text_param_index': 0,
|
| 119 |
'return_audio_index': 1,
|
| 120 |
+
'series': 'XTTS',
|
| 121 |
},
|
| 122 |
# WhisperSpeech
|
| 123 |
'collabora/WhisperSpeech': {
|
|
|
|
| 125 |
'function': '/whisper_speech_demo',
|
| 126 |
'text_param_index': 0,
|
| 127 |
'return_audio_index': 0,
|
| 128 |
+
'series': 'WhisperSpeech',
|
| 129 |
},
|
| 130 |
# OpenVoice (MyShell.ai)
|
| 131 |
'myshell-ai/OpenVoice': {
|
|
|
|
| 133 |
'function': '1',
|
| 134 |
'text_param_index': 0,
|
| 135 |
'return_audio_index': 1,
|
| 136 |
+
'series': 'OpenVoice',
|
| 137 |
},
|
| 138 |
# OpenVoice v2 (MyShell.ai)
|
| 139 |
'myshell-ai/OpenVoiceV2': {
|
|
|
|
| 141 |
'function': '1',
|
| 142 |
'text_param_index': 0,
|
| 143 |
'return_audio_index': 1,
|
| 144 |
+
'series': 'OpenVoice',
|
| 145 |
},
|
| 146 |
# MetaVoice
|
| 147 |
'mrfakename/MetaVoice-1B-v0.1': {
|
| 148 |
+
'name':'MetaVoice-1B',
|
| 149 |
'function': '/tts',
|
| 150 |
'text_param_index': 0,
|
| 151 |
'return_audio_index': 0,
|
| 152 |
+
'series': 'MetaVoice-1B',
|
| 153 |
},
|
| 154 |
# xVASynth (CPU)
|
| 155 |
'Pendrokar/xVASynth-TTS': {
|
|
|
|
| 157 |
'function': '/predict',
|
| 158 |
'text_param_index': 0,
|
| 159 |
'return_audio_index': 0,
|
| 160 |
+
'series': 'xVASynth',
|
| 161 |
},
|
| 162 |
# CoquiTTS (CPU)
|
| 163 |
'coqui/CoquiTTS': {
|
|
|
|
| 165 |
'function': '0',
|
| 166 |
'text_param_index': 0,
|
| 167 |
'return_audio_index': 0,
|
| 168 |
+
'series': 'CoquiTTS',
|
| 169 |
},
|
| 170 |
# HierSpeech_TTS
|
| 171 |
'LeeSangHoon/HierSpeech_TTS': {
|
|
|
|
| 173 |
'function': '/predict',
|
| 174 |
'text_param_index': 0,
|
| 175 |
'return_audio_index': 0,
|
| 176 |
+
'series': 'HierSpeech++',
|
| 177 |
},
|
| 178 |
# MeloTTS (MyShell.ai)
|
| 179 |
'mrfakename/MeloTTS': {
|
|
|
|
| 181 |
'function': '/synthesize',
|
| 182 |
'text_param_index': 0,
|
| 183 |
'return_audio_index': 0,
|
| 184 |
+
'series': 'MeloTTS',
|
| 185 |
},
|
| 186 |
|
| 187 |
# Parler
|
|
|
|
| 191 |
'text_param_index': 0,
|
| 192 |
'return_audio_index': 0,
|
| 193 |
'is_zero_gpu_space': True,
|
| 194 |
+
'series': 'Parler',
|
| 195 |
},
|
| 196 |
# Parler Mini
|
| 197 |
# 'parler-tts/parler_tts': {
|
|
|
|
| 200 |
# 'text_param_index': 0,
|
| 201 |
# 'return_audio_index': 0,
|
| 202 |
# 'is_zero_gpu_space': True,
|
| 203 |
+
# 'series': 'Parler',
|
| 204 |
# },
|
| 205 |
# Parler Mini which using Expresso dataset
|
| 206 |
'parler-tts/parler-tts-expresso': {
|
|
|
|
| 209 |
'text_param_index': 0,
|
| 210 |
'return_audio_index': 0,
|
| 211 |
'is_zero_gpu_space': True,
|
| 212 |
+
'series': 'Parler',
|
| 213 |
},
|
| 214 |
|
| 215 |
# Microsoft Edge TTS
|
|
|
|
| 219 |
'text_param_index': 0,
|
| 220 |
'return_audio_index': 0,
|
| 221 |
'is_proprietary': True,
|
| 222 |
+
'series': 'Edge TTS',
|
| 223 |
},
|
| 224 |
|
| 225 |
# Fish Speech
|
|
|
|
| 228 |
'function': '/inference_wrapper',
|
| 229 |
'text_param_index': 0,
|
| 230 |
'return_audio_index': 1,
|
| 231 |
+
'series': 'Fish Speech',
|
| 232 |
},
|
| 233 |
|
| 234 |
# E2/F5 TTS
|
|
|
|
| 238 |
'text_param_index': 2,
|
| 239 |
'return_audio_index': 0,
|
| 240 |
'is_zero_gpu_space': True,
|
| 241 |
+
'series': 'E2/F5 TTS',
|
| 242 |
},
|
| 243 |
|
| 244 |
# TTS w issues
|
|
|
|
| 558 |
|
| 559 |
Random sentences are sourced from a filtered subset of the [Harvard Sentences](https://www.cs.columbia.edu/~hgs/audio/harvard.html).
|
| 560 |
""".strip()
|
| 561 |
+
|
| 562 |
LDESC = f"""
|
| 563 |
## π Leaderboard
|
| 564 |
|
|
|
|
| 568 |
|
| 569 |
Important: In order to help keep results fair, the leaderboard hides results by default until the number of votes passes a threshold. Tick the `Reveal preliminary results` to show models without sufficient votes. Please note that preliminary results may be inaccurate. [This dataset is public](https://huggingface.co/datasets/{DB_DATASET_ID}) and only saves the hardcoded sentences while keeping the voters anonymous.
|
| 570 |
""".strip()
|
| 571 |
+
|
| 572 |
TTS_INFO = f"""
|
| 573 |
## π£ Contenders
|
| 574 |
|
| 575 |
### Open Source TTS capabilities table
|
| 576 |
|
| 577 |
+
See [the below dataset itself](https://huggingface.co/datasets/Pendrokar/open_tts_tracker) for the legend and more in depth information for each model.
|
| 578 |
""".strip()
|
| 579 |
+
|
| 580 |
+
model_series = []
|
| 581 |
+
for model in HF_SPACES.values():
|
| 582 |
+
model_series.append('%27'+ model['series'].replace('+', '%2B') +'%27')
|
| 583 |
+
TTS_DATASET_IFRAME_ORDER = '%2C+'.join(model_series)
|
| 584 |
+
TTS_DATASET_IFRAME = f"""
|
| 585 |
<iframe
|
| 586 |
+
src="https://huggingface.co/datasets/Pendrokar/open_tts_tracker/embed/viewer/default/train?sql_console=true&sql=--+The+SQL+console+is+powered+by+DuckDB+WASM+and+runs+entirely+in+the+browser.%0A--+Get+started+by+typing+a+query+or+selecting+a+view+from+the+options+below.%0ASELECT+*%2C+%22Name%22+IN+%28{TTS_DATASET_IFRAME_ORDER}%29+AS+%22In+arena%22+FROM+train+WHERE+%22Insta-clone+%F0%9F%91%A5%22+IS+NOT+NULL+ORDER+BY+%22In+arena%22+DESC+LIMIT+50%3B&views%5B%5D=train"
|
| 587 |
+
frameborder="0"
|
| 588 |
+
width="100%"
|
| 589 |
+
height="650px"
|
| 590 |
></iframe>
|
| 591 |
""".strip()
|
| 592 |
|
|
|
|
| 1598 |
gr.Markdown(ABOUT)
|
| 1599 |
with gr.Blocks() as tts_info:
|
| 1600 |
gr.Markdown(TTS_INFO)
|
| 1601 |
+
gr.HTML(TTS_DATASET_IFRAME)
|
| 1602 |
# with gr.Blocks() as admin:
|
| 1603 |
# rdb = gr.Button("Reload Audio Dataset")
|
| 1604 |
# # rdb.click(reload_audio_dataset, outputs=rdb)
|