Spaces:
Running
Running
save samples to cache; load cached; a way to fix #4
Browse files
app.py
CHANGED
|
@@ -13,6 +13,20 @@ from detoxify import Detoxify
|
|
| 13 |
import os
|
| 14 |
import tempfile
|
| 15 |
from pydub import AudioSegment
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 16 |
|
| 17 |
def match_target_amplitude(sound, target_dBFS):
|
| 18 |
change_in_dBFS = target_dBFS - sound.dBFS
|
|
@@ -257,6 +271,27 @@ OVERRIDE_INPUTS = {
|
|
| 257 |
}
|
| 258 |
|
| 259 |
hf_clients = {}
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 260 |
|
| 261 |
SPACE_ID = os.getenv('SPACE_ID')
|
| 262 |
MAX_SAMPLE_TXT_LENGTH = 300
|
|
@@ -378,7 +413,9 @@ Vote to help the community find the best available text-to-speech model!
|
|
| 378 |
INSTR = """
|
| 379 |
## 🗳️ Vote
|
| 380 |
|
| 381 |
-
* Input text (English only) to synthesize audio
|
|
|
|
|
|
|
| 382 |
* Listen to the two audio clips, one after the other.
|
| 383 |
* Vote on which audio sounds more natural to you.
|
| 384 |
* _Note: Model names are revealed after the vote is cast._
|
|
@@ -611,7 +648,7 @@ def make_link_to_space(model_name):
|
|
| 611 |
model_basename = HF_SPACES[model_name]['name']
|
| 612 |
|
| 613 |
if '/' in model_name:
|
| 614 |
-
return '🤗 <a style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
|
| 615 |
|
| 616 |
# otherwise just return the model name
|
| 617 |
return model_name
|
|
@@ -989,6 +1026,26 @@ def synthandreturn(text):
|
|
| 989 |
#debug
|
| 990 |
# outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|
| 991 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 992 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
| 993 |
return (
|
| 994 |
text,
|
|
@@ -1046,26 +1103,94 @@ def unlock_vote(btn_index, aplayed, bplayed):
|
|
| 1046 |
|
| 1047 |
return [gr.update(), gr.update(), aplayed, bplayed]
|
| 1048 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1049 |
def randomsent():
|
| 1050 |
-
return random.choice(sents), '🎲'
|
| 1051 |
def clear_stuff():
|
| 1052 |
-
return
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1053 |
def disable():
|
| 1054 |
return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
|
| 1055 |
def enable():
|
| 1056 |
return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
|
| 1057 |
with gr.Blocks() as vote:
|
| 1058 |
# sample played
|
| 1059 |
-
|
| 1060 |
-
|
| 1061 |
# voter ID
|
| 1062 |
useridstate = gr.State()
|
| 1063 |
gr.Markdown(INSTR)
|
| 1064 |
with gr.Group():
|
| 1065 |
with gr.Row():
|
|
|
|
| 1066 |
text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
|
| 1067 |
randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
|
| 1068 |
-
randomt.click(randomsent, outputs=[text, randomt])
|
| 1069 |
btn = gr.Button("Synthesize", variant='primary')
|
| 1070 |
model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
|
| 1071 |
#model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=True)
|
|
@@ -1096,7 +1221,9 @@ with gr.Blocks() as vote:
|
|
| 1096 |
bbetter,
|
| 1097 |
prevmodel1,
|
| 1098 |
prevmodel2,
|
| 1099 |
-
nxtroundbtn
|
|
|
|
|
|
|
| 1100 |
]
|
| 1101 |
"""
|
| 1102 |
text,
|
|
@@ -1111,12 +1238,15 @@ with gr.Blocks() as vote:
|
|
| 1111 |
gr.update(visible=False), #prevmodel1
|
| 1112 |
gr.update(visible=False), #prevmodel2
|
| 1113 |
gr.update(visible=False), #nxt round btn"""
|
| 1114 |
-
btn.click(disable, outputs=[btn, abetter, bbetter]).then(synthandreturn, inputs=[text], outputs=outputs).then(enable, outputs=[btn,
|
| 1115 |
-
nxtroundbtn.click(
|
|
|
|
|
|
|
|
|
|
| 1116 |
|
| 1117 |
# Allow interaction with the vote buttons only when both audio samples have finished playing
|
| 1118 |
-
|
| 1119 |
-
|
| 1120 |
|
| 1121 |
# nxt_outputs = [prevmodel1, prevmodel2, abetter, bbetter]
|
| 1122 |
nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|
|
|
|
| 13 |
import os
|
| 14 |
import tempfile
|
| 15 |
from pydub import AudioSegment
|
| 16 |
+
import itertools
|
| 17 |
+
from typing import List, Tuple, Set, Dict
|
| 18 |
+
from hashlib import sha1
|
| 19 |
+
|
| 20 |
+
class User:
|
| 21 |
+
def __init__(self, user_id: str):
|
| 22 |
+
self.user_id = user_id
|
| 23 |
+
self.voted_pairs: Set[Tuple[str, str]] = set()
|
| 24 |
+
|
| 25 |
+
class Sample:
|
| 26 |
+
def __init__(self, filename: str, transcript: str, modelName: str):
|
| 27 |
+
self.filename = filename
|
| 28 |
+
self.transcript = transcript
|
| 29 |
+
self.modelName = modelName
|
| 30 |
|
| 31 |
def match_target_amplitude(sound, target_dBFS):
|
| 32 |
change_in_dBFS = target_dBFS - sound.dBFS
|
|
|
|
| 271 |
}
|
| 272 |
|
| 273 |
hf_clients = {}
|
| 274 |
+
# cache audio samples for quick voting
|
| 275 |
+
cached_samples: List[Sample] = []
|
| 276 |
+
voting_users = {
|
| 277 |
+
# userid as the key and USER() as the value
|
| 278 |
+
}
|
| 279 |
+
|
| 280 |
+
def generate_matching_pairs(samples: List[Sample]) -> List[Tuple[Sample, Sample]]:
|
| 281 |
+
transcript_groups: Dict[str, List[Sample]] = {}
|
| 282 |
+
for sample in samples:
|
| 283 |
+
if sample.transcript not in transcript_groups:
|
| 284 |
+
transcript_groups[sample.transcript] = []
|
| 285 |
+
transcript_groups[sample.transcript].append(sample)
|
| 286 |
+
|
| 287 |
+
matching_pairs: List[Tuple[Sample, Sample]] = []
|
| 288 |
+
for group in transcript_groups.values():
|
| 289 |
+
matching_pairs.extend(list(itertools.combinations(group, 2)))
|
| 290 |
+
|
| 291 |
+
return matching_pairs
|
| 292 |
+
|
| 293 |
+
# List[Tuple[Sample, Sample]]
|
| 294 |
+
all_pairs = []
|
| 295 |
|
| 296 |
SPACE_ID = os.getenv('SPACE_ID')
|
| 297 |
MAX_SAMPLE_TXT_LENGTH = 300
|
|
|
|
| 413 |
INSTR = """
|
| 414 |
## 🗳️ Vote
|
| 415 |
|
| 416 |
+
* Input text (English only) to synthesize audio.
|
| 417 |
+
* Press ⚡ to select a cached sample you have yet to vote on. Fast.
|
| 418 |
+
* Press 🎲 to randomly select text for a list. Slow.
|
| 419 |
* Listen to the two audio clips, one after the other.
|
| 420 |
* Vote on which audio sounds more natural to you.
|
| 421 |
* _Note: Model names are revealed after the vote is cast._
|
|
|
|
| 648 |
model_basename = HF_SPACES[model_name]['name']
|
| 649 |
|
| 650 |
if '/' in model_name:
|
| 651 |
+
return '🤗 <a target="_top" style="'+ style +'" title="'+ title +'" href="'+ 'https://huggingface.co/spaces/'+ model_name +'">'+ model_basename +'</a>'
|
| 652 |
|
| 653 |
# otherwise just return the model name
|
| 654 |
return model_name
|
|
|
|
| 1026 |
#debug
|
| 1027 |
# outputs = [text, btn, r2, model1, model2, aud1, aud2, abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|
| 1028 |
|
| 1029 |
+
# cache the result
|
| 1030 |
+
for model in [mdl1k, mdl2k]:
|
| 1031 |
+
already_cached = False
|
| 1032 |
+
# check if already cached
|
| 1033 |
+
for cached_sample in cached_samples:
|
| 1034 |
+
# TODO:replace cached
|
| 1035 |
+
if (cached_sample.transcript == text and cached_sample.modelName == model):
|
| 1036 |
+
already_cached = True
|
| 1037 |
+
break
|
| 1038 |
+
|
| 1039 |
+
if (already_cached):
|
| 1040 |
+
continue
|
| 1041 |
+
|
| 1042 |
+
print(f"Cached {model}")
|
| 1043 |
+
cached_samples.append(Sample(results[model], text, model))
|
| 1044 |
+
# print(cached_samples)
|
| 1045 |
+
|
| 1046 |
+
all_pairs = generate_matching_pairs(cached_samples)
|
| 1047 |
+
# print(all_pairs)
|
| 1048 |
+
|
| 1049 |
print(f"Retrieving models {mdl1k} and {mdl2k} from API")
|
| 1050 |
return (
|
| 1051 |
text,
|
|
|
|
| 1103 |
|
| 1104 |
return [gr.update(), gr.update(), aplayed, bplayed]
|
| 1105 |
|
| 1106 |
+
def cachedsent(request: gr.Request):
|
| 1107 |
+
# add new userid to voting_users from Browser session hash
|
| 1108 |
+
# stored only in RAM
|
| 1109 |
+
if request.username:
|
| 1110 |
+
print('auth by username')
|
| 1111 |
+
# by HuggingFace username
|
| 1112 |
+
userid = sha1(bytes(request.username.encode('ascii'))).hexdigest()
|
| 1113 |
+
else:
|
| 1114 |
+
print('auth by ip')
|
| 1115 |
+
# by IP address
|
| 1116 |
+
userid = sha1(bytes(request.client.host.encode('ascii'))).hexdigest()
|
| 1117 |
+
# by browser session hash
|
| 1118 |
+
# userid = sha1(bytes(request.session_hash.encode('ascii')), usedforsecurity=False).hexdigest() # Session hash changes on page reload
|
| 1119 |
+
|
| 1120 |
+
if userid not in voting_users:
|
| 1121 |
+
voting_users[userid] = User(userid)
|
| 1122 |
+
|
| 1123 |
+
def get_next_pair(user: User):
|
| 1124 |
+
# all_pairs = generate_matching_pairs(cached_samples)
|
| 1125 |
+
|
| 1126 |
+
# for pair in all_pairs:
|
| 1127 |
+
for pair in generate_matching_pairs(cached_samples):
|
| 1128 |
+
pair_key = (pair[0].filename, pair[1].filename)
|
| 1129 |
+
if pair_key not in user.voted_pairs and (pair_key[1], pair_key[0]) not in user.voted_pairs:
|
| 1130 |
+
return pair
|
| 1131 |
+
return None
|
| 1132 |
+
|
| 1133 |
+
pair = get_next_pair(voting_users[userid])
|
| 1134 |
+
if pair is None:
|
| 1135 |
+
return [*clear_stuff(), gr.update(interactive=False)]
|
| 1136 |
+
|
| 1137 |
+
# TODO: move to abisbetter
|
| 1138 |
+
voting_users[userid].voted_pairs.add((pair[0].filename, pair[1].filename))
|
| 1139 |
+
return (
|
| 1140 |
+
pair[0].transcript,
|
| 1141 |
+
"Synthesize",
|
| 1142 |
+
gr.update(visible=True), # r2
|
| 1143 |
+
pair[0].modelName, # model1
|
| 1144 |
+
pair[1].modelName, # model2
|
| 1145 |
+
gr.update(visible=True, value=pair[0].filename), # aud1
|
| 1146 |
+
gr.update(visible=True, value=pair[1].filename), # aud2
|
| 1147 |
+
gr.update(visible=True, interactive=False), #abetter
|
| 1148 |
+
gr.update(visible=True, interactive=False), #bbetter
|
| 1149 |
+
gr.update(visible=False), #prevmodel1
|
| 1150 |
+
gr.update(visible=False), #prevmodel2
|
| 1151 |
+
gr.update(visible=False), #nxt round btn
|
| 1152 |
+
# reset aplayed, bplayed audio playback events
|
| 1153 |
+
gr.update(value=False), #aplayed
|
| 1154 |
+
gr.update(value=False), #bplayed
|
| 1155 |
+
# fetch cached btn
|
| 1156 |
+
gr.update(interactive=True)
|
| 1157 |
+
)
|
| 1158 |
def randomsent():
|
| 1159 |
+
return '⚡', random.choice(sents), '🎲'
|
| 1160 |
def clear_stuff():
|
| 1161 |
+
return [
|
| 1162 |
+
'',
|
| 1163 |
+
"Synthesize",
|
| 1164 |
+
gr.update(visible=True), # r2
|
| 1165 |
+
'', # model1
|
| 1166 |
+
'', # model2
|
| 1167 |
+
gr.update(visible=False), # aud1
|
| 1168 |
+
gr.update(visible=False), # aud2
|
| 1169 |
+
gr.update(visible=False), #abetter
|
| 1170 |
+
gr.update(visible=False), #bbetter
|
| 1171 |
+
gr.update(visible=False), #prevmodel1
|
| 1172 |
+
gr.update(visible=False), #prevmodel2
|
| 1173 |
+
gr.update(visible=False), #nxt round btn
|
| 1174 |
+
gr.update(value=False), #aplayed
|
| 1175 |
+
gr.update(value=False), #bplayed
|
| 1176 |
+
]
|
| 1177 |
def disable():
|
| 1178 |
return [gr.update(interactive=False), gr.update(interactive=False), gr.update(interactive=False)]
|
| 1179 |
def enable():
|
| 1180 |
return [gr.update(interactive=True), gr.update(interactive=True), gr.update(interactive=True)]
|
| 1181 |
with gr.Blocks() as vote:
|
| 1182 |
# sample played
|
| 1183 |
+
aplayed = gr.State(value=False)
|
| 1184 |
+
bplayed = gr.State(value=False)
|
| 1185 |
# voter ID
|
| 1186 |
useridstate = gr.State()
|
| 1187 |
gr.Markdown(INSTR)
|
| 1188 |
with gr.Group():
|
| 1189 |
with gr.Row():
|
| 1190 |
+
cachedt = gr.Button('⚡', scale=0, min_width=0, variant='tool', interactive=len(cached_samples)>0)
|
| 1191 |
text = gr.Textbox(container=False, show_label=False, placeholder="Enter text to synthesize", lines=1, max_lines=1, scale=9999999, min_width=0)
|
| 1192 |
randomt = gr.Button('🎲', scale=0, min_width=0, variant='tool')
|
| 1193 |
+
randomt.click(randomsent, outputs=[cachedt, text, randomt])
|
| 1194 |
btn = gr.Button("Synthesize", variant='primary')
|
| 1195 |
model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=False)
|
| 1196 |
#model1 = gr.Textbox(interactive=False, lines=1, max_lines=1, visible=True)
|
|
|
|
| 1221 |
bbetter,
|
| 1222 |
prevmodel1,
|
| 1223 |
prevmodel2,
|
| 1224 |
+
nxtroundbtn,
|
| 1225 |
+
aplayed,
|
| 1226 |
+
bplayed,
|
| 1227 |
]
|
| 1228 |
"""
|
| 1229 |
text,
|
|
|
|
| 1238 |
gr.update(visible=False), #prevmodel1
|
| 1239 |
gr.update(visible=False), #prevmodel2
|
| 1240 |
gr.update(visible=False), #nxt round btn"""
|
| 1241 |
+
btn.click(disable, outputs=[btn, abetter, bbetter]).then(synthandreturn, inputs=[text], outputs=outputs).then(enable, outputs=[btn, gr.State(), gr.State()])
|
| 1242 |
+
nxtroundbtn.click(cachedsent, outputs=[*outputs, cachedt])
|
| 1243 |
+
|
| 1244 |
+
# fetch a comparison pair from cache
|
| 1245 |
+
cachedt.click(disable, outputs=[cachedt, abetter, bbetter]).then(cachedsent, outputs=[*outputs, cachedt]).then(enable, outputs=[btn, gr.State(), gr.State()])
|
| 1246 |
|
| 1247 |
# Allow interaction with the vote buttons only when both audio samples have finished playing
|
| 1248 |
+
aud1.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=0), aplayed, bplayed])
|
| 1249 |
+
aud2.stop(unlock_vote, outputs=[abetter, bbetter, aplayed, bplayed], inputs=[gr.State(value=1), aplayed, bplayed])
|
| 1250 |
|
| 1251 |
# nxt_outputs = [prevmodel1, prevmodel2, abetter, bbetter]
|
| 1252 |
nxt_outputs = [abetter, bbetter, prevmodel1, prevmodel2, nxtroundbtn]
|