IFMedTechdemo commited on
Commit
be6bdf9
·
verified ·
1 Parent(s): 3012249

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +7 -293
app.py CHANGED
@@ -6,9 +6,6 @@ import subprocess
6
  import numpy as np
7
  import gradio as gr
8
  import soundfile as sf
9
- import random
10
- import torch
11
- from src.chatterbox.mtl_tts import ChatterboxMultilingualTTS, SUPPORTED_LANGUAGES
12
 
13
  # Clone NeuTTS-Air repository if not present
14
  NEUTTS_DIR = "neutts-air"
@@ -25,7 +22,6 @@ sys.path.append(NEUTTS_DIR)
25
  # Global variables for lazy loading
26
  kokoro_pipe = None
27
  neutts_model = None
28
- chatterbox_model = None
29
 
30
  # NeuTTS-Air configuration - aligned with official neutts-air/app.py
31
  SAMPLES_PATH = os.path.join(os.getcwd(), NEUTTS_DIR, "samples")
@@ -33,104 +29,6 @@ DEFAULT_REF_TEXT = "So I'm live on radio. And I say, well, my dear friend James
33
  DEFAULT_REF_PATH = os.path.join(SAMPLES_PATH, "dave.wav")
34
  DEFAULT_GEN_TEXT = "My name is Dave, and um, I'm from London."
35
 
36
- DEVICE = "cuda" if torch.cuda.is_available() else "cpu"
37
- print(f"🚀 Running on device: {DEVICE}")
38
-
39
- LANGUAGE_CONFIG = {
40
- "ar": {
41
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ar_f/ar_prompts2.flac",
42
- "text": "في الشهر الماضي، وصلنا إلى معلم جديد بمليارين من المشاهدات على قناتنا على يوتيوب."
43
- },
44
- "da": {
45
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/da_m1.flac",
46
- "text": "Sidste måned nåede vi en ny milepæl med to milliarder visninger på vores YouTube-kanal."
47
- },
48
- "de": {
49
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/de_f1.flac",
50
- "text": "Letzten Monat haben wir einen neuen Meilenstein erreicht: zwei Milliarden Aufrufe auf unserem YouTube-Kanal."
51
- },
52
- "el": {
53
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/el_m.flac",
54
- "text": "Τον περασμένο μήνα, φτάσαμε σε ένα νέο ορόσημο με δύο δισεκατομμύρια προβολές στο κανάλι μας στο YouTube."
55
- },
56
- "en": {
57
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/en_f1.flac",
58
- "text": "Last month, we reached a new milestone with two billion views on our YouTube channel."
59
- },
60
- "es": {
61
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/es_f1.flac",
62
- "text": "El mes pasado alcanzamos un nuevo hito: dos mil millones de visualizaciones en nuestro canal de YouTube."
63
- },
64
- "fi": {
65
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fi_m.flac",
66
- "text": "Viime kuussa saavutimme uuden virstanpylvään kahden miljardin katselukerran kanssa YouTube-kanavallamme."
67
- },
68
- "fr": {
69
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/fr_f1.flac",
70
- "text": "Le mois dernier, nous avons atteint un nouveau jalon avec deux milliards de vues sur notre chaîne YouTube."
71
- },
72
- "he": {
73
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/he_m1.flac",
74
- "text": "בחודש שעבר הגענו לאבן דרך חדשה עם שני מיליארד צפיות בערוץ היוטיוב שלנו."
75
- },
76
- "hi": {
77
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/hi_f1.flac",
78
- "text": "पिछले महीने हमने एक नया मील का पत्थर छुआ: हमारे YouTube चैनल पर दो अरब व्यूज़।"
79
- },
80
- "it": {
81
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/it_m1.flac",
82
- "text": "Il mese scorso abbiamo raggiunto un nuovo traguardo: due miliardi di visualizzazioni sul nostro canale YouTube."
83
- },
84
- "ja": {
85
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ja/ja_prompts1.flac",
86
- "text": "先月、私たちのYouTubeチャンネルで二十億回の再生回数という新たなマイルストーンに到達しました。"
87
- },
88
- "ko": {
89
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ko_f.flac",
90
- "text": "지난달 우리는 유튜브 채널에서 이십억 조회수라는 새로운 이정표에 도달했습니다."
91
- },
92
- "ms": {
93
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ms_f.flac",
94
- "text": "Bulan lepas, kami mencapai pencapaian baru dengan dua bilion tontonan di saluran YouTube kami."
95
- },
96
- "nl": {
97
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/nl_m.flac",
98
- "text": "Vorige maand bereikten we een nieuwe mijlpaal met twee miljard weergaven op ons YouTube-kanaal."
99
- },
100
- "no": {
101
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/no_f1.flac",
102
- "text": "Forrige måned nådde vi en ny milepæl med to milliarder visninger på YouTube-kanalen vår."
103
- },
104
- "pl": {
105
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pl_m.flac",
106
- "text": "W zeszłym miesiącu osiągnęliśmy nowy kamień milowy z dwoma miliardami wyświetleń na naszym kanale YouTube."
107
- },
108
- "pt": {
109
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/pt_m1.flac",
110
- "text": "No mês passado, alcançámos um novo marco: dois mil milhões de visualizações no nosso canal do YouTube."
111
- },
112
- "ru": {
113
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/ru_m.flac",
114
- "text": "В прошлом месяце мы достигли нового рубежа: два миллиарда просмотров на нашем YouTube-канале."
115
- },
116
- "sv": {
117
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sv_f.flac",
118
- "text": "Förra månaden nådde vi en ny milstolpe med två miljarder visningar på vår YouTube-kanal."
119
- },
120
- "sw": {
121
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/sw_m.flac",
122
- "text": "Mwezi uliopita, tulifika hatua mpya ya maoni ya bilioni mbili kweny kituo chetu cha YouTube."
123
- },
124
- "tr": {
125
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/tr_m.flac",
126
- "text": "Geçen ay YouTube kanalımızda iki milyar görüntüleme ile yeni bir dönüm noktasına ulaştık."
127
- },
128
- "zh": {
129
- "audio": "https://storage.googleapis.com/chatterbox-demo-samples/mtl_prompts/zh_f2.flac",
130
- "text": "上个月,我们达到了一个新的里程碑。 我们的YouTube频道观看次数达到了二十亿次,这绝对令人难以置信。"
131
- },
132
- }
133
-
134
  # ------------------------------------------------------------------
135
  # 1. Lazy loaders
136
  # ------------------------------------------------------------------
@@ -155,28 +53,6 @@ def load_neutts():
155
  )
156
  return neutts_model
157
 
158
- def get_or_load_chatterbox_model():
159
- """Loads the ChatterboxMultilingualTTS model if it hasn't been loaded already,
160
- and ensures it's on the correct device."""
161
- global chatterbox_model
162
- if chatterbox_model is None:
163
- print("Chatterbox model not loaded, initializing...")
164
- try:
165
- chatterbox_model = ChatterboxMultilingualTTS.from_pretrained(DEVICE)
166
- if hasattr(chatterbox_model, 'to') and str(chatterbox_model.device) != DEVICE:
167
- chatterbox_model.to(DEVICE)
168
- print(f"Chatterbox model loaded successfully. Internal device: {getattr(chatterbox_model, 'device', 'N/A')}")
169
- except Exception as e:
170
- print(f"Error loading Chatterbox model: {e}")
171
- raise
172
- return chatterbox_model
173
-
174
- # Attempt to load the Chatterbox model at startup.
175
- try:
176
- get_or_load_chatterbox_model()
177
- except Exception as e:
178
- print(f"CRITICAL: Failed to load Chatterbox model on startup. Application may not function. Error: {e}")
179
-
180
  # ------------------------------------------------------------------
181
  # 2. Kokoro TTS inference
182
  # ------------------------------------------------------------------
@@ -233,100 +109,17 @@ def neutts_infer(ref_text: str, ref_audio_path: str, gen_text: str) -> tuple[int
233
  return (24_000, wav)
234
 
235
  # ------------------------------------------------------------------
236
- # 4. Chatterbox Multilingual inference
237
- # ------------------------------------------------------------------
238
- def default_audio_for_ui(lang: str) -> str | None:
239
- return LANGUAGE_CONFIG.get(lang, {}).get("audio")
240
-
241
- def default_text_for_ui(lang: str) -> str:
242
- return LANGUAGE_CONFIG.get(lang, {}).get("text", "")
243
-
244
- def get_supported_languages_display() -> str:
245
- """Generate a formatted display of all supported languages."""
246
- language_items = []
247
- for code, name in sorted(SUPPORTED_LANGUAGES.items()):
248
- language_items.append(f"**{name}** (`{code}`)")
249
-
250
- # Split into 2 lines
251
- mid = len(language_items) // 2
252
- line1 = " • ".join(language_items[:mid])
253
- line2 = " • ".join(language_items[mid:])
254
-
255
- return f"""
256
- ### 🌍 Supported Languages ({len(SUPPORTED_LANGUAGES)} total)
257
- {line1}
258
- {line2}
259
- """
260
-
261
- def set_seed(seed: int):
262
- """Sets the random seed for reproducibility across torch, numpy, and random."""
263
- torch.manual_seed(seed)
264
- if DEVICE == "cuda":
265
- torch.cuda.manual_seed(seed)
266
- torch.cuda.manual_seed_all(seed)
267
- random.seed(seed)
268
- np.random.seed(seed)
269
-
270
- @spaces.GPU
271
- def chatterbox_infer(
272
- text_input: str,
273
- language_id: str,
274
- audio_prompt_path_input: str = None,
275
- exaggeration_input: float = 0.5,
276
- temperature_input: float = 0.8,
277
- seed_num_input: int = 0,
278
- cfgw_input: float = 0.5
279
- ) -> tuple[int, np.ndarray]:
280
- """
281
- Generate high-quality speech audio from text using Chatterbox Multilingual model with optional reference audio styling.
282
- """
283
- current_model = get_or_load_chatterbox_model()
284
-
285
- if current_model is None:
286
- raise RuntimeError("TTS model is not loaded.")
287
-
288
- if seed_num_input != 0:
289
- set_seed(int(seed_num_input))
290
-
291
- print(f"Generating audio for text: '{text_input[:50]}...'")
292
-
293
- # Handle optional audio prompt
294
- chosen_prompt = audio_prompt_path_input or default_audio_for_ui(language_id)
295
-
296
- generate_kwargs = {
297
- "exaggeration": exaggeration_input,
298
- "temperature": temperature_input,
299
- "cfg_weight": cfgw_input,
300
- }
301
- if chosen_prompt:
302
- generate_kwargs["audio_prompt_path"] = chosen_prompt
303
- print(f"Using audio prompt: {chosen_prompt}")
304
- else:
305
- print("No audio prompt provided; using default voice.")
306
-
307
- wav = current_model.generate(
308
- text_input[:300], # Truncate text to max chars
309
- language_id=language_id,
310
- **generate_kwargs
311
- )
312
- print("Audio generation complete.")
313
- return (current_model.sr, wav.squeeze(0).numpy())
314
-
315
- def on_language_change(lang, current_ref, current_text):
316
- return default_audio_for_ui(lang), default_text_for_ui(lang)
317
-
318
- # ------------------------------------------------------------------
319
- # 5. Gradio UI with model selection
320
  # ------------------------------------------------------------------
321
  css = """footer {visibility: hidden}"""
322
 
323
- with gr.Blocks(css=css, title="Text2Audio - Kokoro, NeuTTS-Air & Chatterbox") as demo:
324
  gr.Markdown("# 🎙️ Text-to-Audio Generation")
325
- gr.Markdown("Choose between **Kokoro TTS** (fast English TTS), **NeuTTS-Air** (voice cloning with reference audio), or **Chatterbox Multilingual** (23 languages with styling)")
326
 
327
  # Model selection
328
  model_choice = gr.Radio(
329
- choices=["Kokoro TTS", "NeuTTS-Air", "Chatterbox Multilingual"],
330
  value="Kokoro TTS",
331
  label="Select TTS Engine",
332
  interactive=True
@@ -380,96 +173,17 @@ with gr.Blocks(css=css, title="Text2Audio - Kokoro, NeuTTS-Air & Chatterbox") as
380
 
381
  gr.Markdown("**NeuTTS-Air** – Upload a reference audio sample, provide the reference text, and enter new text to synthesize.")
382
 
383
- # Chatterbox Multilingual Interface
384
- with gr.Group(visible=False) as chatterbox_group:
385
- gr.Markdown("### 🗣️ Chatterbox Multilingual Settings")
386
- gr.Markdown(
387
- """
388
- Generate high-quality multilingual speech from text with reference audio styling, supporting 23 languages.
389
-
390
- For a hosted version of Chatterbox Multilingual and for finetuning, please visit [resemble.ai](https://app.resemble.ai)
391
- """
392
- )
393
- # Display supported languages
394
- gr.Markdown(get_supported_languages_display())
395
- with gr.Row():
396
- with gr.Column():
397
- initial_lang = "en"
398
- chatterbox_text = gr.Textbox(
399
- value=default_text_for_ui(initial_lang),
400
- label="Text to synthesize (max chars 300)",
401
- max_lines=5
402
- )
403
-
404
- chatterbox_language_id = gr.Dropdown(
405
- choices=list(SUPPORTED_LANGUAGES.keys()),
406
- value=initial_lang,
407
- label="Language",
408
- info="Select the language for text-to-speech synthesis"
409
- )
410
-
411
- chatterbox_ref_wav = gr.Audio(
412
- sources=["upload", "microphone"],
413
- type="filepath",
414
- label="Reference Audio File (Optional)",
415
- value=default_audio_for_ui(initial_lang)
416
- )
417
-
418
- gr.Markdown(
419
- "💡 **Note**: Ensure that the reference clip matches the specified language tag. Otherwise, language transfer outputs may inherit the accent of the reference clip's language. To mitigate this, set the CFG weight to 0.",
420
- elem_classes=["audio-note"]
421
- )
422
-
423
- chatterbox_exaggeration = gr.Slider(
424
- 0.25, 2, step=.05, label="Exaggeration (Neutral = 0.5, extreme values can be unstable)", value=.5
425
- )
426
- chatterbox_cfg_weight = gr.Slider(
427
- 0.2, 1, step=.05, label="CFG/Pace", value=0.5
428
- )
429
-
430
- with gr.Accordion("More options", open=False):
431
- chatterbox_seed_num = gr.Number(value=0, label="Random seed (0 for random)")
432
- chatterbox_temp = gr.Slider(0.05, 5, step=.05, label="Temperature", value=.8)
433
-
434
- chatterbox_btn = gr.Button("🗣️ Generate with Chatterbox", variant="primary")
435
-
436
- with gr.Column():
437
- chatterbox_audio_output = gr.Audio(label="Output Audio")
438
-
439
- chatterbox_language_id.change(
440
- fn=on_language_change,
441
- inputs=[chatterbox_language_id, chatterbox_ref_wav, chatterbox_text],
442
- outputs=[chatterbox_ref_wav, chatterbox_text],
443
- show_progress=False
444
- )
445
-
446
- chatterbox_btn.click(
447
- fn=chatterbox_infer,
448
- inputs=[
449
- chatterbox_text,
450
- chatterbox_language_id,
451
- chatterbox_ref_wav,
452
- chatterbox_exaggeration,
453
- chatterbox_temp,
454
- chatterbox_seed_num,
455
- chatterbox_cfg_weight,
456
- ],
457
- outputs=[chatterbox_audio_output],
458
- )
459
-
460
  # Event handlers
461
  def toggle_interface(choice):
462
  if choice == "Kokoro TTS":
463
- return gr.update(visible=True), gr.update(visible=False), gr.update(visible=False)
464
- elif choice == "NeuTTS-Air":
465
- return gr.update(visible=False), gr.update(visible=True), gr.update(visible=False)
466
  else:
467
- return gr.update(visible=False), gr.update(visible=False), gr.update(visible=True)
468
 
469
  model_choice.change(
470
  fn=toggle_interface,
471
  inputs=[model_choice],
472
- outputs=[kokoro_group, neutts_group, chatterbox_group]
473
  )
474
 
475
  kokoro_btn.click(
 
6
  import numpy as np
7
  import gradio as gr
8
  import soundfile as sf
 
 
 
9
 
10
  # Clone NeuTTS-Air repository if not present
11
  NEUTTS_DIR = "neutts-air"
 
22
  # Global variables for lazy loading
23
  kokoro_pipe = None
24
  neutts_model = None
 
25
 
26
  # NeuTTS-Air configuration - aligned with official neutts-air/app.py
27
  SAMPLES_PATH = os.path.join(os.getcwd(), NEUTTS_DIR, "samples")
 
29
  DEFAULT_REF_PATH = os.path.join(SAMPLES_PATH, "dave.wav")
30
  DEFAULT_GEN_TEXT = "My name is Dave, and um, I'm from London."
31
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
32
  # ------------------------------------------------------------------
33
  # 1. Lazy loaders
34
  # ------------------------------------------------------------------
 
53
  )
54
  return neutts_model
55
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
56
  # ------------------------------------------------------------------
57
  # 2. Kokoro TTS inference
58
  # ------------------------------------------------------------------
 
109
  return (24_000, wav)
110
 
111
  # ------------------------------------------------------------------
112
+ # 4. Gradio UI with model selection
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  # ------------------------------------------------------------------
114
  css = """footer {visibility: hidden}"""
115
 
116
+ with gr.Blocks(css=css, title="Text2Audio - Kokoro & NeuTTS-Air") as demo:
117
  gr.Markdown("# 🎙️ Text-to-Audio Generation")
118
+ gr.Markdown("Choose between **Kokoro TTS** (fast English TTS) or **NeuTTS-Air** (voice cloning with reference audio)")
119
 
120
  # Model selection
121
  model_choice = gr.Radio(
122
+ choices=["Kokoro TTS", "NeuTTS-Air"],
123
  value="Kokoro TTS",
124
  label="Select TTS Engine",
125
  interactive=True
 
173
 
174
  gr.Markdown("**NeuTTS-Air** – Upload a reference audio sample, provide the reference text, and enter new text to synthesize.")
175
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
176
  # Event handlers
177
  def toggle_interface(choice):
178
  if choice == "Kokoro TTS":
179
+ return gr.update(visible=True), gr.update(visible=False)
 
 
180
  else:
181
+ return gr.update(visible=False), gr.update(visible=True)
182
 
183
  model_choice.change(
184
  fn=toggle_interface,
185
  inputs=[model_choice],
186
+ outputs=[kokoro_group, neutts_group]
187
  )
188
 
189
  kokoro_btn.click(