Opera8 commited on
Commit
3fbd4a0
·
verified ·
1 Parent(s): 619d3cc

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +101 -894
app.py CHANGED
@@ -13,13 +13,10 @@ import re
13
  import spaces
14
  import soundfile as sf # Importing soundfile directly
15
 
16
- # Create a global variable to track downloaded resources
17
  downloaded_resources = {
18
  "configs": False,
19
- "tokenizer_vq32": False,
20
  "tokenizer_vq8192": False,
21
- "ar_Vq32ToVq8192": False,
22
- "ar_PhoneToVq8192": False,
23
  "fmt_Vq8192ToMels": False,
24
  "vocoder": False
25
  }
@@ -27,118 +24,57 @@ downloaded_resources = {
27
  def install_espeak():
28
  """Detect and install espeak-ng dependency"""
29
  try:
30
- # Check if espeak-ng is already installed
31
  result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
32
  if result.returncode != 0:
33
- print("Detected espeak-ng not installed in the system, attempting to install...")
34
- # Try to install espeak-ng and its data using apt-get
35
  subprocess.run(["apt-get", "update"], check=True)
36
- # Install espeak-ng and the corresponding language data package
37
  subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
38
- print("espeak-ng and its data packages installed successfully!")
39
  else:
40
- print("espeak-ng is already installed in the system.")
41
-
42
- # Verify Chinese support (optional)
43
- try:
44
- voices_result = subprocess.run(["espeak-ng", "--voices=cmn"], capture_output=True, text=True, check=True)
45
- if "cmn" in voices_result.stdout:
46
- print("espeak-ng supports 'cmn' language.")
47
- else:
48
- print("Warning: espeak-ng is installed, but 'cmn' language still seems unavailable.")
49
- except Exception as e:
50
- print(f"Error verifying espeak-ng Chinese support (may not affect functionality): {e}")
51
-
52
  except Exception as e:
53
  print(f"Error installing espeak-ng: {e}")
54
- print("Please try to run manually: apt-get update && apt-get install -y espeak-ng espeak-ng-data")
55
 
56
- # Install espeak before all other operations
57
  install_espeak()
58
 
59
  def patch_langsegment_init():
60
  try:
61
- # Try to find the location of the LangSegment package
62
  spec = importlib.util.find_spec("LangSegment")
63
- if spec is None or spec.origin is None:
64
- print("Unable to locate LangSegment package.")
65
- return
66
-
67
- # Build the path to __init__.py
68
  init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
69
-
70
  if not os.path.exists(init_path):
71
- print(f"LangSegment __init__.py file not found at: {init_path}")
72
- # Try to find in site-packages, applicable in some environments
73
  for site_pkg_path in site.getsitepackages():
74
  potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
75
  if os.path.exists(potential_path):
76
  init_path = potential_path
77
- print(f"Found __init__.py in site-packages: {init_path}")
78
  break
79
- else: # If the loop ends normally (no break)
80
- print(f"Also unable to find __init__.py in site-packages")
81
- return
82
-
83
-
84
- print(f"Attempting to read LangSegment __init__.py: {init_path}")
85
- with open(init_path, 'r') as f:
86
- lines = f.readlines()
87
 
 
88
  modified = False
89
  new_lines = []
90
  target_line_prefix = "from .LangSegment import"
91
 
92
  for line in lines:
93
- stripped_line = line.strip()
94
- if stripped_line.startswith(target_line_prefix):
95
- if 'setLangfilters' in stripped_line or 'getLangfilters' in stripped_line:
96
- print(f"Found line that needs modification: {stripped_line}")
97
- # Remove setLangfilters and getLangfilters
98
- modified_line = stripped_line.replace(',setLangfilters', '')
99
- modified_line = modified_line.replace(',getLangfilters', '')
100
- # Ensure comma handling is correct (e.g., if they are the last items)
101
- modified_line = modified_line.replace('setLangfilters,', '')
102
- modified_line = modified_line.replace('getLangfilters,', '')
103
- # If they are the only extra imports, remove any redundant commas
104
- modified_line = modified_line.rstrip(',')
105
- new_lines.append(modified_line + '\n')
106
- modified = True
107
- print(f"Modified line: {modified_line.strip()}")
108
- else:
109
- new_lines.append(line) # Line is fine, keep as is
110
  else:
111
- new_lines.append(line) # Non-target line, keep as is
112
 
113
  if modified:
114
- print(f"Attempting to write back modified LangSegment __init__.py to: {init_path}")
115
  try:
116
- with open(init_path, 'w') as f:
117
- f.writelines(new_lines)
118
- print("LangSegment __init__.py modified successfully.")
119
- # Try to reload the module to make changes effective (may not work, depending on import chain)
120
- try:
121
- import LangSegment
122
- importlib.reload(LangSegment)
123
- print("LangSegment module has been attempted to reload.")
124
- except Exception as reload_e:
125
- print(f"Error reloading LangSegment (may have no impact): {reload_e}")
126
- except PermissionError:
127
- print(f"Error: Insufficient permissions to modify {init_path}. Consider modifying requirements.txt.")
128
- except Exception as write_e:
129
- print(f"Other error occurred when writing LangSegment __init__.py: {write_e}")
130
- else:
131
- print("LangSegment __init__.py doesn't need modification.")
132
 
133
- except ImportError:
134
- print("LangSegment package not found, unable to fix.")
135
  except Exception as e:
136
- print(f"Unexpected error occurred when fixing LangSegment package: {e}")
137
 
138
- # Execute the fix before all other imports (especially Amphion) that might trigger LangSegment
139
  patch_langsegment_init()
140
 
141
- # Clone Amphion repository
142
  if not os.path.exists("Amphion"):
143
  subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
144
  os.chdir("Amphion")
@@ -146,24 +82,19 @@ else:
146
  if not os.getcwd().endswith("Amphion"):
147
  os.chdir("Amphion")
148
 
149
- # Add Amphion to the path
150
  if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
151
  sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
152
 
153
- # Ensure needed directories exist
154
  os.makedirs("wav", exist_ok=True)
155
  os.makedirs("ckpts/Vevo", exist_ok=True)
156
 
157
- # IMPORTANT: Do NOT import save_audio from vevo_utils because it uses torchaudio.save which crashes
158
- from models.vc.vevo.vevo_utils import VevoInferencePipeline, load_wav
159
 
160
- # Define a custom save_audio function using soundfile directly to avoid TorchCodec errors
161
  def my_save_audio(waveform, output_path, sample_rate=24000):
162
  try:
163
- # Move to CPU and detach
164
  if isinstance(waveform, torch.Tensor):
165
  waveform = waveform.detach().cpu()
166
- # Handle shapes [1, T] -> [T]
167
  if waveform.dim() == 2 and waveform.shape[0] == 1:
168
  waveform = waveform.squeeze(0)
169
  waveform = waveform.numpy()
@@ -174,482 +105,83 @@ def my_save_audio(waveform, output_path, sample_rate=24000):
174
  print(f"Failed to save audio with soundfile: {e}")
175
  raise e
176
 
177
- # Download and setup config files
178
  def setup_configs():
179
- if downloaded_resources["configs"]:
180
- print("Config files already downloaded, skipping...")
181
- return
182
-
183
  config_path = "models/vc/vevo/config"
184
  os.makedirs(config_path, exist_ok=True)
185
-
186
- config_files = [
187
- "PhoneToVq8192.json",
188
- "Vocoder.json",
189
- "Vq32ToVq8192.json",
190
- "Vq8192ToMels.json",
191
- "hubert_large_l18_c32.yaml",
192
- ]
193
 
194
  for file in config_files:
195
  file_path = f"{config_path}/{file}"
196
  if not os.path.exists(file_path):
197
  try:
198
- file_data = hf_hub_download(
199
- repo_id="amphion/Vevo",
200
- filename=f"config/{file}",
201
- repo_type="model",
202
- )
203
- os.makedirs(os.path.dirname(file_path), exist_ok=True)
204
- # Copy file to target location
205
  subprocess.run(["cp", file_data, file_path])
206
- except Exception as e:
207
- print(f"Error downloading config file {file}: {e}")
208
-
209
  downloaded_resources["configs"] = True
210
 
211
  setup_configs()
212
 
213
- # Device configuration
214
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
215
  print(f"Using device: {device}")
216
 
217
- # Initialize pipeline dictionary
218
  inference_pipelines = {}
219
 
220
- # Download all necessary model resources at startup
221
  def preload_all_resources():
222
- print("Preloading all model resources...")
223
- # Download configuration files
224
  setup_configs()
225
 
226
- # Store the downloaded model paths
227
- global downloaded_content_tokenizer_path
228
  global downloaded_content_style_tokenizer_path
229
- global downloaded_ar_vq32_path
230
- global downloaded_ar_phone_path
231
  global downloaded_fmt_path
232
  global downloaded_vocoder_path
233
 
234
- # Download Content Tokenizer (vq32)
235
- if not downloaded_resources["tokenizer_vq32"]:
236
- print("Preloading Content Tokenizer (vq32)...")
237
- local_dir = snapshot_download(
238
- repo_id="amphion/Vevo",
239
- repo_type="model",
240
- cache_dir="./ckpts/Vevo",
241
- allow_patterns=["tokenizer/vq32/*"],
242
- )
243
- downloaded_content_tokenizer_path = local_dir
244
- downloaded_resources["tokenizer_vq32"] = True
245
- print("Content Tokenizer (vq32) download completed")
246
-
247
- # Download Content-Style Tokenizer (vq8192)
248
  if not downloaded_resources["tokenizer_vq8192"]:
249
- print("Preloading Content-Style Tokenizer (vq8192)...")
250
- local_dir = snapshot_download(
251
- repo_id="amphion/Vevo",
252
- repo_type="model",
253
- cache_dir="./ckpts/Vevo",
254
- allow_patterns=["tokenizer/vq8192/*"],
255
- )
256
  downloaded_content_style_tokenizer_path = local_dir
257
  downloaded_resources["tokenizer_vq8192"] = True
258
- print("Content-Style Tokenizer (vq8192) download completed")
259
-
260
- # Download Autoregressive Transformer (Vq32ToVq8192)
261
- if not downloaded_resources["ar_Vq32ToVq8192"]:
262
- print("Preloading Autoregressive Transformer (Vq32ToVq8192)...")
263
- local_dir = snapshot_download(
264
- repo_id="amphion/Vevo",
265
- repo_type="model",
266
- cache_dir="./ckpts/Vevo",
267
- allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
268
- )
269
- downloaded_ar_vq32_path = local_dir
270
- downloaded_resources["ar_Vq32ToVq8192"] = True
271
- print("Autoregressive Transformer (Vq32ToVq8192) download completed")
272
-
273
- # Download Autoregressive Transformer (PhoneToVq8192)
274
- if not downloaded_resources["ar_PhoneToVq8192"]:
275
- print("Preloading Autoregressive Transformer (PhoneToVq8192)...")
276
- local_dir = snapshot_download(
277
- repo_id="amphion/Vevo",
278
- repo_type="model",
279
- cache_dir="./ckpts/Vevo",
280
- allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
281
- )
282
- downloaded_ar_phone_path = local_dir
283
- downloaded_resources["ar_PhoneToVq8192"] = True
284
- print("Autoregressive Transformer (PhoneToVq8192) download completed")
285
 
286
- # Download Flow Matching Transformer
287
  if not downloaded_resources["fmt_Vq8192ToMels"]:
288
- print("Preloading Flow Matching Transformer (Vq8192ToMels)...")
289
- local_dir = snapshot_download(
290
- repo_id="amphion/Vevo",
291
- repo_type="model",
292
- cache_dir="./ckpts/Vevo",
293
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
294
- )
295
  downloaded_fmt_path = local_dir
296
  downloaded_resources["fmt_Vq8192ToMels"] = True
297
- print("Flow Matching Transformer (Vq8192ToMels) download completed")
298
 
299
- # Download Vocoder
300
  if not downloaded_resources["vocoder"]:
301
- print("Preloading Vocoder...")
302
- local_dir = snapshot_download(
303
- repo_id="amphion/Vevo",
304
- repo_type="model",
305
- cache_dir="./ckpts/Vevo",
306
- allow_patterns=["acoustic_modeling/Vocoder/*"],
307
- )
308
  downloaded_vocoder_path = local_dir
309
  downloaded_resources["vocoder"] = True
310
- print("Vocoder download completed")
311
 
312
- print("All model resources preloading completed!")
313
 
314
- # Initialize path variables to store downloaded model paths
315
- downloaded_content_tokenizer_path = None
316
  downloaded_content_style_tokenizer_path = None
317
- downloaded_ar_vq32_path = None
318
- downloaded_ar_phone_path = None
319
  downloaded_fmt_path = None
320
  downloaded_vocoder_path = None
321
 
322
- # Preload all resources before creating the Gradio interface
323
  preload_all_resources()
324
 
325
- def get_pipeline(pipeline_type):
326
- if pipeline_type in inference_pipelines:
327
- return inference_pipelines[pipeline_type]
328
-
329
- # Initialize pipeline based on the required pipeline type
330
- if pipeline_type == "style" or pipeline_type == "voice":
331
- # Use already downloaded Content Tokenizer
332
- if downloaded_resources["tokenizer_vq32"]:
333
- content_tokenizer_ckpt_path = os.path.join(
334
- downloaded_content_tokenizer_path, "tokenizer/vq32/hubert_large_l18_c32.pkl"
335
- )
336
- else:
337
- # Fallback to direct download
338
- local_dir = snapshot_download(
339
- repo_id="amphion/Vevo",
340
- repo_type="model",
341
- cache_dir="./ckpts/Vevo",
342
- allow_patterns=["tokenizer/vq32/*"],
343
- )
344
- content_tokenizer_ckpt_path = os.path.join(
345
- local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
346
- )
347
-
348
- # Use already downloaded Content-Style Tokenizer
349
- if downloaded_resources["tokenizer_vq8192"]:
350
- content_style_tokenizer_ckpt_path = os.path.join(
351
- downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
352
- )
353
- else:
354
- # Fallback to direct download
355
- local_dir = snapshot_download(
356
- repo_id="amphion/Vevo",
357
- repo_type="model",
358
- cache_dir="./ckpts/Vevo",
359
- allow_patterns=["tokenizer/vq8192/*"],
360
- )
361
- content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
362
-
363
- # Use already downloaded Autoregressive Transformer
364
- ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
365
- if downloaded_resources["ar_Vq32ToVq8192"]:
366
- ar_ckpt_path = os.path.join(
367
- downloaded_ar_vq32_path, "contentstyle_modeling/Vq32ToVq8192"
368
- )
369
- else:
370
- # Fallback to direct download
371
- local_dir = snapshot_download(
372
- repo_id="amphion/Vevo",
373
- repo_type="model",
374
- cache_dir="./ckpts/Vevo",
375
- allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
376
- )
377
- ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
378
-
379
- # Use already downloaded Flow Matching Transformer
380
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
381
- if downloaded_resources["fmt_Vq8192ToMels"]:
382
- fmt_ckpt_path = os.path.join(
383
- downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
384
- )
385
- else:
386
- # Fallback to direct download
387
- local_dir = snapshot_download(
388
- repo_id="amphion/Vevo",
389
- repo_type="model",
390
- cache_dir="./ckpts/Vevo",
391
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
392
- )
393
- fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
394
-
395
- # Use already downloaded Vocoder
396
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
397
- if downloaded_resources["vocoder"]:
398
- vocoder_ckpt_path = os.path.join(
399
- downloaded_vocoder_path, "acoustic_modeling/Vocoder"
400
- )
401
- else:
402
- # Fallback to direct download
403
- local_dir = snapshot_download(
404
- repo_id="amphion/Vevo",
405
- repo_type="model",
406
- cache_dir="./ckpts/Vevo",
407
- allow_patterns=["acoustic_modeling/Vocoder/*"],
408
- )
409
- vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
410
-
411
- # Initialize pipeline
412
- inference_pipeline = VevoInferencePipeline(
413
- content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
414
- content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
415
- ar_cfg_path=ar_cfg_path,
416
- ar_ckpt_path=ar_ckpt_path,
417
- fmt_cfg_path=fmt_cfg_path,
418
- fmt_ckpt_path=fmt_ckpt_path,
419
- vocoder_cfg_path=vocoder_cfg_path,
420
- vocoder_ckpt_path=vocoder_ckpt_path,
421
- device=device,
422
- )
423
-
424
- elif pipeline_type == "timbre":
425
- # Use already downloaded Content-Style Tokenizer
426
- if downloaded_resources["tokenizer_vq8192"]:
427
- content_style_tokenizer_ckpt_path = os.path.join(
428
- downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
429
- )
430
- else:
431
- # Fallback to direct download
432
- local_dir = snapshot_download(
433
- repo_id="amphion/Vevo",
434
- repo_type="model",
435
- cache_dir="./ckpts/Vevo",
436
- allow_patterns=["tokenizer/vq8192/*"],
437
- )
438
- content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
439
-
440
- # Use already downloaded Flow Matching Transformer
441
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
442
- if downloaded_resources["fmt_Vq8192ToMels"]:
443
- fmt_ckpt_path = os.path.join(
444
- downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
445
- )
446
- else:
447
- # Fallback to direct download
448
- local_dir = snapshot_download(
449
- repo_id="amphion/Vevo",
450
- repo_type="model",
451
- cache_dir="./ckpts/Vevo",
452
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
453
- )
454
- fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
455
-
456
- # Use already downloaded Vocoder
457
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
458
- if downloaded_resources["vocoder"]:
459
- vocoder_ckpt_path = os.path.join(
460
- downloaded_vocoder_path, "acoustic_modeling/Vocoder"
461
- )
462
- else:
463
- # Fallback to direct download
464
- local_dir = snapshot_download(
465
- repo_id="amphion/Vevo",
466
- repo_type="model",
467
- cache_dir="./ckpts/Vevo",
468
- allow_patterns=["acoustic_modeling/Vocoder/*"],
469
- )
470
- vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
471
-
472
- # Initialize pipeline
473
- inference_pipeline = VevoInferencePipeline(
474
- content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
475
- fmt_cfg_path=fmt_cfg_path,
476
- fmt_ckpt_path=fmt_ckpt_path,
477
- vocoder_cfg_path=vocoder_cfg_path,
478
- vocoder_ckpt_path=vocoder_ckpt_path,
479
- device=device,
480
- )
481
-
482
- elif pipeline_type == "tts":
483
- # Use already downloaded Content-Style Tokenizer
484
- if downloaded_resources["tokenizer_vq8192"]:
485
- content_style_tokenizer_ckpt_path = os.path.join(
486
- downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
487
- )
488
- else:
489
- # Fallback to direct download
490
- local_dir = snapshot_download(
491
- repo_id="amphion/Vevo",
492
- repo_type="model",
493
- cache_dir="./ckpts/Vevo",
494
- allow_patterns=["tokenizer/vq8192/*"],
495
- )
496
- content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
497
-
498
- # Use already downloaded Autoregressive Transformer (TTS specific)
499
- ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
500
- if downloaded_resources["ar_PhoneToVq8192"]:
501
- ar_ckpt_path = os.path.join(
502
- downloaded_ar_phone_path, "contentstyle_modeling/PhoneToVq8192"
503
- )
504
- else:
505
- # Fallback to direct download
506
- local_dir = snapshot_download(
507
- repo_id="amphion/Vevo",
508
- repo_type="model",
509
- cache_dir="./ckpts/Vevo",
510
- allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
511
- )
512
- ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
513
-
514
- # Use already downloaded Flow Matching Transformer
515
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
516
- if downloaded_resources["fmt_Vq8192ToMels"]:
517
- fmt_ckpt_path = os.path.join(
518
- downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
519
- )
520
- else:
521
- # Fallback to direct download
522
- local_dir = snapshot_download(
523
- repo_id="amphion/Vevo",
524
- repo_type="model",
525
- cache_dir="./ckpts/Vevo",
526
- allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
527
- )
528
- fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
529
-
530
- # Use already downloaded Vocoder
531
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
532
- if downloaded_resources["vocoder"]:
533
- vocoder_ckpt_path = os.path.join(
534
- downloaded_vocoder_path, "acoustic_modeling/Vocoder"
535
- )
536
- else:
537
- # Fallback to direct download
538
- local_dir = snapshot_download(
539
- repo_id="amphion/Vevo",
540
- repo_type="model",
541
- cache_dir="./ckpts/Vevo",
542
- allow_patterns=["acoustic_modeling/Vocoder/*"],
543
- )
544
- vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
545
-
546
- # Initialize pipeline
547
- inference_pipeline = VevoInferencePipeline(
548
- content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
549
- ar_cfg_path=ar_cfg_path,
550
- ar_ckpt_path=ar_ckpt_path,
551
- fmt_cfg_path=fmt_cfg_path,
552
- fmt_ckpt_path=fmt_ckpt_path,
553
- vocoder_cfg_path=vocoder_cfg_path,
554
- vocoder_ckpt_path=vocoder_ckpt_path,
555
- device=device,
556
- )
557
-
558
- # Cache pipeline instance
559
- inference_pipelines[pipeline_type] = inference_pipeline
560
- return inference_pipeline
561
-
562
- # Implement VEVO functionality functions
563
- @spaces.GPU()
564
- def vevo_style(content_wav, style_wav):
565
- temp_content_path = "wav/temp_content.wav"
566
- temp_style_path = "wav/temp_style.wav"
567
- output_path = "wav/output_vevostyle.wav"
568
-
569
- # Check and process audio data
570
- if content_wav is None or style_wav is None:
571
- raise ValueError("Please upload audio files")
572
-
573
- # Process audio format
574
- if isinstance(content_wav, tuple) and len(content_wav) == 2:
575
- if isinstance(content_wav[0], np.ndarray):
576
- content_data, content_sr = content_wav
577
- else:
578
- content_sr, content_data = content_wav
579
-
580
- # Ensure single channel
581
- if len(content_data.shape) > 1 and content_data.shape[1] > 1:
582
- content_data = np.mean(content_data, axis=1)
583
-
584
- # Resample to 24kHz
585
- if content_sr != 24000:
586
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
587
- content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
588
- content_sr = 24000
589
- else:
590
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
591
-
592
- # Normalize volume
593
- content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
594
- else:
595
- raise ValueError("Invalid content audio format")
596
-
597
- if isinstance(style_wav[0], np.ndarray):
598
- style_data, style_sr = style_wav
599
- else:
600
- style_sr, style_data = style_wav
601
-
602
- # Ensure single channel
603
- if len(style_data.shape) > 1 and style_data.shape[1] > 1:
604
- style_data = np.mean(style_data, axis=1)
605
-
606
- # Resample to 24kHz
607
- if style_sr != 24000:
608
- style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
609
- style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
610
- style_sr = 24000
611
- else:
612
- style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
613
-
614
- # Normalize volume
615
- style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
616
-
617
- # Print debug information
618
- print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
619
- print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
620
-
621
- # Save audio DIRECTLY using soundfile (bypassing torchaudio to avoid TorchCodec error)
622
- sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
623
- sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
624
-
625
- try:
626
- # Get pipeline
627
- pipeline = get_pipeline("style")
628
-
629
- # Inference
630
- gen_audio = pipeline.inference_ar_and_fm(
631
- src_wav_path=temp_content_path,
632
- src_text=None,
633
- style_ref_wav_path=temp_style_path,
634
- timbre_ref_wav_path=temp_content_path,
635
- )
636
-
637
- # Check if generated audio is numerical anomaly
638
- if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
639
- print("Warning: Generated audio contains NaN or Inf values")
640
- gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
641
-
642
- print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
643
-
644
- # Save generated audio using custom function
645
- my_save_audio(gen_audio, output_path=output_path)
646
-
647
- return output_path
648
- except Exception as e:
649
- print(f"Error during processing: {e}")
650
- import traceback
651
- traceback.print_exc()
652
- raise e
653
 
654
  @spaces.GPU()
655
  def vevo_timbre(content_wav, reference_wav):
@@ -657,407 +189,82 @@ def vevo_timbre(content_wav, reference_wav):
657
  temp_reference_path = "wav/temp_reference.wav"
658
  output_path = "wav/output_vevotimbre.wav"
659
 
660
- # Check and process audio data
661
  if content_wav is None or reference_wav is None:
662
  raise ValueError("Please upload audio files")
663
 
664
- # Process content audio format
665
- if isinstance(content_wav, tuple) and len(content_wav) == 2:
666
- if isinstance(content_wav[0], np.ndarray):
667
- content_data, content_sr = content_wav
668
- else:
669
- content_sr, content_data = content_wav
670
-
671
- # Ensure single channel
672
- if len(content_data.shape) > 1 and content_data.shape[1] > 1:
673
- content_data = np.mean(content_data, axis=1)
674
-
675
- # Resample to 24kHz
676
- if content_sr != 24000:
677
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
678
- content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
679
- content_sr = 24000
680
- else:
681
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
682
-
683
- # Normalize volume
684
- content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
685
  else:
686
- raise ValueError("Invalid content audio format")
 
 
 
687
 
688
- # Process reference audio format
689
- if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
690
- if isinstance(reference_wav[0], np.ndarray):
691
- reference_data, reference_sr = reference_wav
692
- else:
693
- reference_sr, reference_data = reference_wav
694
-
695
- # Ensure single channel
696
- if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
697
- reference_data = np.mean(reference_data, axis=1)
698
-
699
- # Resample to 24kHz
700
- if reference_sr != 24000:
701
- reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
702
- reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
703
- reference_sr = 24000
704
- else:
705
- reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
706
-
707
- # Normalize volume
708
- reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
709
  else:
710
- raise ValueError("Invalid reference audio format")
 
 
 
 
 
 
 
 
 
 
711
 
712
- # Print debug information
713
- print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
714
- print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
715
 
716
- # Save uploaded audio DIRECTLY using soundfile
717
  sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
718
- sf.write(temp_reference_path, reference_tensor.squeeze().cpu().numpy(), reference_sr)
719
 
720
  try:
721
- # Get pipeline
722
- pipeline = get_pipeline("timbre")
723
 
724
- # Inference
725
  gen_audio = pipeline.inference_fm(
726
  src_wav_path=temp_content_path,
727
  timbre_ref_wav_path=temp_reference_path,
728
  flow_matching_steps=32,
729
  )
730
 
731
- # Check if generated audio is numerical anomaly
732
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
733
- print("Warning: Generated audio contains NaN or Inf values")
734
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
735
 
736
- print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
737
-
738
- # Save generated audio using custom function
739
  my_save_audio(gen_audio, output_path=output_path)
740
-
741
  return output_path
742
- except Exception as e:
743
- print(f"Error during processing: {e}")
744
- import traceback
745
- traceback.print_exc()
746
- raise e
747
 
748
- @spaces.GPU()
749
- def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
750
- temp_content_path = "wav/temp_content.wav"
751
- temp_style_path = "wav/temp_style.wav"
752
- temp_timbre_path = "wav/temp_timbre.wav"
753
- output_path = "wav/output_vevovoice.wav"
754
-
755
- # Check and process audio data
756
- if content_wav is None or style_reference_wav is None or timbre_reference_wav is None:
757
- raise ValueError("Please upload all required audio files")
758
-
759
- # Process content audio format
760
- if isinstance(content_wav, tuple) and len(content_wav) == 2:
761
- if isinstance(content_wav[0], np.ndarray):
762
- content_data, content_sr = content_wav
763
- else:
764
- content_sr, content_data = content_wav
765
-
766
- # Ensure single channel
767
- if len(content_data.shape) > 1 and content_data.shape[1] > 1:
768
- content_data = np.mean(content_data, axis=1)
769
-
770
- # Resample to 24kHz
771
- if content_sr != 24000:
772
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
773
- content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
774
- content_sr = 24000
775
- else:
776
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
777
-
778
- # Normalize volume
779
- content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
780
- else:
781
- raise ValueError("Invalid content audio format")
782
-
783
- # Process style reference audio format
784
- if isinstance(style_reference_wav, tuple) and len(style_reference_wav) == 2:
785
- if isinstance(style_reference_wav[0], np.ndarray):
786
- style_data, style_sr = style_reference_wav
787
- else:
788
- style_sr, style_data = style_reference_wav
789
-
790
- # Ensure single channel
791
- if len(style_data.shape) > 1 and style_data.shape[1] > 1:
792
- style_data = np.mean(style_data, axis=1)
793
-
794
- # Resample to 24kHz
795
- if style_sr != 24000:
796
- style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
797
- style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
798
- style_sr = 24000
799
- else:
800
- style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
801
-
802
- # Normalize volume
803
- style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
804
- else:
805
- raise ValueError("Invalid style reference audio format")
806
-
807
- # Process timbre reference audio format
808
- if isinstance(timbre_reference_wav, tuple) and len(timbre_reference_wav) == 2:
809
- if isinstance(timbre_reference_wav[0], np.ndarray):
810
- timbre_data, timbre_sr = timbre_reference_wav
811
- else:
812
- timbre_sr, timbre_data = timbre_reference_wav
813
-
814
- # Ensure single channel
815
- if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
816
- timbre_data = np.mean(timbre_data, axis=1)
817
-
818
- # Resample to 24kHz
819
- if timbre_sr != 24000:
820
- timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
821
- timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
822
- timbre_sr = 24000
823
- else:
824
- timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
825
-
826
- # Normalize volume
827
- timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
828
- else:
829
- raise ValueError("Invalid timbre reference audio format")
830
-
831
- # Print debug information
832
- print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
833
- print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
834
- print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
835
-
836
- # Save uploaded audio DIRECTLY using soundfile
837
- sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
838
- sf.write(temp_style_path, style_tensor.squeeze().cpu().numpy(), style_sr)
839
- sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
840
-
841
- try:
842
- # Get pipeline
843
- pipeline = get_pipeline("voice")
844
-
845
- # Inference
846
- gen_audio = pipeline.inference_ar_and_fm(
847
- src_wav_path=temp_content_path,
848
- src_text=None,
849
- style_ref_wav_path=temp_style_path,
850
- timbre_ref_wav_path=temp_timbre_path,
851
- )
852
-
853
- # Check if generated audio is numerical anomaly
854
- if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
855
- print("Warning: Generated audio contains NaN or Inf values")
856
- gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
857
-
858
- print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
859
-
860
- # Save generated audio using custom function
861
- my_save_audio(gen_audio, output_path=output_path)
862
-
863
- return output_path
864
  except Exception as e:
865
- print(f"Error during processing: {e}")
866
- import traceback
867
- traceback.print_exc()
868
  raise e
869
 
870
- @spaces.GPU()
871
- def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
872
- temp_ref_path = "wav/temp_ref.wav"
873
- temp_timbre_path = "wav/temp_timbre.wav"
874
- output_path = "wav/output_vevotts.wav"
875
-
876
- # Check and process audio data
877
- if ref_wav is None:
878
- raise ValueError("Please upload a reference audio file")
879
-
880
- # Process reference audio format
881
- if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
882
- if isinstance(ref_wav[0], np.ndarray):
883
- ref_data, ref_sr = ref_wav
884
- else:
885
- ref_sr, ref_data = ref_wav
886
-
887
- # Ensure single channel
888
- if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
889
- ref_data = np.mean(ref_data, axis=1)
890
-
891
- # Resample to 24kHz
892
- if ref_sr != 24000:
893
- ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
894
- ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
895
- ref_sr = 24000
896
- else:
897
- ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
898
-
899
- # Normalize volume
900
- ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
901
- else:
902
- raise ValueError("Invalid reference audio format")
903
-
904
- # Print debug information
905
- print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
906
- if style_ref_text:
907
- print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
908
-
909
- # Save uploaded audio DIRECTLY using soundfile
910
- sf.write(temp_ref_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
911
-
912
- if timbre_ref_wav is not None:
913
- if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
914
- if isinstance(timbre_ref_wav[0], np.ndarray):
915
- timbre_data, timbre_sr = timbre_ref_wav
916
- else:
917
- timbre_sr, timbre_data = timbre_ref_wav
918
-
919
- # Ensure single channel
920
- if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
921
- timbre_data = np.mean(timbre_data, axis=1)
922
-
923
- # Resample to 24kHz
924
- if timbre_sr != 24000:
925
- timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
926
- timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
927
- timbre_sr = 24000
928
- else:
929
- timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
930
-
931
- # Normalize volume
932
- timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
933
 
934
- print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
935
- sf.write(temp_timbre_path, timbre_tensor.squeeze().cpu().numpy(), timbre_sr)
936
- else:
937
- raise ValueError("Invalid timbre reference audio format")
938
- else:
939
- temp_timbre_path = temp_ref_path
940
-
941
- try:
942
- # Get pipeline
943
- pipeline = get_pipeline("tts")
944
-
945
- # Inference
946
- gen_audio = pipeline.inference_ar_and_fm(
947
- src_wav_path=None,
948
- src_text=text,
949
- style_ref_wav_path=temp_ref_path,
950
- timbre_ref_wav_path=temp_timbre_path,
951
- style_ref_wav_text=style_ref_text,
952
- src_text_language=src_language,
953
- style_ref_wav_text_language=style_ref_text_language,
954
- )
955
-
956
- # Check if generated audio is numerical anomaly
957
- if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
958
- print("Warning: Generated audio contains NaN or Inf values")
959
- gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
960
-
961
- print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
962
-
963
- # Save generated audio using custom function
964
- my_save_audio(gen_audio, output_path=output_path)
965
-
966
- return output_path
967
- except Exception as e:
968
- print(f"Error during processing: {e}")
969
- import traceback
970
- traceback.print_exc()
971
- raise e
972
-
973
- # Create Gradio interface
974
- with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
975
- gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
976
- # Add link tag line
977
- with gr.Row(elem_id="links_row"):
978
- gr.HTML("""
979
- <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
980
- <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
981
- <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
982
- </a>
983
- <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
984
- <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
985
- </a>
986
- <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
987
- <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
988
- </a>
989
- <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
990
- <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
991
- </a>
992
- </div>
993
- """)
994
-
995
- with gr.Tab("Vevo-Timbre"):
996
- gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
997
- with gr.Row():
998
- with gr.Column():
999
- timbre_content = gr.Audio(label="Source Audio", type="numpy")
1000
- timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
1001
- timbre_button = gr.Button("Generate")
1002
- with gr.Column():
1003
- timbre_output = gr.Audio(label="Result")
1004
- timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
1005
-
1006
- with gr.Tab("Vevo-Style"):
1007
- gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
1008
- with gr.Row():
1009
- with gr.Column():
1010
- style_content = gr.Audio(label="Source Audio", type="numpy")
1011
- style_reference = gr.Audio(label="Style Reference", type="numpy")
1012
- style_button = gr.Button("Generate")
1013
- with gr.Column():
1014
- style_output = gr.Audio(label="Result")
1015
- style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
1016
-
1017
- with gr.Tab("Vevo-Voice"):
1018
- gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
1019
- with gr.Row():
1020
- with gr.Column():
1021
- voice_content = gr.Audio(label="Source Audio", type="numpy")
1022
- voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
1023
- voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
1024
- voice_button = gr.Button("Generate")
1025
- with gr.Column():
1026
- voice_output = gr.Audio(label="Result")
1027
- voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
1028
-
1029
-
1030
-
1031
- with gr.Tab("Vevo-TTS"):
1032
- gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
1033
- with gr.Row():
1034
- with gr.Column():
1035
- tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
1036
- tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
1037
- tts_reference = gr.Audio(label="Style Reference", type="numpy")
1038
- tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
1039
- tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
1040
- tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
1041
- tts_button = gr.Button("Generate")
1042
- with gr.Column():
1043
- tts_output = gr.Audio(label="Result")
1044
-
1045
- tts_button.click(
1046
- vevo_tts,
1047
- inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
1048
- outputs=tts_output
1049
- )
1050
-
1051
- gr.Markdown("""
1052
- ## About VEVO
1053
- VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
1054
- 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
1055
- 2. **Vevo-Timbre**: Maintains style but transfers timbre
1056
- 3. **Vevo-Voice**: Transfers both style and timbre with separate references
1057
- 4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
1058
-
1059
- For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
1060
- """)
1061
 
1062
- # Launch application
1063
  demo.launch()
 
13
  import spaces
14
  import soundfile as sf # Importing soundfile directly
15
 
16
+ # فقط منابع مورد نیاز برای Timbre را دانلود میکنیم
17
  downloaded_resources = {
18
  "configs": False,
 
19
  "tokenizer_vq8192": False,
 
 
20
  "fmt_Vq8192ToMels": False,
21
  "vocoder": False
22
  }
 
24
  def install_espeak():
25
  """Detect and install espeak-ng dependency"""
26
  try:
 
27
  result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
28
  if result.returncode != 0:
29
+ print("Installing espeak-ng...")
 
30
  subprocess.run(["apt-get", "update"], check=True)
 
31
  subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
 
32
  else:
33
+ print("espeak-ng is already installed.")
 
 
 
 
 
 
 
 
 
 
 
34
  except Exception as e:
35
  print(f"Error installing espeak-ng: {e}")
 
36
 
 
37
  install_espeak()
38
 
39
  def patch_langsegment_init():
40
  try:
 
41
  spec = importlib.util.find_spec("LangSegment")
42
+ if spec is None or spec.origin is None: return
 
 
 
 
43
  init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
 
44
  if not os.path.exists(init_path):
 
 
45
  for site_pkg_path in site.getsitepackages():
46
  potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
47
  if os.path.exists(potential_path):
48
  init_path = potential_path
 
49
  break
50
+ else: return
 
 
 
 
 
 
 
51
 
52
+ with open(init_path, 'r') as f: lines = f.readlines()
53
  modified = False
54
  new_lines = []
55
  target_line_prefix = "from .LangSegment import"
56
 
57
  for line in lines:
58
+ if line.strip().startswith(target_line_prefix) and ('setLangfilters' in line or 'getLangfilters' in line):
59
+ mod_line = line.replace(',setLangfilters', '').replace(',getLangfilters', '')
60
+ mod_line = mod_line.replace('setLangfilters,', '').replace('getLangfilters,', '').rstrip(',')
61
+ new_lines.append(mod_line + '\n')
62
+ modified = True
 
 
 
 
 
 
 
 
 
 
 
 
63
  else:
64
+ new_lines.append(line)
65
 
66
  if modified:
67
+ with open(init_path, 'w') as f: f.writelines(new_lines)
68
  try:
69
+ import LangSegment
70
+ importlib.reload(LangSegment)
71
+ except: pass
 
 
 
 
 
 
 
 
 
 
 
 
 
72
 
 
 
73
  except Exception as e:
74
+ print(f"Error patching LangSegment: {e}")
75
 
 
76
  patch_langsegment_init()
77
 
 
78
  if not os.path.exists("Amphion"):
79
  subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
80
  os.chdir("Amphion")
 
82
  if not os.getcwd().endswith("Amphion"):
83
  os.chdir("Amphion")
84
 
 
85
  if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
86
  sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
87
 
 
88
  os.makedirs("wav", exist_ok=True)
89
  os.makedirs("ckpts/Vevo", exist_ok=True)
90
 
91
+ from models.vc.vevo.vevo_utils import VevoInferencePipeline
 
92
 
93
+ # تابع ذخیره سازی اختصاصی برای جلوگیری از ارور TorchCodec
94
  def my_save_audio(waveform, output_path, sample_rate=24000):
95
  try:
 
96
  if isinstance(waveform, torch.Tensor):
97
  waveform = waveform.detach().cpu()
 
98
  if waveform.dim() == 2 and waveform.shape[0] == 1:
99
  waveform = waveform.squeeze(0)
100
  waveform = waveform.numpy()
 
105
  print(f"Failed to save audio with soundfile: {e}")
106
  raise e
107
 
 
108
  def setup_configs():
109
+ if downloaded_resources["configs"]: return
 
 
 
110
  config_path = "models/vc/vevo/config"
111
  os.makedirs(config_path, exist_ok=True)
112
+ config_files = ["Vq8192ToMels.json", "Vocoder.json"] # فقط کانفیگ‌های تیمبر
 
 
 
 
 
 
 
113
 
114
  for file in config_files:
115
  file_path = f"{config_path}/{file}"
116
  if not os.path.exists(file_path):
117
  try:
118
+ file_data = hf_hub_download(repo_id="amphion/Vevo", filename=f"config/{file}", repo_type="model")
 
 
 
 
 
 
119
  subprocess.run(["cp", file_data, file_path])
120
+ except Exception as e: print(f"Error downloading config {file}: {e}")
 
 
121
  downloaded_resources["configs"] = True
122
 
123
  setup_configs()
124
 
 
125
  device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
126
  print(f"Using device: {device}")
127
 
 
128
  inference_pipelines = {}
129
 
130
+ # دانلود منابع (فقط بخش‌های مورد نیاز Timbre)
131
  def preload_all_resources():
132
+ print("Preloading Timbre resources...")
 
133
  setup_configs()
134
 
 
 
135
  global downloaded_content_style_tokenizer_path
 
 
136
  global downloaded_fmt_path
137
  global downloaded_vocoder_path
138
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
139
  if not downloaded_resources["tokenizer_vq8192"]:
140
+ local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
 
 
 
 
 
 
141
  downloaded_content_style_tokenizer_path = local_dir
142
  downloaded_resources["tokenizer_vq8192"] = True
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
143
 
 
144
  if not downloaded_resources["fmt_Vq8192ToMels"]:
145
+ local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vq8192ToMels/*"])
 
 
 
 
 
 
146
  downloaded_fmt_path = local_dir
147
  downloaded_resources["fmt_Vq8192ToMels"] = True
 
148
 
 
149
  if not downloaded_resources["vocoder"]:
150
+ local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
 
 
 
 
 
 
151
  downloaded_vocoder_path = local_dir
152
  downloaded_resources["vocoder"] = True
 
153
 
154
+ print("Timbre resources ready!")
155
 
 
 
156
  downloaded_content_style_tokenizer_path = None
 
 
157
  downloaded_fmt_path = None
158
  downloaded_vocoder_path = None
159
 
 
160
  preload_all_resources()
161
 
162
+ def get_pipeline():
163
+ if "timbre" in inference_pipelines:
164
+ return inference_pipelines["timbre"]
165
+
166
+ # مسیرها
167
+ content_style_tokenizer_ckpt_path = os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192")
168
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
169
+ fmt_ckpt_path = os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels")
170
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
171
+ vocoder_ckpt_path = os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder")
172
+
173
+ # ساخت پایپ‌لاین فقط برای Timbre
174
+ pipeline = VevoInferencePipeline(
175
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
176
+ fmt_cfg_path=fmt_cfg_path,
177
+ fmt_ckpt_path=fmt_ckpt_path,
178
+ vocoder_cfg_path=vocoder_cfg_path,
179
+ vocoder_ckpt_path=vocoder_ckpt_path,
180
+ device=device,
181
+ )
182
+
183
+ inference_pipelines["timbre"] = pipeline
184
+ return pipeline
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
185
 
186
  @spaces.GPU()
187
  def vevo_timbre(content_wav, reference_wav):
 
189
  temp_reference_path = "wav/temp_reference.wav"
190
  output_path = "wav/output_vevotimbre.wav"
191
 
 
192
  if content_wav is None or reference_wav is None:
193
  raise ValueError("Please upload audio files")
194
 
195
+ # پردازش صدای اصلی
196
+ if isinstance(content_wav, tuple):
197
+ content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
198
  else:
199
+ content_sr, content_data = content_wav
200
+
201
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
202
+ content_data = np.mean(content_data, axis=1)
203
 
204
+ # ریسمپل به 24k
205
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
206
+ if content_sr != 24000:
207
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
208
+ content_sr = 24000
209
+
210
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
211
+
212
+ # پردازش صدای رفرنس (Timbre)
213
+ if isinstance(reference_wav, tuple):
214
+ ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
 
 
 
 
 
 
 
 
 
 
215
  else:
216
+ ref_sr, ref_data = reference_wav
217
+
218
+ if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
219
+ ref_data = np.mean(ref_data, axis=1)
220
+
221
+ ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
222
+ if ref_sr != 24000:
223
+ ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
224
+ ref_sr = 24000
225
+
226
+ ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
227
 
228
+ print(f"Processing Timbre Swap... Content Length: {content_tensor.shape[-1]/24000:.2f}s")
 
 
229
 
230
+ # ذخیره موقت فایل‌ها
231
  sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
232
+ sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
233
 
234
  try:
235
+ pipeline = get_pipeline()
 
236
 
 
237
  gen_audio = pipeline.inference_fm(
238
  src_wav_path=temp_content_path,
239
  timbre_ref_wav_path=temp_reference_path,
240
  flow_matching_steps=32,
241
  )
242
 
 
243
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
244
+ print("Warning: NaN detected, fixing...")
245
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
246
 
247
+ # ذخیره خروجی
 
 
248
  my_save_audio(gen_audio, output_path=output_path)
 
249
  return output_path
 
 
 
 
 
250
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
251
  except Exception as e:
252
+ print(f"Error: {e}")
 
 
253
  raise e
254
 
255
+ # رابط کاربری ساده فقط برای Vevo-Timbre
256
+ with gr.Blocks(title="Vevo-Timbre Only") as demo:
257
+ gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
258
+ gr.Markdown("**نکته:** برای بهترین کیفیت، از فایل‌های صوتی زیر ۲۰ ثانیه استفاده کنید. فایل‌های طولانی ممکن است دچار افت کیفیت شوند.")
259
+
260
+ with gr.Row():
261
+ with gr.Column():
262
+ timbre_content = gr.Audio(label="Source Audio (صدای اصلی)", type="numpy")
263
+ timbre_reference = gr.Audio(label="Target Timbre (صدای هدف)", type="numpy")
264
+ timbre_button = gr.Button("Generate (ساخت صدا)", variant="primary")
265
+ with gr.Column():
266
+ timbre_output = gr.Audio(label="Result (خروجی)")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
267
 
268
+ timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
 
 
270
  demo.launch()