Opera8 commited on
Commit
6eac6e1
·
verified ·
1 Parent(s): d60cfcb

Upload 3 files

Browse files
Files changed (3) hide show
  1. README (1).md +14 -0
  2. app (1) (1).py +1049 -0
  3. requirements (2).txt +30 -0
README (1).md ADDED
@@ -0,0 +1,14 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ ---
2
+ title: Vevo for Zero-shot VC, TTS, and More
3
+ emoji: 🐠
4
+ colorFrom: indigo
5
+ colorTo: yellow
6
+ sdk: gradio
7
+ sdk_version: 5.25.2
8
+ app_file: app.py
9
+ pinned: false
10
+ license: mit
11
+ short_description: Controllable Zero-Shot Voice Imitation
12
+ ---
13
+
14
+ Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference
app (1) (1).py ADDED
@@ -0,0 +1,1049 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import sys
3
+ import importlib.util
4
+ import site
5
+ import json
6
+ import torch
7
+ import gradio as gr
8
+ import torchaudio
9
+ import numpy as np
10
+ from huggingface_hub import snapshot_download, hf_hub_download
11
+ import subprocess
12
+ import re
13
+ import spaces
14
+
15
+ # 创建一个全局变量来跟踪已下载的资源
16
+ # Create a global variable to track downloaded resources
17
+ downloaded_resources = {
18
+ "configs": False,
19
+ "tokenizer_vq32": False,
20
+ "tokenizer_vq8192": False,
21
+ "ar_Vq32ToVq8192": False,
22
+ "ar_PhoneToVq8192": False,
23
+ "fmt_Vq8192ToMels": False,
24
+ "vocoder": False
25
+ }
26
+
27
+ def install_espeak():
28
+ """Detect and install espeak-ng dependency"""
29
+ try:
30
+ # Check if espeak-ng is already installed
31
+ result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
32
+ if result.returncode != 0:
33
+ print("Detected espeak-ng not installed in the system, attempting to install...")
34
+ # Try to install espeak-ng and its data using apt-get
35
+ subprocess.run(["apt-get", "update"], check=True)
36
+ # Install espeak-ng and the corresponding language data package
37
+ subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
38
+ print("espeak-ng and its data packages installed successfully!")
39
+ else:
40
+ print("espeak-ng is already installed in the system.")
41
+ # Even if already installed, try to update data to ensure integrity (optional but sometimes helpful)
42
+ # print("Attempting to update espeak-ng data...")
43
+ # subprocess.run(["apt-get", "update"], check=True)
44
+ # subprocess.run(["apt-get", "install", "--only-upgrade", "-y", "espeak-ng-data"], check=True)
45
+
46
+ # Verify Chinese support (optional)
47
+ try:
48
+ voices_result = subprocess.run(["espeak-ng", "--voices=cmn"], capture_output=True, text=True, check=True)
49
+ if "cmn" in voices_result.stdout:
50
+ print("espeak-ng supports 'cmn' language.")
51
+ else:
52
+ print("Warning: espeak-ng is installed, but 'cmn' language still seems unavailable.")
53
+ except Exception as e:
54
+ print(f"Error verifying espeak-ng Chinese support (may not affect functionality): {e}")
55
+
56
+ except Exception as e:
57
+ print(f"Error installing espeak-ng: {e}")
58
+ print("Please try to run manually: apt-get update && apt-get install -y espeak-ng espeak-ng-data")
59
+
60
+ # Install espeak before all other operations
61
+ install_espeak()
62
+
63
+ def patch_langsegment_init():
64
+ try:
65
+ # Try to find the location of the LangSegment package
66
+ spec = importlib.util.find_spec("LangSegment")
67
+ if spec is None or spec.origin is None:
68
+ print("Unable to locate LangSegment package.")
69
+ return
70
+
71
+ # Build the path to __init__.py
72
+ init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
73
+
74
+ if not os.path.exists(init_path):
75
+ print(f"LangSegment __init__.py file not found at: {init_path}")
76
+ # Try to find in site-packages, applicable in some environments
77
+ for site_pkg_path in site.getsitepackages():
78
+ potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
79
+ if os.path.exists(potential_path):
80
+ init_path = potential_path
81
+ print(f"Found __init__.py in site-packages: {init_path}")
82
+ break
83
+ else: # If the loop ends normally (no break)
84
+ print(f"Also unable to find __init__.py in site-packages")
85
+ return
86
+
87
+
88
+ print(f"Attempting to read LangSegment __init__.py: {init_path}")
89
+ with open(init_path, 'r') as f:
90
+ lines = f.readlines()
91
+
92
+ modified = False
93
+ new_lines = []
94
+ target_line_prefix = "from .LangSegment import"
95
+
96
+ for line in lines:
97
+ stripped_line = line.strip()
98
+ if stripped_line.startswith(target_line_prefix):
99
+ if 'setLangfilters' in stripped_line or 'getLangfilters' in stripped_line:
100
+ print(f"Found line that needs modification: {stripped_line}")
101
+ # Remove setLangfilters and getLangfilters
102
+ modified_line = stripped_line.replace(',setLangfilters', '')
103
+ modified_line = modified_line.replace(',getLangfilters', '')
104
+ # Ensure comma handling is correct (e.g., if they are the last items)
105
+ modified_line = modified_line.replace('setLangfilters,', '')
106
+ modified_line = modified_line.replace('getLangfilters,', '')
107
+ # If they are the only extra imports, remove any redundant commas
108
+ modified_line = modified_line.rstrip(',')
109
+ new_lines.append(modified_line + '\n')
110
+ modified = True
111
+ print(f"Modified line: {modified_line.strip()}")
112
+ else:
113
+ new_lines.append(line) # Line is fine, keep as is
114
+ else:
115
+ new_lines.append(line) # Non-target line, keep as is
116
+
117
+ if modified:
118
+ print(f"Attempting to write back modified LangSegment __init__.py to: {init_path}")
119
+ try:
120
+ with open(init_path, 'w') as f:
121
+ f.writelines(new_lines)
122
+ print("LangSegment __init__.py modified successfully.")
123
+ # Try to reload the module to make changes effective (may not work, depending on import chain)
124
+ try:
125
+ import LangSegment
126
+ importlib.reload(LangSegment)
127
+ print("LangSegment module has been attempted to reload.")
128
+ except Exception as reload_e:
129
+ print(f"Error reloading LangSegment (may have no impact): {reload_e}")
130
+ except PermissionError:
131
+ print(f"Error: Insufficient permissions to modify {init_path}. Consider modifying requirements.txt.")
132
+ except Exception as write_e:
133
+ print(f"Other error occurred when writing LangSegment __init__.py: {write_e}")
134
+ else:
135
+ print("LangSegment __init__.py doesn't need modification.")
136
+
137
+ except ImportError:
138
+ print("LangSegment package not found, unable to fix.")
139
+ except Exception as e:
140
+ print(f"Unexpected error occurred when fixing LangSegment package: {e}")
141
+
142
+ # Execute the fix before all other imports (especially Amphion) that might trigger LangSegment
143
+ patch_langsegment_init()
144
+
145
+ # Clone Amphion repository
146
+ if not os.path.exists("Amphion"):
147
+ subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
148
+ os.chdir("Amphion")
149
+ else:
150
+ if not os.getcwd().endswith("Amphion"):
151
+ os.chdir("Amphion")
152
+
153
+ # Add Amphion to the path
154
+ if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
155
+ sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
156
+
157
+ # Ensure needed directories exist
158
+ os.makedirs("wav", exist_ok=True)
159
+ os.makedirs("ckpts/Vevo", exist_ok=True)
160
+
161
+ from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio, load_wav
162
+
163
+ # Download and setup config files
164
+ def setup_configs():
165
+ if downloaded_resources["configs"]:
166
+ print("Config files already downloaded, skipping...")
167
+ return
168
+
169
+ config_path = "models/vc/vevo/config"
170
+ os.makedirs(config_path, exist_ok=True)
171
+
172
+ config_files = [
173
+ "PhoneToVq8192.json",
174
+ "Vocoder.json",
175
+ "Vq32ToVq8192.json",
176
+ "Vq8192ToMels.json",
177
+ "hubert_large_l18_c32.yaml",
178
+ ]
179
+
180
+ for file in config_files:
181
+ file_path = f"{config_path}/{file}"
182
+ if not os.path.exists(file_path):
183
+ try:
184
+ file_data = hf_hub_download(
185
+ repo_id="amphion/Vevo",
186
+ filename=f"config/{file}",
187
+ repo_type="model",
188
+ )
189
+ os.makedirs(os.path.dirname(file_path), exist_ok=True)
190
+ # Copy file to target location
191
+ subprocess.run(["cp", file_data, file_path])
192
+ except Exception as e:
193
+ print(f"Error downloading config file {file}: {e}")
194
+
195
+ downloaded_resources["configs"] = True
196
+
197
+ setup_configs()
198
+
199
+ # Device configuration
200
+ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
201
+ print(f"Using device: {device}")
202
+
203
+ # Initialize pipeline dictionary
204
+ inference_pipelines = {}
205
+
206
+ # Download all necessary model resources at startup
207
+ def preload_all_resources():
208
+ print("Preloading all model resources...")
209
+ # Download configuration files
210
+ setup_configs()
211
+
212
+ # Store the downloaded model paths
213
+ global downloaded_content_tokenizer_path
214
+ global downloaded_content_style_tokenizer_path
215
+ global downloaded_ar_vq32_path
216
+ global downloaded_ar_phone_path
217
+ global downloaded_fmt_path
218
+ global downloaded_vocoder_path
219
+
220
+ # Download Content Tokenizer (vq32)
221
+ if not downloaded_resources["tokenizer_vq32"]:
222
+ print("Preloading Content Tokenizer (vq32)...")
223
+ local_dir = snapshot_download(
224
+ repo_id="amphion/Vevo",
225
+ repo_type="model",
226
+ cache_dir="./ckpts/Vevo",
227
+ allow_patterns=["tokenizer/vq32/*"],
228
+ )
229
+ downloaded_content_tokenizer_path = local_dir
230
+ downloaded_resources["tokenizer_vq32"] = True
231
+ print("Content Tokenizer (vq32) download completed")
232
+
233
+ # Download Content-Style Tokenizer (vq8192)
234
+ if not downloaded_resources["tokenizer_vq8192"]:
235
+ print("Preloading Content-Style Tokenizer (vq8192)...")
236
+ local_dir = snapshot_download(
237
+ repo_id="amphion/Vevo",
238
+ repo_type="model",
239
+ cache_dir="./ckpts/Vevo",
240
+ allow_patterns=["tokenizer/vq8192/*"],
241
+ )
242
+ downloaded_content_style_tokenizer_path = local_dir
243
+ downloaded_resources["tokenizer_vq8192"] = True
244
+ print("Content-Style Tokenizer (vq8192) download completed")
245
+
246
+ # Download Autoregressive Transformer (Vq32ToVq8192)
247
+ if not downloaded_resources["ar_Vq32ToVq8192"]:
248
+ print("Preloading Autoregressive Transformer (Vq32ToVq8192)...")
249
+ local_dir = snapshot_download(
250
+ repo_id="amphion/Vevo",
251
+ repo_type="model",
252
+ cache_dir="./ckpts/Vevo",
253
+ allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
254
+ )
255
+ downloaded_ar_vq32_path = local_dir
256
+ downloaded_resources["ar_Vq32ToVq8192"] = True
257
+ print("Autoregressive Transformer (Vq32ToVq8192) download completed")
258
+
259
+ # Download Autoregressive Transformer (PhoneToVq8192)
260
+ if not downloaded_resources["ar_PhoneToVq8192"]:
261
+ print("Preloading Autoregressive Transformer (PhoneToVq8192)...")
262
+ local_dir = snapshot_download(
263
+ repo_id="amphion/Vevo",
264
+ repo_type="model",
265
+ cache_dir="./ckpts/Vevo",
266
+ allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
267
+ )
268
+ downloaded_ar_phone_path = local_dir
269
+ downloaded_resources["ar_PhoneToVq8192"] = True
270
+ print("Autoregressive Transformer (PhoneToVq8192) download completed")
271
+
272
+ # Download Flow Matching Transformer
273
+ if not downloaded_resources["fmt_Vq8192ToMels"]:
274
+ print("Preloading Flow Matching Transformer (Vq8192ToMels)...")
275
+ local_dir = snapshot_download(
276
+ repo_id="amphion/Vevo",
277
+ repo_type="model",
278
+ cache_dir="./ckpts/Vevo",
279
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
280
+ )
281
+ downloaded_fmt_path = local_dir
282
+ downloaded_resources["fmt_Vq8192ToMels"] = True
283
+ print("Flow Matching Transformer (Vq8192ToMels) download completed")
284
+
285
+ # Download Vocoder
286
+ if not downloaded_resources["vocoder"]:
287
+ print("Preloading Vocoder...")
288
+ local_dir = snapshot_download(
289
+ repo_id="amphion/Vevo",
290
+ repo_type="model",
291
+ cache_dir="./ckpts/Vevo",
292
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
293
+ )
294
+ downloaded_vocoder_path = local_dir
295
+ downloaded_resources["vocoder"] = True
296
+ print("Vocoder download completed")
297
+
298
+ print("All model resources preloading completed!")
299
+
300
+ # Initialize path variables to store downloaded model paths
301
+ downloaded_content_tokenizer_path = None
302
+ downloaded_content_style_tokenizer_path = None
303
+ downloaded_ar_vq32_path = None
304
+ downloaded_ar_phone_path = None
305
+ downloaded_fmt_path = None
306
+ downloaded_vocoder_path = None
307
+
308
+ # Preload all resources before creating the Gradio interface
309
+ preload_all_resources()
310
+
311
+ def get_pipeline(pipeline_type):
312
+ if pipeline_type in inference_pipelines:
313
+ return inference_pipelines[pipeline_type]
314
+
315
+ # Initialize pipeline based on the required pipeline type
316
+ if pipeline_type == "style" or pipeline_type == "voice":
317
+ # Use already downloaded Content Tokenizer
318
+ if downloaded_resources["tokenizer_vq32"]:
319
+ content_tokenizer_ckpt_path = os.path.join(
320
+ downloaded_content_tokenizer_path, "tokenizer/vq32/hubert_large_l18_c32.pkl"
321
+ )
322
+ else:
323
+ # Fallback to direct download
324
+ local_dir = snapshot_download(
325
+ repo_id="amphion/Vevo",
326
+ repo_type="model",
327
+ cache_dir="./ckpts/Vevo",
328
+ allow_patterns=["tokenizer/vq32/*"],
329
+ )
330
+ content_tokenizer_ckpt_path = os.path.join(
331
+ local_dir, "tokenizer/vq32/hubert_large_l18_c32.pkl"
332
+ )
333
+
334
+ # Use already downloaded Content-Style Tokenizer
335
+ if downloaded_resources["tokenizer_vq8192"]:
336
+ content_style_tokenizer_ckpt_path = os.path.join(
337
+ downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
338
+ )
339
+ else:
340
+ # Fallback to direct download
341
+ local_dir = snapshot_download(
342
+ repo_id="amphion/Vevo",
343
+ repo_type="model",
344
+ cache_dir="./ckpts/Vevo",
345
+ allow_patterns=["tokenizer/vq8192/*"],
346
+ )
347
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
348
+
349
+ # Use already downloaded Autoregressive Transformer
350
+ ar_cfg_path = "./models/vc/vevo/config/Vq32ToVq8192.json"
351
+ if downloaded_resources["ar_Vq32ToVq8192"]:
352
+ ar_ckpt_path = os.path.join(
353
+ downloaded_ar_vq32_path, "contentstyle_modeling/Vq32ToVq8192"
354
+ )
355
+ else:
356
+ # Fallback to direct download
357
+ local_dir = snapshot_download(
358
+ repo_id="amphion/Vevo",
359
+ repo_type="model",
360
+ cache_dir="./ckpts/Vevo",
361
+ allow_patterns=["contentstyle_modeling/Vq32ToVq8192/*"],
362
+ )
363
+ ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/Vq32ToVq8192")
364
+
365
+ # Use already downloaded Flow Matching Transformer
366
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
367
+ if downloaded_resources["fmt_Vq8192ToMels"]:
368
+ fmt_ckpt_path = os.path.join(
369
+ downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
370
+ )
371
+ else:
372
+ # Fallback to direct download
373
+ local_dir = snapshot_download(
374
+ repo_id="amphion/Vevo",
375
+ repo_type="model",
376
+ cache_dir="./ckpts/Vevo",
377
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
378
+ )
379
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
380
+
381
+ # Use already downloaded Vocoder
382
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
383
+ if downloaded_resources["vocoder"]:
384
+ vocoder_ckpt_path = os.path.join(
385
+ downloaded_vocoder_path, "acoustic_modeling/Vocoder"
386
+ )
387
+ else:
388
+ # Fallback to direct download
389
+ local_dir = snapshot_download(
390
+ repo_id="amphion/Vevo",
391
+ repo_type="model",
392
+ cache_dir="./ckpts/Vevo",
393
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
394
+ )
395
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
396
+
397
+ # Initialize pipeline
398
+ inference_pipeline = VevoInferencePipeline(
399
+ content_tokenizer_ckpt_path=content_tokenizer_ckpt_path,
400
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
401
+ ar_cfg_path=ar_cfg_path,
402
+ ar_ckpt_path=ar_ckpt_path,
403
+ fmt_cfg_path=fmt_cfg_path,
404
+ fmt_ckpt_path=fmt_ckpt_path,
405
+ vocoder_cfg_path=vocoder_cfg_path,
406
+ vocoder_ckpt_path=vocoder_ckpt_path,
407
+ device=device,
408
+ )
409
+
410
+ elif pipeline_type == "timbre":
411
+ # Use already downloaded Content-Style Tokenizer
412
+ if downloaded_resources["tokenizer_vq8192"]:
413
+ content_style_tokenizer_ckpt_path = os.path.join(
414
+ downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
415
+ )
416
+ else:
417
+ # Fallback to direct download
418
+ local_dir = snapshot_download(
419
+ repo_id="amphion/Vevo",
420
+ repo_type="model",
421
+ cache_dir="./ckpts/Vevo",
422
+ allow_patterns=["tokenizer/vq8192/*"],
423
+ )
424
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
425
+
426
+ # Use already downloaded Flow Matching Transformer
427
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
428
+ if downloaded_resources["fmt_Vq8192ToMels"]:
429
+ fmt_ckpt_path = os.path.join(
430
+ downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
431
+ )
432
+ else:
433
+ # Fallback to direct download
434
+ local_dir = snapshot_download(
435
+ repo_id="amphion/Vevo",
436
+ repo_type="model",
437
+ cache_dir="./ckpts/Vevo",
438
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
439
+ )
440
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
441
+
442
+ # Use already downloaded Vocoder
443
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
444
+ if downloaded_resources["vocoder"]:
445
+ vocoder_ckpt_path = os.path.join(
446
+ downloaded_vocoder_path, "acoustic_modeling/Vocoder"
447
+ )
448
+ else:
449
+ # Fallback to direct download
450
+ local_dir = snapshot_download(
451
+ repo_id="amphion/Vevo",
452
+ repo_type="model",
453
+ cache_dir="./ckpts/Vevo",
454
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
455
+ )
456
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
457
+
458
+ # Initialize pipeline
459
+ inference_pipeline = VevoInferencePipeline(
460
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
461
+ fmt_cfg_path=fmt_cfg_path,
462
+ fmt_ckpt_path=fmt_ckpt_path,
463
+ vocoder_cfg_path=vocoder_cfg_path,
464
+ vocoder_ckpt_path=vocoder_ckpt_path,
465
+ device=device,
466
+ )
467
+
468
+ elif pipeline_type == "tts":
469
+ # Use already downloaded Content-Style Tokenizer
470
+ if downloaded_resources["tokenizer_vq8192"]:
471
+ content_style_tokenizer_ckpt_path = os.path.join(
472
+ downloaded_content_style_tokenizer_path, "tokenizer/vq8192"
473
+ )
474
+ else:
475
+ # Fallback to direct download
476
+ local_dir = snapshot_download(
477
+ repo_id="amphion/Vevo",
478
+ repo_type="model",
479
+ cache_dir="./ckpts/Vevo",
480
+ allow_patterns=["tokenizer/vq8192/*"],
481
+ )
482
+ content_style_tokenizer_ckpt_path = os.path.join(local_dir, "tokenizer/vq8192")
483
+
484
+ # Use already downloaded Autoregressive Transformer (TTS specific)
485
+ ar_cfg_path = "./models/vc/vevo/config/PhoneToVq8192.json"
486
+ if downloaded_resources["ar_PhoneToVq8192"]:
487
+ ar_ckpt_path = os.path.join(
488
+ downloaded_ar_phone_path, "contentstyle_modeling/PhoneToVq8192"
489
+ )
490
+ else:
491
+ # Fallback to direct download
492
+ local_dir = snapshot_download(
493
+ repo_id="amphion/Vevo",
494
+ repo_type="model",
495
+ cache_dir="./ckpts/Vevo",
496
+ allow_patterns=["contentstyle_modeling/PhoneToVq8192/*"],
497
+ )
498
+ ar_ckpt_path = os.path.join(local_dir, "contentstyle_modeling/PhoneToVq8192")
499
+
500
+ # Use already downloaded Flow Matching Transformer
501
+ fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
502
+ if downloaded_resources["fmt_Vq8192ToMels"]:
503
+ fmt_ckpt_path = os.path.join(
504
+ downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"
505
+ )
506
+ else:
507
+ # Fallback to direct download
508
+ local_dir = snapshot_download(
509
+ repo_id="amphion/Vevo",
510
+ repo_type="model",
511
+ cache_dir="./ckpts/Vevo",
512
+ allow_patterns=["acoustic_modeling/Vq8192ToMels/*"],
513
+ )
514
+ fmt_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vq8192ToMels")
515
+
516
+ # Use already downloaded Vocoder
517
+ vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
518
+ if downloaded_resources["vocoder"]:
519
+ vocoder_ckpt_path = os.path.join(
520
+ downloaded_vocoder_path, "acoustic_modeling/Vocoder"
521
+ )
522
+ else:
523
+ # Fallback to direct download
524
+ local_dir = snapshot_download(
525
+ repo_id="amphion/Vevo",
526
+ repo_type="model",
527
+ cache_dir="./ckpts/Vevo",
528
+ allow_patterns=["acoustic_modeling/Vocoder/*"],
529
+ )
530
+ vocoder_ckpt_path = os.path.join(local_dir, "acoustic_modeling/Vocoder")
531
+
532
+ # Initialize pipeline
533
+ inference_pipeline = VevoInferencePipeline(
534
+ content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
535
+ ar_cfg_path=ar_cfg_path,
536
+ ar_ckpt_path=ar_ckpt_path,
537
+ fmt_cfg_path=fmt_cfg_path,
538
+ fmt_ckpt_path=fmt_ckpt_path,
539
+ vocoder_cfg_path=vocoder_cfg_path,
540
+ vocoder_ckpt_path=vocoder_ckpt_path,
541
+ device=device,
542
+ )
543
+
544
+ # Cache pipeline instance
545
+ inference_pipelines[pipeline_type] = inference_pipeline
546
+ return inference_pipeline
547
+
548
+ # Implement VEVO functionality functions
549
+ @spaces.GPU()
550
+ def vevo_style(content_wav, style_wav):
551
+ temp_content_path = "wav/temp_content.wav"
552
+ temp_style_path = "wav/temp_style.wav"
553
+ output_path = "wav/output_vevostyle.wav"
554
+
555
+ # Check and process audio data
556
+ if content_wav is None or style_wav is None:
557
+ raise ValueError("Please upload audio files")
558
+
559
+ # Process audio format
560
+ if isinstance(content_wav, tuple) and len(content_wav) == 2:
561
+ if isinstance(content_wav[0], np.ndarray):
562
+ content_data, content_sr = content_wav
563
+ else:
564
+ content_sr, content_data = content_wav
565
+
566
+ # Ensure single channel
567
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
568
+ content_data = np.mean(content_data, axis=1)
569
+
570
+ # Resample to 24kHz
571
+ if content_sr != 24000:
572
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
573
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
574
+ content_sr = 24000
575
+ else:
576
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
577
+
578
+ # Normalize volume
579
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
580
+ else:
581
+ raise ValueError("Invalid content audio format")
582
+
583
+ if isinstance(style_wav[0], np.ndarray):
584
+ style_data, style_sr = style_wav
585
+ else:
586
+ style_sr, style_data = style_wav
587
+
588
+ # Ensure single channel
589
+ if len(style_data.shape) > 1 and style_data.shape[1] > 1:
590
+ style_data = np.mean(style_data, axis=1)
591
+
592
+ # Resample to 24kHz
593
+ if style_sr != 24000:
594
+ style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
595
+ style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
596
+ style_sr = 24000
597
+ else:
598
+ style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
599
+
600
+ # Normalize volume
601
+ style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
602
+
603
+ # Print debug information
604
+ print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
605
+ print(f"Style audio shape: {style_tensor.shape}, sample rate: {style_sr}")
606
+
607
+ # Save audio
608
+ torchaudio.save(temp_content_path, content_tensor, content_sr)
609
+ torchaudio.save(temp_style_path, style_tensor, style_sr)
610
+
611
+ try:
612
+ # Get pipeline
613
+ pipeline = get_pipeline("style")
614
+
615
+ # Inference
616
+ gen_audio = pipeline.inference_ar_and_fm(
617
+ src_wav_path=temp_content_path,
618
+ src_text=None,
619
+ style_ref_wav_path=temp_style_path,
620
+ timbre_ref_wav_path=temp_content_path,
621
+ )
622
+
623
+ # Check if generated audio is numerical anomaly
624
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
625
+ print("Warning: Generated audio contains NaN or Inf values")
626
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
627
+
628
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
629
+
630
+ # Save generated audio
631
+ save_audio(gen_audio, output_path=output_path)
632
+
633
+ return output_path
634
+ except Exception as e:
635
+ print(f"Error during processing: {e}")
636
+ import traceback
637
+ traceback.print_exc()
638
+ raise e
639
+
640
+ @spaces.GPU()
641
+ def vevo_timbre(content_wav, reference_wav):
642
+ temp_content_path = "wav/temp_content.wav"
643
+ temp_reference_path = "wav/temp_reference.wav"
644
+ output_path = "wav/output_vevotimbre.wav"
645
+
646
+ # Check and process audio data
647
+ if content_wav is None or reference_wav is None:
648
+ raise ValueError("Please upload audio files")
649
+
650
+ # Process content audio format
651
+ if isinstance(content_wav, tuple) and len(content_wav) == 2:
652
+ if isinstance(content_wav[0], np.ndarray):
653
+ content_data, content_sr = content_wav
654
+ else:
655
+ content_sr, content_data = content_wav
656
+
657
+ # Ensure single channel
658
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
659
+ content_data = np.mean(content_data, axis=1)
660
+
661
+ # Resample to 24kHz
662
+ if content_sr != 24000:
663
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
664
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
665
+ content_sr = 24000
666
+ else:
667
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
668
+
669
+ # Normalize volume
670
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
671
+ else:
672
+ raise ValueError("Invalid content audio format")
673
+
674
+ # Process reference audio format
675
+ if isinstance(reference_wav, tuple) and len(reference_wav) == 2:
676
+ if isinstance(reference_wav[0], np.ndarray):
677
+ reference_data, reference_sr = reference_wav
678
+ else:
679
+ reference_sr, reference_data = reference_wav
680
+
681
+ # Ensure single channel
682
+ if len(reference_data.shape) > 1 and reference_data.shape[1] > 1:
683
+ reference_data = np.mean(reference_data, axis=1)
684
+
685
+ # Resample to 24kHz
686
+ if reference_sr != 24000:
687
+ reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
688
+ reference_tensor = torchaudio.functional.resample(reference_tensor, reference_sr, 24000)
689
+ reference_sr = 24000
690
+ else:
691
+ reference_tensor = torch.FloatTensor(reference_data).unsqueeze(0)
692
+
693
+ # Normalize volume
694
+ reference_tensor = reference_tensor / (torch.max(torch.abs(reference_tensor)) + 1e-6) * 0.95
695
+ else:
696
+ raise ValueError("Invalid reference audio format")
697
+
698
+ # Print debug information
699
+ print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
700
+ print(f"Reference audio shape: {reference_tensor.shape}, sample rate: {reference_sr}")
701
+
702
+ # Save uploaded audio
703
+ torchaudio.save(temp_content_path, content_tensor, content_sr)
704
+ torchaudio.save(temp_reference_path, reference_tensor, reference_sr)
705
+
706
+ try:
707
+ # Get pipeline
708
+ pipeline = get_pipeline("timbre")
709
+
710
+ # Inference
711
+ gen_audio = pipeline.inference_fm(
712
+ src_wav_path=temp_content_path,
713
+ timbre_ref_wav_path=temp_reference_path,
714
+ flow_matching_steps=32,
715
+ )
716
+
717
+ # Check if generated audio is numerical anomaly
718
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
719
+ print("Warning: Generated audio contains NaN or Inf values")
720
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
721
+
722
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
723
+
724
+ # Save generated audio
725
+ save_audio(gen_audio, output_path=output_path)
726
+
727
+ return output_path
728
+ except Exception as e:
729
+ print(f"Error during processing: {e}")
730
+ import traceback
731
+ traceback.print_exc()
732
+ raise e
733
+
734
+ @spaces.GPU()
735
+ def vevo_voice(content_wav, style_reference_wav, timbre_reference_wav):
736
+ temp_content_path = "wav/temp_content.wav"
737
+ temp_style_path = "wav/temp_style.wav"
738
+ temp_timbre_path = "wav/temp_timbre.wav"
739
+ output_path = "wav/output_vevovoice.wav"
740
+
741
+ # Check and process audio data
742
+ if content_wav is None or style_reference_wav is None or timbre_reference_wav is None:
743
+ raise ValueError("Please upload all required audio files")
744
+
745
+ # Process content audio format
746
+ if isinstance(content_wav, tuple) and len(content_wav) == 2:
747
+ if isinstance(content_wav[0], np.ndarray):
748
+ content_data, content_sr = content_wav
749
+ else:
750
+ content_sr, content_data = content_wav
751
+
752
+ # Ensure single channel
753
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
754
+ content_data = np.mean(content_data, axis=1)
755
+
756
+ # Resample to 24kHz
757
+ if content_sr != 24000:
758
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
759
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
760
+ content_sr = 24000
761
+ else:
762
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
763
+
764
+ # Normalize volume
765
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
766
+ else:
767
+ raise ValueError("Invalid content audio format")
768
+
769
+ # Process style reference audio format
770
+ if isinstance(style_reference_wav, tuple) and len(style_reference_wav) == 2:
771
+ if isinstance(style_reference_wav[0], np.ndarray):
772
+ style_data, style_sr = style_reference_wav
773
+ else:
774
+ style_sr, style_data = style_reference_wav
775
+
776
+ # Ensure single channel
777
+ if len(style_data.shape) > 1 and style_data.shape[1] > 1:
778
+ style_data = np.mean(style_data, axis=1)
779
+
780
+ # Resample to 24kHz
781
+ if style_sr != 24000:
782
+ style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
783
+ style_tensor = torchaudio.functional.resample(style_tensor, style_sr, 24000)
784
+ style_sr = 24000
785
+ else:
786
+ style_tensor = torch.FloatTensor(style_data).unsqueeze(0)
787
+
788
+ # Normalize volume
789
+ style_tensor = style_tensor / (torch.max(torch.abs(style_tensor)) + 1e-6) * 0.95
790
+ else:
791
+ raise ValueError("Invalid style reference audio format")
792
+
793
+ # Process timbre reference audio format
794
+ if isinstance(timbre_reference_wav, tuple) and len(timbre_reference_wav) == 2:
795
+ if isinstance(timbre_reference_wav[0], np.ndarray):
796
+ timbre_data, timbre_sr = timbre_reference_wav
797
+ else:
798
+ timbre_sr, timbre_data = timbre_reference_wav
799
+
800
+ # Ensure single channel
801
+ if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
802
+ timbre_data = np.mean(timbre_data, axis=1)
803
+
804
+ # Resample to 24kHz
805
+ if timbre_sr != 24000:
806
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
807
+ timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
808
+ timbre_sr = 24000
809
+ else:
810
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
811
+
812
+ # Normalize volume
813
+ timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
814
+ else:
815
+ raise ValueError("Invalid timbre reference audio format")
816
+
817
+ # Print debug information
818
+ print(f"Content audio shape: {content_tensor.shape}, sample rate: {content_sr}")
819
+ print(f"Style reference audio shape: {style_tensor.shape}, sample rate: {style_sr}")
820
+ print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
821
+
822
+ # Save uploaded audio
823
+ torchaudio.save(temp_content_path, content_tensor, content_sr)
824
+ torchaudio.save(temp_style_path, style_tensor, style_sr)
825
+ torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
826
+
827
+ try:
828
+ # Get pipeline
829
+ pipeline = get_pipeline("voice")
830
+
831
+ # Inference
832
+ gen_audio = pipeline.inference_ar_and_fm(
833
+ src_wav_path=temp_content_path,
834
+ src_text=None,
835
+ style_ref_wav_path=temp_style_path,
836
+ timbre_ref_wav_path=temp_timbre_path,
837
+ )
838
+
839
+ # Check if generated audio is numerical anomaly
840
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
841
+ print("Warning: Generated audio contains NaN or Inf values")
842
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
843
+
844
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
845
+
846
+ # Save generated audio
847
+ save_audio(gen_audio, output_path=output_path)
848
+
849
+ return output_path
850
+ except Exception as e:
851
+ print(f"Error during processing: {e}")
852
+ import traceback
853
+ traceback.print_exc()
854
+ raise e
855
+
856
+ @spaces.GPU()
857
+ def vevo_tts(text, ref_wav, timbre_ref_wav=None, style_ref_text=None, src_language="en", ref_language="en", style_ref_text_language="en"):
858
+ temp_ref_path = "wav/temp_ref.wav"
859
+ temp_timbre_path = "wav/temp_timbre.wav"
860
+ output_path = "wav/output_vevotts.wav"
861
+
862
+ # Check and process audio data
863
+ if ref_wav is None:
864
+ raise ValueError("Please upload a reference audio file")
865
+
866
+ # Process reference audio format
867
+ if isinstance(ref_wav, tuple) and len(ref_wav) == 2:
868
+ if isinstance(ref_wav[0], np.ndarray):
869
+ ref_data, ref_sr = ref_wav
870
+ else:
871
+ ref_sr, ref_data = ref_wav
872
+
873
+ # Ensure single channel
874
+ if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
875
+ ref_data = np.mean(ref_data, axis=1)
876
+
877
+ # Resample to 24kHz
878
+ if ref_sr != 24000:
879
+ ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
880
+ ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
881
+ ref_sr = 24000
882
+ else:
883
+ ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
884
+
885
+ # Normalize volume
886
+ ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
887
+ else:
888
+ raise ValueError("Invalid reference audio format")
889
+
890
+ # Print debug information
891
+ print(f"Reference audio shape: {ref_tensor.shape}, sample rate: {ref_sr}")
892
+ if style_ref_text:
893
+ print(f"Style reference text: {style_ref_text}, language: {style_ref_text_language}")
894
+
895
+ # Save uploaded audio
896
+ torchaudio.save(temp_ref_path, ref_tensor, ref_sr)
897
+
898
+ if timbre_ref_wav is not None:
899
+ if isinstance(timbre_ref_wav, tuple) and len(timbre_ref_wav) == 2:
900
+ if isinstance(timbre_ref_wav[0], np.ndarray):
901
+ timbre_data, timbre_sr = timbre_ref_wav
902
+ else:
903
+ timbre_sr, timbre_data = timbre_ref_wav
904
+
905
+ # Ensure single channel
906
+ if len(timbre_data.shape) > 1 and timbre_data.shape[1] > 1:
907
+ timbre_data = np.mean(timbre_data, axis=1)
908
+
909
+ # Resample to 24kHz
910
+ if timbre_sr != 24000:
911
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
912
+ timbre_tensor = torchaudio.functional.resample(timbre_tensor, timbre_sr, 24000)
913
+ timbre_sr = 24000
914
+ else:
915
+ timbre_tensor = torch.FloatTensor(timbre_data).unsqueeze(0)
916
+
917
+ # Normalize volume
918
+ timbre_tensor = timbre_tensor / (torch.max(torch.abs(timbre_tensor)) + 1e-6) * 0.95
919
+
920
+ print(f"Timbre reference audio shape: {timbre_tensor.shape}, sample rate: {timbre_sr}")
921
+ torchaudio.save(temp_timbre_path, timbre_tensor, timbre_sr)
922
+ else:
923
+ raise ValueError("Invalid timbre reference audio format")
924
+ else:
925
+ temp_timbre_path = temp_ref_path
926
+
927
+ try:
928
+ # Get pipeline
929
+ pipeline = get_pipeline("tts")
930
+
931
+ # Inference
932
+ gen_audio = pipeline.inference_ar_and_fm(
933
+ src_wav_path=None,
934
+ src_text=text,
935
+ style_ref_wav_path=temp_ref_path,
936
+ timbre_ref_wav_path=temp_timbre_path,
937
+ style_ref_wav_text=style_ref_text,
938
+ src_text_language=src_language,
939
+ style_ref_wav_text_language=style_ref_text_language,
940
+ )
941
+
942
+ # Check if generated audio is numerical anomaly
943
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
944
+ print("Warning: Generated audio contains NaN or Inf values")
945
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
946
+
947
+ print(f"Generated audio shape: {gen_audio.shape}, max: {torch.max(gen_audio)}, min: {torch.min(gen_audio)}")
948
+
949
+ # Save generated audio
950
+ save_audio(gen_audio, output_path=output_path)
951
+
952
+ return output_path
953
+ except Exception as e:
954
+ print(f"Error during processing: {e}")
955
+ import traceback
956
+ traceback.print_exc()
957
+ raise e
958
+
959
+ # Create Gradio interface
960
+ with gr.Blocks(title="Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement") as demo:
961
+ gr.Markdown("# Vevo: Controllable Zero-Shot Voice Imitation with Self-Supervised Disentanglement")
962
+ # Add link tag line
963
+ with gr.Row(elem_id="links_row"):
964
+ gr.HTML("""
965
+ <div style="display: flex; justify-content: flex-start; gap: 8px; margin: 0 0; padding-left: 0px;">
966
+ <a href="https://arxiv.org/abs/2502.07243" target="_blank" style="text-decoration: none;">
967
+ <img alt="arXiv Paper" src="https://img.shields.io/badge/arXiv-Paper-red">
968
+ </a>
969
+ <a href="https://openreview.net/pdf?id=anQDiQZhDP" target="_blank" style="text-decoration: none;">
970
+ <img alt="ICLR Paper" src="https://img.shields.io/badge/ICLR-Paper-64b63a">
971
+ </a>
972
+ <a href="https://huggingface.co/amphion/Vevo" target="_blank" style="text-decoration: none;">
973
+ <img alt="HuggingFace Model" src="https://img.shields.io/badge/%F0%9F%A4%97%20HuggingFace-Model-yellow">
974
+ </a>
975
+ <a href="https://github.com/open-mmlab/Amphion/tree/main/models/vc/vevo" target="_blank" style="text-decoration: none;">
976
+ <img alt="GitHub Repo" src="https://img.shields.io/badge/GitHub-Repo-blue">
977
+ </a>
978
+ </div>
979
+ """)
980
+
981
+ with gr.Tab("Vevo-Timbre"):
982
+ gr.Markdown("### Vevo-Timbre: Maintain style but transfer timbre")
983
+ with gr.Row():
984
+ with gr.Column():
985
+ timbre_content = gr.Audio(label="Source Audio", type="numpy")
986
+ timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
987
+ timbre_button = gr.Button("Generate")
988
+ with gr.Column():
989
+ timbre_output = gr.Audio(label="Result")
990
+ timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
991
+
992
+ with gr.Tab("Vevo-Style"):
993
+ gr.Markdown("### Vevo-Style: Maintain timbre but transfer style (accent, emotion, etc.)")
994
+ with gr.Row():
995
+ with gr.Column():
996
+ style_content = gr.Audio(label="Source Audio", type="numpy")
997
+ style_reference = gr.Audio(label="Style Reference", type="numpy")
998
+ style_button = gr.Button("Generate")
999
+ with gr.Column():
1000
+ style_output = gr.Audio(label="Result")
1001
+ style_button.click(vevo_style, inputs=[style_content, style_reference], outputs=style_output)
1002
+
1003
+ with gr.Tab("Vevo-Voice"):
1004
+ gr.Markdown("### Vevo-Voice: Transfers both style and timbre with separate references")
1005
+ with gr.Row():
1006
+ with gr.Column():
1007
+ voice_content = gr.Audio(label="Source Audio", type="numpy")
1008
+ voice_style_reference = gr.Audio(label="Style Reference", type="numpy")
1009
+ voice_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
1010
+ voice_button = gr.Button("Generate")
1011
+ with gr.Column():
1012
+ voice_output = gr.Audio(label="Result")
1013
+ voice_button.click(vevo_voice, inputs=[voice_content, voice_style_reference, voice_timbre_reference], outputs=voice_output)
1014
+
1015
+
1016
+
1017
+ with gr.Tab("Vevo-TTS"):
1018
+ gr.Markdown("### Vevo-TTS: Text-to-speech with separate style and timbre references")
1019
+ with gr.Row():
1020
+ with gr.Column():
1021
+ tts_text = gr.Textbox(label="Target Text", placeholder="Enter text to synthesize...", lines=3)
1022
+ tts_src_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Text Language", value="en")
1023
+ tts_reference = gr.Audio(label="Style Reference", type="numpy")
1024
+ tts_style_ref_text = gr.Textbox(label="Style Reference Text", placeholder="Enter style reference text...", lines=3)
1025
+ tts_style_ref_text_language = gr.Dropdown(["en", "zh", "de", "fr", "ja", "ko"], label="Style Reference Text Language", value="en")
1026
+ tts_timbre_reference = gr.Audio(label="Timbre Reference", type="numpy")
1027
+ tts_button = gr.Button("Generate")
1028
+ with gr.Column():
1029
+ tts_output = gr.Audio(label="Result")
1030
+
1031
+ tts_button.click(
1032
+ vevo_tts,
1033
+ inputs=[tts_text, tts_reference, tts_timbre_reference, tts_style_ref_text, tts_src_language, tts_style_ref_text_language],
1034
+ outputs=tts_output
1035
+ )
1036
+
1037
+ gr.Markdown("""
1038
+ ## About VEVO
1039
+ VEVO is a versatile voice synthesis and conversion model that offers four main functionalities:
1040
+ 1. **Vevo-Style**: Maintains timbre but transfers style (accent, emotion, etc.)
1041
+ 2. **Vevo-Timbre**: Maintains style but transfers timbre
1042
+ 3. **Vevo-Voice**: Transfers both style and timbre with separate references
1043
+ 4. **Vevo-TTS**: Text-to-speech with separate style and timbre references
1044
+
1045
+ For more information, visit the [Amphion project](https://github.com/open-mmlab/Amphion)
1046
+ """)
1047
+
1048
+ # Launch application
1049
+ demo.launch()
requirements (2).txt ADDED
@@ -0,0 +1,30 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ gradio>=3.50.2
2
+ torch>=2.0.0
3
+ torchaudio>=2.0.0
4
+ numpy>=1.20.0
5
+ huggingface_hub>=0.14.1
6
+ librosa>=0.9.2
7
+ PyYAML>=6.0
8
+ accelerate>=0.20.3
9
+ safetensors>=0.3.1
10
+ phonemizer>=3.2.0
11
+ setuptools
12
+ onnxruntime
13
+ transformers==4.41.2
14
+ unidecode
15
+ scipy>=1.12.0
16
+ encodec
17
+ g2p_en
18
+ jieba
19
+ cn2an
20
+ pypinyin
21
+ langsegment==0.2.0
22
+ pyopenjtalk
23
+ pykakasi
24
+ json5
25
+ black>=24.1.1
26
+ ruamel.yaml
27
+ tqdm
28
+ openai-whisper
29
+ ipython
30
+ pyworld