Opera8 commited on
Commit
d7672e1
·
verified ·
1 Parent(s): ec6e509

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +75 -136
app.py CHANGED
@@ -11,9 +11,9 @@ from huggingface_hub import snapshot_download, hf_hub_download
11
  import subprocess
12
  import re
13
  import spaces
14
- import soundfile as sf # Importing soundfile directly
15
 
16
- # فقط منابع مورد نیاز برای Timbre را دانلود میکنیم
17
  downloaded_resources = {
18
  "configs": False,
19
  "tokenizer_vq8192": False,
@@ -22,7 +22,6 @@ downloaded_resources = {
22
  }
23
 
24
  def install_espeak():
25
- """Detect and install espeak-ng dependency"""
26
  try:
27
  result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
28
  if result.returncode != 0:
@@ -30,7 +29,7 @@ def install_espeak():
30
  subprocess.run(["apt-get", "update"], check=True)
31
  subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
32
  else:
33
- print("espeak-ng is already installed.")
34
  except Exception as e:
35
  print(f"Error installing espeak-ng: {e}")
36
 
@@ -69,9 +68,7 @@ def patch_langsegment_init():
69
  import LangSegment
70
  importlib.reload(LangSegment)
71
  except: pass
72
-
73
- except Exception as e:
74
- print(f"Error patching LangSegment: {e}")
75
 
76
  patch_langsegment_init()
77
 
@@ -88,22 +85,8 @@ if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
88
  os.makedirs("wav", exist_ok=True)
89
  os.makedirs("ckpts/Vevo", exist_ok=True)
90
 
91
- from models.vc.vevo.vevo_utils import VevoInferencePipeline
92
-
93
- # تابع ذخیره سازی اختصاصی
94
- def my_save_audio(waveform, output_path, sample_rate=24000):
95
- try:
96
- if isinstance(waveform, torch.Tensor):
97
- waveform = waveform.detach().cpu()
98
- if waveform.dim() == 2 and waveform.shape[0] == 1:
99
- waveform = waveform.squeeze(0)
100
- waveform = waveform.numpy()
101
-
102
- sf.write(output_path, waveform, sample_rate)
103
- print(f"Audio saved successfully to {output_path}")
104
- except Exception as e:
105
- print(f"Failed to save audio with soundfile: {e}")
106
- raise e
107
 
108
  def setup_configs():
109
  if downloaded_resources["configs"]: return
@@ -128,7 +111,7 @@ print(f"Using device: {device}")
128
  inference_pipelines = {}
129
 
130
  def preload_all_resources():
131
- print("Preloading Timbre resources...")
132
  setup_configs()
133
 
134
  global downloaded_content_style_tokenizer_path
@@ -149,8 +132,7 @@ def preload_all_resources():
149
  local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
150
  downloaded_vocoder_path = local_dir
151
  downloaded_resources["vocoder"] = True
152
-
153
- print("Timbre resources ready!")
154
 
155
  downloaded_content_style_tokenizer_path = None
156
  downloaded_fmt_path = None
@@ -162,18 +144,12 @@ def get_pipeline():
162
  if "timbre" in inference_pipelines:
163
  return inference_pipelines["timbre"]
164
 
165
- content_style_tokenizer_ckpt_path = os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192")
166
- fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
167
- fmt_ckpt_path = os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels")
168
- vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
169
- vocoder_ckpt_path = os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder")
170
-
171
  pipeline = VevoInferencePipeline(
172
- content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
173
- fmt_cfg_path=fmt_cfg_path,
174
- fmt_ckpt_path=fmt_ckpt_path,
175
- vocoder_cfg_path=vocoder_cfg_path,
176
- vocoder_ckpt_path=vocoder_ckpt_path,
177
  device=device,
178
  )
179
 
@@ -182,122 +158,85 @@ def get_pipeline():
182
 
183
  @spaces.GPU()
184
  def vevo_timbre(content_wav, reference_wav):
185
- temp_content_path = "wav/temp_content.wav"
186
- temp_reference_path = "wav/temp_reference.wav"
187
- output_path = "wav/output_vevotimbre.wav"
 
 
188
 
189
  if content_wav is None or reference_wav is None:
190
  raise ValueError("Please upload audio files")
191
 
192
- # --- بارگذاری و پردازش صدای اصلی (Content) ---
193
- if isinstance(content_wav, tuple):
194
- content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
195
- else:
196
- content_sr, content_data = content_wav
 
197
 
198
- if len(content_data.shape) > 1 and content_data.shape[1] > 1:
199
- content_data = np.mean(content_data, axis=1)
200
-
201
- content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
202
- if content_sr != 24000:
203
- content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
204
- content_sr = 24000
205
-
206
- content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
207
 
208
- # --- بارگذاری و پردازش صدای رفرنس (Reference) ---
209
- if isinstance(reference_wav, tuple):
210
- ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
211
- else:
212
- ref_sr, ref_data = reference_wav
213
 
214
- if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
215
- ref_data = np.mean(ref_data, axis=1)
216
 
217
- ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
218
- if ref_sr != 24000:
219
- ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
220
- ref_sr = 24000
221
-
222
- ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
223
-
224
- # برش زدن صدای رفرنس به 20 ثانیه اول (برای جلوگیری از گیج شدن مدل)
225
- # صدای رفرنس فقط برای برداشتن "رنگ صدا" استفاده میشه و 20 ثانیه کافیه
226
- if ref_tensor.shape[1] > 24000 * 20:
227
- ref_tensor = ref_tensor[:, :24000 * 20]
228
-
229
- # ذخیره موقت صدای رفرنس
230
- sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
231
-
232
- print(f"Total Duration: {content_tensor.shape[1]/24000:.2f}s")
233
-
234
- # --- تکه تکه کردن صدای اصلی (Chunking Logic) ---
235
- pipeline = get_pipeline()
236
-
237
- CHUNK_DURATION = 15 # ثانیه (اندازه هر تکه)
238
- CHUNK_SAMPLES = CHUNK_DURATION * 24000
239
- total_samples = content_tensor.shape[1]
240
-
241
- generated_chunks = []
242
-
243
- # حلقه برای پردازش تکه تکه
244
- for i in range(0, total_samples, CHUNK_SAMPLES):
245
- end = min(i + CHUNK_SAMPLES, total_samples)
246
- chunk = content_tensor[:, i:end]
247
 
248
- print(f"Processing Chunk: {i/24000:.1f}s to {end/24000:.1f}s")
249
 
250
- # ذخیره تکه جاری
251
- sf.write(temp_content_path, chunk.squeeze().cpu().numpy(), 24000)
 
252
 
253
- try:
254
- # پردازش تکه
255
- gen_chunk = pipeline.inference_fm(
256
- src_wav_path=temp_content_path,
257
- timbre_ref_wav_path=temp_reference_path,
258
- flow_matching_steps=32,
259
- )
260
-
261
- # بررسی خرابی احتمالی
262
- if torch.isnan(gen_chunk).any() or torch.isinf(gen_chunk).any():
263
- print("Warning: NaN in chunk, fixing...")
264
- gen_chunk = torch.nan_to_num(gen_chunk, nan=0.0, posinf=0.95, neginf=-0.95)
265
-
266
- # اضافه کردن به لیست خروجی‌ها (��طمئن میشیم دوبعدی باشه [1, T])
267
- if gen_chunk.dim() == 1:
268
- gen_chunk = gen_chunk.unsqueeze(0)
269
- generated_chunks.append(gen_chunk.cpu())
270
-
271
- except Exception as e:
272
- print(f"Error processing chunk starting at {i}: {e}")
273
- # در صورت خطا در یک تکه، سکوت جایگزین میکنیم تا کل فایل خراب نشه
274
- silence = torch.zeros_like(chunk)
275
- generated_chunks.append(silence)
276
-
277
- # --- چسباندن تکه‌ها به هم ---
278
- if not generated_chunks:
279
- raise ValueError("No audio generated")
280
 
281
- final_audio = torch.cat(generated_chunks, dim=1)
282
-
283
- print(f"Final Audio Duration: {final_audio.shape[1]/24000:.2f}s")
284
-
285
- # ذخیره خروجی نهایی
286
- my_save_audio(final_audio, output_path=output_path)
287
- return output_path
 
 
 
 
 
 
 
 
 
 
 
 
 
288
 
289
- # رابط کاربری
290
- with gr.Blocks(title="Vevo-Timbre (Long Audio Fix)") as demo:
291
- gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion (Unlimited Length)")
292
- gr.Markdown("این نسخه فایل‌های طولانی را به صورت اتوماتیک به تکه‌های ۱۵ ثانیه‌ای تقسیم کرده و پردازش می‌کند تا صدا خراب نشود.")
293
 
294
  with gr.Row():
295
  with gr.Column():
296
- timbre_content = gr.Audio(label="Source Audio (صدای اصلی - هر چقدر طولانی باشد مشکلی نیست)", type="numpy")
297
- timbre_reference = gr.Audio(label="Target Timbre (صدای هدف - ۲۰ ثانیه اول استفاده میشود)", type="numpy")
298
- timbre_button = gr.Button("Generate (ساخت صدا)", variant="primary")
299
  with gr.Column():
300
- timbre_output = gr.Audio(label="Result (خروجی نهایی)")
301
 
302
  timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
303
 
 
11
  import subprocess
12
  import re
13
  import spaces
14
+ import uuid
15
 
16
+ # دانلود فقط منابع ضروری
17
  downloaded_resources = {
18
  "configs": False,
19
  "tokenizer_vq8192": False,
 
22
  }
23
 
24
  def install_espeak():
 
25
  try:
26
  result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
27
  if result.returncode != 0:
 
29
  subprocess.run(["apt-get", "update"], check=True)
30
  subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
31
  else:
32
+ print("espeak-ng is installed.")
33
  except Exception as e:
34
  print(f"Error installing espeak-ng: {e}")
35
 
 
68
  import LangSegment
69
  importlib.reload(LangSegment)
70
  except: pass
71
+ except: pass
 
 
72
 
73
  patch_langsegment_init()
74
 
 
85
  os.makedirs("wav", exist_ok=True)
86
  os.makedirs("ckpts/Vevo", exist_ok=True)
87
 
88
+ # اینجا دیگر مشکلی ندارد چون نسخه torchaudio را درست کردیم
89
+ from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
 
 
 
 
 
 
 
 
 
 
 
 
 
 
90
 
91
  def setup_configs():
92
  if downloaded_resources["configs"]: return
 
111
  inference_pipelines = {}
112
 
113
  def preload_all_resources():
114
+ print("Preloading resources...")
115
  setup_configs()
116
 
117
  global downloaded_content_style_tokenizer_path
 
132
  local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
133
  downloaded_vocoder_path = local_dir
134
  downloaded_resources["vocoder"] = True
135
+ print("Resources ready.")
 
136
 
137
  downloaded_content_style_tokenizer_path = None
138
  downloaded_fmt_path = None
 
144
  if "timbre" in inference_pipelines:
145
  return inference_pipelines["timbre"]
146
 
 
 
 
 
 
 
147
  pipeline = VevoInferencePipeline(
148
+ content_style_tokenizer_ckpt_path=os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192"),
149
+ fmt_cfg_path="./models/vc/vevo/config/Vq8192ToMels.json",
150
+ fmt_ckpt_path=os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"),
151
+ vocoder_cfg_path="./models/vc/vevo/config/Vocoder.json",
152
+ vocoder_ckpt_path=os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder"),
153
  device=device,
154
  )
155
 
 
158
 
159
  @spaces.GPU()
160
  def vevo_timbre(content_wav, reference_wav):
161
+ # ایجاد نام فایل منحصر به فرد برای جلوگیری از تداخل کاربران
162
+ session_id = str(uuid.uuid4())[:8]
163
+ temp_content_path = f"wav/c_{session_id}.wav"
164
+ temp_reference_path = f"wav/r_{session_id}.wav"
165
+ output_path = f"wav/out_{session_id}.wav"
166
 
167
  if content_wav is None or reference_wav is None:
168
  raise ValueError("Please upload audio files")
169
 
170
+ try:
171
+ # --- پردازش صدای اصلی ---
172
+ if isinstance(content_wav, tuple):
173
+ content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
174
+ else:
175
+ content_sr, content_data = content_wav
176
 
177
+ if len(content_data.shape) > 1 and content_data.shape[1] > 1:
178
+ content_data = np.mean(content_data, axis=1)
179
+
180
+ content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
181
+ if content_sr != 24000:
182
+ content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
183
+ content_sr = 24000
184
+
185
+ content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
186
 
187
+ # --- پردازش صدای رفرنس ---
188
+ if isinstance(reference_wav, tuple):
189
+ ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
190
+ else:
191
+ ref_sr, ref_data = reference_wav
192
 
193
+ if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
194
+ ref_data = np.mean(ref_data, axis=1)
195
 
196
+ ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
197
+ if ref_sr != 24000:
198
+ ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
199
+ ref_sr = 24000
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
200
 
201
+ ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
202
 
203
+ # ذخیره فایل‌ها با torchaudio (چون نسخه قدیمی است، بدون ارور کار می‌کند و فرمت دقیق را حفظ می‌کند)
204
+ torchaudio.save(temp_content_path, content_tensor, content_sr)
205
+ torchaudio.save(temp_reference_path, ref_tensor, ref_sr)
206
 
207
+ print(f"[{session_id}] Processing Audio...")
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
208
 
209
+ pipeline = get_pipeline()
210
+
211
+ # اجرای مدل روی کل فایل (بدون تکه تکه کردن - چون قبلاً اینطوری کار می‌کرد)
212
+ gen_audio = pipeline.inference_fm(
213
+ src_wav_path=temp_content_path,
214
+ timbre_ref_wav_path=temp_reference_path,
215
+ flow_matching_steps=32,
216
+ )
217
+
218
+ if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
219
+ print("Warning: NaN fixed")
220
+ gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
221
+
222
+ save_audio(gen_audio, output_path=output_path)
223
+ return output_path
224
+
225
+ finally:
226
+ # پاکسازی فایل‌های موقت
227
+ if os.path.exists(temp_content_path): os.remove(temp_content_path)
228
+ if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
229
 
230
+ with gr.Blocks(title="Vevo-Timbre (Stable)") as demo:
231
+ gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
 
 
232
 
233
  with gr.Row():
234
  with gr.Column():
235
+ timbre_content = gr.Audio(label="Source Audio", type="numpy")
236
+ timbre_reference = gr.Audio(label="Target Timbre", type="numpy")
237
+ timbre_button = gr.Button("Generate", variant="primary")
238
  with gr.Column():
239
+ timbre_output = gr.Audio(label="Result")
240
 
241
  timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)
242