Opera8 commited on
Commit
97b11e9
·
verified ·
1 Parent(s): 06310a1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +14 -12
app.py CHANGED
@@ -86,15 +86,19 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
86
 
87
  from models.vc.vevo.vevo_utils import VevoInferencePipeline
88
 
89
- # تابع ذخیره سازی امن (جایگزین torchaudio)
90
- def my_save_audio(waveform, output_path, sample_rate=24000):
 
91
  try:
92
  if isinstance(waveform, torch.Tensor):
93
  waveform = waveform.detach().cpu()
94
  if waveform.dim() == 2 and waveform.shape[0] == 1:
95
  waveform = waveform.squeeze(0)
96
  waveform = waveform.numpy()
97
- sf.write(output_path, waveform, sample_rate)
 
 
 
98
  except Exception as e:
99
  print(f"Save error: {e}")
100
  raise e
@@ -169,7 +173,6 @@ def get_pipeline():
169
 
170
  @spaces.GPU()
171
  def vevo_timbre(content_wav, reference_wav):
172
- # تولید نام فایل امن
173
  session_id = str(uuid.uuid4())[:8]
174
  temp_content_path = f"wav/c_{session_id}.wav"
175
  temp_reference_path = f"wav/r_{session_id}.wav"
@@ -190,7 +193,6 @@ def vevo_timbre(content_wav, reference_wav):
190
 
191
  content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
192
 
193
- # ریسمپل با torchaudio (اینجا ارور نمیده چون ذخیره نمیکنیم، فقط پردازش میکنیم)
194
  if content_sr != 24000:
195
  content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
196
  content_sr = 24000
@@ -213,14 +215,15 @@ def vevo_timbre(content_wav, reference_wav):
213
 
214
  ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
215
 
216
- # ذخیره موقت با soundfile (برای جلوگیری از ارور TorchCodec)
217
- sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
218
- sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
219
 
220
  print(f"[{session_id}] Processing...")
221
 
222
  pipeline = get_pipeline()
223
 
 
224
  gen_audio = pipeline.inference_fm(
225
  src_wav_path=temp_content_path,
226
  timbre_ref_wav_path=temp_reference_path,
@@ -228,18 +231,17 @@ def vevo_timbre(content_wav, reference_wav):
228
  )
229
 
230
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
231
- print("Warning: NaN fixed")
232
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
233
 
234
- # ذخیره نهایی با soundfile
235
- my_save_audio(gen_audio, output_path=output_path)
236
  return output_path
237
 
238
  finally:
239
  if os.path.exists(temp_content_path): os.remove(temp_content_path)
240
  if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
241
 
242
- with gr.Blocks(title="Vevo-Timbre (Secure)") as demo:
243
  gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
244
 
245
  with gr.Row():
 
86
 
87
  from models.vc.vevo.vevo_utils import VevoInferencePipeline
88
 
89
+ # --- تابع ذخیره سازی دقیق (16-bit PCM) ---
90
+ # این تابع کلید حل مشکل نویز صداست. فایل را دقیقاً مثل WAV استاندارد ذخیره می‌کند.
91
+ def save_audio_pcm16(waveform, output_path, sample_rate=24000):
92
  try:
93
  if isinstance(waveform, torch.Tensor):
94
  waveform = waveform.detach().cpu()
95
  if waveform.dim() == 2 and waveform.shape[0] == 1:
96
  waveform = waveform.squeeze(0)
97
  waveform = waveform.numpy()
98
+
99
+ # تبدیل به فرمت 16 بیتی برای جلوگیری از نویز
100
+ sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
101
+
102
  except Exception as e:
103
  print(f"Save error: {e}")
104
  raise e
 
173
 
174
  @spaces.GPU()
175
  def vevo_timbre(content_wav, reference_wav):
 
176
  session_id = str(uuid.uuid4())[:8]
177
  temp_content_path = f"wav/c_{session_id}.wav"
178
  temp_reference_path = f"wav/r_{session_id}.wav"
 
193
 
194
  content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
195
 
 
196
  if content_sr != 24000:
197
  content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
198
  content_sr = 24000
 
215
 
216
  ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
217
 
218
+ # *** ذخیره با فرمت PCM_16 (کلید حل مشکل نویز) ***
219
+ save_audio_pcm16(content_tensor, temp_content_path, content_sr)
220
+ save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
221
 
222
  print(f"[{session_id}] Processing...")
223
 
224
  pipeline = get_pipeline()
225
 
226
+ # اجرای مدل
227
  gen_audio = pipeline.inference_fm(
228
  src_wav_path=temp_content_path,
229
  timbre_ref_wav_path=temp_reference_path,
 
231
  )
232
 
233
  if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
 
234
  gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
235
 
236
+ # ذخیره خروجی نهایی
237
+ save_audio_pcm16(gen_audio, output_path, 24000)
238
  return output_path
239
 
240
  finally:
241
  if os.path.exists(temp_content_path): os.remove(temp_content_path)
242
  if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
243
 
244
+ with gr.Blocks(title="Vevo-Timbre (High Quality)") as demo:
245
  gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
246
 
247
  with gr.Row():