Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

97b11e9

verified ·

1 Parent(s): 06310a1

Update app.py

Browse files

Files changed (1) hide show

app.py +14 -12

app.py CHANGED Viewed

@@ -86,15 +86,19 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
-# تابع ذخیره سازی امن (جایگزین torchaudio)
-def my_save_audio(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
             waveform = waveform.detach().cpu()
             if waveform.dim() == 2 and waveform.shape[0] == 1:
                 waveform = waveform.squeeze(0)
             waveform = waveform.numpy()
-        sf.write(output_path, waveform, sample_rate)
     except Exception as e:
         print(f"Save error: {e}")
         raise e
@@ -169,7 +173,6 @@ def get_pipeline():
 @spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
-    # تولید نام فایل امن
     session_id = str(uuid.uuid4())[:8]
     temp_content_path = f"wav/c_{session_id}.wav"
     temp_reference_path = f"wav/r_{session_id}.wav"
@@ -190,7 +193,6 @@ def vevo_timbre(content_wav, reference_wav):
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        # ریسمپل با torchaudio (اینجا ارور نمیده چون ذخیره نمیکنیم، فقط پردازش میکنیم)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
@@ -213,14 +215,15 @@ def vevo_timbre(content_wav, reference_wav):
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        # ذخیره موقت با soundfile (برای جلوگیری از ارور TorchCodec)
-        sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
-        sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
         print(f"[{session_id}] Processing...")
         pipeline = get_pipeline()
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
@@ -228,18 +231,17 @@ def vevo_timbre(content_wav, reference_wav):
         )
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            print("Warning: NaN fixed")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        # ذخیره نهایی با soundfile
-        my_save_audio(gen_audio, output_path=output_path)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Secure)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():

 from models.vc.vevo.vevo_utils import VevoInferencePipeline
+# --- تابع ذخیره سازی دقیق (16-bit PCM) ---
+# این تابع کلید حل مشکل نویز صداست. فایل را دقیقاً مثل WAV استاندارد ذخیره می‌کند.
+def save_audio_pcm16(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
             waveform = waveform.detach().cpu()
             if waveform.dim() == 2 and waveform.shape[0] == 1:
                 waveform = waveform.squeeze(0)
             waveform = waveform.numpy()
+        # تبدیل به فرمت 16 بیتی برای جلوگیری از نویز
+        sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
         raise e
 @spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
     session_id = str(uuid.uuid4())[:8]
     temp_content_path = f"wav/c_{session_id}.wav"
     temp_reference_path = f"wav/r_{session_id}.wav"
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        # *** ذخیره با فرمت PCM_16 (کلید حل مشکل نویز) ***
+        save_audio_pcm16(content_tensor, temp_content_path, content_sr)
+        save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
         print(f"[{session_id}] Processing...")
         pipeline = get_pipeline()
+        # اجرای مدل
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
         )
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        # ذخیره خروجی نهایی
+        save_audio_pcm16(gen_audio, output_path, 24000)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (High Quality)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():