Spaces:

Opera8
/

Sada

Sleeping

App Files Files Community

Opera8 commited on Nov 21

Commit

885b401

verified ·

1 Parent(s): 97b11e9

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -12

app.py CHANGED Viewed

@@ -86,9 +86,8 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
-# --- تابع ذخیره سازی دقیق (16-bit PCM) ---
-# این تابع کلید حل مشکل نویز صداست. فایل را دقیقاً مثل WAV استاندارد ذخیره می‌کند.
-def save_audio_pcm16(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
             waveform = waveform.detach().cpu()
@@ -96,7 +95,22 @@ def save_audio_pcm16(waveform, output_path, sample_rate=24000):
                 waveform = waveform.squeeze(0)
             waveform = waveform.numpy()
-        # تبدیل به فرمت 16 بیتی برای جلوگیری از نویز
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
@@ -192,12 +206,14 @@ def vevo_timbre(content_wav, reference_wav):
             content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- پردازش صدای رفرنس ---
         if isinstance(reference_wav, tuple):
@@ -215,15 +231,13 @@ def vevo_timbre(content_wav, reference_wav):
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        # *** ذخیره با فرمت PCM_16 (کلید حل مشکل نویز) ***
-        save_audio_pcm16(content_tensor, temp_content_path, content_sr)
-        save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
         print(f"[{session_id}] Processing...")
         pipeline = get_pipeline()
-        # اجرای مدل
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
@@ -233,15 +247,17 @@ def vevo_timbre(content_wav, reference_wav):
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        # ذخیره خروجی نهایی
-        save_audio_pcm16(gen_audio, output_path, 24000)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (High Quality)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():

 from models.vc.vevo.vevo_utils import VevoInferencePipeline
+# --- تابع ذخیره سازی پیشرفته (حذف نویز + فرمت استاندارد) ---
+def save_audio_final(waveform, output_path, sample_rate=24000, target_length=None):
     try:
         if isinstance(waveform, torch.Tensor):
             waveform = waveform.detach().cpu()
                 waveform = waveform.squeeze(0)
             waveform = waveform.numpy()
+        # 1. همگام‌سازی طول (حذف نویز اضافه آخر فایل)
+        if target_length is not None:
+            if len(waveform) > target_length:
+                waveform = waveform[:target_length]
+            elif len(waveform) < target_length:
+                # اگر کوتاه‌تر بود، با سکوت پر کن (معمولاً پیش نمیاد)
+                padding = np.zeros(target_length - len(waveform))
+                waveform = np.concatenate([waveform, padding])
+        # 2. اعمال Fade Out (جلوگیری از صدای کلیک در لحظه قطع شدن)
+        fade_len = int(sample_rate * 0.05)  # 50 میلی ثانیه
+        if len(waveform) > fade_len:
+            fade_curve = np.linspace(1, 0, fade_len)
+            waveform[-fade_len:] *= fade_curve
+        # 3. ذخیره با فرمت 16 بیتی
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
             content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # ذخیره طول دقیق فایل ورودی برای برش نهایی
+        target_length_samples = content_tensor.shape[-1]
         # --- پردازش صدای رفرنس ---
         if isinstance(reference_wav, tuple):
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        save_audio_final(content_tensor, temp_content_path, content_sr)
+        save_audio_final(ref_tensor, temp_reference_path, ref_sr)
         print(f"[{session_id}] Processing...")
         pipeline = get_pipeline()
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
         if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        # اینجا فایل را دقیقاً به اندازه ورودی برش می‌زنیم
+        # این کار باعث می‌شود نویز اضافه‌ای که مدل در پایان تولید کرده حذف شود
+        save_audio_final(gen_audio, output_path, 24000, target_length=target_length_samples)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Clean)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():