Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

a6cd2a1

verified ·

1 Parent(s): f2ebb51

Update app.py

Browse files

Files changed (1) hide show

app.py +35 -40

app.py CHANGED Viewed

@@ -174,7 +174,24 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- آماده سازی Reference (اول رفرنس را پردازش می‌کنیم تا سطح صدا را بگیریم) ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -188,43 +205,27 @@ def vevo_timbre(content_wav, reference_wav):
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
-        # محاسبه انرژی رفرنس
-        ref_max_vol = torch.max(torch.abs(ref_tensor)) + 1e-6
-        ref_tensor = ref_tensor / ref_max_vol * 0.95 # نرمال سازی رفرنس
-        # برش رفرنس به 20 ثانیه
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- آماده سازی Content ---
-        if isinstance(content_wav, tuple):
-            content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
-        else:
-            content_sr, content_data = content_wav
-        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
-            content_data = np.mean(content_data, axis=1)
-        content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        if content_sr != 24000:
-            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
-            content_sr = 24000
-        # نرمال سازی هوشمند: صدای ورودی را هم‌سطح صدای رفرنس می‌کنیم
-        content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # --- منطق Chunking ---
         pipeline = get_pipeline()
         SR = 24000
-        CHUNK_LEN = 10 * SR
-        OVERLAP = 1 * SR
         INPUT_SIZE = CHUNK_LEN + OVERLAP
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] High Quality Processing (64 Steps)... Duration: {total_samples/SR:.2f}s")
         final_parts = []
         overlap_buffer = None
@@ -239,7 +240,7 @@ def vevo_timbre(content_wav, reference_wav):
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=64,  # <--- کیفیت بالا (قبلاً 32 بود)
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
@@ -257,6 +258,7 @@ def vevo_timbre(content_wav, reference_wav):
                     head_to_mix = gen[:mix_len]
                     body_rest = gen[mix_len:]
                     alpha = np.linspace(0, 1, mix_len)
                     blended_segment = (overlap_buffer * (1 - alpha)) + (head_to_mix * alpha)
@@ -302,22 +304,15 @@ def vevo_timbre(content_wav, reference_wav):
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Ultra Quality)") as demo:
-    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion (Ultra Quality)")
-    gr.Markdown("""
-    **ویژگی‌ها:**
-    - **Steps 64:** کیفیت و دقت بافت صدا دو برابر شده است.
-    - **Auto-Leveling:** سطح صدای شما با مدل تنظیم می‌شود.
-    - **Seamless Stitching:** بدون پرش و بدون اضافه شدن زمان.
-    **نکته مهم:** برای بهترین نتیجه، سعی کنید **لحن، سرعت و احساس** صدای خودتان را شبیه فایل هدف کنید. مدل فقط جنس صدا را تغییر می‌دهد، نه بازیگری شما را!
-    """)
     with gr.Row():
         with gr.Column():
-            timbre_content = gr.Audio(label="Source Audio (صدای شما)", type="numpy")
-            timbre_reference = gr.Audio(label="Target Timbre (صدای هدف)", type="numpy")
-            timbre_button = gr.Button("Generate (Ultra Quality)", variant="primary")
         with gr.Column():
             timbre_output = gr.Audio(label="Result")

         raise ValueError("Please upload audio files")
     try:
+        # --- آماده سازی Content ---
+        if isinstance(content_wav, tuple):
+            content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
+        else:
+            content_sr, content_data = content_wav
+        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
+            content_data = np.mean(content_data, axis=1)
+        content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        if content_sr != 24000:
+            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
+            content_sr = 24000
+        # نرمال سازی
+        content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- آماده سازی Reference ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
+        # تنظیم لول رفرنس
+        ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
+        ref_tensor = ref_tensor / ref_max * 0.95
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق Chunking (اصلاح شده: همپوشانی کوتاه) ---
         pipeline = get_pipeline()
         SR = 24000
+        CHUNK_LEN = 10 * SR        # 10 ثانیه اصلی
+        # تغییر مهم: کاهش همپوشانی به 0.1 ثانیه (100 میلی ثانیه)
+        # این باعث می‌شود اکو از بین برود ولی اتصال همچنان نرم باشد
+        OVERLAP = int(0.1 * SR)
         INPUT_SIZE = CHUNK_LEN + OVERLAP
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Processing (High Quality 64 Steps)...")
         final_parts = []
         overlap_buffer = None
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64, # کیفیت بالا
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                     head_to_mix = gen[:mix_len]
                     body_rest = gen[mix_len:]
+                    # میکس سریع (Fast Cross-Fade)
                     alpha = np.linspace(0, 1, mix_len)
                     blended_segment = (overlap_buffer * (1 - alpha)) + (head_to_mix * alpha)
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (No Echo)") as demo:
+    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("نسخه اصلاح شده: حذف اکو در نقاط اتصال + کیفیت بالای ۶۴ مرحله‌ای.")
     with gr.Row():
         with gr.Column():
+            timbre_content = gr.Audio(label="Source Audio", type="numpy")
+            timbre_reference = gr.Audio(label="Target Timbre", type="numpy")
+            timbre_button = gr.Button("Generate", variant="primary")
         with gr.Column():
             timbre_output = gr.Audio(label="Result")