Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 22 days ago

Commit

454cbe7

verified ·

1 Parent(s): de91e11

Update app.py

Browse files

Files changed (1) hide show

app.py +60 -66

app.py CHANGED Viewed

@@ -174,7 +174,7 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- پردازش ورودی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
@@ -187,11 +187,9 @@ def vevo_timbre(content_wav, reference_wav):
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
-        # نرمال‌سازی
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # --- پردازش رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -204,105 +202,101 @@ def vevo_timbre(content_wav, reference_wav):
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
-        ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
-        ref_tensor = ref_tensor / ref_max * 0.95
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- منطق Look-back Splicing (حذف قطعی اکو) ---
         pipeline = get_pipeline()
         SR = 24000
-        MAIN_CHUNK_SEC = 10.0
-        CONTEXT_SEC = 1.0  # مقدار نگاه به عقب
-        MAIN_CHUNK = int(MAIN_CHUNK_SEC * SR)
-        CONTEXT = int(CONTEXT_SEC * SR)
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Processing (High Quality 64 Steps)... Zero Echo Mode.")
-        final_output = []
-        # اشاره‌گر جاری روی فایل اصلی
-        cursor = 0
-        while cursor < total_samples:
-            if cursor == 0:
-                # تکه اول: بدون کانتکست
-                input_start = 0
-                input_end = min(MAIN_CHUNK, total_samples)
-                # در تکه اول چیزی را دور نمی‌ریزیم
-                crop_from = 0
-            else:
-                # تکه‌های بعدی: با کانتکست (نگاه به عقب)
-                input_start = cursor - CONTEXT
-                input_end = min(cursor + MAIN_CHUNK, total_samples)
-                # در خروجی، قسمت کانتکست را دور می‌ریزیم (Cut)
-                crop_from = CONTEXT
-            # اگر به انتهای فایل رسیدیم و طول باقی‌مانده خیلی کم است
-            if input_start >= input_end:
-                break
-            current_input_chunk = content_tensor[:, input_start:input_end]
-            save_audio_pcm16(current_input_chunk, temp_content_path, SR)
-            print(f"[{session_id}] Processing chunk: {cursor/SR:.1f}s -> {(input_end-input_start)/SR:.1f}s len")
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=64, # کیفیت بالا
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
-                if gen.dim() == 1: gen = gen.unsqueeze(0)
-                gen = gen.cpu().squeeze(0).numpy()
-                # *** نکته کلیدی: برش قسمت تکراری ***
-                # فقط قسمت "جدید" را نگه می‌داریم
-                if crop_from > 0:
-                    if len(gen) > crop_from:
-                        valid_audio = gen[crop_from:]
-                    else:
-                        valid_audio = np.array([]) # اگر خروجی خیلی کوتاه بود
-                else:
-                    valid_audio = gen
-                final_output.append(valid_audio)
-                # حرکت مکان‌نما به اندازه دیتای مفیدی که تولید کردیم
-                cursor = input_end
             except Exception as e:
-                print(f"Error in chunk: {e}")
-                # اگر ارور داد، سکوت جایگزین کن که تایمینگ به هم نریزد
-                needed_len = input_end - (cursor if cursor > 0 else 0)
-                final_output.append(np.zeros(needed_len))
-                cursor = input_end
-        # چسباندن تکه‌ها (Concatenate)
-        if len(final_output) > 0:
-            full_audio = np.concatenate(final_output)
-        else:
-            full_audio = np.zeros(24000)
-        save_audio_pcm16(full_audio, output_path, SR)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Zero Echo)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("نسخه نهایی: استفاده از روش Look-back Splicing برای حذف کامل اکو و حفظ پیوستگی لحن.")
     with gr.Row():
         with gr.Column():

         raise ValueError("Please upload audio files")
     try:
+        # --- آماده سازی Content ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- آماده سازی Reference ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
+        ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- تنظیمات میکسینگ عمیق (Deep Cross-Fade) ---
         pipeline = get_pipeline()
         SR = 24000
+        # گام حرکت: ۱۰ ثانیه
+        STEP_SIZE = 10 * SR
+        # طول پردازش: ۱۲ ثانیه (۲ ثانیه همپوشانی)
+        # این ۲ ثانیه اضافه باعث می‌شود هیچوقت لبه تیز نداشته باشیم
+        PROCESS_LEN = 12 * SR
+        OVERLAP = PROCESS_LEN - STEP_SIZE # 2 seconds
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Processing Deep Mix Mode...")
+        # آرایه نهایی را کمی بزرگتر می‌گیریم تا جا کم نیاید
+        final_audio = np.zeros(total_samples + OVERLAP)
+        # آرایه‌ای برای شمارش وزن‌ها (برای نرمال کردن میکس)
+        weight_accumulator = np.zeros(total_samples + OVERLAP)
+        # ایجاد پنجره محو شونده (Trapezoid Window)
+        # 1 ثانیه Fade In -- 10 ثانیه ثابت -- 1 ثانیه Fade Out
+        fade_samples = SR # 1 second fade
+        window = np.ones(PROCESS_LEN)
+        window[:fade_samples] = np.linspace(0, 1, fade_samples)
+        window[-fade_samples:] = np.linspace(1, 0, fade_samples)
+        for start in range(0, total_samples, STEP_SIZE):
+            # انتخاب بازه ورودی (کمی بزرگتر از استپ)
+            end = min(start + PROCESS_LEN, total_samples)
+            current_len = end - start
+            if current_len <= 0: break
+            current_input_chunk = content_tensor[:, start:end]
+            save_audio_pcm16(current_input_chunk, temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
+                gen = gen.cpu().squeeze().numpy()
+                # اگر طول خروجی کمتر از انتظار بود، با سکوت پر کن
+                if len(gen) < current_len:
+                    gen = np.pad(gen, (0, current_len - len(gen)))
+                elif len(gen) > current_len:
+                    gen = gen[:current_len]
+                # اعمال پنجره روی خروجی (اگر تکه آخر است، پنجره را برش بزن)
+                current_window = window[:current_len]
+                # برای تکه اول، Fade In نیاز نیست (چون شروع فایل است)
+                if start == 0:
+                    current_window[:fade_samples] = 1.0
+                # برای تکه آخر، Fade Out نیاز نیست (چون پایان فایل است)
+                if end == total_samples:
+                    current_window[-fade_samples:] = 1.0
+                weighted_gen = gen * current_window
+                # اضافه کردن به آرایه اصلی (Overlap-Add)
+                final_audio[start:end] += weighted_gen
+                weight_accumulator[start:end] += current_window
             except Exception as e:
+                print(f"Error: {e}")
+        # نرمال‌سازی نهایی (تقسیم بر وزن‌ها برای یکنواخت شدن صدا در نقاط اتصال)
+        # جایی که وزن صفر است را ۱ می‌کنیم تا تقسیم بر صفر نشود
+        weight_accumulator[weight_accumulator == 0] = 1.0
+        final_audio = final_audio / weight_accumulator
+        # برش بخش‌های اضافه انتهای فایل
+        final_audio = final_audio[:total_samples]
+        save_audio_pcm16(final_audio, output_path, SR)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Seamless)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("نسخه Seamless: استفاده از تکنیک Overlap-Add برای حذف کامل پرش و لرزش صدا.")
     with gr.Row():
         with gr.Column():