Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

3b10964

verified ·

1 Parent(s): 454cbe7

Update app.py

Browse files

Files changed (1) hide show

app.py +94 -59

app.py CHANGED Viewed

@@ -174,7 +174,7 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- آماده سازی Content ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
@@ -189,7 +189,7 @@ def vevo_timbre(content_wav, reference_wav):
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # --- آماده سازی Reference ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -209,94 +209,129 @@ def vevo_timbre(content_wav, reference_wav):
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- تنظیمات میکسینگ عمیق (Deep Cross-Fade) ---
         pipeline = get_pipeline()
-        SR = 24000
-        # گام حرکت: ۱۰ ثانیه
-        STEP_SIZE = 10 * SR
-        # طول پردازش: ۱۲ ثانیه (۲ ثانیه همپوشانی)
-        # این ۲ ثانیه اضافه باعث می‌شود هیچوقت لبه تیز نداشته باشیم
-        PROCESS_LEN = 12 * SR
-        OVERLAP = PROCESS_LEN - STEP_SIZE # 2 seconds
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Processing Deep Mix Mode...")
-        # آرایه نهایی را کمی بزرگتر می‌گیریم تا جا کم نیاید
-        final_audio = np.zeros(total_samples + OVERLAP)
-        # آرایه‌ای برای شمارش وزن‌ها (برای نرمال کردن میکس)
-        weight_accumulator = np.zeros(total_samples + OVERLAP)
-        # ایجاد پنجره محو شونده (Trapezoid Window)
-        # 1 ثانیه Fade In -- 10 ثانیه ثابت -- 1 ثانیه Fade Out
-        fade_samples = SR # 1 second fade
-        window = np.ones(PROCESS_LEN)
-        window[:fade_samples] = np.linspace(0, 1, fade_samples)
-        window[-fade_samples:] = np.linspace(1, 0, fade_samples)
-        for start in range(0, total_samples, STEP_SIZE):
-            # انتخاب بازه ورودی (کمی بزرگتر از استپ)
-            end = min(start + PROCESS_LEN, total_samples)
-            current_len = end - start
-            if current_len <= 0: break
-            current_input_chunk = content_tensor[:, start:end]
-            save_audio_pcm16(current_input_chunk, temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
-                gen = gen.cpu().squeeze().numpy()
-                # اگر طول خروجی کمتر از انتظار بود، با سکوت پر کن
-                if len(gen) < current_len:
-                    gen = np.pad(gen, (0, current_len - len(gen)))
-                elif len(gen) > current_len:
-                    gen = gen[:current_len]
-                # اعمال پنجره روی خروجی (اگر تکه آخر است، پنجره را برش بزن)
-                current_window = window[:current_len]
-                # برای تکه اول، Fade In نیاز نیست (چون شروع فایل است)
-                if start == 0:
-                    current_window[:fade_samples] = 1.0
-                # برای تکه آخر، Fade Out نیاز نیست (چون پایان فایل است)
-                if end == total_samples:
-                    current_window[-fade_samples:] = 1.0
-                weighted_gen = gen * current_window
-                # اضافه کردن به آرایه اصلی (Overlap-Add)
-                final_audio[start:end] += weighted_gen
-                weight_accumulator[start:end] += current_window
             except Exception as e:
-                print(f"Error: {e}")
-        # نرمال‌سازی نهایی (تقسیم بر وزن‌ها برای یکنواخت شدن صدا در نقاط اتصال)
-        # جایی که وزن صفر است را ۱ می‌کنیم تا تقسیم بر صفر نشود
-        weight_accumulator[weight_accumulator == 0] = 1.0
-        final_audio = final_audio / weight_accumulator
-        # برش بخش‌های اضافه انتهای فایل
-        final_audio = final_audio[:total_samples]
-        save_audio_pcm16(final_audio, output_path, SR)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Seamless)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("نسخه Seamless: استفاده از تکنیک Overlap-Add برای حذف کامل پرش و لرزش صدا.")
     with gr.Row():
         with gr.Column():

         raise ValueError("Please upload audio files")
     try:
+        # --- پردازش ورودی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- پردازش رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق حرفه‌ای Warm-up Context Stitching ---
         pipeline = get_pipeline()
+        SR = 24000
+        STEP_SIZE = 10 * SR        # هر 10 ثانیه جلو می‌رویم
+        WARMUP_SIZE = 3 * SR       # 3 ثانیه کانتکست (نگاه به عقب) برای گرم شدن
+        CROSSFADE_SIZE = 1 * SR    # 1 ثانیه میکس برای نرم کردن اتصال
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Duration: {total_samples/SR:.2f}s. Studio Mode (Warm-up + Crossfade)...")
+        final_audio = []
+        previous_tail = None # نگهداری ۱ ثانیه آخر تکه قبلی برای میکس
+        # حلقه روی تکه‌ها
+        current_pos = 0
+        while current_pos < total_samples:
+            # محاسبه دقیق بازه ورودی
+            # اگر اولین تکه نیستیم، 3 ثانیه عقب‌تر شروع می‌کنیم (Warm-up)
+            if current_pos == 0:
+                start_input = 0
+                warmup_cut = 0
+            else:
+                start_input = max(0, current_pos - WARMUP_SIZE)
+                warmup_cut = current_pos - start_input # مقداری که باید از اول خروجی دور بریزیم
+            # پایان این تکه (10 ثانیه جلوتر + 1 ثانیه اضافه برای میکس بعدی)
+            end_input = min(current_pos + STEP_SIZE + CROSSFADE_SIZE, total_samples)
+            # اگر دیتایی نمانده، تمام
+            if start_input >= end_input:
+                break
+            # استخراج تکه ورودی
+            chunk_tensor = content_tensor[:, start_input:end_input]
+            save_audio_pcm16(chunk_tensor, temp_content_path, SR)
+            print(f"[{session_id}] Processing chunk starting at {current_pos/SR:.1f}s (with context)")
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64, # کیفیت 64 پله‌ای
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
+                if gen.dim() == 1: gen = gen.unsqueeze(0)
+                gen = gen.cpu().squeeze(0).numpy()
+                # 1. حذف قسمت Warm-up (که قبلاً ساخته شده بود)
+                if warmup_cut > 0:
+                    # اما صبر کن! ما باید CROSSFADE_SIZE تا قبل از نقطه برش را نگه داریم برای میکس
+                    # پس برش را کمی عقب‌تر می‌زنیم تا همپوشانی داشته باشیم
+                    valid_start = warmup_cut - CROSSFADE_SIZE
+                    if valid_start < 0: valid_start = 0 # نباید پیش بیاد
+                    gen = gen[valid_start:]
+                # الان `gen` شامل: [همپوشانی با قبلی] + [تکه جدید] + [همپوشانی با بعدی] است.
+                # 2. میکس با تکه قبلی (اگر وجود دارد)
+                if previous_tail is not None:
+                    # جدا کردن قسمت همپوشانی از این تکه
+                    overlap_part = gen[:CROSSFADE_SIZE]
+                    new_part = gen[CROSSFADE_SIZE:]
+                    # اگر سایزها یکی بود میکس کن
+                    if len(overlap_part) == len(previous_tail):
+                        alpha = np.linspace(0, 1, len(overlap_part))
+                        blended = (previous_tail * (1 - alpha)) + (overlap_part * alpha)
+                        final_audio.append(blended)
+                    else:
+                        # فال‌بک (نباید پیش بیاد)
+                        final_audio.append(previous_tail)
+                    # حالا قسمت جدید را پردازش می‌کنیم
+                    # باید قسمت انتهایی را برای دور بعد ذخیره کنیم
+                    if len(new_part) > CROSSFADE_SIZE and end_input < total_samples:
+                        # ذخیره دم برای دور بعد
+                        previous_tail = new_part[-CROSSFADE_SIZE:]
+                        # اضافه کردن بدنه اصلی
+                        final_audio.append(new_part[:-CROSSFADE_SIZE])
+                    else:
+                        # تکه آخر است، کلش را اضافه کن
+                        final_audio.append(new_part)
+                        previous_tail = None
+                else:
+                    # تکه اول است
+                    if len(gen) > CROSSFADE_SIZE and end_input < total_samples:
+                        previous_tail = gen[-CROSSFADE_SIZE:]
+                        final_audio.append(gen[:-CROSSFADE_SIZE])
+                    else:
+                        final_audio.append(gen)
+                        previous_tail = None
+                current_pos += STEP_SIZE
             except Exception as e:
+                print(f"Error in chunk: {e}")
+                # در صورت خطا، پرش کن (بهتر از قطع شدن است)
+                current_pos += STEP_SIZE
+                # اضافه کردن سکوت
+                final_audio.append(np.zeros(STEP_SIZE))
+                previous_tail = None
+        # چسباندن نهایی
+        if len(final_audio) > 0:
+            full_audio = np.concatenate(final_audio)
+        else:
+            full_audio = np.zeros(24000)
+        save_audio_pcm16(full_audio, output_path, SR)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Studio)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("نسخه استودیویی: بدون پرش، بدون تداخل زمانی.")
     with gr.Row():
         with gr.Column():