Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

3a081bb

verified ·

1 Parent(s): 3b10964

Update app.py

Browse files

Files changed (1) hide show

app.py +61 -83

app.py CHANGED Viewed

@@ -187,6 +187,7 @@ def vevo_timbre(content_wav, reference_wav):
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- پردازش رفرنس ---
@@ -202,123 +203,100 @@ def vevo_timbre(content_wav, reference_wav):
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
-        ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- منطق حرفه‌ای Warm-up Context Stitching ---
         pipeline = get_pipeline()
         SR = 24000
-        STEP_SIZE = 10 * SR        # هر 10 ثانیه جلو می‌رویم
-        WARMUP_SIZE = 3 * SR       # 3 ثانیه کانتکست (نگاه به عقب) برای گرم شدن
-        CROSSFADE_SIZE = 1 * SR    # 1 ثانیه میکس برای نرم کردن اتصال
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Duration: {total_samples/SR:.2f}s. Studio Mode (Warm-up + Crossfade)...")
-        final_audio = []
-        previous_tail = None # نگهداری ۱ ثانیه آخر تکه قبلی برای میکس
-        # حلقه روی تکه‌ها
-        current_pos = 0
-        while current_pos < total_samples:
-            # محاسبه دقیق بازه ورودی
-            # اگر اولین تکه نیستیم، 3 ثانیه عقب‌تر شروع می‌کنیم (Warm-up)
-            if current_pos == 0:
-                start_input = 0
-                warmup_cut = 0
-            else:
-                start_input = max(0, current_pos - WARMUP_SIZE)
-                warmup_cut = current_pos - start_input # مقداری که باید از اول خروجی دور بریزیم
-            # پایان این تکه (10 ثانیه جلوتر + 1 ثانیه اضافه برای میکس بعدی)
-            end_input = min(current_pos + STEP_SIZE + CROSSFADE_SIZE, total_samples)
-            # اگر دیتایی نمانده، تمام
             if start_input >= end_input:
                 break
-            # استخراج تکه ورودی
             chunk_tensor = content_tensor[:, start_input:end_input]
             save_audio_pcm16(chunk_tensor, temp_content_path, SR)
-            print(f"[{session_id}] Processing chunk starting at {current_pos/SR:.1f}s (with context)")
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=64, # کیفیت 64 پله‌ای
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
-                # 1. حذف قسمت Warm-up (که قبلاً ساخته شده بود)
-                if warmup_cut > 0:
-                    # اما صبر کن! ما باید CROSSFADE_SIZE تا قبل از نقطه برش را نگه داریم برای میکس
-                    # پس برش را کمی عقب‌تر می‌زنیم تا همپوشانی داشته باشیم
-                    valid_start = warmup_cut - CROSSFADE_SIZE
-                    if valid_start < 0: valid_start = 0 # نباید پیش بیاد
-                    gen = gen[valid_start:]
-                # الان `gen` شامل: [همپوشانی با قبلی] + [تکه جدید] + [همپوشانی با بعدی] است.
-                # 2. میکس با تکه قبلی (اگر وجود دارد)
-                if previous_tail is not None:
-                    # جدا کردن قسمت همپوشانی از این تکه
-                    overlap_part = gen[:CROSSFADE_SIZE]
-                    new_part = gen[CROSSFADE_SIZE:]
-                    # اگر سایزها یکی بود میکس کن
-                    if len(overlap_part) == len(previous_tail):
-                        alpha = np.linspace(0, 1, len(overlap_part))
-                        blended = (previous_tail * (1 - alpha)) + (overlap_part * alpha)
-                        final_audio.append(blended)
-                    else:
-                        # فال‌بک (نباید پیش بیاد)
-                        final_audio.append(previous_tail)
-                    # حالا قسمت جدید را پردازش می‌کنیم
-                    # باید قسمت انتهایی را برای دور بعد ذخیره کنیم
-                    if len(new_part) > CROSSFADE_SIZE and end_input < total_samples:
-                        # ذخیره دم برای دور بعد
-                        previous_tail = new_part[-CROSSFADE_SIZE:]
-                        # اضافه کردن بدنه اصلی
-                        final_audio.append(new_part[:-CROSSFADE_SIZE])
-                    else:
-                        # تکه آخر است، کلش را اضافه کن
-                        final_audio.append(new_part)
-                        previous_tail = None
-                else:
-                    # تکه اول است
-                    if len(gen) > CROSSFADE_SIZE and end_input < total_samples:
-                        previous_tail = gen[-CROSSFADE_SIZE:]
-                        final_audio.append(gen[:-CROSSFADE_SIZE])
-                    else:
-                        final_audio.append(gen)
-                        previous_tail = None
-                current_pos += STEP_SIZE
             except Exception as e:
-                print(f"Error in chunk: {e}")
-                # در صورت خطا، پرش کن (بهتر از قطع شدن است)
-                current_pos += STEP_SIZE
-                # اضافه کردن سکوت
-                final_audio.append(np.zeros(STEP_SIZE))
-                previous_tail = None
-        # چسباندن نهایی
-        if len(final_audio) > 0:
-            full_audio = np.concatenate(final_audio)
         else:
             full_audio = np.zeros(24000)
@@ -329,9 +307,9 @@ def vevo_timbre(content_wav, reference_wav):
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Studio)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("نسخه استودیویی: بدون پرش، بدون تداخل زمانی.")
     with gr.Row():
         with gr.Column():

         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- پردازش رفرنس ---
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
+        ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
+        ref_tensor = ref_tensor / ref_max * 0.95
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق Pre-roll Chunking (حفظ لحن پیوسته) ---
         pipeline = get_pipeline()
         SR = 24000
+        TARGET_CHUNK_LEN = 10 * SR  # طول هدف برای هر تکه جدید (۱۰ ثانیه)
+        CONTEXT_LEN = int(2.5 * SR) # مقدار پیش‌خوانی (۲.۵ ثانیه)
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Processing with Pre-roll context...")
+        final_output = []
+        # مکان‌نما: نشان می‌دهد تا کجای فایل را "نهایی" کرده‌ایم
+        current_cursor = 0
+        while current_cursor < total_samples:
+            # تعیین بازه ورودی به مدل
+            # شروع: از ۲.۵ ثانیه قبل (اگر وجود داشته باشد)
+            start_input = max(0, current_cursor - CONTEXT_LEN)
+            # پایان: ۱۰ ثانیه جلوتر از مکان‌نما
+            end_input = min(current_cursor + TARGET_CHUNK_LEN, total_samples)
+            # اگر به ته فایل رسیدیم و چیزی نمانده
             if start_input >= end_input:
                 break
+            # استخراج تکه (شامل کانتکست + دیتای جدید)
             chunk_tensor = content_tensor[:, start_input:end_input]
             save_audio_pcm16(chunk_tensor, temp_content_path, SR)
             try:
+                # تولید صدا
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
+                # --- برش هوشمند ---
+                # ما به مدل (start_input تا end_input) را دادیم.
+                # اما فقط قسمت (current_cursor تا end_input) را می‌خواهیم.
+                # پس باید قسمت اول (که مربوط به کانتکست است) را ببریم.
+                cut_amount = current_cursor - start_input
+                if len(gen) > cut_amount:
+                    valid_part = gen[cut_amount:]
+                    # اعمال یک میکرو-فید خیلی کوتاه (10 میلی ثانیه) فقط برای رفع نویز اتصال PCM
+                    # این Cross-fade نیست، فقط De-click است
+                    if len(final_output) > 0:
+                        fade_samples = int(0.01 * SR) # 10ms
+                        if len(valid_part) > fade_samples and len(final_output[-1]) > fade_samples:
+                            # نرم کردن اتصال
+                            fade_in = np.linspace(0, 1, fade_samples)
+                            fade_out = np.linspace(1, 0, fade_samples)
+                            # میکس ۱۰ میلی ثانیه آخرِ قبلی با ۱۰ میلی ثانیه اولِ جدید
+                            prev_tail = final_output[-1][-fade_samples:]
+                            curr_head = valid_part[:fade_samples]
+                            # جایگزینی دم قبلی (میکس شده)
+                            final_output[-1][-fade_samples:] = (prev_tail * fade_out) + (curr_head * fade_in)
+                            # حذف سرِ فعلی (چون میکس شد)
+                            valid_part = valid_part[fade_samples:]
+                    final_output.append(valid_part)
+                # آپدیت مکان‌نما
+                current_cursor = end_input
             except Exception as e:
+                print(f"Error: {e}")
+                # مدیریت خطا
+                missing = end_input - current_cursor
+                final_output.append(np.zeros(missing))
+                current_cursor = end_input
+        # چسباندن
+        if len(final_output) > 0:
+            full_audio = np.concatenate(final_output)
         else:
             full_audio = np.zeros(24000)
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Seamless)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("نسخه نهایی: اتصال کاملاً پیوسته و بدون شوک (Pre-roll Context).")
     with gr.Row():
         with gr.Column():