Spaces:

Opera8
/

Sada

Sleeping

App Files Files Community

Opera8 commited on 24 days ago

Commit

de91e11

verified ·

1 Parent(s): 9622192

Update app.py

Browse files

Files changed (1) hide show

app.py +58 -43

app.py CHANGED Viewed

@@ -174,7 +174,7 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- آماده سازی Content ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
@@ -187,9 +187,11 @@ def vevo_timbre(content_wav, reference_wav):
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # --- آماده سازی Reference ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -203,7 +205,6 @@ def vevo_timbre(content_wav, reference_wav):
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
-        # تنظیم لول رفرنس
         ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
         ref_tensor = ref_tensor / ref_max * 0.95
@@ -212,69 +213,83 @@ def vevo_timbre(content_wav, reference_wav):
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- منطق هوشمند (Smart Context Window) ---
-        # ما هیچ صدایی را با هم میکس نمی‌کنیم (حذف اکو)
-        # فقط از صدای قبلی به عنوان "زمینه" استفاده می‌کنیم و خروجی زمینه را دور می‌ریزیم
         pipeline = get_pipeline()
         SR = 24000
-        CHUNK_LEN = 10 * SR       # 10 ثانیه دیتای مفید
-        CONTEXT_LEN = 3 * SR      # 3 ثانیه نگاه به عقب (برای حفظ لحن)
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Smart Processing (No Echo)...")
-        final_parts = []
-        current_ptr = 0
-        while current_ptr < total_samples:
-            # تعیین بازه ورودی
-            # شروع: از 3 ثانیه قبل (اگر وجود داشته باشد)
-            start_idx = max(0, current_ptr - CONTEXT_LEN)
-            # پایان: 10 ثانیه بعد از نقطه فعلی
-            end_idx = min(total_samples, current_ptr + CHUNK_LEN)
-            # استخراج تکه ورودی (شامل کانتکست + دیتای جدید)
-            current_input_chunk = content_tensor[:, start_idx:end_idx]
             save_audio_pcm16(current_input_chunk, temp_content_path, SR)
-            # مقدار زمانی که باید از اول خروجی حذف کنیم (همان کانتکست)
-            trim_amount = 0
-            if current_ptr > 0:
-                trim_amount = current_ptr - start_idx # معمولاً برابر CONTEXT_LEN است
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
-                # *** برش هوشمند ***
-                # قسمت اول (که تکراری است و مربوط به کانتکست بوده) را دور می‌ریزیم
-                useful_part = gen[trim_amount:]
-                final_parts.append(useful_part)
-                # حرکت به جلو
-                current_ptr += CHUNK_LEN
             except Exception as e:
-                print(f"Error: {e}")
-                # در صورت خطا، سکوت اضافه کن (به اندازه دیتای جدیدی که قرار بود ساخته شود)
-                missing = end_idx - current_ptr
-                if missing > 0:
-                    final_parts.append(np.zeros(missing))
-                current_ptr += CHUNK_LEN # تلاش برای تکه بعدی
-        # چسباندن قطعات
-        if len(final_parts) > 0:
-            full_audio = np.concatenate(final_parts)
         else:
             full_audio = np.zeros(24000)
@@ -285,9 +300,9 @@ def vevo_timbre(content_wav, reference_wav):
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Clean)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("نسخه نهایی بدون اکو: استفاده از تکنیک Smart Context Window.")
     with gr.Row():
         with gr.Column():

         raise ValueError("Please upload audio files")
     try:
+        # --- پردازش ورودی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
+        # نرمال‌سازی
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- پردازش رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
         ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
         ref_tensor = ref_tensor / ref_max * 0.95
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق Look-back Splicing (حذف قطعی اکو) ---
         pipeline = get_pipeline()
         SR = 24000
+        MAIN_CHUNK_SEC = 10.0
+        CONTEXT_SEC = 1.0  # مقدار نگاه به عقب
+        MAIN_CHUNK = int(MAIN_CHUNK_SEC * SR)
+        CONTEXT = int(CONTEXT_SEC * SR)
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Processing (High Quality 64 Steps)... Zero Echo Mode.")
+        final_output = []
+        # اشاره‌گر جاری روی فایل اصلی
+        cursor = 0
+        while cursor < total_samples:
+            if cursor == 0:
+                # تکه اول: بدون کانتکست
+                input_start = 0
+                input_end = min(MAIN_CHUNK, total_samples)
+                # در تکه اول چیزی را دور نمی‌ریزیم
+                crop_from = 0
+            else:
+                # تکه‌های بعدی: با کانتکست (نگاه به عقب)
+                input_start = cursor - CONTEXT
+                input_end = min(cursor + MAIN_CHUNK, total_samples)
+                # در خروجی، قسمت کانتکست را دور می‌ریزیم (Cut)
+                crop_from = CONTEXT
+            # اگر به انتهای فایل رسیدیم و طول باقی‌مانده خیلی کم است
+            if input_start >= input_end:
+                break
+            current_input_chunk = content_tensor[:, input_start:input_end]
             save_audio_pcm16(current_input_chunk, temp_content_path, SR)
+            print(f"[{session_id}] Processing chunk: {cursor/SR:.1f}s -> {(input_end-input_start)/SR:.1f}s len")
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64, # کیفیت بالا
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
+                # *** نکته کلیدی: برش قسمت تکراری ***
+                # فقط قسمت "جدید" را نگه می‌داریم
+                if crop_from > 0:
+                    if len(gen) > crop_from:
+                        valid_audio = gen[crop_from:]
+                    else:
+                        valid_audio = np.array([]) # اگر خروجی خیلی کوتاه بود
+                else:
+                    valid_audio = gen
+                final_output.append(valid_audio)
+                # حرکت مکان‌نما به اندازه دیتای مفیدی که تولید کردیم
+                cursor = input_end
             except Exception as e:
+                print(f"Error in chunk: {e}")
+                # اگر ارور داد، سکوت جایگزین کن که تایمینگ به هم نریزد
+                needed_len = input_end - (cursor if cursor > 0 else 0)
+                final_output.append(np.zeros(needed_len))
+                cursor = input_end
+        # چسباندن تکه‌ها (Concatenate)
+        if len(final_output) > 0:
+            full_audio = np.concatenate(final_output)
         else:
             full_audio = np.zeros(24000)
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Zero Echo)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("نسخه نهایی: استفاده از روش Look-back Splicing برای حذف کامل اکو و حفظ پیوستگی لحن.")
     with gr.Row():
         with gr.Column():