Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 22 days ago

Commit

9622192

verified ·

1 Parent(s): a6cd2a1

Update app.py

Browse files

Files changed (1) hide show

app.py +40 -59

app.py CHANGED Viewed

@@ -187,8 +187,6 @@ def vevo_timbre(content_wav, reference_wav):
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
-        # نرمال سازی
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- آماده سازی Reference ---
@@ -214,84 +212,67 @@ def vevo_timbre(content_wav, reference_wav):
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- منطق Chunking (اصلاح شده: همپوشانی کوتاه) ---
-        pipeline = get_pipeline()
         SR = 24000
-        CHUNK_LEN = 10 * SR        # 10 ثانیه اصلی
-        # تغییر مهم: کاهش همپوشانی به 0.1 ثانیه (100 میلی ثانیه)
-        # این باعث می‌شود اکو از بین برود ولی اتصال همچنان نرم باشد
-        OVERLAP = int(0.1 * SR)
-        INPUT_SIZE = CHUNK_LEN + OVERLAP
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Processing (High Quality 64 Steps)...")
         final_parts = []
-        overlap_buffer = None
-        for start in range(0, total_samples, CHUNK_LEN):
-            end_input = min(start + INPUT_SIZE, total_samples)
-            current_input_chunk = content_tensor[:, start:end_input]
             save_audio_pcm16(current_input_chunk, temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=64, # کیفیت بالا
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
-                current_len = len(gen)
-                if overlap_buffer is not None:
-                    mix_len = len(overlap_buffer)
-                    if current_len < mix_len:
-                        mix_len = current_len
-                        overlap_buffer = overlap_buffer[:mix_len]
-                    head_to_mix = gen[:mix_len]
-                    body_rest = gen[mix_len:]
-                    # میکس سریع (Fast Cross-Fade)
-                    alpha = np.linspace(0, 1, mix_len)
-                    blended_segment = (overlap_buffer * (1 - alpha)) + (head_to_mix * alpha)
-                    final_parts.append(blended_segment)
-                    if len(body_rest) > OVERLAP:
-                        pure_body = body_rest[:-OVERLAP]
-                        final_parts.append(pure_body)
-                        overlap_buffer = body_rest[-OVERLAP:]
-                    else:
-                        final_parts.append(body_rest)
-                        overlap_buffer = None
-                else:
-                    if current_len > OVERLAP:
-                        final_parts.append(gen[:-OVERLAP])
-                        overlap_buffer = gen[-OVERLAP:]
-                    else:
-                        final_parts.append(gen)
-                        overlap_buffer = None
             except Exception as e:
-                print(f"Error in chunk: {e}")
-                missing_len = end_input - start
-                if overlap_buffer is not None:
-                    missing_len -= len(overlap_buffer)
-                    final_parts.append(overlap_buffer)
-                    overlap_buffer = None
-                final_parts.append(np.zeros(max(0, missing_len)))
-        if overlap_buffer is not None:
-            final_parts.append(overlap_buffer)
         if len(final_parts) > 0:
             full_audio = np.concatenate(final_parts)
         else:
@@ -304,9 +285,9 @@ def vevo_timbre(content_wav, reference_wav):
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (No Echo)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("نسخه اصلاح شده: حذف اکو در نقاط اتصال + کیفیت بالای ۶۴ مرحله‌ای.")
     with gr.Row():
         with gr.Column():

         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- آماده سازی Reference ---
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق هوشمند (Smart Context Window) ---
+        # ما هیچ صدایی را با هم میکس نمی‌کنیم (حذف اکو)
+        # فقط از صدای قبلی به عنوان "زمینه" استفاده می‌کنیم و خروجی زمینه را دور می‌ریزیم
+        pipeline = get_pipeline()
         SR = 24000
+        CHUNK_LEN = 10 * SR       # 10 ثانیه دیتای مفید
+        CONTEXT_LEN = 3 * SR      # 3 ثانیه نگاه به عقب (برای حفظ لحن)
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Smart Processing (No Echo)...")
         final_parts = []
+        current_ptr = 0
+        while current_ptr < total_samples:
+            # تعیین بازه ورودی
+            # شروع: از 3 ثانیه قبل (اگر وجود داشته باشد)
+            start_idx = max(0, current_ptr - CONTEXT_LEN)
+            # پایان: 10 ثانیه بعد از نقطه فعلی
+            end_idx = min(total_samples, current_ptr + CHUNK_LEN)
+            # استخراج تکه ورودی (شامل کانتکست + دیتای جدید)
+            current_input_chunk = content_tensor[:, start_idx:end_idx]
             save_audio_pcm16(current_input_chunk, temp_content_path, SR)
+            # مقدار زمانی که باید از اول خروجی حذف کنیم (همان کانتکست)
+            trim_amount = 0
+            if current_ptr > 0:
+                trim_amount = current_ptr - start_idx # معمولاً برابر CONTEXT_LEN است
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
+                # *** برش هوشمند ***
+                # قسمت اول (که تکراری است و مربوط به کانتکست بوده) را دور می‌ریزیم
+                useful_part = gen[trim_amount:]
+                final_parts.append(useful_part)
+                # حرکت به جلو
+                current_ptr += CHUNK_LEN
             except Exception as e:
+                print(f"Error: {e}")
+                # در صورت خطا، سکوت اضافه کن (به اندازه دیتای جدیدی که قرار بود ساخته شود)
+                missing = end_idx - current_ptr
+                if missing > 0:
+                    final_parts.append(np.zeros(missing))
+                current_ptr += CHUNK_LEN # تلاش برای تکه بعدی
+        # چسباندن قطعات
         if len(final_parts) > 0:
             full_audio = np.concatenate(final_parts)
         else:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Clean)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("نسخه نهایی بدون اکو: استفاده از تکنیک Smart Context Window.")
     with gr.Row():
         with gr.Column():