Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

f2ebb51

verified ·

1 Parent(s): 09eb27e

Update app.py

Browse files

Files changed (1) hide show

app.py +45 -65

app.py CHANGED Viewed

@@ -174,22 +174,7 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- آماده سازی Content ---
-        if isinstance(content_wav, tuple):
-            content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
-        else:
-            content_sr, content_data = content_wav
-        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
-            content_data = np.mean(content_data, axis=1)
-        content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        if content_sr != 24000:
-            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
-            content_sr = 24000
-        content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # --- آماده سازی Reference ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -202,126 +187,114 @@ def vevo_timbre(content_wav, reference_wav):
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
-        ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        # برش رفرنس به 20 ثانیه برای سرعت
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- منطق دقیق Seamless Chunking ---
         pipeline = get_pipeline()
         SR = 24000
-        CHUNK_LEN = 10 * SR      # 10 ثانیه اصلی
-        OVERLAP = 1 * SR          # 1 ثانیه همپوشانی
-        # مقدار ورودی به مدل = 10 ثانیه + 1 ثانیه اورلپ = 11 ثانیه
         INPUT_SIZE = CHUNK_LEN + OVERLAP
         total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Duration: {total_samples/SR:.2f}s. Seamless Chunking...")
         final_parts = []
         overlap_buffer = None
-        # حرکت با قدم‌های 10 ثانیه‌ای
         for start in range(0, total_samples, CHUNK_LEN):
-            # انتخاب بازه: از شروع تا 11 ثانیه جلوتر (یا تا آخر فایل)
             end_input = min(start + INPUT_SIZE, total_samples)
             current_input_chunk = content_tensor[:, start:end_input]
             save_audio_pcm16(current_input_chunk, temp_content_path, SR)
-            print(f"[{session_id}] Processing input {start/SR:.1f}s to {end_input/SR:.1f}s")
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
-                    flow_matching_steps=32,
                 )
-                # تمیزکاری داده‌ها
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
-                # --- الگوریتم میکس دقیق ---
-                # gen اکنون شامل [بدنه اصلی (10s)] + [دم (1s)] است (مگر ای��که تکه آخر باشد)
                 current_len = len(gen)
-                # اگر بافر از دور قبلی داریم (یعنی تکه اول نیستیم)
                 if overlap_buffer is not None:
-                    # باید بافر قبلی را با ابتدای این تکه میکس کنیم
-                    # طول ناحیه میکس = طول بافر
                     mix_len = len(overlap_buffer)
-                    # اگر تکه جاری کوتاه‌تر از بافر است (خیلی نادر)، برش بزن
                     if current_len < mix_len:
                         mix_len = current_len
                         overlap_buffer = overlap_buffer[:mix_len]
-                    # جدا کردن سرِ تکه جاری برای میکس
                     head_to_mix = gen[:mix_len]
                     body_rest = gen[mix_len:]
-                    # ایجاد منحنی فید (Fade In/Out)
                     alpha = np.linspace(0, 1, mix_len)
-                    # فرمول: (دم قبلی * پایین‌رونده) + (سر فعلی * بالا‌رونده)
                     blended_segment = (overlap_buffer * (1 - alpha)) + (head_to_mix * alpha)
-                    # اضافه کردن بخش میکس شده به خروجی
                     final_parts.append(blended_segment)
-                    # حالا باید بدنه اصلی را مدیریت کنیم
-                    # اگر به اندازه کافی دیتا داریم که 1 ثانیه آخر را برای دور بعد نگه داریم
                     if len(body_rest) > OVERLAP:
-                        # بخش خالص وسط
                         pure_body = body_rest[:-OVERLAP]
                         final_parts.append(pure_body)
-                        # آپدیت بافر برای دور بعد
                         overlap_buffer = body_rest[-OVERLAP:]
                     else:
-                        # تکه آخر است و اورلپ ندارد، کلش را اضافه کن
                         final_parts.append(body_rest)
                         overlap_buffer = None
                 else:
-                    # تکه اول است (هنوز بافری نداریم)
                     if current_len > OVERLAP:
-                        # بخش اصلی را اضافه کن
                         final_parts.append(gen[:-OVERLAP])
-                        # بخش آخر را بفرست توی بافر
                         overlap_buffer = gen[-OVERLAP:]
                     else:
-                        # فایل خیلی کوتاه است، کلش را اضافه کن
                         final_parts.append(gen)
                         overlap_buffer = None
             except Exception as e:
-                print(f"Error: {e}")
-                # در صورت خطا، سکوت جایگزین کن تا تایمینگ به هم نریزد
                 missing_len = end_input - start
-                # اگر تکه اول نبود، اورلپ را کم کن
                 if overlap_buffer is not None:
                     missing_len -= len(overlap_buffer)
-                    final_parts.append(overlap_buffer) # بافر قبلی را خالی کن
                     overlap_buffer = None
                 final_parts.append(np.zeros(max(0, missing_len)))
-        # اگر بافری باقی مانده (از تکه آخر)، اضافه‌اش کن
         if overlap_buffer is not None:
             final_parts.append(overlap_buffer)
-        # چسباندن نهایی
         if len(final_parts) > 0:
             full_audio = np.concatenate(final_parts)
         else:
-            full_audio = np.zeros(24000) # Fallback
-        # ذخیره
         save_audio_pcm16(full_audio, output_path, SR)
         return output_path
@@ -329,15 +302,22 @@ def vevo_timbre(content_wav, reference_wav):
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Perfect Stitch)") as demo:
-    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("نسخه نهایی: کیفیت ۱۶ بیتی، بدون نویز، زمان‌بندی دقیق، بدون سکوت بین تکه‌ها.")
     with gr.Row():
         with gr.Column():
-            timbre_content = gr.Audio(label="Source Audio", type="numpy")
-            timbre_reference = gr.Audio(label="Target Timbre", type="numpy")
-            timbre_button = gr.Button("Generate", variant="primary")
         with gr.Column():
             timbre_output = gr.Audio(label="Result")

         raise ValueError("Please upload audio files")
     try:
+        # --- آماده سازی Reference (اول رفرنس را پردازش می‌کنیم تا سطح صدا را بگیریم) ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
+        # محاسبه انرژی رفرنس
+        ref_max_vol = torch.max(torch.abs(ref_tensor)) + 1e-6
+        ref_tensor = ref_tensor / ref_max_vol * 0.95 # نرمال سازی رفرنس
+        # برش رفرنس به 20 ثانیه
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- آماده سازی Content ---
+        if isinstance(content_wav, tuple):
+            content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
+        else:
+            content_sr, content_data = content_wav
+        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
+            content_data = np.mean(content_data, axis=1)
+        content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        if content_sr != 24000:
+            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
+            content_sr = 24000
+        # نرمال سازی هوشمند: صدای ورودی را هم‌سطح صدای رفرنس می‌کنیم
+        content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- منطق Chunking ---
         pipeline = get_pipeline()
         SR = 24000
+        CHUNK_LEN = 10 * SR
+        OVERLAP = 1 * SR
         INPUT_SIZE = CHUNK_LEN + OVERLAP
         total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] High Quality Processing (64 Steps)... Duration: {total_samples/SR:.2f}s")
         final_parts = []
         overlap_buffer = None
         for start in range(0, total_samples, CHUNK_LEN):
             end_input = min(start + INPUT_SIZE, total_samples)
             current_input_chunk = content_tensor[:, start:end_input]
             save_audio_pcm16(current_input_chunk, temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=64,  # <--- کیفیت بالا (قبلاً 32 بود)
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 if gen.dim() == 1: gen = gen.unsqueeze(0)
                 gen = gen.cpu().squeeze(0).numpy()
                 current_len = len(gen)
                 if overlap_buffer is not None:
                     mix_len = len(overlap_buffer)
                     if current_len < mix_len:
                         mix_len = current_len
                         overlap_buffer = overlap_buffer[:mix_len]
                     head_to_mix = gen[:mix_len]
                     body_rest = gen[mix_len:]
                     alpha = np.linspace(0, 1, mix_len)
                     blended_segment = (overlap_buffer * (1 - alpha)) + (head_to_mix * alpha)
                     final_parts.append(blended_segment)
                     if len(body_rest) > OVERLAP:
                         pure_body = body_rest[:-OVERLAP]
                         final_parts.append(pure_body)
                         overlap_buffer = body_rest[-OVERLAP:]
                     else:
                         final_parts.append(body_rest)
                         overlap_buffer = None
                 else:
                     if current_len > OVERLAP:
                         final_parts.append(gen[:-OVERLAP])
                         overlap_buffer = gen[-OVERLAP:]
                     else:
                         final_parts.append(gen)
                         overlap_buffer = None
             except Exception as e:
+                print(f"Error in chunk: {e}")
                 missing_len = end_input - start
                 if overlap_buffer is not None:
                     missing_len -= len(overlap_buffer)
+                    final_parts.append(overlap_buffer)
                     overlap_buffer = None
                 final_parts.append(np.zeros(max(0, missing_len)))
         if overlap_buffer is not None:
             final_parts.append(overlap_buffer)
         if len(final_parts) > 0:
             full_audio = np.concatenate(final_parts)
         else:
+            full_audio = np.zeros(24000)
         save_audio_pcm16(full_audio, output_path, SR)
         return output_path
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Ultra Quality)") as demo:
+    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion (Ultra Quality)")
+    gr.Markdown("""
+    **ویژگی‌ها:**
+    - **Steps 64:** کیفیت و دقت بافت صدا دو برابر شده است.
+    - **Auto-Leveling:** سطح صدای شما با مدل تنظی�� می‌شود.
+    - **Seamless Stitching:** بدون پرش و بدون اضافه شدن زمان.
+    **نکته مهم:** برای بهترین نتیجه، سعی کنید **لحن، سرعت و احساس** صدای خودتان را شبیه فایل هدف کنید. مدل فقط جنس صدا را تغییر می‌دهد، نه بازیگری شما را!
+    """)
     with gr.Row():
         with gr.Column():
+            timbre_content = gr.Audio(label="Source Audio (صدای شما)", type="numpy")
+            timbre_reference = gr.Audio(label="Target Timbre (صدای هدف)", type="numpy")
+            timbre_button = gr.Button("Generate (Ultra Quality)", variant="primary")
         with gr.Column():
             timbre_output = gr.Audio(label="Result")