Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

f375b6c

verified ·

1 Parent(s): 428894f

Update app.py

Browse files

Files changed (1) hide show

app.py +62 -36

app.py CHANGED Viewed

@@ -187,8 +187,6 @@ def vevo_timbre(content_wav, reference_wav):
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
-        # نرمال سازی
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- آماده سازی Reference ---
@@ -204,73 +202,101 @@ def vevo_timbre(content_wav, reference_wav):
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        # اگر رفرنس خیلی طولانی باشد، فقط 20 ثانیه اول کافی است (برای استخراج Timbre)
-        # این کار سرعت را بالا می‌برد و تاثیری در کیفیت ندارد
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- منطق Chunking (حل مشکل فایل طولانی) ---
         pipeline = get_pipeline()
-        # هر تکه 15 ثانیه (360000 سمپل)
-        CHUNK_SIZE = 15 * 24000
-        total_samples = content_tensor.shape[1]
-        print(f"[{session_id}] Audio Duration: {total_samples/24000:.2f}s. Starting Chunking...")
-        generated_chunks = []
-        for start in range(0, total_samples, CHUNK_SIZE):
-            end = min(start + CHUNK_SIZE, total_samples)
-            current_chunk = content_tensor[:, start:end]
-            # ذخیره تکه جاری
-            save_audio_pcm16(current_chunk, temp_content_path, 24000)
-            print(f"[{session_id}] Processing chunk {start/24000:.1f}s to {end/24000:.1f}s")
             try:
-                gen_chunk = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
                     flow_matching_steps=32,
                 )
-                # رفع NaN
-                if torch.isnan(gen_chunk).any() or torch.isinf(gen_chunk).any():
-                    gen_chunk = torch.nan_to_num(gen_chunk, nan=0.0, posinf=0.95, neginf=-0.95)
-                # مطمئن شویم تنسور دو بعدی است [1, T]
-                if gen_chunk.dim() == 1:
-                    gen_chunk = gen_chunk.unsqueeze(0)
-                generated_chunks.append(gen_chunk.cpu())
             except Exception as e:
-                print(f"Error in chunk: {e}")
-                # اگر خطایی رخ داد، سکوت اضافه کن تا فایل قطع نشود
-                silence = torch.zeros((1, end - start))
-                generated_chunks.append(silence)
-        # چسباندن تکه‌ها
-        final_audio = torch.cat(generated_chunks, dim=1)
         # ذخیره نهایی
-        save_audio_pcm16(final_audio, output_path, 24000)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Long Audio)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("پشتیبانی کامل از فایل‌های طولانی (بدون نویز و قطعی)")
     with gr.Row():
         with gr.Column():

         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         # --- آماده سازی Reference ---
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
             ref_sr = 24000
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
         if ref_tensor.shape[1] > 24000 * 20:
              ref_tensor = ref_tensor[:, :24000 * 20]
         save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق Cross-Fade Chunking ---
         pipeline = get_pipeline()
+        SR = 24000
+        MAIN_CHUNK = 10 * SR       # 10 ثانیه اصلی
+        OVERLAP = 1 * SR           # 1 ثانیه هم‌پوشانی (برای میکس)
+        STEP = MAIN_CHUNK          # قدم حرکت (10 ثانیه)
+        total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Duration: {total_samples/SR:.2f}s. Chunking 10s with Cross-fade...")
+        final_output = []
+        # حلقه روی تکه‌ها با هم‌پوشانی
+        # ما هر بار 'MAIN_CHUNK + OVERLAP' را پردازش می‌کنیم (یعنی 11 ثانیه)
+        # مگر اینکه به آخر فایل رسیده باشیم
+        for start in range(0, total_samples, STEP):
+            end = min(start + MAIN_CHUNK + OVERLAP, total_samples)
+            current_input_chunk = content_tensor[:, start:end]
+            save_audio_pcm16(current_input_chunk, temp_content_path, SR)
+            print(f"[{session_id}] Processing {start/SR:.1f}s to {end/SR:.1f}s")
             try:
+                gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
                     flow_matching_steps=32,
                 )
+                if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
+                if gen.dim() == 1: gen = gen.unsqueeze(0)
+                gen = gen.cpu().squeeze(0).numpy() # تبدیل به numpy
+                # منطق میکس (Cross-fade)
+                if start == 0:
+                    # تکه اول: فعلاً نگه می‌داریم (هنوز چیزی برای میکس نیست)
+                    # اگر فایل کوتاه باشد و به هم‌پوشانی نرسد، کلش را اضافه می‌کنیم
+                    if len(gen) <= MAIN_CHUNK:
+                        final_output.append(gen)
+                    else:
+                        # قسمت اصلی را اضافه کن، قسمت اورلپ را برای میکس با بعدی نگه دار
+                        final_output.append(gen[:-OVERLAP])
+                        overlap_buffer = gen[-OVERLAP:]
+                else:
+                    # تکه‌های بعدی:
+                    # 1. قسمت اورلپ قبلی را با شروع این تکه میکس کن
+                    current_overlap = gen[:OVERLAP]
+                    if len(current_overlap) == len(overlap_buffer):
+                        # ایجاد منحنی فید (Fade Curves)
+                        alpha = np.linspace(0, 1, len(overlap_buffer))
+                        # فرمول: (قبلی * نزولی) + (جدید * صعودی)
+                        blended = (overlap_buffer * (1 - alpha)) + (current_overlap * alpha)
+                        final_output.append(blended)
+                    else:
+                        # اگر سایزها نخواند (خیلی نادر)، فقط قبلی را بچسبان
+                        final_output.append(overlap_buffer)
+                    # 2. بقیه فایل را مدیریت کن
+                    if len(gen) <= OVERLAP + MAIN_CHUNK: # اگر تکه آخر است
+                        final_output.append(gen[OVERLAP:])
+                        overlap_buffer = None # تمام شد
+                    else:
+                        # قسمت وسط را اضافه کن
+                        final_output.append(gen[OVERLAP:-OVERLAP])
+                        # اورلپ جدید را ذخیره کن
+                        overlap_buffer = gen[-OVERLAP:]
             except Exception as e:
+                print(f"Error: {e}")
+                silence_len = end - start
+                final_output.append(np.zeros(silence_len))
+                overlap_buffer = np.zeros(OVERLAP)
+        # چسباندن همه آرایه‌ها
+        full_audio = np.concatenate(final_output)
         # ذخیره نهایی
+        sf.write(output_path, full_audio, SR, subtype='PCM_16')
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Professional)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("پشتیبانی از فایل‌های نامحدود با کیفیت بالا (10s Chunking + Smooth Cross-Fade)")
     with gr.Row():
         with gr.Column():