Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 22 days ago

Commit

428894f

verified ·

1 Parent(s): 885b401

Update app.py

Browse files

Files changed (1) hide show

app.py +56 -44

app.py CHANGED Viewed

@@ -86,33 +86,14 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
-# --- تابع ذخیره سازی پیشرفته (حذف نویز + فرمت استاندارد) ---
-def save_audio_final(waveform, output_path, sample_rate=24000, target_length=None):
     try:
         if isinstance(waveform, torch.Tensor):
             waveform = waveform.detach().cpu()
             if waveform.dim() == 2 and waveform.shape[0] == 1:
                 waveform = waveform.squeeze(0)
             waveform = waveform.numpy()
-        # 1. همگام‌سازی طول (حذف نویز اضافه آخر فایل)
-        if target_length is not None:
-            if len(waveform) > target_length:
-                waveform = waveform[:target_length]
-            elif len(waveform) < target_length:
-                # اگر کوتاه‌تر بود، با سکوت پر کن (معمولاً پیش نمیاد)
-                padding = np.zeros(target_length - len(waveform))
-                waveform = np.concatenate([waveform, padding])
-        # 2. اعمال Fade Out (جلوگیری از صدای کلیک در لحظه قطع شدن)
-        fade_len = int(sample_rate * 0.05)  # 50 میلی ثانیه
-        if len(waveform) > fade_len:
-            fade_curve = np.linspace(1, 0, fade_len)
-            waveform[-fade_len:] *= fade_curve
-        # 3. ذخیره با فرمت 16 بیتی
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
         raise e
@@ -142,10 +123,7 @@ inference_pipelines = {}
 def preload_all_resources():
     print("Preloading resources...")
     setup_configs()
-    global downloaded_content_style_tokenizer_path
-    global downloaded_fmt_path
-    global downloaded_vocoder_path
     if not downloaded_resources["tokenizer_vq8192"]:
         local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
@@ -196,7 +174,7 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- پردازش صدای اصلی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
@@ -210,12 +188,10 @@ def vevo_timbre(content_wav, reference_wav):
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # ذخیره طول دقیق فایل ورودی برای برش نهایی
-        target_length_samples = content_tensor.shape[-1]
-        # --- پردازش صدای رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -231,34 +207,70 @@ def vevo_timbre(content_wav, reference_wav):
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        save_audio_final(content_tensor, temp_content_path, content_sr)
-        save_audio_final(ref_tensor, temp_reference_path, ref_sr)
-        print(f"[{session_id}] Processing...")
         pipeline = get_pipeline()
-        gen_audio = pipeline.inference_fm(
-            src_wav_path=temp_content_path,
-            timbre_ref_wav_path=temp_reference_path,
-            flow_matching_steps=32,
-        )
-        if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        # اینجا فایل را دقیقاً به اندازه ورودی برش می‌ز��یم
-        # این کار باعث می‌شود نویز اضافه‌ای که مدل در پایان تولید کرده حذف شود
-        save_audio_final(gen_audio, output_path, 24000, target_length=target_length_samples)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Clean)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():
         with gr.Column():

 from models.vc.vevo.vevo_utils import VevoInferencePipeline
+def save_audio_pcm16(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
             waveform = waveform.detach().cpu()
             if waveform.dim() == 2 and waveform.shape[0] == 1:
                 waveform = waveform.squeeze(0)
             waveform = waveform.numpy()
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
         raise e
 def preload_all_resources():
     print("Preloading resources...")
     setup_configs()
+    global downloaded_content_style_tokenizer_path, downloaded_fmt_path, downloaded_vocoder_path
     if not downloaded_resources["tokenizer_vq8192"]:
         local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
         raise ValueError("Please upload audio files")
     try:
+        # --- آماده سازی Content ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
+        # نرمال سازی
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- آماده سازی Reference ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        # اگر رفرنس خیلی طولانی باشد، فقط 20 ثانیه اول کافی است (برای استخراج Timbre)
+        # این کار سرعت را بالا می‌برد و تاثیری در کیفیت ندارد
+        if ref_tensor.shape[1] > 24000 * 20:
+             ref_tensor = ref_tensor[:, :24000 * 20]
+        save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
+        # --- منطق Chunking (حل مشکل فایل طولانی) ---
         pipeline = get_pipeline()
+        # هر تکه 15 ثانیه (360000 سمپل)
+        CHUNK_SIZE = 15 * 24000
+        total_samples = content_tensor.shape[1]
+        print(f"[{session_id}] Audio Duration: {total_samples/24000:.2f}s. Starting Chunking...")
+        generated_chunks = []
+        for start in range(0, total_samples, CHUNK_SIZE):
+            end = min(start + CHUNK_SIZE, total_samples)
+            current_chunk = content_tensor[:, start:end]
+            # ذخیره تکه جاری
+            save_audio_pcm16(current_chunk, temp_content_path, 24000)
+            print(f"[{session_id}] Processing chunk {start/24000:.1f}s to {end/24000:.1f}s")
+            try:
+                gen_chunk = pipeline.inference_fm(
+                    src_wav_path=temp_content_path,
+                    timbre_ref_wav_path=temp_reference_path,
+                    flow_matching_steps=32,
+                )
+                # رفع NaN
+                if torch.isnan(gen_chunk).any() or torch.isinf(gen_chunk).any():
+                    gen_chunk = torch.nan_to_num(gen_chunk, nan=0.0, posinf=0.95, neginf=-0.95)
+                # مطمئن شویم تنسور دو بعدی است [1, T]
+                if gen_chunk.dim() == 1:
+                    gen_chunk = gen_chunk.unsqueeze(0)
+                generated_chunks.append(gen_chunk.cpu())
+            except Exception as e:
+                print(f"Error in chunk: {e}")
+                # اگر خطایی رخ داد، سکوت اضافه کن تا فایل قطع نشود
+                silence = torch.zeros((1, end - start))
+                generated_chunks.append(silence)
+        # چسباندن تکه‌ها
+        final_audio = torch.cat(generated_chunks, dim=1)
+        # ذخیره نهایی
+        save_audio_pcm16(final_audio, output_path, 24000)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Long Audio)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("پشتیبانی کامل از فایل‌های طولانی (بدون نویز و قطعی)")
     with gr.Row():
         with gr.Column():