Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

2775a80

verified ·

1 Parent(s): 3fbd4a0

Update app.py

Browse files

Files changed (1) hide show

app.py +69 -35

app.py CHANGED Viewed

@@ -90,7 +90,7 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
-# تابع ذخیره سازی اختصاصی برای جلوگیری از ارور TorchCodec
 def my_save_audio(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
@@ -109,7 +109,7 @@ def setup_configs():
     if downloaded_resources["configs"]: return
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
-    config_files = ["Vq8192ToMels.json", "Vocoder.json"] # فقط کانفیگ‌های تیمبر
     for file in config_files:
         file_path = f"{config_path}/{file}"
@@ -127,7 +127,6 @@ print(f"Using device: {device}")
 inference_pipelines = {}
-# دانلود منابع (فقط بخش‌های مورد نیاز Timbre)
 def preload_all_resources():
     print("Preloading Timbre resources...")
     setup_configs()
@@ -163,14 +162,12 @@ def get_pipeline():
     if "timbre" in inference_pipelines:
         return inference_pipelines["timbre"]
-    # مسیرها
     content_style_tokenizer_ckpt_path = os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192")
     fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
     fmt_ckpt_path = os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels")
     vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
     vocoder_ckpt_path = os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder")
-    # ساخت پایپ‌لاین فقط برای Timbre
     pipeline = VevoInferencePipeline(
         content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
         fmt_cfg_path=fmt_cfg_path,
@@ -192,7 +189,7 @@ def vevo_timbre(content_wav, reference_wav):
     if content_wav is None or reference_wav is None:
         raise ValueError("Please upload audio files")
-    # پردازش صدای اصلی
     if isinstance(content_wav, tuple):
         content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
     else:
@@ -201,7 +198,6 @@ def vevo_timbre(content_wav, reference_wav):
     if len(content_data.shape) > 1 and content_data.shape[1] > 1:
         content_data = np.mean(content_data, axis=1)
-    # ریسمپل به 24k
     content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
     if content_sr != 24000:
         content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
@@ -209,7 +205,7 @@ def vevo_timbre(content_wav, reference_wav):
     content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-    # پردازش صدای رفرنس (Timbre)
     if isinstance(reference_wav, tuple):
         ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
     else:
@@ -225,45 +221,83 @@ def vevo_timbre(content_wav, reference_wav):
     ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-    print(f"Processing Timbre Swap... Content Length: {content_tensor.shape[-1]/24000:.2f}s")
-    # ذخیره موقت فایل‌ها
-    sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
     sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
-    try:
-        pipeline = get_pipeline()
-        gen_audio = pipeline.inference_fm(
-            src_wav_path=temp_content_path,
-            timbre_ref_wav_path=temp_reference_path,
-            flow_matching_steps=32,
-        )
-        if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
-            print("Warning: NaN detected, fixing...")
-            gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        # ذخیره خروجی
-        my_save_audio(gen_audio, output_path=output_path)
-        return output_path
-    except Exception as e:
-        print(f"Error: {e}")
-        raise e
-# رابط کاربری ساده فقط برای Vevo-Timbre
-with gr.Blocks(title="Vevo-Timbre Only") as demo:
-    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("**نکته:** برای بهترین کیفیت، از فایل‌های صوتی زیر ۲۰ ثانیه استفاده کنید. فایل‌های طولانی ممکن است دچار افت کیفیت شوند.")
     with gr.Row():
         with gr.Column():
-            timbre_content = gr.Audio(label="Source Audio (صدای اصلی)", type="numpy")
-            timbre_reference = gr.Audio(label="Target Timbre (صدای هدف)", type="numpy")
             timbre_button = gr.Button("Generate (ساخت صدا)", variant="primary")
         with gr.Column():
-            timbre_output = gr.Audio(label="Result (خروجی)")
     timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)

 from models.vc.vevo.vevo_utils import VevoInferencePipeline
+# تابع ذخیره سازی اختصاصی
 def my_save_audio(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
     if downloaded_resources["configs"]: return
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
+    config_files = ["Vq8192ToMels.json", "Vocoder.json"]
     for file in config_files:
         file_path = f"{config_path}/{file}"
 inference_pipelines = {}
 def preload_all_resources():
     print("Preloading Timbre resources...")
     setup_configs()
     if "timbre" in inference_pipelines:
         return inference_pipelines["timbre"]
     content_style_tokenizer_ckpt_path = os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192")
     fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
     fmt_ckpt_path = os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels")
     vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
     vocoder_ckpt_path = os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder")
     pipeline = VevoInferencePipeline(
         content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
         fmt_cfg_path=fmt_cfg_path,
     if content_wav is None or reference_wav is None:
         raise ValueError("Please upload audio files")
+    # --- بارگذاری و پردازش صدای اصلی (Content) ---
     if isinstance(content_wav, tuple):
         content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
     else:
     if len(content_data.shape) > 1 and content_data.shape[1] > 1:
         content_data = np.mean(content_data, axis=1)
     content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
     if content_sr != 24000:
         content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
     content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+    # --- بارگذاری و پردازش صدای رفرنس (Reference) ---
     if isinstance(reference_wav, tuple):
         ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
     else:
     ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+    # برش زدن صدای رفرنس به 20 ثانیه اول (برای جلوگیری از گیج شدن مدل)
+    # صدای رفرنس فقط برای برداشتن "رنگ صدا" استفاده میشه و 20 ثانیه کافیه
+    if ref_tensor.shape[1] > 24000 * 20:
+        ref_tensor = ref_tensor[:, :24000 * 20]
+    # ذخیره موقت صدای رفرنس
     sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
+    print(f"Total Duration: {content_tensor.shape[1]/24000:.2f}s")
+    # --- تکه تکه کردن صدای اصلی (Chunking Logic) ---
+    pipeline = get_pipeline()
+    CHUNK_DURATION = 15 # ثانیه (اندازه هر تکه)
+    CHUNK_SAMPLES = CHUNK_DURATION * 24000
+    total_samples = content_tensor.shape[1]
+    generated_chunks = []
+    # حلقه برای پردازش تکه تکه
+    for i in range(0, total_samples, CHUNK_SAMPLES):
+        end = min(i + CHUNK_SAMPLES, total_samples)
+        chunk = content_tensor[:, i:end]
+        print(f"Processing Chunk: {i/24000:.1f}s to {end/24000:.1f}s")
+        # ذخیره تکه جاری
+        sf.write(temp_content_path, chunk.squeeze().cpu().numpy(), 24000)
+        try:
+            # پردازش تکه
+            gen_chunk = pipeline.inference_fm(
+                src_wav_path=temp_content_path,
+                timbre_ref_wav_path=temp_reference_path,
+                flow_matching_steps=32,
+            )
+            # بررسی خرابی احتمالی
+            if torch.isnan(gen_chunk).any() or torch.isinf(gen_chunk).any():
+                print("Warning: NaN in chunk, fixing...")
+                gen_chunk = torch.nan_to_num(gen_chunk, nan=0.0, posinf=0.95, neginf=-0.95)
+            # اضافه کردن به لیست خروجی‌ها (مطمئن میشیم دوبعدی باشه [1, T])
+            if gen_chunk.dim() == 1:
+                gen_chunk = gen_chunk.unsqueeze(0)
+            generated_chunks.append(gen_chunk.cpu())
+        except Exception as e:
+            print(f"Error processing chunk starting at {i}: {e}")
+            # در صورت خطا در یک تکه، سکوت جایگزین میکنیم تا کل فایل خراب نشه
+            silence = torch.zeros_like(chunk)
+            generated_chunks.append(silence)
+    # --- چسباندن تکه‌ها به هم ---
+    if not generated_chunks:
+        raise ValueError("No audio generated")
+    final_audio = torch.cat(generated_chunks, dim=1)
+    print(f"Final Audio Duration: {final_audio.shape[1]/24000:.2f}s")
+    # ذخیره خروجی نهایی
+    my_save_audio(final_audio, output_path=output_path)
+    return output_path
+# رابط کاربری
+with gr.Blocks(title="Vevo-Timbre (Long Audio Fix)") as demo:
+    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion (Unlimited Length)")
+    gr.Markdown("این نسخه فایل‌های طولانی را به صورت اتوماتیک به تکه‌های ۱۵ ثانیه‌ای تقسیم کرده و پردازش می‌کند تا صدا خراب نشود.")
     with gr.Row():
         with gr.Column():
+            timbre_content = gr.Audio(label="Source Audio (صدای اصلی - هر چقدر طولانی باشد مشکلی نیست)", type="numpy")
+            timbre_reference = gr.Audio(label="Target Timbre (صدای هدف - ۲۰ ثانیه اول استفاده میشود)", type="numpy")
             timbre_button = gr.Button("Generate (ساخت صدا)", variant="primary")
         with gr.Column():
+            timbre_output = gr.Audio(label="Result (خروجی نهایی)")
     timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)