Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

d7672e1

verified ·

1 Parent(s): ec6e509

Update app.py

Browse files

Files changed (1) hide show

app.py +75 -136

app.py CHANGED Viewed

@@ -11,9 +11,9 @@ from huggingface_hub import snapshot_download, hf_hub_download
 import subprocess
 import re
 import spaces
-import soundfile as sf  # Importing soundfile directly
-# فقط منابع مورد نیاز برای Timbre را دانلود میکنیم
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
@@ -22,7 +22,6 @@ downloaded_resources = {
 }
 def install_espeak():
-    """Detect and install espeak-ng dependency"""
     try:
         result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
         if result.returncode != 0:
@@ -30,7 +29,7 @@ def install_espeak():
             subprocess.run(["apt-get", "update"], check=True)
             subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
         else:
-            print("espeak-ng is already installed.")
     except Exception as e:
         print(f"Error installing espeak-ng: {e}")
@@ -69,9 +68,7 @@ def patch_langsegment_init():
                 import LangSegment
                 importlib.reload(LangSegment)
             except: pass
-    except Exception as e:
-        print(f"Error patching LangSegment: {e}")
 patch_langsegment_init()
@@ -88,22 +85,8 @@ if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
 os.makedirs("wav", exist_ok=True)
 os.makedirs("ckpts/Vevo", exist_ok=True)
-from models.vc.vevo.vevo_utils import VevoInferencePipeline
-# تابع ذخیره سازی اختصاصی
-def my_save_audio(waveform, output_path, sample_rate=24000):
-    try:
-        if isinstance(waveform, torch.Tensor):
-            waveform = waveform.detach().cpu()
-            if waveform.dim() == 2 and waveform.shape[0] == 1:
-                waveform = waveform.squeeze(0)
-            waveform = waveform.numpy()
-        sf.write(output_path, waveform, sample_rate)
-        print(f"Audio saved successfully to {output_path}")
-    except Exception as e:
-        print(f"Failed to save audio with soundfile: {e}")
-        raise e
 def setup_configs():
     if downloaded_resources["configs"]: return
@@ -128,7 +111,7 @@ print(f"Using device: {device}")
 inference_pipelines = {}
 def preload_all_resources():
-    print("Preloading Timbre resources...")
     setup_configs()
     global downloaded_content_style_tokenizer_path
@@ -149,8 +132,7 @@ def preload_all_resources():
         local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
         downloaded_vocoder_path = local_dir
         downloaded_resources["vocoder"] = True
-    print("Timbre resources ready!")
 downloaded_content_style_tokenizer_path = None
 downloaded_fmt_path = None
@@ -162,18 +144,12 @@ def get_pipeline():
     if "timbre" in inference_pipelines:
         return inference_pipelines["timbre"]
-    content_style_tokenizer_ckpt_path = os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192")
-    fmt_cfg_path = "./models/vc/vevo/config/Vq8192ToMels.json"
-    fmt_ckpt_path = os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels")
-    vocoder_cfg_path = "./models/vc/vevo/config/Vocoder.json"
-    vocoder_ckpt_path = os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder")
     pipeline = VevoInferencePipeline(
-        content_style_tokenizer_ckpt_path=content_style_tokenizer_ckpt_path,
-        fmt_cfg_path=fmt_cfg_path,
-        fmt_ckpt_path=fmt_ckpt_path,
-        vocoder_cfg_path=vocoder_cfg_path,
-        vocoder_ckpt_path=vocoder_ckpt_path,
         device=device,
     )
@@ -182,122 +158,85 @@ def get_pipeline():
 @spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
-    temp_content_path = "wav/temp_content.wav"
-    temp_reference_path = "wav/temp_reference.wav"
-    output_path = "wav/output_vevotimbre.wav"
     if content_wav is None or reference_wav is None:
         raise ValueError("Please upload audio files")
-    # --- بارگذاری و پردازش صدای اصلی (Content) ---
-    if isinstance(content_wav, tuple):
-        content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
-    else:
-        content_sr, content_data = content_wav
-    if len(content_data.shape) > 1 and content_data.shape[1] > 1:
-        content_data = np.mean(content_data, axis=1)
-    content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-    if content_sr != 24000:
-        content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
-        content_sr = 24000
-    content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-    # --- بارگذاری و پردازش صدای رفرنس (Reference) ---
-    if isinstance(reference_wav, tuple):
-        ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
-    else:
-        ref_sr, ref_data = reference_wav
-    if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
-        ref_data = np.mean(ref_data, axis=1)
-    ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
-    if ref_sr != 24000:
-        ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
-        ref_sr = 24000
-    ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-    # برش زدن صدای رفرنس به 20 ثانیه اول (برای جلوگیری از گیج شدن مدل)
-    # صدای رفرنس فقط برای برداشتن "رنگ صدا" استفاده میشه و 20 ثانیه کافیه
-    if ref_tensor.shape[1] > 24000 * 20:
-        ref_tensor = ref_tensor[:, :24000 * 20]
-    # ذخیره موقت صدای رفرنس
-    sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
-    print(f"Total Duration: {content_tensor.shape[1]/24000:.2f}s")
-    # --- تکه تکه کردن صدای اصلی (Chunking Logic) ---
-    pipeline = get_pipeline()
-    CHUNK_DURATION = 15 # ثانیه (اندازه هر تکه)
-    CHUNK_SAMPLES = CHUNK_DURATION * 24000
-    total_samples = content_tensor.shape[1]
-    generated_chunks = []
-    # حلقه برای پردازش تکه تکه
-    for i in range(0, total_samples, CHUNK_SAMPLES):
-        end = min(i + CHUNK_SAMPLES, total_samples)
-        chunk = content_tensor[:, i:end]
-        print(f"Processing Chunk: {i/24000:.1f}s to {end/24000:.1f}s")
-        # ذخیره تکه جاری
-        sf.write(temp_content_path, chunk.squeeze().cpu().numpy(), 24000)
-        try:
-            # پردازش تکه
-            gen_chunk = pipeline.inference_fm(
-                src_wav_path=temp_content_path,
-                timbre_ref_wav_path=temp_reference_path,
-                flow_matching_steps=32,
-            )
-            # بررسی خرابی احتمالی
-            if torch.isnan(gen_chunk).any() or torch.isinf(gen_chunk).any():
-                print("Warning: NaN in chunk, fixing...")
-                gen_chunk = torch.nan_to_num(gen_chunk, nan=0.0, posinf=0.95, neginf=-0.95)
-            # اضافه کردن به لیست خروجی‌ها (��طمئن میشیم دوبعدی باشه [1, T])
-            if gen_chunk.dim() == 1:
-                gen_chunk = gen_chunk.unsqueeze(0)
-            generated_chunks.append(gen_chunk.cpu())
-        except Exception as e:
-            print(f"Error processing chunk starting at {i}: {e}")
-            # در صورت خطا در یک تکه، سکوت جایگزین میکنیم تا کل فایل خراب نشه
-            silence = torch.zeros_like(chunk)
-            generated_chunks.append(silence)
-    # --- چسباندن تکه‌ها به هم ---
-    if not generated_chunks:
-        raise ValueError("No audio generated")
-    final_audio = torch.cat(generated_chunks, dim=1)
-    print(f"Final Audio Duration: {final_audio.shape[1]/24000:.2f}s")
-    # ذخیره خروجی نهایی
-    my_save_audio(final_audio, output_path=output_path)
-    return output_path
-# رابط کاربری
-with gr.Blocks(title="Vevo-Timbre (Long Audio Fix)") as demo:
-    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion (Unlimited Length)")
-    gr.Markdown("این نسخه فایل‌های طولانی را به صورت اتوماتیک به تکه‌های ۱۵ ثانیه‌ای تقسیم کرده و پردازش می‌کند تا صدا خراب نشود.")
     with gr.Row():
         with gr.Column():
-            timbre_content = gr.Audio(label="Source Audio (صدای اصلی - هر چقدر طولانی باشد مشکلی نیست)", type="numpy")
-            timbre_reference = gr.Audio(label="Target Timbre (صدای هدف - ۲۰ ثانیه اول استفاده میشود)", type="numpy")
-            timbre_button = gr.Button("Generate (ساخت صدا)", variant="primary")
         with gr.Column():
-            timbre_output = gr.Audio(label="Result (خروجی نهایی)")
     timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)

 import subprocess
 import re
 import spaces
+import uuid
+# دانلود فقط منابع ضروری
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
 }
 def install_espeak():
     try:
         result = subprocess.run(["which", "espeak-ng"], capture_output=True, text=True)
         if result.returncode != 0:
             subprocess.run(["apt-get", "update"], check=True)
             subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
         else:
+            print("espeak-ng is installed.")
     except Exception as e:
         print(f"Error installing espeak-ng: {e}")
                 import LangSegment
                 importlib.reload(LangSegment)
             except: pass
+    except: pass
 patch_langsegment_init()
 os.makedirs("wav", exist_ok=True)
 os.makedirs("ckpts/Vevo", exist_ok=True)
+# اینجا دیگر مشکلی ندارد چون نسخه torchaudio را درست کردیم
+from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
 def setup_configs():
     if downloaded_resources["configs"]: return
 inference_pipelines = {}
 def preload_all_resources():
+    print("Preloading resources...")
     setup_configs()
     global downloaded_content_style_tokenizer_path
         local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
         downloaded_vocoder_path = local_dir
         downloaded_resources["vocoder"] = True
+    print("Resources ready.")
 downloaded_content_style_tokenizer_path = None
 downloaded_fmt_path = None
     if "timbre" in inference_pipelines:
         return inference_pipelines["timbre"]
     pipeline = VevoInferencePipeline(
+        content_style_tokenizer_ckpt_path=os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192"),
+        fmt_cfg_path="./models/vc/vevo/config/Vq8192ToMels.json",
+        fmt_ckpt_path=os.path.join(downloaded_fmt_path, "acoustic_modeling/Vq8192ToMels"),
+        vocoder_cfg_path="./models/vc/vevo/config/Vocoder.json",
+        vocoder_ckpt_path=os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder"),
         device=device,
     )
 @spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
+    # ایجاد نام فایل منحصر به فرد برای جلوگیری از تداخل کاربران
+    session_id = str(uuid.uuid4())[:8]
+    temp_content_path = f"wav/c_{session_id}.wav"
+    temp_reference_path = f"wav/r_{session_id}.wav"
+    output_path = f"wav/out_{session_id}.wav"
     if content_wav is None or reference_wav is None:
         raise ValueError("Please upload audio files")
+    try:
+        # --- پردازش صدای اصلی ---
+        if isinstance(content_wav, tuple):
+            content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
+        else:
+            content_sr, content_data = content_wav
+        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
+            content_data = np.mean(content_data, axis=1)
+        content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        if content_sr != 24000:
+            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
+            content_sr = 24000
+        content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- پردازش صدای رفرنس ---
+        if isinstance(reference_wav, tuple):
+            ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
+        else:
+            ref_sr, ref_data = reference_wav
+        if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
+            ref_data = np.mean(ref_data, axis=1)
+        ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
+        if ref_sr != 24000:
+            ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
+            ref_sr = 24000
+        ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        # ذخیره فایل‌ها با torchaudio (چون نسخه قدیمی است، بدون ارور کار می‌کند و فرمت دقیق را حفظ می‌کند)
+        torchaudio.save(temp_content_path, content_tensor, content_sr)
+        torchaudio.save(temp_reference_path, ref_tensor, ref_sr)
+        print(f"[{session_id}] Processing Audio...")
+        pipeline = get_pipeline()
+        # اجرای مدل روی کل فایل (بدون تکه تکه کردن - چون قبلاً اینطوری کار می‌کرد)
+        gen_audio = pipeline.inference_fm(
+            src_wav_path=temp_content_path,
+            timbre_ref_wav_path=temp_reference_path,
+            flow_matching_steps=32,
+        )
+        if torch.isnan(gen_audio).any() or torch.isinf(gen_audio).any():
+            print("Warning: NaN fixed")
+            gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        save_audio(gen_audio, output_path=output_path)
+        return output_path
+    finally:
+        # پاکسازی فایل‌های موقت
+        if os.path.exists(temp_content_path): os.remove(temp_content_path)
+        if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Stable)") as demo:
+    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():
         with gr.Column():
+            timbre_content = gr.Audio(label="Source Audio", type="numpy")
+            timbre_reference = gr.Audio(label="Target Timbre", type="numpy")
+            timbre_button = gr.Button("Generate", variant="primary")
         with gr.Column():
+            timbre_output = gr.Audio(label="Result")
     timbre_button.click(vevo_timbre, inputs=[timbre_content, timbre_reference], outputs=timbre_output)