Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

a4df331

verified ·

1 Parent(s): 4863b79

Update app.py

Browse files

Files changed (1) hide show

app.py +28 -15

app.py CHANGED Viewed

@@ -12,8 +12,9 @@ import subprocess
 import re
 import spaces
 import uuid
-# دانلود فقط منابع ضروری
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
@@ -28,8 +29,6 @@ def install_espeak():
             print("Installing espeak-ng...")
             subprocess.run(["apt-get", "update"], check=True)
             subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
-        else:
-            print("espeak-ng is installed.")
     except Exception as e:
         print(f"Error installing espeak-ng: {e}")
@@ -85,8 +84,20 @@ if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
 os.makedirs("wav", exist_ok=True)
 os.makedirs("ckpts/Vevo", exist_ok=True)
-# اینجا دیگر مشکلی ندارد چون نسخه torchaudio را درست کردیم
-from models.vc.vevo.vevo_utils import VevoInferencePipeline, save_audio
 def setup_configs():
     if downloaded_resources["configs"]: return
@@ -158,7 +169,7 @@ def get_pipeline():
 @spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
-    # ایجاد نام فایل منحصر به فرد برای جلوگیری از تداخل کاربران
     session_id = str(uuid.uuid4())[:8]
     temp_content_path = f"wav/c_{session_id}.wav"
     temp_reference_path = f"wav/r_{session_id}.wav"
@@ -168,7 +179,7 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- پردازش صدای اصلی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
@@ -178,13 +189,15 @@ def vevo_timbre(content_wav, reference_wav):
             content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
-        # --- پردازش صدای رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
@@ -200,15 +213,15 @@ def vevo_timbre(content_wav, reference_wav):
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        # ذخیره فایل‌ها با torchaudio (چون نسخه قدیمی است، بدون ارور کار می‌کند و فرمت دقیق را حفظ می‌کند)
-        torchaudio.save(temp_content_path, content_tensor, content_sr)
-        torchaudio.save(temp_reference_path, ref_tensor, ref_sr)
-        print(f"[{session_id}] Processing Audio...")
         pipeline = get_pipeline()
-        # اجرای مدل روی کل فایل (بدون تکه تکه کردن - چون قبلاً اینطوری کار می‌کرد)
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
@@ -219,7 +232,7 @@ def vevo_timbre(content_wav, reference_wav):
             print("Warning: NaN fixed")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
-        save_audio(gen_audio, output_path=output_path)
         return output_path
     finally:
@@ -227,7 +240,7 @@ def vevo_timbre(content_wav, reference_wav):
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Stable)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():

 import re
 import spaces
 import uuid
+import soundfile as sf  # استفاده مستقیم برای حل مشکل ذخیره‌سازی
+# فقط منابع ضروری
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
             print("Installing espeak-ng...")
             subprocess.run(["apt-get", "update"], check=True)
             subprocess.run(["apt-get", "install", "-y", "espeak-ng", "espeak-ng-data"], check=True)
     except Exception as e:
         print(f"Error installing espeak-ng: {e}")
 os.makedirs("wav", exist_ok=True)
 os.makedirs("ckpts/Vevo", exist_ok=True)
+from models.vc.vevo.vevo_utils import VevoInferencePipeline
+# تابع ذخیره سازی امن (جایگزین torchaudio.save)
+def my_save_audio(waveform, output_path, sample_rate=24000):
+    try:
+        if isinstance(waveform, torch.Tensor):
+            waveform = waveform.detach().cpu()
+            if waveform.dim() == 2 and waveform.shape[0] == 1:
+                waveform = waveform.squeeze(0)
+            waveform = waveform.numpy()
+        sf.write(output_path, waveform, sample_rate)
+    except Exception as e:
+        print(f"Save error: {e}")
+        raise e
 def setup_configs():
     if downloaded_resources["configs"]: return
 @spaces.GPU()
 def vevo_timbre(content_wav, reference_wav):
+    # 1. ایجاد نام یکتا برای هر کاربر (جلوگیری از قاطی شدن فایل‌ها)
     session_id = str(uuid.uuid4())[:8]
     temp_content_path = f"wav/c_{session_id}.wav"
     temp_reference_path = f"wav/r_{session_id}.wav"
         raise ValueError("Please upload audio files")
     try:
+        # --- پردازش و نرمال‌سازی صداها ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
             content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        # مهم: استفاده از torchaudio برای ریسمپل دقیق (جلوگیری از نویز)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
             content_sr = 24000
+        # نرمال‌سازی صدا (خیلی مهم برای کیفیت)
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
+        # --- پردازش رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        # استفاده از soundfile برای ذخیره (چون torchaudio در نسخه جدید ارور می‌دهد)
+        sf.write(temp_content_path, content_tensor.squeeze().cpu().numpy(), content_sr)
+        sf.write(temp_reference_path, ref_tensor.squeeze().cpu().numpy(), ref_sr)
+        print(f"[{session_id}] Processing Audio ({content_tensor.shape[1]/24000:.2f}s)...")
         pipeline = get_pipeline()
+        # اجرای مدل روی کل فایل (بدون تکه تکه کردن)
         gen_audio = pipeline.inference_fm(
             src_wav_path=temp_content_path,
             timbre_ref_wav_path=temp_reference_path,
             print("Warning: NaN fixed")
             gen_audio = torch.nan_to_num(gen_audio, nan=0.0, posinf=0.95, neginf=-0.95)
+        my_save_audio(gen_audio, output_path=output_path)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Secure)") as demo:
     gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
     with gr.Row():