Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

2457c1e

verified ·

1 Parent(s): 339799c

Update app.py

Browse files

Files changed (1) hide show

app.py +66 -125

app.py CHANGED Viewed

@@ -13,8 +13,11 @@ import re
 import spaces
 import uuid
 import soundfile as sf
-# --- تنظیمات اولیه و نصب پکیج‌ها ---
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
@@ -34,25 +37,15 @@ def install_espeak():
 install_espeak()
-# پچ کردن مشکل LangSegment
 def patch_langsegment_init():
     try:
         spec = importlib.util.find_spec("LangSegment")
         if spec is None or spec.origin is None: return
         init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
-        if not os.path.exists(init_path):
-            for site_pkg_path in site.getsitepackages():
-                potential_path = os.path.join(site_pkg_path, 'LangSegment', '__init__.py')
-                if os.path.exists(potential_path):
-                    init_path = potential_path
-                    break
-            else: return
         with open(init_path, 'r') as f: lines = f.readlines()
         modified = False
         new_lines = []
         target_line_prefix = "from .LangSegment import"
         for line in lines:
             if line.strip().startswith(target_line_prefix) and ('setLangfilters' in line or 'getLangfilters' in line):
                 mod_line = line.replace(',setLangfilters', '').replace(',getLangfilters', '')
@@ -61,7 +54,6 @@ def patch_langsegment_init():
                 modified = True
             else:
                 new_lines.append(line)
         if modified:
             with open(init_path, 'w') as f: f.writelines(new_lines)
             try:
@@ -72,14 +64,9 @@ def patch_langsegment_init():
 patch_langsegment_init()
-# دریافت ریپازیتوری Amphion
 if not os.path.exists("Amphion"):
     subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
     os.chdir("Amphion")
-else:
-    if not os.getcwd().endswith("Amphion"):
-        os.chdir("Amphion")
 if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
     sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
@@ -88,24 +75,31 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
 def save_audio_pcm16(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
-            waveform = waveform.detach().cpu()
-            if waveform.dim() == 2 and waveform.shape[0] == 1:
-                waveform = waveform.squeeze(0)
-            waveform = waveform.numpy()
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
-        raise e
 def setup_configs():
     if downloaded_resources["configs"]: return
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
     config_files = ["Vq8192ToMels.json", "Vocoder.json"]
     for file in config_files:
         file_path = f"{config_path}/{file}"
         if not os.path.exists(file_path):
@@ -116,9 +110,7 @@ def setup_configs():
     downloaded_resources["configs"] = True
 setup_configs()
 device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
-print(f"Using device: {device}")
 inference_pipelines = {}
@@ -128,31 +120,23 @@ def preload_all_resources():
     global downloaded_content_style_tokenizer_path, downloaded_fmt_path, downloaded_vocoder_path
     if not downloaded_resources["tokenizer_vq8192"]:
-        local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
-        downloaded_content_style_tokenizer_path = local_dir
         downloaded_resources["tokenizer_vq8192"] = True
     if not downloaded_resources["fmt_Vq8192ToMels"]:
-        local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vq8192ToMels/*"])
-        downloaded_fmt_path = local_dir
         downloaded_resources["fmt_Vq8192ToMels"] = True
     if not downloaded_resources["vocoder"]:
-        local_dir = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
-        downloaded_vocoder_path = local_dir
         downloaded_resources["vocoder"] = True
     print("Resources ready.")
 downloaded_content_style_tokenizer_path = None
 downloaded_fmt_path = None
 downloaded_vocoder_path = None
 preload_all_resources()
 def get_pipeline():
-    if "timbre" in inference_pipelines:
-        return inference_pipelines["timbre"]
     pipeline = VevoInferencePipeline(
         content_style_tokenizer_ckpt_path=os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192"),
         fmt_cfg_path="./models/vc/vevo/config/Vq8192ToMels.json",
@@ -161,7 +145,6 @@ def get_pipeline():
         vocoder_ckpt_path=os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder"),
         device=device,
     )
     inference_pipelines["timbre"] = pipeline
     return pipeline
@@ -176,88 +159,60 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
-        # --- 1. آماده‌سازی فایل ورودی (Content) ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
             content_sr, content_data = content_wav
-        if len(content_data.shape) > 1 and content_data.shape[1] > 1:
-            content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
-            content_sr = 24000
-        # نرمال‌سازی
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         content_full_np = content_tensor.squeeze().numpy()
-        # --- 2. آماده‌سازی فایل رفرنس (Reference) ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
             ref_sr, ref_data = reference_wav
-        if len(ref_data.shape) > 1 and ref_data.shape[1] > 1:
-            ref_data = np.mean(ref_data, axis=1)
         ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
-            ref_sr = 24000
-        ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
-        ref_tensor = ref_tensor / ref_max * 0.95
-        if ref_tensor.shape[1] > 24000 * 20:
-             ref_tensor = ref_tensor[:, :24000 * 20]
-        save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
-        # --- 3. منطق جدید: Pre-roll Context (بدون هم‌پوشانی طولانی) ---
         pipeline = get_pipeline()
         SR = 24000
-        # استراتژی:
-        # ما می‌خواهیم ۱۰ ثانیه جدید بسازیم.
-        # اما برای اینکه لحن نپرد، ۳ ثانیه از صدای قبلی را هم به ورودی اضافه می‌کنیم.
-        # بعد از تولید، آن ۳ ثانیه اول را دور می‌ریزیم.
-        NEW_CHUNK_SEC = 10.0  # مقدار صدای جدید در هر مرحله
-        CONTEXT_SEC = 3.0     # مقدار صدای قدیمی برای حفظ لحن
         new_chunk_samples = int(NEW_CHUNK_SEC * SR)
         context_samples = int(CONTEXT_SEC * SR)
         total_samples = len(content_full_np)
-        final_output = []
-        # نشانگر: تا کجای فایل را "نهایی" و تولید کرده‌ایم
         current_cursor = 0
-        print(f"[{session_id}] Starting processing with Context-Discard strategy...")
         while current_cursor < total_samples:
-            # تعیین محدوده برش از فایل اصلی
-            # شروع: کمی عقب‌تر از جایی که هستیم (برای کانتکست)
             start_slice = max(0, current_cursor - context_samples)
-            # پایان: ۱۰ ثانیه جلوتر از جایی که هستیم
             end_slice = min(total_samples, current_cursor + new_chunk_samples)
-            # اگر چیزی برای پردازش نمانده
-            if start_slice >= end_slice:
-                break
-            # استخراج تکه از فایل اصلی
             chunk_np = content_full_np[start_slice:end_slice]
-            # ذخیره موقت برای مدل
             save_audio_pcm16(torch.FloatTensor(chunk_np).unsqueeze(0), temp_content_path, SR)
             try:
-                # تولید صدا توسط مدل
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
@@ -265,65 +220,51 @@ def vevo_timbre(content_wav, reference_wav):
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
-                if gen.dim() == 1: gen = gen.unsqueeze(0)
-                gen_np = gen.cpu().squeeze(0).numpy()
-                # --- برش هوشمند (Trimming) ---
-                # ما (current_cursor - start_slice) مقدار سمپل را قبلا تولید کرده بودیم (کانتکست)
-                # پس باید این مقدار را از ابتدای خروجی جدید حذف کنیم تا صدای دو نفره نشود.
-                trim_amount = current_cursor - start_slice
-                if len(gen_np) > trim_amount:
-                    valid_audio = gen_np[trim_amount:]
-                    # --- اتصال نرم (Micro Cross-fade) ---
-                    # فقط ۱۰ میلی ثانیه فید می‌کنیم تا صدای "تیک" ندهد.
-                    # این مقدار آنقدر کم است که گوش انسان صدای دو نفره نمی‌شنود.
-                    if len(final_output) > 0:
-                        # 10ms fade
-                        fade_len = int(0.01 * SR)
-                        if len(final_output[-1]) > fade_len and len(valid_audio) > fade_len:
-                            fade_out_curve = np.linspace(1, 0, fade_len)
-                            fade_in_curve = np.linspace(0, 1, fade_len)
-                            # میکس فقط روی ۱۰ میلی ثانیه مرز
-                            prev_tail = final_output[-1][-fade_len:]
-                            curr_head = valid_audio[:fade_len]
-                            blended = (prev_tail * fade_out_curve) + (curr_head * fade_in_curve)
-                            # جایگزینی
-                            final_output[-1][-fade_len:] = blended
-                            valid_audio = valid_audio[fade_len:]
-                    final_output.append(valid_audio)
-                # مکان‌نما را جلو می‌بریم
                 current_cursor = end_slice
             except Exception as e:
-                print(f"Error in chunk: {e}")
-                # در صورت خطا، سکوت اضافه می‌کنیم تا زمان‌بندی به هم نریزد
-                missing_len = end_slice - current_cursor
-                final_output.append(np.zeros(missing_len))
-                current_cursor = end_slice
-        # چسباندن نهایی
-        if len(final_output) > 0:
-            full_audio = np.concatenate(final_output)
-        else:
-            full_audio = np.zeros(SR)
-        save_audio_pcm16(full_audio, output_path, SR)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (Clean Stitch)") as demo:
-    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
-    gr.Markdown("No Ghosting Version: Uses Context-Discard buffering to ensure single voice playback.")
     with gr.Row():
         with gr.Column():

 import spaces
 import uuid
 import soundfile as sf
+# اضافه شدن کتابخانه PyDub
+from pydub import AudioSegment
+import io
+# --- نصب و پچ کردن ---
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
 install_espeak()
 def patch_langsegment_init():
     try:
         spec = importlib.util.find_spec("LangSegment")
         if spec is None or spec.origin is None: return
         init_path = os.path.join(os.path.dirname(spec.origin), '__init__.py')
         with open(init_path, 'r') as f: lines = f.readlines()
         modified = False
         new_lines = []
         target_line_prefix = "from .LangSegment import"
         for line in lines:
             if line.strip().startswith(target_line_prefix) and ('setLangfilters' in line or 'getLangfilters' in line):
                 mod_line = line.replace(',setLangfilters', '').replace(',getLangfilters', '')
                 modified = True
             else:
                 new_lines.append(line)
         if modified:
             with open(init_path, 'w') as f: f.writelines(new_lines)
             try:
 patch_langsegment_init()
 if not os.path.exists("Amphion"):
     subprocess.run(["git", "clone", "https://github.com/open-mmlab/Amphion.git"])
     os.chdir("Amphion")
 if os.path.dirname(os.path.abspath("Amphion")) not in sys.path:
     sys.path.append(os.path.dirname(os.path.abspath("Amphion")))
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
+# --- توابع کمکی جدید برای PyDub ---
+def numpy_to_audiosegment(audio_arr, sample_rate=24000):
+    """تبدیل آرایه نامپای (Float32) به آبجکت AudioSegment"""
+    # تبدیل به PCM 16-bit
+    audio_int16 = (audio_arr * 32767).astype(np.int16)
+    # ایجاد فایل در حافظه
+    byte_io = io.BytesIO()
+    sf.write(byte_io, audio_int16, sample_rate, format='WAV', subtype='PCM_16')
+    byte_io.seek(0)
+    return AudioSegment.from_wav(byte_io)
 def save_audio_pcm16(waveform, output_path, sample_rate=24000):
+    # این تابع فقط برای ذخیره فایل‌های موقت ورودی مدل است
     try:
         if isinstance(waveform, torch.Tensor):
+            waveform = waveform.detach().cpu().squeeze().numpy()
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
 def setup_configs():
     if downloaded_resources["configs"]: return
     config_path = "models/vc/vevo/config"
     os.makedirs(config_path, exist_ok=True)
     config_files = ["Vq8192ToMels.json", "Vocoder.json"]
     for file in config_files:
         file_path = f"{config_path}/{file}"
         if not os.path.exists(file_path):
     downloaded_resources["configs"] = True
 setup_configs()
 device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cpu")
 inference_pipelines = {}
     global downloaded_content_style_tokenizer_path, downloaded_fmt_path, downloaded_vocoder_path
     if not downloaded_resources["tokenizer_vq8192"]:
+        downloaded_content_style_tokenizer_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
         downloaded_resources["tokenizer_vq8192"] = True
     if not downloaded_resources["fmt_Vq8192ToMels"]:
+        downloaded_fmt_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vq8192ToMels/*"])
         downloaded_resources["fmt_Vq8192ToMels"] = True
     if not downloaded_resources["vocoder"]:
+        downloaded_vocoder_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
         downloaded_resources["vocoder"] = True
     print("Resources ready.")
 downloaded_content_style_tokenizer_path = None
 downloaded_fmt_path = None
 downloaded_vocoder_path = None
 preload_all_resources()
 def get_pipeline():
+    if "timbre" in inference_pipelines: return inference_pipelines["timbre"]
     pipeline = VevoInferencePipeline(
         content_style_tokenizer_ckpt_path=os.path.join(downloaded_content_style_tokenizer_path, "tokenizer/vq8192"),
         fmt_cfg_path="./models/vc/vevo/config/Vq8192ToMels.json",
         vocoder_ckpt_path=os.path.join(downloaded_vocoder_path, "acoustic_modeling/Vocoder"),
         device=device,
     )
     inference_pipelines["timbre"] = pipeline
     return pipeline
         raise ValueError("Please upload audio files")
     try:
+        # --- 1. پردازش ورودی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         else:
             content_sr, content_data = content_wav
+        if len(content_data.shape) > 1: content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
         if content_sr != 24000:
             content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         content_full_np = content_tensor.squeeze().numpy()
+        # --- 2. پردازش رفرنس ---
         if isinstance(reference_wav, tuple):
             ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
         else:
             ref_sr, ref_data = reference_wav
+        if len(ref_data.shape) > 1: ref_data = np.mean(ref_data, axis=1)
         ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
         if ref_sr != 24000:
             ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
+        ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        if ref_tensor.shape[1] > 24000 * 20: ref_tensor = ref_tensor[:, :24000 * 20]
+        save_audio_pcm16(ref_tensor, temp_reference_path, 24000)
+        # --- 3. منطق پردازش با استفاده از PyDub ---
         pipeline = get_pipeline()
         SR = 24000
+        NEW_CHUNK_SEC = 10.0
+        CONTEXT_SEC = 3.0
         new_chunk_samples = int(NEW_CHUNK_SEC * SR)
         context_samples = int(CONTEXT_SEC * SR)
         total_samples = len(content_full_np)
+        # ایجاد یک AudioSegment خالی برای جمع‌آوری خروجی نهایی
+        final_audio_segment = AudioSegment.empty()
         current_cursor = 0
+        print(f"[{session_id}] Processing with PyDub stitching...")
         while current_cursor < total_samples:
             start_slice = max(0, current_cursor - context_samples)
             end_slice = min(total_samples, current_cursor + new_chunk_samples)
+            if start_slice >= end_slice: break
             chunk_np = content_full_np[start_slice:end_slice]
             save_audio_pcm16(torch.FloatTensor(chunk_np).unsqueeze(0), temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
                     src_wav_path=temp_content_path,
                     timbre_ref_wav_path=temp_reference_path,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
+                gen_np = gen.detach().cpu().squeeze().numpy()
+                # محاسبه مقدار برشی (حذف کانتکست تکراری)
+                trim_samples = current_cursor - start_slice
+                if len(gen_np) > trim_samples:
+                    valid_part_np = gen_np[trim_samples:]
+                    # تبدیل به فرمت PyDub
+                    new_segment = numpy_to_audiosegment(valid_part_np, SR)
+                    # اتصال:
+                    # اگر اولین تکه نیست، یک فید (Crossfade) بسیار کوتاه (5 میلی ثانیه)
+                    # اعمال می‌کنیم تا صدای "تیک" حذف شود.
+                    if len(final_audio_segment) > 0:
+                        # تکنیک: یک فید بسیار ریز (Crossfade 5ms)
+                        # نکته: PyDub برای کراس‌فید نیاز به همپوشانی دارد، اما چون ما کانتکست را دقیق بریدیم،
+                        # اینجا از append ساده استفاده می‌کنیم و فقط لبه‌ها را نرم می‌کنیم.
+                        # نرم کردن ابتدای تکه جدید (Fade In 5ms)
+                        new_segment = new_segment.fade_in(5)
+                        # نرم کردن انتهای تکه قبلی (Fade Out 5ms) - (قبلاً انجام شده یا الان انجام میدیم)
+                        # در اینجا فقط چسباندن (Append) با فید این کافیست.
+                        final_audio_segment += new_segment
+                    else:
+                        final_audio_segment += new_segment
                 current_cursor = end_slice
             except Exception as e:
+                print(f"Error: {e}")
+                current_cursor = end_slice # Skip on error
+        # ذخیره خروجی نهایی با PyDub
+        final_audio_segment.export(output_path, format="wav")
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (PyDub)") as demo:
+    gr.Markdown("## Vevo-Timbre: Voice Conversion")
+    gr.Markdown("Seamless stitching powered by PyDub library.")
     with gr.Row():
         with gr.Column():