Spaces:

Opera8
/

Sada

Running on Zero

App Files Files Community

Opera8 commited on 21 days ago

Commit

501163c

verified ·

1 Parent(s): 84bdd36

Update app.py

Browse files

Files changed (1) hide show

app.py +91 -75

app.py CHANGED Viewed

@@ -13,11 +13,8 @@ import re
 import spaces
 import uuid
 import soundfile as sf
-# اضافه شدن کتابخانه PyDub
-from pydub import AudioSegment
-import io
-# --- نصب و پچ کردن ---
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
@@ -75,22 +72,13 @@ os.makedirs("ckpts/Vevo", exist_ok=True)
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
-# --- توابع کمکی جدید برای PyDub ---
-def numpy_to_audiosegment(audio_arr, sample_rate=24000):
-    """تبدیل آرایه نامپای (Float32) به آبجکت AudioSegment"""
-    # تبدیل به PCM 16-bit
-    audio_int16 = (audio_arr * 32767).astype(np.int16)
-    # ایجاد فایل در حافظه
-    byte_io = io.BytesIO()
-    sf.write(byte_io, audio_int16, sample_rate, format='WAV', subtype='PCM_16')
-    byte_io.seek(0)
-    return AudioSegment.from_wav(byte_io)
 def save_audio_pcm16(waveform, output_path, sample_rate=24000):
-    # این تابع فقط برای ذخیره فایل‌های موقت ورودی مدل است
     try:
         if isinstance(waveform, torch.Tensor):
-            waveform = waveform.detach().cpu().squeeze().numpy()
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
@@ -115,10 +103,8 @@ device = torch.device("cuda") if torch.cuda.is_available() else torch.device("cp
 inference_pipelines = {}
 def preload_all_resources():
-    print("Preloading resources...")
     setup_configs()
     global downloaded_content_style_tokenizer_path, downloaded_fmt_path, downloaded_vocoder_path
     if not downloaded_resources["tokenizer_vq8192"]:
         downloaded_content_style_tokenizer_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
         downloaded_resources["tokenizer_vq8192"] = True
@@ -128,7 +114,6 @@ def preload_all_resources():
     if not downloaded_resources["vocoder"]:
         downloaded_vocoder_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
         downloaded_resources["vocoder"] = True
-    print("Resources ready.")
 downloaded_content_style_tokenizer_path = None
 downloaded_fmt_path = None
@@ -159,6 +144,8 @@ def vevo_timbre(content_wav, reference_wav):
         raise ValueError("Please upload audio files")
     try:
         # --- 1. پردازش ورودی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
@@ -167,8 +154,8 @@ def vevo_timbre(content_wav, reference_wav):
         if len(content_data.shape) > 1: content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
-        if content_sr != 24000:
-            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         content_full_np = content_tensor.squeeze().numpy()
@@ -180,37 +167,56 @@ def vevo_timbre(content_wav, reference_wav):
         if len(ref_data.shape) > 1: ref_data = np.mean(ref_data, axis=1)
         ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
-        if ref_sr != 24000:
-            ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
-        if ref_tensor.shape[1] > 24000 * 20: ref_tensor = ref_tensor[:, :24000 * 20]
-        save_audio_pcm16(ref_tensor, temp_reference_path, 24000)
-        # --- 3. منطق پردازش با استفاده از PyDub ---
         pipeline = get_pipeline()
-        SR = 24000
-        NEW_CHUNK_SEC = 10.0
-        CONTEXT_SEC = 3.0
-        new_chunk_samples = int(NEW_CHUNK_SEC * SR)
-        context_samples = int(CONTEXT_SEC * SR)
         total_samples = len(content_full_np)
-        # ایجاد یک AudioSegment خالی برای جمع‌آوری خروجی نهایی
-        final_audio_segment = AudioSegment.empty()
-        current_cursor = 0
-        print(f"[{session_id}] Processing with PyDub stitching...")
-        while current_cursor < total_samples:
-            start_slice = max(0, current_cursor - context_samples)
-            end_slice = min(total_samples, current_cursor + new_chunk_samples)
-            if start_slice >= end_slice: break
-            chunk_np = content_full_np[start_slice:end_slice]
-            save_audio_pcm16(torch.FloatTensor(chunk_np).unsqueeze(0), temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
@@ -218,53 +224,63 @@ def vevo_timbre(content_wav, reference_wav):
                     timbre_ref_wav_path=temp_reference_path,
                     flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 gen_np = gen.detach().cpu().squeeze().numpy()
-                # محاسبه مقدار برشی (حذف کانتکست تکراری)
-                trim_samples = current_cursor - start_slice
-                if len(gen_np) > trim_samples:
-                    valid_part_np = gen_np[trim_samples:]
-                    # تبدیل به فرمت PyDub
-                    new_segment = numpy_to_audiosegment(valid_part_np, SR)
-                    # اتصال:
-                    # اگر اولین تکه نیست، یک فید (Crossfade) بسیار کوتاه (5 میلی ثانیه)
-                    # اعمال می‌کنیم تا صدای "تیک" حذف شود.
-                    if len(final_audio_segment) > 0:
-                        # تکنیک: یک فید بسیار ریز (Crossfade 5ms)
-                        # نکته: PyDub برای کراس‌فید نیاز به همپوشانی دارد، اما چون ما کانتکست را دقیق بریدیم،
-                        # اینجا از append ساده استفاده می‌کنیم و فقط لبه‌ها را نرم می‌کنیم.
-                        # نرم کردن ابتدای تکه جدید (Fade In 5ms)
-                        new_segment = new_segment.fade_in(5)
-                        # نرم کردن انتهای تکه قبلی (Fade Out 5ms) - (قبلاً انجام شده یا الان انجام میدیم)
-                        # در اینجا فقط چسباندن (Append) با فید این کافیست.
-                        final_audio_segment += new_segment
                     else:
-                        final_audio_segment += new_segment
-                current_cursor = end_slice
             except Exception as e:
-                print(f"Error: {e}")
-                current_cursor = end_slice # Skip on error
-        # ذخیره خروجی نهایی با PyDub
-        final_audio_segment.export(output_path, format="wav")
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
-with gr.Blocks(title="Vevo-Timbre (PyDub)") as demo:
-    gr.Markdown("## Vevo-Timbre: Voice Conversion")
-    gr.Markdown("Seamless stitching powered by PyDub library.")
     with gr.Row():
         with gr.Column():

 import spaces
 import uuid
 import soundfile as sf
+# --- تنظیمات و نصب ---
 downloaded_resources = {
     "configs": False,
     "tokenizer_vq8192": False,
 from models.vc.vevo.vevo_utils import VevoInferencePipeline
 def save_audio_pcm16(waveform, output_path, sample_rate=24000):
     try:
         if isinstance(waveform, torch.Tensor):
+            waveform = waveform.detach().cpu()
+            if waveform.dim() == 2 and waveform.shape[0] == 1:
+                waveform = waveform.squeeze(0)
+            waveform = waveform.numpy()
         sf.write(output_path, waveform, sample_rate, subtype='PCM_16')
     except Exception as e:
         print(f"Save error: {e}")
 inference_pipelines = {}
 def preload_all_resources():
     setup_configs()
     global downloaded_content_style_tokenizer_path, downloaded_fmt_path, downloaded_vocoder_path
     if not downloaded_resources["tokenizer_vq8192"]:
         downloaded_content_style_tokenizer_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["tokenizer/vq8192/*"])
         downloaded_resources["tokenizer_vq8192"] = True
     if not downloaded_resources["vocoder"]:
         downloaded_vocoder_path = snapshot_download(repo_id="amphion/Vevo", repo_type="model", cache_dir="./ckpts/Vevo", allow_patterns=["acoustic_modeling/Vocoder/*"])
         downloaded_resources["vocoder"] = True
 downloaded_content_style_tokenizer_path = None
 downloaded_fmt_path = None
         raise ValueError("Please upload audio files")
     try:
+        SR = 24000
         # --- 1. پردازش ورودی ---
         if isinstance(content_wav, tuple):
             content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
         if len(content_data.shape) > 1: content_data = np.mean(content_data, axis=1)
         content_tensor = torch.FloatTensor(content_data).unsqueeze(0)
+        if content_sr != SR:
+            content_tensor = torchaudio.functional.resample(content_tensor, content_sr, SR)
         content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
         content_full_np = content_tensor.squeeze().numpy()
         if len(ref_data.shape) > 1: ref_data = np.mean(ref_data, axis=1)
         ref_tensor = torch.FloatTensor(ref_data).unsqueeze(0)
+        if ref_sr != SR:
+            ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, SR)
         ref_tensor = ref_tensor / (torch.max(torch.abs(ref_tensor)) + 1e-6) * 0.95
+        if ref_tensor.shape[1] > SR * 20: ref_tensor = ref_tensor[:, :SR * 20]
+        save_audio_pcm16(ref_tensor, temp_reference_path, SR)
+        # --- 3. استراتژی جوش دادن Equal Power (500ms) ---
         pipeline = get_pipeline()
+        # تنظیمات حیاتی
+        CHUNK_DURATION = 10.0   # طول خالص هر تکه
+        CROSSFADE_SEC = 0.5     # طول هم‌پوشانی (نیم ثانیه برای حذف لرزش)
+        chunk_samples = int(CHUNK_DURATION * SR)
+        crossfade_samples = int(CROSSFADE_SEC * SR)
         total_samples = len(content_full_np)
+        final_output = np.array([], dtype=np.float32)
+        # ایجاد منحنی فید Equal Power (سینوسی)
+        # این منحنی باعث می‌شود حجم صدا در محل اتصال ثابت بماند
+        fade_out_curve = np.cos(np.linspace(0, np.pi/2, crossfade_samples))
+        fade_in_curve = np.sin(np.linspace(0, np.pi/2, crossfade_samples))
+        # شروع حلقه پردازش
+        # ما در هر مرحله به اندازه chunk_samples جلو می‌رویم
+        # اما برای ورودی مدل، crossfade_samples را از قبل هم برمی‌داریم
+        cursor = 0
+        print(f"[{session_id}] Processing with 500ms Equal-Power Crossfade...")
+        while cursor < total_samples:
+            # تعیین بازه ورودی برای مدل
+            # اگر اولین تکه نیست، باید کمی از عقب‌تر شروع کنیم (برای هم‌پوشانی)
+            is_first_chunk = (cursor == 0)
+            start_idx = cursor
+            if not is_first_chunk:
+                start_idx -= crossfade_samples  # عقب‌گرد برای هم‌پوشانی
+            end_idx = min(total_samples, cursor + chunk_samples)
+            # اگر به انتهای فایل رسیدیم و تکه خیلی کوچک است
+            if start_idx >= end_idx:
+                break
+            current_chunk_input = content_full_np[start_idx:end_idx]
+            # ذخیره و اجرا
+            save_audio_pcm16(torch.FloatTensor(current_chunk_input).unsqueeze(0), temp_content_path, SR)
             try:
                 gen = pipeline.inference_fm(
                     timbre_ref_wav_path=temp_reference_path,
                     flow_matching_steps=64,
                 )
                 if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
                 gen_np = gen.detach().cpu().squeeze().numpy()
+                # --- عملیات میکس هوشمند ---
+                if is_first_chunk:
+                    # تکه اول: مستقیماً اضافه کن
+                    final_output = np.concatenate([final_output, gen_np])
+                else:
+                    # تکه‌های بعدی:
+                    # 1. بخش هم‌پوشانی (Crossfade Area)
+                    # 2. بخش جدید (New Area)
+                    if len(gen_np) < crossfade_samples:
+                        # اگر خروجی خیلی کوتاه بود (نادر)، فقط بچسبان
+                        final_output = np.concatenate([final_output, gen_np])
                     else:
+                        # جدا کردن بخش میکس و بخش جدید از خروجی فعلی
+                        overlap_part_new = gen_np[:crossfade_samples]
+                        rest_part_new = gen_np[crossfade_samples:]
+                        # جدا کردن بخش میکس از انتهای خروجی قبلی
+                        if len(final_output) >= crossfade_samples:
+                            overlap_part_old = final_output[-crossfade_samples:]
+                            # فرمول Equal Power Crossfade
+                            # Old * Cos + New * Sin
+                            blended = (overlap_part_old * fade_out_curve) + (overlap_part_new * fade_in_curve)
+                            # جایگزینی انتهای آرایه اصلی با بخش میکس شده
+                            final_output[-crossfade_samples:] = blended
+                            # اضافه کردن باقی‌مانده
+                            final_output = np.concatenate([final_output, rest_part_new])
+                        else:
+                            # اگر بافر قبلی خیلی کوتاه بود (نباید پیش بیاید)
+                            final_output = np.concatenate([final_output, gen_np])
             except Exception as e:
+                print(f"Error at {cursor}: {e}")
+                # در صورت خطا سکوت اضافه کن
+                missing = end_idx - start_idx
+                final_output = np.concatenate([final_output, np.zeros(missing)])
+            # حرکت به جلو
+            cursor += chunk_samples
+        save_audio_pcm16(final_output, output_path, SR)
         return output_path
     finally:
         if os.path.exists(temp_content_path): os.remove(temp_content_path)
         if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
+with gr.Blocks(title="Vevo-Timbre (Pro Stitch)") as demo:
+    gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
+    gr.Markdown("Professional Stitching: 500ms Equal-Power Crossfade (No Jitter, No Ghosting).")
     with gr.Row():
         with gr.Column():