Update app.py
Browse files
app.py
CHANGED
|
@@ -187,8 +187,6 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 187 |
if content_sr != 24000:
|
| 188 |
content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
|
| 189 |
content_sr = 24000
|
| 190 |
-
|
| 191 |
-
# نرمال سازی
|
| 192 |
content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
|
| 193 |
|
| 194 |
# --- آماده سازی Reference ---
|
|
@@ -214,84 +212,67 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 214 |
|
| 215 |
save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
|
| 216 |
|
| 217 |
-
# --- منطق
|
| 218 |
-
|
|
|
|
| 219 |
|
|
|
|
| 220 |
SR = 24000
|
| 221 |
-
|
| 222 |
-
|
| 223 |
-
|
| 224 |
-
OVERLAP = int(0.1 * SR)
|
| 225 |
-
INPUT_SIZE = CHUNK_LEN + OVERLAP
|
| 226 |
|
| 227 |
total_samples = content_tensor.shape[1]
|
| 228 |
-
print(f"[{session_id}] Processing (
|
| 229 |
|
| 230 |
final_parts = []
|
| 231 |
-
|
| 232 |
|
| 233 |
-
|
| 234 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 235 |
|
| 236 |
-
|
|
|
|
| 237 |
save_audio_pcm16(current_input_chunk, temp_content_path, SR)
|
| 238 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 239 |
try:
|
| 240 |
gen = pipeline.inference_fm(
|
| 241 |
src_wav_path=temp_content_path,
|
| 242 |
timbre_ref_wav_path=temp_reference_path,
|
| 243 |
-
flow_matching_steps=64,
|
| 244 |
)
|
| 245 |
|
| 246 |
if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
|
| 247 |
if gen.dim() == 1: gen = gen.unsqueeze(0)
|
| 248 |
gen = gen.cpu().squeeze(0).numpy()
|
| 249 |
|
| 250 |
-
|
|
|
|
|
|
|
| 251 |
|
| 252 |
-
|
| 253 |
-
|
| 254 |
-
|
| 255 |
-
|
| 256 |
-
overlap_buffer = overlap_buffer[:mix_len]
|
| 257 |
-
|
| 258 |
-
head_to_mix = gen[:mix_len]
|
| 259 |
-
body_rest = gen[mix_len:]
|
| 260 |
-
|
| 261 |
-
# میکس سریع (Fast Cross-Fade)
|
| 262 |
-
alpha = np.linspace(0, 1, mix_len)
|
| 263 |
-
blended_segment = (overlap_buffer * (1 - alpha)) + (head_to_mix * alpha)
|
| 264 |
-
|
| 265 |
-
final_parts.append(blended_segment)
|
| 266 |
-
|
| 267 |
-
if len(body_rest) > OVERLAP:
|
| 268 |
-
pure_body = body_rest[:-OVERLAP]
|
| 269 |
-
final_parts.append(pure_body)
|
| 270 |
-
overlap_buffer = body_rest[-OVERLAP:]
|
| 271 |
-
else:
|
| 272 |
-
final_parts.append(body_rest)
|
| 273 |
-
overlap_buffer = None
|
| 274 |
|
| 275 |
-
else:
|
| 276 |
-
if current_len > OVERLAP:
|
| 277 |
-
final_parts.append(gen[:-OVERLAP])
|
| 278 |
-
overlap_buffer = gen[-OVERLAP:]
|
| 279 |
-
else:
|
| 280 |
-
final_parts.append(gen)
|
| 281 |
-
overlap_buffer = None
|
| 282 |
-
|
| 283 |
except Exception as e:
|
| 284 |
-
print(f"Error
|
| 285 |
-
|
| 286 |
-
|
| 287 |
-
|
| 288 |
-
final_parts.append(
|
| 289 |
-
|
| 290 |
-
|
| 291 |
-
|
| 292 |
-
if overlap_buffer is not None:
|
| 293 |
-
final_parts.append(overlap_buffer)
|
| 294 |
-
|
| 295 |
if len(final_parts) > 0:
|
| 296 |
full_audio = np.concatenate(final_parts)
|
| 297 |
else:
|
|
@@ -304,9 +285,9 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 304 |
if os.path.exists(temp_content_path): os.remove(temp_content_path)
|
| 305 |
if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
|
| 306 |
|
| 307 |
-
with gr.Blocks(title="Vevo-Timbre (
|
| 308 |
gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
|
| 309 |
-
gr.Markdown("نسخه
|
| 310 |
|
| 311 |
with gr.Row():
|
| 312 |
with gr.Column():
|
|
|
|
| 187 |
if content_sr != 24000:
|
| 188 |
content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
|
| 189 |
content_sr = 24000
|
|
|
|
|
|
|
| 190 |
content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
|
| 191 |
|
| 192 |
# --- آماده سازی Reference ---
|
|
|
|
| 212 |
|
| 213 |
save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
|
| 214 |
|
| 215 |
+
# --- منطق هوشمند (Smart Context Window) ---
|
| 216 |
+
# ما هیچ صدایی را با هم میکس نمیکنیم (حذف اکو)
|
| 217 |
+
# فقط از صدای قبلی به عنوان "زمینه" استفاده میکنیم و خروجی زمینه را دور میریزیم
|
| 218 |
|
| 219 |
+
pipeline = get_pipeline()
|
| 220 |
SR = 24000
|
| 221 |
+
|
| 222 |
+
CHUNK_LEN = 10 * SR # 10 ثانیه دیتای مفید
|
| 223 |
+
CONTEXT_LEN = 3 * SR # 3 ثانیه نگاه به عقب (برای حفظ لحن)
|
|
|
|
|
|
|
| 224 |
|
| 225 |
total_samples = content_tensor.shape[1]
|
| 226 |
+
print(f"[{session_id}] Smart Processing (No Echo)...")
|
| 227 |
|
| 228 |
final_parts = []
|
| 229 |
+
current_ptr = 0
|
| 230 |
|
| 231 |
+
while current_ptr < total_samples:
|
| 232 |
+
# تعیین بازه ورودی
|
| 233 |
+
# شروع: از 3 ثانیه قبل (اگر وجود داشته باشد)
|
| 234 |
+
start_idx = max(0, current_ptr - CONTEXT_LEN)
|
| 235 |
+
# پایان: 10 ثانیه بعد از نقطه فعلی
|
| 236 |
+
end_idx = min(total_samples, current_ptr + CHUNK_LEN)
|
| 237 |
|
| 238 |
+
# استخراج تکه ورودی (شامل کانتکست + دیتای جدید)
|
| 239 |
+
current_input_chunk = content_tensor[:, start_idx:end_idx]
|
| 240 |
save_audio_pcm16(current_input_chunk, temp_content_path, SR)
|
| 241 |
|
| 242 |
+
# مقدار زمانی که باید از اول خروجی حذف کنیم (همان کانتکست)
|
| 243 |
+
trim_amount = 0
|
| 244 |
+
if current_ptr > 0:
|
| 245 |
+
trim_amount = current_ptr - start_idx # معمولاً برابر CONTEXT_LEN است
|
| 246 |
+
|
| 247 |
try:
|
| 248 |
gen = pipeline.inference_fm(
|
| 249 |
src_wav_path=temp_content_path,
|
| 250 |
timbre_ref_wav_path=temp_reference_path,
|
| 251 |
+
flow_matching_steps=64,
|
| 252 |
)
|
| 253 |
|
| 254 |
if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
|
| 255 |
if gen.dim() == 1: gen = gen.unsqueeze(0)
|
| 256 |
gen = gen.cpu().squeeze(0).numpy()
|
| 257 |
|
| 258 |
+
# *** برش هوشمند ***
|
| 259 |
+
# قسمت اول (که تکراری است و مربوط به کانتکست بوده) را دور میریزیم
|
| 260 |
+
useful_part = gen[trim_amount:]
|
| 261 |
|
| 262 |
+
final_parts.append(useful_part)
|
| 263 |
+
|
| 264 |
+
# حرکت به جلو
|
| 265 |
+
current_ptr += CHUNK_LEN
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 266 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 267 |
except Exception as e:
|
| 268 |
+
print(f"Error: {e}")
|
| 269 |
+
# در صورت خطا، سکوت اضافه کن (به اندازه دیتای جدیدی که قرار بود ساخته شود)
|
| 270 |
+
missing = end_idx - current_ptr
|
| 271 |
+
if missing > 0:
|
| 272 |
+
final_parts.append(np.zeros(missing))
|
| 273 |
+
current_ptr += CHUNK_LEN # تلاش برای تکه بعدی
|
| 274 |
+
|
| 275 |
+
# چسباندن قطعات
|
|
|
|
|
|
|
|
|
|
| 276 |
if len(final_parts) > 0:
|
| 277 |
full_audio = np.concatenate(final_parts)
|
| 278 |
else:
|
|
|
|
| 285 |
if os.path.exists(temp_content_path): os.remove(temp_content_path)
|
| 286 |
if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
|
| 287 |
|
| 288 |
+
with gr.Blocks(title="Vevo-Timbre (Clean)") as demo:
|
| 289 |
gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
|
| 290 |
+
gr.Markdown("نسخه نهایی بدون اکو: استفاده از تکنیک Smart Context Window.")
|
| 291 |
|
| 292 |
with gr.Row():
|
| 293 |
with gr.Column():
|