Update app.py
Browse files
app.py
CHANGED
|
@@ -174,7 +174,7 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 174 |
raise ValueError("Please upload audio files")
|
| 175 |
|
| 176 |
try:
|
| 177 |
-
# ---
|
| 178 |
if isinstance(content_wav, tuple):
|
| 179 |
content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
|
| 180 |
else:
|
|
@@ -187,9 +187,11 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 187 |
if content_sr != 24000:
|
| 188 |
content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
|
| 189 |
content_sr = 24000
|
|
|
|
|
|
|
| 190 |
content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
|
| 191 |
|
| 192 |
-
# ---
|
| 193 |
if isinstance(reference_wav, tuple):
|
| 194 |
ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
|
| 195 |
else:
|
|
@@ -203,7 +205,6 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 203 |
ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
|
| 204 |
ref_sr = 24000
|
| 205 |
|
| 206 |
-
# تنظیم لول رفرنس
|
| 207 |
ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
|
| 208 |
ref_tensor = ref_tensor / ref_max * 0.95
|
| 209 |
|
|
@@ -212,69 +213,83 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 212 |
|
| 213 |
save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
|
| 214 |
|
| 215 |
-
# --- منطق
|
| 216 |
-
# ما هیچ صدایی را با هم میکس نمیکنیم (حذف اکو)
|
| 217 |
-
# فقط از صدای قبلی به عنوان "زمینه" استفاده میکنیم و خروجی زمینه را دور میریزیم
|
| 218 |
-
|
| 219 |
pipeline = get_pipeline()
|
|
|
|
| 220 |
SR = 24000
|
|
|
|
|
|
|
| 221 |
|
| 222 |
-
|
| 223 |
-
|
| 224 |
|
| 225 |
total_samples = content_tensor.shape[1]
|
| 226 |
-
print(f"[{session_id}]
|
| 227 |
|
| 228 |
-
|
| 229 |
-
current_ptr = 0
|
| 230 |
|
| 231 |
-
|
| 232 |
-
|
| 233 |
-
|
| 234 |
-
|
| 235 |
-
|
| 236 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 237 |
|
| 238 |
-
#
|
| 239 |
-
|
|
|
|
|
|
|
|
|
|
| 240 |
save_audio_pcm16(current_input_chunk, temp_content_path, SR)
|
| 241 |
|
| 242 |
-
|
| 243 |
-
trim_amount = 0
|
| 244 |
-
if current_ptr > 0:
|
| 245 |
-
trim_amount = current_ptr - start_idx # معمولاً برابر CONTEXT_LEN است
|
| 246 |
|
| 247 |
try:
|
| 248 |
gen = pipeline.inference_fm(
|
| 249 |
src_wav_path=temp_content_path,
|
| 250 |
timbre_ref_wav_path=temp_reference_path,
|
| 251 |
-
flow_matching_steps=64,
|
| 252 |
)
|
| 253 |
|
| 254 |
if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
|
| 255 |
if gen.dim() == 1: gen = gen.unsqueeze(0)
|
| 256 |
gen = gen.cpu().squeeze(0).numpy()
|
| 257 |
|
| 258 |
-
# *** برش
|
| 259 |
-
# قسمت
|
| 260 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 261 |
|
| 262 |
-
|
| 263 |
|
| 264 |
-
# حرکت به
|
| 265 |
-
|
| 266 |
|
| 267 |
except Exception as e:
|
| 268 |
-
print(f"Error: {e}")
|
| 269 |
-
#
|
| 270 |
-
|
| 271 |
-
|
| 272 |
-
|
| 273 |
-
|
| 274 |
-
|
| 275 |
-
|
| 276 |
-
|
| 277 |
-
full_audio = np.concatenate(final_parts)
|
| 278 |
else:
|
| 279 |
full_audio = np.zeros(24000)
|
| 280 |
|
|
@@ -285,9 +300,9 @@ def vevo_timbre(content_wav, reference_wav):
|
|
| 285 |
if os.path.exists(temp_content_path): os.remove(temp_content_path)
|
| 286 |
if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
|
| 287 |
|
| 288 |
-
with gr.Blocks(title="Vevo-Timbre (
|
| 289 |
gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
|
| 290 |
-
gr.Markdown("نسخه
|
| 291 |
|
| 292 |
with gr.Row():
|
| 293 |
with gr.Column():
|
|
|
|
| 174 |
raise ValueError("Please upload audio files")
|
| 175 |
|
| 176 |
try:
|
| 177 |
+
# --- پردازش ورودی ---
|
| 178 |
if isinstance(content_wav, tuple):
|
| 179 |
content_sr, content_data = content_wav if isinstance(content_wav[0], int) else (content_wav[1], content_wav[0])
|
| 180 |
else:
|
|
|
|
| 187 |
if content_sr != 24000:
|
| 188 |
content_tensor = torchaudio.functional.resample(content_tensor, content_sr, 24000)
|
| 189 |
content_sr = 24000
|
| 190 |
+
|
| 191 |
+
# نرمالسازی
|
| 192 |
content_tensor = content_tensor / (torch.max(torch.abs(content_tensor)) + 1e-6) * 0.95
|
| 193 |
|
| 194 |
+
# --- پردازش رفرنس ---
|
| 195 |
if isinstance(reference_wav, tuple):
|
| 196 |
ref_sr, ref_data = reference_wav if isinstance(reference_wav[0], int) else (reference_wav[1], reference_wav[0])
|
| 197 |
else:
|
|
|
|
| 205 |
ref_tensor = torchaudio.functional.resample(ref_tensor, ref_sr, 24000)
|
| 206 |
ref_sr = 24000
|
| 207 |
|
|
|
|
| 208 |
ref_max = torch.max(torch.abs(ref_tensor)) + 1e-6
|
| 209 |
ref_tensor = ref_tensor / ref_max * 0.95
|
| 210 |
|
|
|
|
| 213 |
|
| 214 |
save_audio_pcm16(ref_tensor, temp_reference_path, ref_sr)
|
| 215 |
|
| 216 |
+
# --- منطق Look-back Splicing (حذف قطعی اکو) ---
|
|
|
|
|
|
|
|
|
|
| 217 |
pipeline = get_pipeline()
|
| 218 |
+
|
| 219 |
SR = 24000
|
| 220 |
+
MAIN_CHUNK_SEC = 10.0
|
| 221 |
+
CONTEXT_SEC = 1.0 # مقدار نگاه به عقب
|
| 222 |
|
| 223 |
+
MAIN_CHUNK = int(MAIN_CHUNK_SEC * SR)
|
| 224 |
+
CONTEXT = int(CONTEXT_SEC * SR)
|
| 225 |
|
| 226 |
total_samples = content_tensor.shape[1]
|
| 227 |
+
print(f"[{session_id}] Processing (High Quality 64 Steps)... Zero Echo Mode.")
|
| 228 |
|
| 229 |
+
final_output = []
|
|
|
|
| 230 |
|
| 231 |
+
# اشارهگر جاری روی فایل اصلی
|
| 232 |
+
cursor = 0
|
| 233 |
+
|
| 234 |
+
while cursor < total_samples:
|
| 235 |
+
if cursor == 0:
|
| 236 |
+
# تکه اول: بدون کانتکست
|
| 237 |
+
input_start = 0
|
| 238 |
+
input_end = min(MAIN_CHUNK, total_samples)
|
| 239 |
+
# در تکه اول چیزی را دور نمیریزیم
|
| 240 |
+
crop_from = 0
|
| 241 |
+
else:
|
| 242 |
+
# تکههای بعدی: با کانتکست (نگاه به عقب)
|
| 243 |
+
input_start = cursor - CONTEXT
|
| 244 |
+
input_end = min(cursor + MAIN_CHUNK, total_samples)
|
| 245 |
+
# در خروجی، قسمت کانتکست را دور میریزیم (Cut)
|
| 246 |
+
crop_from = CONTEXT
|
| 247 |
|
| 248 |
+
# اگر به انتهای فایل رسیدیم و طول باقیمانده خیلی کم است
|
| 249 |
+
if input_start >= input_end:
|
| 250 |
+
break
|
| 251 |
+
|
| 252 |
+
current_input_chunk = content_tensor[:, input_start:input_end]
|
| 253 |
save_audio_pcm16(current_input_chunk, temp_content_path, SR)
|
| 254 |
|
| 255 |
+
print(f"[{session_id}] Processing chunk: {cursor/SR:.1f}s -> {(input_end-input_start)/SR:.1f}s len")
|
|
|
|
|
|
|
|
|
|
| 256 |
|
| 257 |
try:
|
| 258 |
gen = pipeline.inference_fm(
|
| 259 |
src_wav_path=temp_content_path,
|
| 260 |
timbre_ref_wav_path=temp_reference_path,
|
| 261 |
+
flow_matching_steps=64, # کیفیت بالا
|
| 262 |
)
|
| 263 |
|
| 264 |
if torch.isnan(gen).any(): gen = torch.nan_to_num(gen, nan=0.0)
|
| 265 |
if gen.dim() == 1: gen = gen.unsqueeze(0)
|
| 266 |
gen = gen.cpu().squeeze(0).numpy()
|
| 267 |
|
| 268 |
+
# *** نکته کلیدی: برش قسمت تکراری ***
|
| 269 |
+
# فقط قسمت "جدید" را نگه میداریم
|
| 270 |
+
if crop_from > 0:
|
| 271 |
+
if len(gen) > crop_from:
|
| 272 |
+
valid_audio = gen[crop_from:]
|
| 273 |
+
else:
|
| 274 |
+
valid_audio = np.array([]) # اگر خروجی خیلی کوتاه بود
|
| 275 |
+
else:
|
| 276 |
+
valid_audio = gen
|
| 277 |
|
| 278 |
+
final_output.append(valid_audio)
|
| 279 |
|
| 280 |
+
# حرکت مکاننما به اندازه دیتای مفیدی که تولید کردیم
|
| 281 |
+
cursor = input_end
|
| 282 |
|
| 283 |
except Exception as e:
|
| 284 |
+
print(f"Error in chunk: {e}")
|
| 285 |
+
# اگر ارور داد، سکوت جایگزین کن که تایمینگ به هم نریزد
|
| 286 |
+
needed_len = input_end - (cursor if cursor > 0 else 0)
|
| 287 |
+
final_output.append(np.zeros(needed_len))
|
| 288 |
+
cursor = input_end
|
| 289 |
+
|
| 290 |
+
# چسباندن تکهها (Concatenate)
|
| 291 |
+
if len(final_output) > 0:
|
| 292 |
+
full_audio = np.concatenate(final_output)
|
|
|
|
| 293 |
else:
|
| 294 |
full_audio = np.zeros(24000)
|
| 295 |
|
|
|
|
| 300 |
if os.path.exists(temp_content_path): os.remove(temp_content_path)
|
| 301 |
if os.path.exists(temp_reference_path): os.remove(temp_reference_path)
|
| 302 |
|
| 303 |
+
with gr.Blocks(title="Vevo-Timbre (Zero Echo)") as demo:
|
| 304 |
gr.Markdown("## Vevo-Timbre: Zero-Shot Voice Conversion")
|
| 305 |
+
gr.Markdown("نسخه نهایی: استفاده از روش Look-back Splicing برای حذف کامل اکو و حفظ پیوستگی لحن.")
|
| 306 |
|
| 307 |
with gr.Row():
|
| 308 |
with gr.Column():
|