wav2lip_api / app.py
mich123geb's picture
Update app.py
a30d89d verified
raw
history blame
3.24 kB
import os
import uuid
import subprocess
from pathlib import Path
import gradio as gr
from PIL import Image
from pydub import AudioSegment
# ──────────────────────────────────────────────
# 1. Download model checkpoint once
# ──────────────────────────────────────────────
MODEL_PATH = Path("wav2lip_gan.pth")
MODEL_URL = (
"https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
) # public mirror
if not MODEL_PATH.exists():
os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")
# ──────────────────────────────────────────────
# 2. Helper: resize image + convert audio β†’ 16 kHz mono WAV
# ──────────────────────────────────────────────
def preprocess(image, audio_file):
if image is None or audio_file is None:
raise ValueError("Both an image and an audio file are required.")
uid = uuid.uuid4().hex
img_path = f"{uid}.jpg"
wav_path = f"{uid}.wav"
out_path = f"{uid}_result.mp4"
# resize image to 256 px height (keeps aspect ratio)
image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS)
image.save(img_path)
# convert audio to 16 kHz mono WAV
seg = AudioSegment.from_file(audio_file)
seg = seg.set_frame_rate(16_000).set_channels(1)
seg.export(wav_path, format="wav")
return img_path, wav_path, out_path
# ──────────────────────────────────────────────
# 3. Main inference wrapper
# ──────────────────────────────────────────────
def generate(image, audio):
try:
img, wav, out_vid = preprocess(image, audio)
except Exception as e:
return f"❌ {e}"
subprocess.run(
[
"python", "inference.py",
"--checkpoint_path", str(MODEL_PATH),
"--face", img,
"--audio", wav,
"--outfile", out_vid,
],
check=True,
)
return out_vid if Path(out_vid).exists() else "❌ Generation failed."
# ──────────────────────────────────────────────
# 4. Gradio UI
# ──────────────────────────────────────────────
demo = gr.Interface(
fn=generate,
inputs=[gr.Image(type="pil", label="Image"),
gr.Audio(type="filepath", label="Audio (any format)")],
outputs=gr.Video(label="Talking-head MP4"),
title="πŸ—£οΈ Wav2Lip CPU Demo",
description="Upload a single face image and an audio clip to create a lip-synced video (runs on free CPU tier).",
allow_flagging="never",
live=True,
)
if __name__ == "__main__":
demo.launch()