File size: 3,242 Bytes
952337e
13089ed
a30d89d
 
 
 
bfd9324
13089ed
a37c88f
a30d89d
 
 
 
 
 
 
 
 
 
a37c88f
a30d89d
 
 
 
 
 
952337e
a30d89d
 
 
 
952337e
a30d89d
6b0172d
a30d89d
952337e
a30d89d
 
 
 
bfd9324
a30d89d
bfd9324
a30d89d
 
 
 
 
 
 
 
bfd9324
a30d89d
 
 
 
 
 
 
 
 
 
952337e
a30d89d
952337e
a30d89d
 
 
 
952337e
a30d89d
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
import os
import uuid
import subprocess
from pathlib import Path

import gradio as gr
from PIL import Image
from pydub import AudioSegment

# ──────────────────────────────────────────────
# 1.  Download model checkpoint once
# ──────────────────────────────────────────────
MODEL_PATH = Path("wav2lip_gan.pth")
MODEL_URL  = (
    "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth"
)  # public mirror

if not MODEL_PATH.exists():
    os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}")

# ──────────────────────────────────────────────
# 2.  Helper: resize image + convert audio β†’ 16 kHz mono WAV
# ──────────────────────────────────────────────
def preprocess(image, audio_file):
    if image is None or audio_file is None:
        raise ValueError("Both an image and an audio file are required.")

    uid = uuid.uuid4().hex
    img_path   = f"{uid}.jpg"
    wav_path   = f"{uid}.wav"
    out_path   = f"{uid}_result.mp4"

    # resize image to 256 px height (keeps aspect ratio)
    image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS)
    image.save(img_path)

    # convert audio to 16 kHz mono WAV
    seg = AudioSegment.from_file(audio_file)
    seg = seg.set_frame_rate(16_000).set_channels(1)
    seg.export(wav_path, format="wav")

    return img_path, wav_path, out_path

# ──────────────────────────────────────────────
# 3.  Main inference wrapper
# ──────────────────────────────────────────────
def generate(image, audio):
    try:
        img, wav, out_vid = preprocess(image, audio)
    except Exception as e:
        return f"❌ {e}"

    subprocess.run(
        [
            "python", "inference.py",
            "--checkpoint_path", str(MODEL_PATH),
            "--face", img,
            "--audio", wav,
            "--outfile", out_vid,
        ],
        check=True,
    )

    return out_vid if Path(out_vid).exists() else "❌ Generation failed."

# ──────────────────────────────────────────────
# 4.  Gradio UI
# ──────────────────────────────────────────────
demo = gr.Interface(
    fn=generate,
    inputs=[gr.Image(type="pil", label="Image"),
            gr.Audio(type="filepath", label="Audio (any format)")],
    outputs=gr.Video(label="Talking-head MP4"),
    title="πŸ—£οΈ Wav2Lip CPU Demo",
    description="Upload a single face image and an audio clip to create a lip-synced video (runs on free CPU tier).",
    allow_flagging="never",
    live=True,
)

if __name__ == "__main__":
    demo.launch()