import os import uuid import subprocess from pathlib import Path import gradio as gr from PIL import Image from pydub import AudioSegment # ────────────────────────────────────────────── # 1. Download model checkpoint once # ────────────────────────────────────────────── MODEL_PATH = Path("wav2lip_gan.pth") MODEL_URL = ( "https://huggingface.co/spaces/fffiloni/wav2lip/resolve/main/wav2lip_gan.pth" ) # public mirror if not MODEL_PATH.exists(): os.system(f"wget -q {MODEL_URL} -O {MODEL_PATH}") # ────────────────────────────────────────────── # 2. Helper: resize image + convert audio → 16 kHz mono WAV # ────────────────────────────────────────────── def preprocess(image, audio_file): if image is None or audio_file is None: raise ValueError("Both an image and an audio file are required.") uid = uuid.uuid4().hex img_path = f"{uid}.jpg" wav_path = f"{uid}.wav" out_path = f"{uid}_result.mp4" # resize image to 256 px height (keeps aspect ratio) image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS) image.save(img_path) # convert audio to 16 kHz mono WAV seg = AudioSegment.from_file(audio_file) seg = seg.set_frame_rate(16_000).set_channels(1) seg.export(wav_path, format="wav") return img_path, wav_path, out_path # ────────────────────────────────────────────── # 3. Main inference wrapper # ────────────────────────────────────────────── def generate(image, audio): try: img, wav, out_vid = preprocess(image, audio) except Exception as e: return f"❌ {e}" subprocess.run( [ "python", "inference.py", "--checkpoint_path", str(MODEL_PATH), "--face", img, "--audio", wav, "--outfile", out_vid, ], check=True, ) return out_vid if Path(out_vid).exists() else "❌ Generation failed." # ────────────────────────────────────────────── # 4. Gradio UI # ────────────────────────────────────────────── demo = gr.Interface( fn=generate, inputs=[gr.Image(type="pil", label="Image"), gr.Audio(type="filepath", label="Audio (any format)")], outputs=gr.Video(label="Talking-head MP4"), title="🗣️ Wav2Lip CPU Demo", description="Upload a single face image and an audio clip to create a lip-synced video (runs on free CPU tier).", allow_flagging="never", live=True, ) if __name__ == "__main__": demo.launch()