import gradio as gr import os import uuid import subprocess import requests from PIL import Image # Safe imports try: import librosa except ImportError: os.system("pip install librosa") import librosa try: import soundfile as sf except ImportError: os.system("pip install soundfile") import soundfile as sf # ✅ Download Wav2Lip model if missing MODEL_URL = "https://huggingface.co/spaces/justest/wav2lip-v2/resolve/main/wav2lip_gan.pth" if not os.path.exists("wav2lip_gan.pth"): r = requests.get(MODEL_URL) with open("wav2lip_gan.pth", "wb") as f: f.write(r.content) def preprocess(image, audio_file): uid = str(uuid.uuid4()) image_path = f"{uid}_image.jpg" audio_out_path = f"{uid}_audio.wav" output_path = f"{uid}_output.mp4" # ✅ Resize image image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS) image.save(image_path) # ✅ Resample audio using librosa (16kHz mono) y, sr = librosa.load(audio_file, sr=16000, mono=True) sf.write(audio_out_path, y, 16000) return image_path, audio_out_path, output_path def generate(image, audio_file): image_path, audio_path, output_path = preprocess(image, audio_file) command = [ "python3", "inference.py", "--checkpoint_path", "wav2lip_gan.pth", "--face", image_path, "--audio", audio_path, "--outfile", output_path ] subprocess.run(command) return output_path gr.Interface( fn=generate, inputs=[ gr.Image(type="pil", label="Upload Image"), gr.Audio(type="filepath", label="Upload Audio") ], outputs=gr.Video(label="Generated Talking Video"), title="⚡ Wav2Lip (Optimized for Hugging Face CPU)", description="Upload an image and audio. This version uses librosa for resampling and is CPU-friendly.", live=True ).launch()