import gradio as gr
import os
import uuid
import subprocess
import requests
from PIL import Image

# Safe imports
try:
    import librosa
except ImportError:
    os.system("pip install librosa")
    import librosa

try:
    import soundfile as sf
except ImportError:
    os.system("pip install soundfile")
    import soundfile as sf

# ✅ Download Wav2Lip model if missing
MODEL_URL = "https://huggingface.co/spaces/justest/wav2lip-v2/resolve/main/wav2lip_gan.pth"
if not os.path.exists("wav2lip_gan.pth"):
    r = requests.get(MODEL_URL)
    with open("wav2lip_gan.pth", "wb") as f:
        f.write(r.content)

def preprocess(image, audio_file):
    uid = str(uuid.uuid4())
    image_path = f"{uid}_image.jpg"
    audio_out_path = f"{uid}_audio.wav"
    output_path = f"{uid}_output.mp4"

    # ✅ Resize image
    image = image.resize((int(image.width * 256 / image.height), 256), Image.Resampling.LANCZOS)
    image.save(image_path)

    # ✅ Resample audio using librosa (16kHz mono)
    y, sr = librosa.load(audio_file, sr=16000, mono=True)
    sf.write(audio_out_path, y, 16000)

    return image_path, audio_out_path, output_path

def generate(image, audio_file):
    image_path, audio_path, output_path = preprocess(image, audio_file)

    command = [
        "python3", "inference.py",
        "--checkpoint_path", "wav2lip_gan.pth",
        "--face", image_path,
        "--audio", audio_path,
        "--outfile", output_path
    ]
    subprocess.run(command)

    return output_path

gr.Interface(
    fn=generate,
    inputs=[
        gr.Image(type="pil", label="Upload Image"),
        gr.Audio(type="filepath", label="Upload Audio")
    ],
    outputs=gr.Video(label="Generated Talking Video"),
    title="⚡ Wav2Lip (Optimized for Hugging Face CPU)",
    description="Upload an image and audio. This version uses librosa for resampling and is CPU-friendly.",
    live=True
).launch()