import gradio as gr
import torch
from diffusers import DiffusionPipeline

# ---------------------- MODEL INITIALIZATION ----------------------
# Use 'balanced' for multi-device setups and CPU fallback for Spaces without GPU
device_map = "balanced" if torch.cuda.is_available() else "cpu"

flux_model = DiffusionPipeline.from_pretrained(
    "black-forest-labs/FLUX.1-dev",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map=device_map
)

omni_model = DiffusionPipeline.from_pretrained(
    "tencent/OmniAvatar",
    torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
    device_map=device_map
)

# ---------------------- MAIN GENERATION FUNCTION ----------------------
def generate_video(image, audio, prompt, style="claymation"):
    device = "cuda" if torch.cuda.is_available() else "cpu"
    flux_model.to(device)
    omni_model.to(device)

    try:
        # Step 1: Stylize input image using FLUX-Kontext
        stylized_image = flux_model(
            prompt=prompt,
            image=image,
            guidance_scale=7.5,
            num_inference_steps=30
        ).images[0]

        # Step 2: Animate the stylized image with OmniAvatar
        result = omni_model(
            image=stylized_image,
            audio=audio,
            style=style,
        )

        # Return the generated video if available
        if isinstance(result, dict) and "video" in result:
            return result["video"]
        elif hasattr(result, "videos"):
            return result.videos[0]
        else:
            return f"⚠️ Unexpected output format: {type(result)}"
    except Exception as e:
        return f"⚠️ Error during generation: {str(e)}"

# ---------------------- GRADIO UI ----------------------
with gr.Blocks(title="🎭 Claymation Talking Avatar Generator") as demo:
    gr.Markdown("""
    # 🎬 Claymation Talking Avatar Generator  
    Generate claymation-style speaking avatars using **FLUX-Kontext** for stylization  
    and **OmniAvatar** for lip-synced animation.
    """)

    with gr.Row():
        image_input = gr.Image(label="🧑 Upload Character Image", type="filepath")
        audio_input = gr.Audio(label="🎤 Upload Voice Audio", type="filepath")

    prompt = gr.Textbox(
        label="📝 Prompt (Optional)",
        value="A claymation character speaking realistically",
        placeholder="Describe the style or mood..."
    )

    with gr.Row():
        style_dropdown = gr.Dropdown(
            choices=["claymation", "toon", "realistic"],
            value="claymation",
            label="🎨 Style"
        )
        generate_button = gr.Button("🚀 Generate Video")

    video_output = gr.Video(label="🎥 Generated Output")

    generate_button.click(
        fn=generate_video,
        inputs=[image_input, audio_input, prompt, style_dropdown],
        outputs=video_output
    )

# ---------------------- LAUNCH ----------------------
demo.queue().launch(debug=True, share=False)