import gradio as gr import torch from diffusers import DiffusionPipeline # ---------------------- MODEL INITIALIZATION ---------------------- # Use 'balanced' for multi-device setups and CPU fallback for Spaces without GPU device_map = "balanced" if torch.cuda.is_available() else "cpu" flux_model = DiffusionPipeline.from_pretrained( "black-forest-labs/FLUX.1-dev", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map=device_map ) omni_model = DiffusionPipeline.from_pretrained( "tencent/OmniAvatar", torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32, device_map=device_map ) # ---------------------- MAIN GENERATION FUNCTION ---------------------- def generate_video(image, audio, prompt, style="claymation"): device = "cuda" if torch.cuda.is_available() else "cpu" flux_model.to(device) omni_model.to(device) try: # Step 1: Stylize input image using FLUX-Kontext stylized_image = flux_model( prompt=prompt, image=image, guidance_scale=7.5, num_inference_steps=30 ).images[0] # Step 2: Animate the stylized image with OmniAvatar result = omni_model( image=stylized_image, audio=audio, style=style, ) # Return the generated video if available if isinstance(result, dict) and "video" in result: return result["video"] elif hasattr(result, "videos"): return result.videos[0] else: return f"⚠️ Unexpected output format: {type(result)}" except Exception as e: return f"⚠️ Error during generation: {str(e)}" # ---------------------- GRADIO UI ---------------------- with gr.Blocks(title="🎭 Claymation Talking Avatar Generator") as demo: gr.Markdown(""" # 🎬 Claymation Talking Avatar Generator Generate claymation-style speaking avatars using **FLUX-Kontext** for stylization and **OmniAvatar** for lip-synced animation. """) with gr.Row(): image_input = gr.Image(label="🧑 Upload Character Image", type="filepath") audio_input = gr.Audio(label="🎤 Upload Voice Audio", type="filepath") prompt = gr.Textbox( label="📝 Prompt (Optional)", value="A claymation character speaking realistically", placeholder="Describe the style or mood..." ) with gr.Row(): style_dropdown = gr.Dropdown( choices=["claymation", "toon", "realistic"], value="claymation", label="🎨 Style" ) generate_button = gr.Button("🚀 Generate Video") video_output = gr.Video(label="🎥 Generated Output") generate_button.click( fn=generate_video, inputs=[image_input, audio_input, prompt, style_dropdown], outputs=video_output ) # ---------------------- LAUNCH ---------------------- demo.queue().launch(debug=True, share=False)