Here's a complete, runnable Gradio application simulating OpenAI's Sora-2 model capabilities using open-source models (Stable Video Diffusion XT for video generation and SDXL-Turbo for initial text-to-image generation) with mandatory ZeroGPU Ahead-of-Time (AoT) compilation. The application is structured into multiple files for clarity and maintainability. **Important Notes:** 1. **Sora-2 Simulation**: OpenAI's Sora-2 is a proprietary model not publicly available. This application uses `stabilityai/stable-video-diffusion-img2vid-xt` (SVD-XT) and `stabilityai/sdxl-turbo` models. 2. **Video Duration**: SVD-XT currently generates short video clips (max 25 frames, approx. 3 seconds at 8 FPS). The UI includes a slider for `num_frames` but explicitly states the actual output duration is limited by SVD-XT, as it cannot replicate Sora's announced long-video capabilities. This manages user expectations while fulfilling the request to *represent* the input structure of a long-form generator. 3. **ZeroGPU AoT Compilation**: AoT compilation is implemented for both `pipe_svd.unet` and `pipe_t2i.unet` using `spaces.aoti_capture` and `torch.export.export` for optimal performance on Hugging Face Spaces. Dynamic shapes are enabled for the `num_frames` parameter in SVD's UNet. 4. **"Built with anycoder"**: Included in the main application markdown. --- === config.py === ```python import os # Model IDs SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt" T2I_MODEL_ID = "stabilityai/sdxl-turbo" # Output Directory OUTPUT_DIR = "generated_videos" os.makedirs(OUTPUT_DIR, exist_ok=True) # SVD-XT Specifics SVD_MAX_FRAMES = 25 SVD_DEFAULT_FPS = 8 SVD_MIN_MOTION_BUCKET_ID = 0 SVD_MAX_MOTION_BUCKET_ID = 255 SVD_DEFAULT_MOTION_BUCKET_ID = 127 SVD_MIN_NOISE_AUG_STRENGTH = 0.0 SVD_MAX_NOISE_AUG_STRENGTH = 0.1 SVD_DEFAULT_NOISE_AUG_STRENGTH = 0.02 # AOT Compilation Specifics AOT_DURATION_COMPILE = 1500 # Max duration for startup compilation AOT_DURATION_INFERENCE = 120 # Max duration for inference # T2I Specifics (SDXL-Turbo) T2I_DEFAULT_HEIGHT = 512 T2I_DEFAULT_WIDTH = 512 ``` === models.py === ```python import spaces import torch from diffusers import StableVideoDiffusionPipeline, AutoPipelineForText2Image from diffusers.utils import export_to_video from PIL import Image import numpy as np import time import math from torch.utils._pytree import tree_map from config import ( SVD_MODEL_ID, T2I_MODEL_ID, OUTPUT_DIR, SVD_MAX_FRAMES, T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH, AOT_DURATION_COMPILE, AOT_DURATION_INFERENCE ) # --- Model Loading --- print("Loading Stable Video Diffusion model...") pipe_svd = StableVideoDiffusionPipeline.from_pretrained( SVD_MODEL_ID, torch_dtype=torch.float16, variant="fp16" ) pipe_svd.to("cuda") print("SVD model loaded.") print("Loading SDXL-Turbo Text-to-Image model...") pipe_t2i = AutoPipelineForText2Image.from_pretrained( T2I_MODEL_ID, torch_dtype=torch.float16, variant="fp16" ) pipe_t2i.to("cuda") print("SDXL-Turbo model loaded.") # --- AoT Compilation Functions --- @spaces.GPU(duration=AOT_DURATION_COMPILE) def compile_svd_unet(): """ Compiles the Stable Video Diffusion (SVD) UNet with Ahead-of-Time (AoT) optimization. Enables dynamic shapes for the number of frames. """ print("Compiling SVD UNet with AoT (dynamic shapes for num_frames)...") # Define example parameters for the SVD pipeline call (use typical values for capture) height, width = 576, 1024 # Recommended resolution for SVD-XT num_frames_min, num_frames_max = 1, SVD_MAX_FRAMES # Range for dynamic num_frames fps = 8 motion_bucket_id = 127 noise_aug_strength = 0.02 # Create a dummy input image for image encoding input_image = Image.new("RGB", (width, height), color='blue') # --- Simulate pre-processing steps of SVD pipeline to get UNet inputs --- with torch.no_grad(): image_embeddings = pipe_svd._encode_image( input_image, device="cuda", num_videos_per_prompt=1, do_classifier_free_guidance=False ).image_embeddings pipe_svd.scheduler.set_timesteps(num_inference_steps=25) timestep = pipe_svd.scheduler.timesteps[0] # Take the first timestep for capture num_channels_latents = pipe_svd.unet.config.in_channels image_width, image_height = input_image.size original_size = (image_height, image_width) crop_coords = pipe_svd.image_processor.get_crops_coords_top_left(input_image) target_size = pipe_svd.image_processor.get_paddings_for_height_width( (image_height, image_width), min_size_divisor=pipe_svd.vae_scale_factor ) # Use a mid-range value for num_frames for initial latent creation for capture example_num_frames = (num_frames_min + num_frames_max) // 2 latents_shape = ( 1, # batch size num_channels_latents, example_num_frames, # Use example num_frames for initial shape height // pipe_svd.vae_scale_factor, width // pipe_svd.vae_scale_factor, ) latents = torch.randn(latents_shape, generator=None, device="cuda", dtype=pipe_svd.unet.dtype) add_time_ids = list(original_size + crop_coords + target_size + (fps,) + (motion_bucket_id,) + (noise_aug_strength,)) add_time_ids = torch.tensor([add_time_ids], dtype=pipe_svd.unet.dtype, device="cuda") added_cond_kwargs = {"text_embeds": image_embeddings, "time_ids": add_time_ids} sample_input = latents timestep_input = timestep encoder_hidden_states_input = None cross_attention_kwargs_input = None return_dict_input = False # Capture the UNet call with spaces.aoti_capture(pipe_svd.unet) as call: _ = pipe_svd.unet( sample=sample_input, timestep=timestep_input, encoder_hidden_states=encoder_hidden_states_input, added_cond_kwargs=added_cond_kwargs, cross_attention_kwargs=cross_attention_kwargs_input, return_dict=return_dict_input ) # Define dynamic dimensions for sequence_length (num_frames) sequence_dim = torch.export.Dim('sequence_length', min=num_frames_min, max=num_frames_max) # Prepare dynamic shapes for `torch.export.export` # `call.args[0]` corresponds to `sample` (latents), which is a 5-dim tensor. # The sequence length is the 3rd dimension (index 2). dynamic_shapes_for_args = list(tree_map(lambda v: None, call.args)) dynamic_shapes_for_args[0] = (None, None, sequence_dim, None, None) # (batch, channels, num_frames, height, width) dynamic_shapes_for_kwargs = tree_map(lambda v: None, call.kwargs) exported_svd_unet = torch.export.export( pipe_svd.unet, args=call.args, kwargs=call.kwargs, dynamic_shapes=(tuple(dynamic_shapes_for_args), dynamic_shapes_for_kwargs) ) print("SVD UNet exported with dynamic shapes. Compiling...") return spaces.aoti_compile(exported_svd_unet) @spaces.GPU(duration=AOT_DURATION_COMPILE) def compile_t2i_unet(): """ Compiles the SDXL-Turbo Text-to-Image UNet with Ahead-of-Time (AoT) optimization. """ print("Compiling SDXL-Turbo UNet with AoT...") # Example prompt and fixed resolution for SDXL-Turbo prompt = "A dog wearing a hat, high quality" height, width = T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH with torch.no_grad(): # Encode prompt to get text embeddings text_encoder_output = pipe_t2i.encode_prompt( prompt, device="cuda", num_images_per_prompt=1,