swift-bot-81 / app.py
Gertie01's picture
Upload app.py with huggingface_hub
10b5308 verified
Here's a complete, runnable Gradio application simulating OpenAI's Sora-2 model capabilities using open-source models (Stable Video Diffusion XT for video generation and SDXL-Turbo for initial text-to-image generation) with mandatory ZeroGPU Ahead-of-Time (AoT) compilation.
The application is structured into multiple files for clarity and maintainability.
**Important Notes:**
1. **Sora-2 Simulation**: OpenAI's Sora-2 is a proprietary model not publicly available. This application uses `stabilityai/stable-video-diffusion-img2vid-xt` (SVD-XT) and `stabilityai/sdxl-turbo` models.
2. **Video Duration**: SVD-XT currently generates short video clips (max 25 frames, approx. 3 seconds at 8 FPS). The UI includes a slider for `num_frames` but explicitly states the actual output duration is limited by SVD-XT, as it cannot replicate Sora's announced long-video capabilities. This manages user expectations while fulfilling the request to *represent* the input structure of a long-form generator.
3. **ZeroGPU AoT Compilation**: AoT compilation is implemented for both `pipe_svd.unet` and `pipe_t2i.unet` using `spaces.aoti_capture` and `torch.export.export` for optimal performance on Hugging Face Spaces. Dynamic shapes are enabled for the `num_frames` parameter in SVD's UNet.
4. **"Built with anycoder"**: Included in the main application markdown.
---
=== config.py ===
```python
import os
# Model IDs
SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"
T2I_MODEL_ID = "stabilityai/sdxl-turbo"
# Output Directory
OUTPUT_DIR = "generated_videos"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# SVD-XT Specifics
SVD_MAX_FRAMES = 25
SVD_DEFAULT_FPS = 8
SVD_MIN_MOTION_BUCKET_ID = 0
SVD_MAX_MOTION_BUCKET_ID = 255
SVD_DEFAULT_MOTION_BUCKET_ID = 127
SVD_MIN_NOISE_AUG_STRENGTH = 0.0
SVD_MAX_NOISE_AUG_STRENGTH = 0.1
SVD_DEFAULT_NOISE_AUG_STRENGTH = 0.02
# AOT Compilation Specifics
AOT_DURATION_COMPILE = 1500 # Max duration for startup compilation
AOT_DURATION_INFERENCE = 120 # Max duration for inference
# T2I Specifics (SDXL-Turbo)
T2I_DEFAULT_HEIGHT = 512
T2I_DEFAULT_WIDTH = 512
```
=== models.py ===
```python
import spaces
import torch
from diffusers import StableVideoDiffusionPipeline, AutoPipelineForText2Image
from diffusers.utils import export_to_video
from PIL import Image
import numpy as np
import time
import math
from torch.utils._pytree import tree_map
from config import (
SVD_MODEL_ID, T2I_MODEL_ID, OUTPUT_DIR, SVD_MAX_FRAMES,
T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH,
AOT_DURATION_COMPILE, AOT_DURATION_INFERENCE
)
# --- Model Loading ---
print("Loading Stable Video Diffusion model...")
pipe_svd = StableVideoDiffusionPipeline.from_pretrained(
SVD_MODEL_ID, torch_dtype=torch.float16, variant="fp16"
)
pipe_svd.to("cuda")
print("SVD model loaded.")
print("Loading SDXL-Turbo Text-to-Image model...")
pipe_t2i = AutoPipelineForText2Image.from_pretrained(
T2I_MODEL_ID, torch_dtype=torch.float16, variant="fp16"
)
pipe_t2i.to("cuda")
print("SDXL-Turbo model loaded.")
# --- AoT Compilation Functions ---
@spaces.GPU(duration=AOT_DURATION_COMPILE)
def compile_svd_unet():
"""
Compiles the Stable Video Diffusion (SVD) UNet with Ahead-of-Time (AoT) optimization.
Enables dynamic shapes for the number of frames.
"""
print("Compiling SVD UNet with AoT (dynamic shapes for num_frames)...")
# Define example parameters for the SVD pipeline call (use typical values for capture)
height, width = 576, 1024 # Recommended resolution for SVD-XT
num_frames_min, num_frames_max = 1, SVD_MAX_FRAMES # Range for dynamic num_frames
fps = 8
motion_bucket_id = 127
noise_aug_strength = 0.02
# Create a dummy input image for image encoding
input_image = Image.new("RGB", (width, height), color='blue')
# --- Simulate pre-processing steps of SVD pipeline to get UNet inputs ---
with torch.no_grad():
image_embeddings = pipe_svd._encode_image(
input_image, device="cuda", num_videos_per_prompt=1, do_classifier_free_guidance=False
).image_embeddings
pipe_svd.scheduler.set_timesteps(num_inference_steps=25)
timestep = pipe_svd.scheduler.timesteps[0] # Take the first timestep for capture
num_channels_latents = pipe_svd.unet.config.in_channels
image_width, image_height = input_image.size
original_size = (image_height, image_width)
crop_coords = pipe_svd.image_processor.get_crops_coords_top_left(input_image)
target_size = pipe_svd.image_processor.get_paddings_for_height_width(
(image_height, image_width), min_size_divisor=pipe_svd.vae_scale_factor
)
# Use a mid-range value for num_frames for initial latent creation for capture
example_num_frames = (num_frames_min + num_frames_max) // 2
latents_shape = (
1, # batch size
num_channels_latents,
example_num_frames, # Use example num_frames for initial shape
height // pipe_svd.vae_scale_factor,
width // pipe_svd.vae_scale_factor,
)
latents = torch.randn(latents_shape, generator=None, device="cuda", dtype=pipe_svd.unet.dtype)
add_time_ids = list(original_size + crop_coords + target_size + (fps,) + (motion_bucket_id,) + (noise_aug_strength,))
add_time_ids = torch.tensor([add_time_ids], dtype=pipe_svd.unet.dtype, device="cuda")
added_cond_kwargs = {"text_embeds": image_embeddings, "time_ids": add_time_ids}
sample_input = latents
timestep_input = timestep
encoder_hidden_states_input = None
cross_attention_kwargs_input = None
return_dict_input = False
# Capture the UNet call
with spaces.aoti_capture(pipe_svd.unet) as call:
_ = pipe_svd.unet(
sample=sample_input,
timestep=timestep_input,
encoder_hidden_states=encoder_hidden_states_input,
added_cond_kwargs=added_cond_kwargs,
cross_attention_kwargs=cross_attention_kwargs_input,
return_dict=return_dict_input
)
# Define dynamic dimensions for sequence_length (num_frames)
sequence_dim = torch.export.Dim('sequence_length', min=num_frames_min, max=num_frames_max)
# Prepare dynamic shapes for `torch.export.export`
# `call.args[0]` corresponds to `sample` (latents), which is a 5-dim tensor.
# The sequence length is the 3rd dimension (index 2).
dynamic_shapes_for_args = list(tree_map(lambda v: None, call.args))
dynamic_shapes_for_args[0] = (None, None, sequence_dim, None, None) # (batch, channels, num_frames, height, width)
dynamic_shapes_for_kwargs = tree_map(lambda v: None, call.kwargs)
exported_svd_unet = torch.export.export(
pipe_svd.unet,
args=call.args,
kwargs=call.kwargs,
dynamic_shapes=(tuple(dynamic_shapes_for_args), dynamic_shapes_for_kwargs)
)
print("SVD UNet exported with dynamic shapes. Compiling...")
return spaces.aoti_compile(exported_svd_unet)
@spaces.GPU(duration=AOT_DURATION_COMPILE)
def compile_t2i_unet():
"""
Compiles the SDXL-Turbo Text-to-Image UNet with Ahead-of-Time (AoT) optimization.
"""
print("Compiling SDXL-Turbo UNet with AoT...")
# Example prompt and fixed resolution for SDXL-Turbo
prompt = "A dog wearing a hat, high quality"
height, width = T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH
with torch.no_grad():
# Encode prompt to get text embeddings
text_encoder_output = pipe_t2i.encode_prompt(
prompt,
device="cuda",
num_images_per_prompt=1,