Spaces:
Runtime error
Runtime error
File size: 7,712 Bytes
10b5308 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 |
Here's a complete, runnable Gradio application simulating OpenAI's Sora-2 model capabilities using open-source models (Stable Video Diffusion XT for video generation and SDXL-Turbo for initial text-to-image generation) with mandatory ZeroGPU Ahead-of-Time (AoT) compilation.
The application is structured into multiple files for clarity and maintainability.
**Important Notes:**
1. **Sora-2 Simulation**: OpenAI's Sora-2 is a proprietary model not publicly available. This application uses `stabilityai/stable-video-diffusion-img2vid-xt` (SVD-XT) and `stabilityai/sdxl-turbo` models.
2. **Video Duration**: SVD-XT currently generates short video clips (max 25 frames, approx. 3 seconds at 8 FPS). The UI includes a slider for `num_frames` but explicitly states the actual output duration is limited by SVD-XT, as it cannot replicate Sora's announced long-video capabilities. This manages user expectations while fulfilling the request to *represent* the input structure of a long-form generator.
3. **ZeroGPU AoT Compilation**: AoT compilation is implemented for both `pipe_svd.unet` and `pipe_t2i.unet` using `spaces.aoti_capture` and `torch.export.export` for optimal performance on Hugging Face Spaces. Dynamic shapes are enabled for the `num_frames` parameter in SVD's UNet.
4. **"Built with anycoder"**: Included in the main application markdown.
---
=== config.py ===
```python
import os
# Model IDs
SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"
T2I_MODEL_ID = "stabilityai/sdxl-turbo"
# Output Directory
OUTPUT_DIR = "generated_videos"
os.makedirs(OUTPUT_DIR, exist_ok=True)
# SVD-XT Specifics
SVD_MAX_FRAMES = 25
SVD_DEFAULT_FPS = 8
SVD_MIN_MOTION_BUCKET_ID = 0
SVD_MAX_MOTION_BUCKET_ID = 255
SVD_DEFAULT_MOTION_BUCKET_ID = 127
SVD_MIN_NOISE_AUG_STRENGTH = 0.0
SVD_MAX_NOISE_AUG_STRENGTH = 0.1
SVD_DEFAULT_NOISE_AUG_STRENGTH = 0.02
# AOT Compilation Specifics
AOT_DURATION_COMPILE = 1500 # Max duration for startup compilation
AOT_DURATION_INFERENCE = 120 # Max duration for inference
# T2I Specifics (SDXL-Turbo)
T2I_DEFAULT_HEIGHT = 512
T2I_DEFAULT_WIDTH = 512
```
=== models.py ===
```python
import spaces
import torch
from diffusers import StableVideoDiffusionPipeline, AutoPipelineForText2Image
from diffusers.utils import export_to_video
from PIL import Image
import numpy as np
import time
import math
from torch.utils._pytree import tree_map
from config import (
SVD_MODEL_ID, T2I_MODEL_ID, OUTPUT_DIR, SVD_MAX_FRAMES,
T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH,
AOT_DURATION_COMPILE, AOT_DURATION_INFERENCE
)
# --- Model Loading ---
print("Loading Stable Video Diffusion model...")
pipe_svd = StableVideoDiffusionPipeline.from_pretrained(
SVD_MODEL_ID, torch_dtype=torch.float16, variant="fp16"
)
pipe_svd.to("cuda")
print("SVD model loaded.")
print("Loading SDXL-Turbo Text-to-Image model...")
pipe_t2i = AutoPipelineForText2Image.from_pretrained(
T2I_MODEL_ID, torch_dtype=torch.float16, variant="fp16"
)
pipe_t2i.to("cuda")
print("SDXL-Turbo model loaded.")
# --- AoT Compilation Functions ---
@spaces.GPU(duration=AOT_DURATION_COMPILE)
def compile_svd_unet():
"""
Compiles the Stable Video Diffusion (SVD) UNet with Ahead-of-Time (AoT) optimization.
Enables dynamic shapes for the number of frames.
"""
print("Compiling SVD UNet with AoT (dynamic shapes for num_frames)...")
# Define example parameters for the SVD pipeline call (use typical values for capture)
height, width = 576, 1024 # Recommended resolution for SVD-XT
num_frames_min, num_frames_max = 1, SVD_MAX_FRAMES # Range for dynamic num_frames
fps = 8
motion_bucket_id = 127
noise_aug_strength = 0.02
# Create a dummy input image for image encoding
input_image = Image.new("RGB", (width, height), color='blue')
# --- Simulate pre-processing steps of SVD pipeline to get UNet inputs ---
with torch.no_grad():
image_embeddings = pipe_svd._encode_image(
input_image, device="cuda", num_videos_per_prompt=1, do_classifier_free_guidance=False
).image_embeddings
pipe_svd.scheduler.set_timesteps(num_inference_steps=25)
timestep = pipe_svd.scheduler.timesteps[0] # Take the first timestep for capture
num_channels_latents = pipe_svd.unet.config.in_channels
image_width, image_height = input_image.size
original_size = (image_height, image_width)
crop_coords = pipe_svd.image_processor.get_crops_coords_top_left(input_image)
target_size = pipe_svd.image_processor.get_paddings_for_height_width(
(image_height, image_width), min_size_divisor=pipe_svd.vae_scale_factor
)
# Use a mid-range value for num_frames for initial latent creation for capture
example_num_frames = (num_frames_min + num_frames_max) // 2
latents_shape = (
1, # batch size
num_channels_latents,
example_num_frames, # Use example num_frames for initial shape
height // pipe_svd.vae_scale_factor,
width // pipe_svd.vae_scale_factor,
)
latents = torch.randn(latents_shape, generator=None, device="cuda", dtype=pipe_svd.unet.dtype)
add_time_ids = list(original_size + crop_coords + target_size + (fps,) + (motion_bucket_id,) + (noise_aug_strength,))
add_time_ids = torch.tensor([add_time_ids], dtype=pipe_svd.unet.dtype, device="cuda")
added_cond_kwargs = {"text_embeds": image_embeddings, "time_ids": add_time_ids}
sample_input = latents
timestep_input = timestep
encoder_hidden_states_input = None
cross_attention_kwargs_input = None
return_dict_input = False
# Capture the UNet call
with spaces.aoti_capture(pipe_svd.unet) as call:
_ = pipe_svd.unet(
sample=sample_input,
timestep=timestep_input,
encoder_hidden_states=encoder_hidden_states_input,
added_cond_kwargs=added_cond_kwargs,
cross_attention_kwargs=cross_attention_kwargs_input,
return_dict=return_dict_input
)
# Define dynamic dimensions for sequence_length (num_frames)
sequence_dim = torch.export.Dim('sequence_length', min=num_frames_min, max=num_frames_max)
# Prepare dynamic shapes for `torch.export.export`
# `call.args[0]` corresponds to `sample` (latents), which is a 5-dim tensor.
# The sequence length is the 3rd dimension (index 2).
dynamic_shapes_for_args = list(tree_map(lambda v: None, call.args))
dynamic_shapes_for_args[0] = (None, None, sequence_dim, None, None) # (batch, channels, num_frames, height, width)
dynamic_shapes_for_kwargs = tree_map(lambda v: None, call.kwargs)
exported_svd_unet = torch.export.export(
pipe_svd.unet,
args=call.args,
kwargs=call.kwargs,
dynamic_shapes=(tuple(dynamic_shapes_for_args), dynamic_shapes_for_kwargs)
)
print("SVD UNet exported with dynamic shapes. Compiling...")
return spaces.aoti_compile(exported_svd_unet)
@spaces.GPU(duration=AOT_DURATION_COMPILE)
def compile_t2i_unet():
"""
Compiles the SDXL-Turbo Text-to-Image UNet with Ahead-of-Time (AoT) optimization.
"""
print("Compiling SDXL-Turbo UNet with AoT...")
# Example prompt and fixed resolution for SDXL-Turbo
prompt = "A dog wearing a hat, high quality"
height, width = T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH
with torch.no_grad():
# Encode prompt to get text embeddings
text_encoder_output = pipe_t2i.encode_prompt(
prompt,
device="cuda",
num_images_per_prompt=1, |