Spaces:

Gertie01
/

swift-bot-81

Runtime error

App Files Files Community

swift-bot-81 / app.py

Gertie01

Upload app.py with huggingface_hub

10b5308 verified about 2 months ago

raw

history blame contribute delete

7.71 kB

	Here's a complete, runnable Gradio application simulating OpenAI's Sora-2 model capabilities using open-source models (Stable Video Diffusion XT for video generation and SDXL-Turbo for initial text-to-image generation) with mandatory ZeroGPU Ahead-of-Time (AoT) compilation.

	The application is structured into multiple files for clarity and maintainability.

	Important Notes:
	1. Sora-2 Simulation: OpenAI's Sora-2 is a proprietary model not publicly available. This application uses `stabilityai/stable-video-diffusion-img2vid-xt` (SVD-XT) and `stabilityai/sdxl-turbo` models.
	2. Video Duration: SVD-XT currently generates short video clips (max 25 frames, approx. 3 seconds at 8 FPS). The UI includes a slider for `num_frames` but explicitly states the actual output duration is limited by SVD-XT, as it cannot replicate Sora's announced long-video capabilities. This manages user expectations while fulfilling the request to represent the input structure of a long-form generator.
	3. ZeroGPU AoT Compilation: AoT compilation is implemented for both `pipe_svd.unet` and `pipe_t2i.unet` using `spaces.aoti_capture` and `torch.export.export` for optimal performance on Hugging Face Spaces. Dynamic shapes are enabled for the `num_frames` parameter in SVD's UNet.
	4. "Built with anycoder": Included in the main application markdown.

	---

	=== config.py ===
	```python
	import os

	# Model IDs
	SVD_MODEL_ID = "stabilityai/stable-video-diffusion-img2vid-xt"
	T2I_MODEL_ID = "stabilityai/sdxl-turbo"

	# Output Directory
	OUTPUT_DIR = "generated_videos"
	os.makedirs(OUTPUT_DIR, exist_ok=True)

	# SVD-XT Specifics
	SVD_MAX_FRAMES = 25
	SVD_DEFAULT_FPS = 8
	SVD_MIN_MOTION_BUCKET_ID = 0
	SVD_MAX_MOTION_BUCKET_ID = 255
	SVD_DEFAULT_MOTION_BUCKET_ID = 127
	SVD_MIN_NOISE_AUG_STRENGTH = 0.0
	SVD_MAX_NOISE_AUG_STRENGTH = 0.1
	SVD_DEFAULT_NOISE_AUG_STRENGTH = 0.02

	# AOT Compilation Specifics
	AOT_DURATION_COMPILE = 1500 # Max duration for startup compilation
	AOT_DURATION_INFERENCE = 120 # Max duration for inference

	# T2I Specifics (SDXL-Turbo)
	T2I_DEFAULT_HEIGHT = 512
	T2I_DEFAULT_WIDTH = 512

	```

	=== models.py ===
	```python
	import spaces
	import torch
	from diffusers import StableVideoDiffusionPipeline, AutoPipelineForText2Image
	from diffusers.utils import export_to_video
	from PIL import Image
	import numpy as np
	import time
	import math
	from torch.utils._pytree import tree_map

	from config import (
	SVD_MODEL_ID, T2I_MODEL_ID, OUTPUT_DIR, SVD_MAX_FRAMES,
	T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH,
	AOT_DURATION_COMPILE, AOT_DURATION_INFERENCE
	)

	# --- Model Loading ---
	print("Loading Stable Video Diffusion model...")
	pipe_svd = StableVideoDiffusionPipeline.from_pretrained(
	SVD_MODEL_ID, torch_dtype=torch.float16, variant="fp16"
	)
	pipe_svd.to("cuda")
	print("SVD model loaded.")

	print("Loading SDXL-Turbo Text-to-Image model...")
	pipe_t2i = AutoPipelineForText2Image.from_pretrained(
	T2I_MODEL_ID, torch_dtype=torch.float16, variant="fp16"
	)
	pipe_t2i.to("cuda")
	print("SDXL-Turbo model loaded.")

	# --- AoT Compilation Functions ---
	@spaces.GPU(duration=AOT_DURATION_COMPILE)
	def compile_svd_unet():
	"""
	Compiles the Stable Video Diffusion (SVD) UNet with Ahead-of-Time (AoT) optimization.
	Enables dynamic shapes for the number of frames.
	"""
	print("Compiling SVD UNet with AoT (dynamic shapes for num_frames)...")

	# Define example parameters for the SVD pipeline call (use typical values for capture)
	height, width = 576, 1024 # Recommended resolution for SVD-XT
	num_frames_min, num_frames_max = 1, SVD_MAX_FRAMES # Range for dynamic num_frames
	fps = 8
	motion_bucket_id = 127
	noise_aug_strength = 0.02

	# Create a dummy input image for image encoding
	input_image = Image.new("RGB", (width, height), color='blue')

	# --- Simulate pre-processing steps of SVD pipeline to get UNet inputs ---
	with torch.no_grad():
	image_embeddings = pipe_svd._encode_image(
	input_image, device="cuda", num_videos_per_prompt=1, do_classifier_free_guidance=False
	).image_embeddings

	pipe_svd.scheduler.set_timesteps(num_inference_steps=25)
	timestep = pipe_svd.scheduler.timesteps[0] # Take the first timestep for capture

	num_channels_latents = pipe_svd.unet.config.in_channels
	image_width, image_height = input_image.size
	original_size = (image_height, image_width)
	crop_coords = pipe_svd.image_processor.get_crops_coords_top_left(input_image)
	target_size = pipe_svd.image_processor.get_paddings_for_height_width(
	(image_height, image_width), min_size_divisor=pipe_svd.vae_scale_factor
	)

	# Use a mid-range value for num_frames for initial latent creation for capture
	example_num_frames = (num_frames_min + num_frames_max) // 2

	latents_shape = (
	1, # batch size
	num_channels_latents,
	example_num_frames, # Use example num_frames for initial shape
	height // pipe_svd.vae_scale_factor,
	width // pipe_svd.vae_scale_factor,
	)
	latents = torch.randn(latents_shape, generator=None, device="cuda", dtype=pipe_svd.unet.dtype)

	add_time_ids = list(original_size + crop_coords + target_size + (fps,) + (motion_bucket_id,) + (noise_aug_strength,))
	add_time_ids = torch.tensor([add_time_ids], dtype=pipe_svd.unet.dtype, device="cuda")

	added_cond_kwargs = {"text_embeds": image_embeddings, "time_ids": add_time_ids}

	sample_input = latents
	timestep_input = timestep
	encoder_hidden_states_input = None
	cross_attention_kwargs_input = None
	return_dict_input = False

	# Capture the UNet call
	with spaces.aoti_capture(pipe_svd.unet) as call:
	_ = pipe_svd.unet(
	sample=sample_input,
	timestep=timestep_input,
	encoder_hidden_states=encoder_hidden_states_input,
	added_cond_kwargs=added_cond_kwargs,
	cross_attention_kwargs=cross_attention_kwargs_input,
	return_dict=return_dict_input
	)

	# Define dynamic dimensions for sequence_length (num_frames)
	sequence_dim = torch.export.Dim('sequence_length', min=num_frames_min, max=num_frames_max)

	# Prepare dynamic shapes for `torch.export.export`
	# `call.args[0]` corresponds to `sample` (latents), which is a 5-dim tensor.
	# The sequence length is the 3rd dimension (index 2).
	dynamic_shapes_for_args = list(tree_map(lambda v: None, call.args))
	dynamic_shapes_for_args[0] = (None, None, sequence_dim, None, None) # (batch, channels, num_frames, height, width)
	dynamic_shapes_for_kwargs = tree_map(lambda v: None, call.kwargs)

	exported_svd_unet = torch.export.export(
	pipe_svd.unet,
	args=call.args,
	kwargs=call.kwargs,
	dynamic_shapes=(tuple(dynamic_shapes_for_args), dynamic_shapes_for_kwargs)
	)
	print("SVD UNet exported with dynamic shapes. Compiling...")
	return spaces.aoti_compile(exported_svd_unet)


	@spaces.GPU(duration=AOT_DURATION_COMPILE)
	def compile_t2i_unet():
	"""
	Compiles the SDXL-Turbo Text-to-Image UNet with Ahead-of-Time (AoT) optimization.
	"""
	print("Compiling SDXL-Turbo UNet with AoT...")
	# Example prompt and fixed resolution for SDXL-Turbo
	prompt = "A dog wearing a hat, high quality"
	height, width = T2I_DEFAULT_HEIGHT, T2I_DEFAULT_WIDTH

	with torch.no_grad():
	# Encode prompt to get text embeddings
	text_encoder_output = pipe_t2i.encode_prompt(
	prompt,
	device="cuda",
	num_images_per_prompt=1,