Spaces:
Paused
Paused
| import gradio as gr | |
| import torch | |
| from huggingface_hub import snapshot_download | |
| from xora.models.autoencoders.causal_video_autoencoder import CausalVideoAutoencoder | |
| from xora.models.transformers.transformer3d import Transformer3DModel | |
| from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier | |
| from xora.schedulers.rf import RectifiedFlowScheduler | |
| from xora.pipelines.pipeline_xora_video import XoraVideoPipeline | |
| from transformers import T5EncoderModel, T5Tokenizer | |
| from xora.utils.conditioning_method import ConditioningMethod | |
| from pathlib import Path | |
| import safetensors.torch | |
| import json | |
| import numpy as np | |
| import cv2 | |
| from PIL import Image | |
| import tempfile | |
| import os | |
| # Load Hugging Face token if needed | |
| hf_token = os.getenv("HF_TOKEN") | |
| # Set model download directory within Hugging Face Spaces | |
| model_path = "asset" | |
| if not os.path.exists(model_path): | |
| snapshot_download( | |
| "Lightricks/LTX-Video", local_dir=model_path, repo_type="model", token=hf_token | |
| ) | |
| # Global variables to load components | |
| vae_dir = Path(model_path) / "vae" | |
| unet_dir = Path(model_path) / "unet" | |
| scheduler_dir = Path(model_path) / "scheduler" | |
| device = torch.device("cuda" if torch.cuda.is_available() else "cpu") | |
| def load_vae(vae_dir): | |
| vae_ckpt_path = vae_dir / "vae_diffusion_pytorch_model.safetensors" | |
| vae_config_path = vae_dir / "config.json" | |
| with open(vae_config_path, "r") as f: | |
| vae_config = json.load(f) | |
| vae = CausalVideoAutoencoder.from_config(vae_config) | |
| vae_state_dict = safetensors.torch.load_file(vae_ckpt_path) | |
| vae.load_state_dict(vae_state_dict) | |
| return vae.cuda().to(torch.bfloat16) | |
| def load_unet(unet_dir): | |
| unet_ckpt_path = unet_dir / "unet_diffusion_pytorch_model.safetensors" | |
| unet_config_path = unet_dir / "config.json" | |
| transformer_config = Transformer3DModel.load_config(unet_config_path) | |
| transformer = Transformer3DModel.from_config(transformer_config) | |
| unet_state_dict = safetensors.torch.load_file(unet_ckpt_path) | |
| transformer.load_state_dict(unet_state_dict, strict=True) | |
| return transformer.to(device) | |
| def load_scheduler(scheduler_dir): | |
| scheduler_config_path = scheduler_dir / "scheduler_config.json" | |
| scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path) | |
| return RectifiedFlowScheduler.from_config(scheduler_config) | |
| # Helper function for image processing | |
| def center_crop_and_resize(frame, target_height, target_width): | |
| h, w, _ = frame.shape | |
| aspect_ratio_target = target_width / target_height | |
| aspect_ratio_frame = w / h | |
| if aspect_ratio_frame > aspect_ratio_target: | |
| new_width = int(h * aspect_ratio_target) | |
| x_start = (w - new_width) // 2 | |
| frame_cropped = frame[:, x_start : x_start + new_width] | |
| else: | |
| new_height = int(w / aspect_ratio_target) | |
| y_start = (h - new_height) // 2 | |
| frame_cropped = frame[y_start : y_start + new_height, :] | |
| frame_resized = cv2.resize(frame_cropped, (target_width, target_height)) | |
| return frame_resized | |
| def load_image_to_tensor_with_resize(image_path, target_height=512, target_width=768): | |
| image = Image.open(image_path).convert("RGB") | |
| image_np = np.array(image) | |
| frame_resized = center_crop_and_resize(image_np, target_height, target_width) | |
| frame_tensor = torch.tensor(frame_resized).permute(2, 0, 1).float() | |
| frame_tensor = (frame_tensor / 127.5) - 1.0 | |
| return frame_tensor.unsqueeze(0).unsqueeze(2) | |
| # Preset options for resolution and frame configuration | |
| preset_options = [ | |
| {"label": "1216x704, 41 frames", "width": 1216, "height": 704, "num_frames": 41}, | |
| {"label": "1088x704, 49 frames", "width": 1088, "height": 704, "num_frames": 49}, | |
| {"label": "1056x640, 57 frames", "width": 1056, "height": 640, "num_frames": 57}, | |
| {"label": "992x608, 65 frames", "width": 992, "height": 608, "num_frames": 65}, | |
| {"label": "896x608, 73 frames", "width": 896, "height": 608, "num_frames": 73}, | |
| {"label": "896x544, 81 frames", "width": 896, "height": 544, "num_frames": 81}, | |
| {"label": "832x544, 89 frames", "width": 832, "height": 544, "num_frames": 89}, | |
| {"label": "800x512, 97 frames", "width": 800, "height": 512, "num_frames": 97}, | |
| {"label": "768x512, 97 frames", "width": 768, "height": 512, "num_frames": 97}, | |
| {"label": "800x480, 105 frames", "width": 800, "height": 480, "num_frames": 105}, | |
| {"label": "736x480, 113 frames", "width": 736, "height": 480, "num_frames": 113}, | |
| {"label": "704x480, 121 frames", "width": 704, "height": 480, "num_frames": 121}, | |
| {"label": "704x448, 129 frames", "width": 704, "height": 448, "num_frames": 129}, | |
| {"label": "672x448, 137 frames", "width": 672, "height": 448, "num_frames": 137}, | |
| {"label": "640x416, 153 frames", "width": 640, "height": 416, "num_frames": 153}, | |
| {"label": "672x384, 161 frames", "width": 672, "height": 384, "num_frames": 161}, | |
| {"label": "640x384, 169 frames", "width": 640, "height": 384, "num_frames": 169}, | |
| {"label": "608x384, 177 frames", "width": 608, "height": 384, "num_frames": 177}, | |
| {"label": "576x384, 185 frames", "width": 576, "height": 384, "num_frames": 185}, | |
| {"label": "608x352, 193 frames", "width": 608, "height": 352, "num_frames": 193}, | |
| {"label": "576x352, 201 frames", "width": 576, "height": 352, "num_frames": 201}, | |
| {"label": "544x352, 209 frames", "width": 544, "height": 352, "num_frames": 209}, | |
| {"label": "512x352, 225 frames", "width": 512, "height": 352, "num_frames": 225}, | |
| {"label": "512x352, 233 frames", "width": 512, "height": 352, "num_frames": 233}, | |
| {"label": "544x320, 241 frames", "width": 544, "height": 320, "num_frames": 241}, | |
| {"label": "512x320, 249 frames", "width": 512, "height": 320, "num_frames": 249}, | |
| {"label": "512x320, 257 frames", "width": 512, "height": 320, "num_frames": 257}, | |
| {"label": "Custom", "height": None, "width": None, "num_frames": None}, | |
| ] | |
| # Function to toggle visibility of sliders based on preset selection | |
| def preset_changed(preset): | |
| if preset != "Custom": | |
| selected = next(item for item in preset_options if item["label"] == preset) | |
| return ( | |
| selected["height"], | |
| selected["width"], | |
| selected["num_frames"], | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| gr.update(visible=False), | |
| ) | |
| else: | |
| return ( | |
| None, | |
| None, | |
| None, | |
| gr.update(visible=True), | |
| gr.update(visible=True), | |
| gr.update(visible=True), | |
| ) | |
| # Load models | |
| vae = load_vae(vae_dir) | |
| unet = load_unet(unet_dir) | |
| scheduler = load_scheduler(scheduler_dir) | |
| patchifier = SymmetricPatchifier(patch_size=1) | |
| text_encoder = T5EncoderModel.from_pretrained( | |
| "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder" | |
| ).to(device) | |
| tokenizer = T5Tokenizer.from_pretrained( | |
| "PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer" | |
| ) | |
| pipeline = XoraVideoPipeline( | |
| transformer=unet, | |
| patchifier=patchifier, | |
| text_encoder=text_encoder, | |
| tokenizer=tokenizer, | |
| scheduler=scheduler, | |
| vae=vae, | |
| ).to(device) | |
| def generate_video_from_text( | |
| prompt="", | |
| negative_prompt="", | |
| seed=171198, | |
| num_inference_steps=40, | |
| guidance_scale=3, | |
| height=512, | |
| width=768, | |
| num_frames=121, | |
| frame_rate=25, | |
| progress=gr.Progress(), | |
| ): | |
| if len(prompt.strip()) < 50: | |
| raise gr.Error( | |
| "Prompt must be at least 50 characters long. Please provide more details for the best results.", | |
| duration=5, | |
| ) | |
| sample = { | |
| "prompt": prompt, | |
| "prompt_attention_mask": None, | |
| "negative_prompt": negative_prompt, | |
| "negative_prompt_attention_mask": None, | |
| "media_items": None, | |
| } | |
| generator = torch.Generator(device="cpu").manual_seed(seed) | |
| def gradio_progress_callback(self, step, timestep, kwargs): | |
| progress((step + 1) / num_inference_steps) | |
| images = pipeline( | |
| num_inference_steps=num_inference_steps, | |
| num_images_per_prompt=1, | |
| guidance_scale=guidance_scale, | |
| generator=generator, | |
| output_type="pt", | |
| height=height, | |
| width=width, | |
| num_frames=num_frames, | |
| frame_rate=frame_rate, | |
| **sample, | |
| is_video=True, | |
| vae_per_channel_normalize=True, | |
| conditioning_method=ConditioningMethod.FIRST_FRAME, | |
| mixed_precision=True, | |
| callback_on_step_end=gradio_progress_callback, | |
| ).images | |
| output_path = tempfile.mktemp(suffix=".mp4") | |
| print(images.shape) | |
| video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() | |
| video_np = (video_np * 255).astype(np.uint8) | |
| height, width = video_np.shape[1:3] | |
| out = cv2.VideoWriter( | |
| output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height) | |
| ) | |
| for frame in video_np[..., ::-1]: | |
| out.write(frame) | |
| out.release() | |
| return output_path | |
| def generate_video_from_image( | |
| image_path, | |
| prompt="", | |
| negative_prompt="", | |
| seed=171198, | |
| num_inference_steps=40, | |
| guidance_scale=3, | |
| height=512, | |
| width=768, | |
| num_frames=121, | |
| frame_rate=25, | |
| progress=gr.Progress(), | |
| ): | |
| if len(prompt.strip()) < 50: | |
| raise gr.Error( | |
| "Prompt must be at least 50 characters long. Please provide more details for the best results.", | |
| duration=5, | |
| ) | |
| if not image_path: | |
| raise gr.Error("Please provide an input image.", duration=5) | |
| media_items = load_image_to_tensor_with_resize(image_path, height, width).to(device) | |
| sample = { | |
| "prompt": prompt, | |
| "prompt_attention_mask": None, | |
| "negative_prompt": negative_prompt, | |
| "negative_prompt_attention_mask": None, | |
| "media_items": media_items, | |
| } | |
| generator = torch.Generator(device="cpu").manual_seed(seed) | |
| def gradio_progress_callback(self, step, timestep, kwargs): | |
| progress((step + 1) / num_inference_steps) | |
| images = pipeline( | |
| num_inference_steps=num_inference_steps, | |
| num_images_per_prompt=1, | |
| guidance_scale=guidance_scale, | |
| generator=generator, | |
| output_type="pt", | |
| height=height, | |
| width=width, | |
| num_frames=num_frames, | |
| frame_rate=frame_rate, | |
| **sample, | |
| is_video=True, | |
| vae_per_channel_normalize=True, | |
| conditioning_method=ConditioningMethod.FIRST_FRAME, | |
| mixed_precision=True, | |
| callback_on_step_end=gradio_progress_callback, | |
| ).images | |
| output_path = tempfile.mktemp(suffix=".mp4") | |
| video_np = images.squeeze(0).permute(1, 2, 3, 0).cpu().float().numpy() | |
| video_np = (video_np * 255).astype(np.uint8) | |
| height, width = video_np.shape[1:3] | |
| out = cv2.VideoWriter( | |
| output_path, cv2.VideoWriter_fourcc(*"mp4v"), frame_rate, (width, height) | |
| ) | |
| for frame in video_np[..., ::-1]: | |
| out.write(frame) | |
| out.release() | |
| return output_path | |
| def create_advanced_options(): | |
| with gr.Accordion("Step 4: Advanced Options (Optional)", open=False): | |
| seed = gr.Slider( | |
| label="4.1 Seed", minimum=0, maximum=1000000, step=1, value=171198 | |
| ) | |
| inference_steps = gr.Slider( | |
| label="4.2 Inference Steps", minimum=1, maximum=100, step=1, value=40 | |
| ) | |
| guidance_scale = gr.Slider( | |
| label="4.3 Guidance Scale", minimum=1.0, maximum=20.0, step=0.1, value=3.0 | |
| ) | |
| height_slider = gr.Slider( | |
| label="4.4 Height", | |
| minimum=256, | |
| maximum=1024, | |
| step=64, | |
| value=704, | |
| visible=False, | |
| ) | |
| width_slider = gr.Slider( | |
| label="4.5 Width", | |
| minimum=256, | |
| maximum=1024, | |
| step=64, | |
| value=1216, | |
| visible=False, | |
| ) | |
| num_frames_slider = gr.Slider( | |
| label="4.5 Number of Frames", | |
| minimum=1, | |
| maximum=200, | |
| step=1, | |
| value=41, | |
| visible=False, | |
| ) | |
| frame_rate = gr.Slider( | |
| label="4.7 Frame Rate", | |
| minimum=1, | |
| maximum=60, | |
| step=1, | |
| value=25, | |
| visible=False, | |
| ) | |
| return [ | |
| seed, | |
| inference_steps, | |
| guidance_scale, | |
| height_slider, | |
| width_slider, | |
| num_frames_slider, | |
| frame_rate, | |
| ] | |
| # Define the Gradio interface with tabs | |
| with gr.Blocks(theme=gr.themes.Soft()) as iface: | |
| with gr.Row(elem_id="title-row"): | |
| gr.Markdown( | |
| """ | |
| <div style="text-align: center; margin-bottom: 1em"> | |
| <h1 style="font-size: 2.5em; font-weight: 600; margin: 0.5em 0;">Video Generation with LTX Video</h1> | |
| </div> | |
| """ | |
| ) | |
| with gr.Accordion( | |
| " ๐ Tips for Best Results", open=False, elem_id="instructions-accordion" | |
| ): | |
| gr.Markdown( | |
| """ | |
| ๐ Prompt Engineering | |
| When writing prompts, focus on detailed, chronological descriptions of actions and scenes. Include specific movements, appearances, camera angles, and environmental details - all in a single flowing paragraph. Start directly with the action, and keep descriptions literal and precise. Think like a cinematographer describing a shot list. Keep within 200 words. | |
| For best results, build your prompts using this structure: | |
| - Start with main action in a single sentence | |
| - Add specific details about movements and gestures | |
| - Describe character/object appearances precisely | |
| - Include background and environment details | |
| - Specify camera angles and movements | |
| - Describe lighting and colors | |
| - Note any changes or sudden events | |
| See examples for more inspiration. | |
| ๐ฎ Parameter Guide | |
| - Resolution Preset: Higher resolutions for detailed scenes, lower for faster generation and simpler scenes | |
| - Seed: Save seed values to recreate specific styles or compositions you like | |
| - Guidance Scale: Higher values (5-7) for accurate prompt following, lower values (3-5) for more creative freedom | |
| - Inference Steps: More steps (40+) for quality, fewer steps (20-30) for speed | |
| """ | |
| ) | |
| with gr.Tabs(): | |
| # Text to Video Tab | |
| with gr.TabItem("Text to Video"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| txt2vid_prompt = gr.Textbox( | |
| label="Step 1: Enter Your Prompt", | |
| placeholder="Describe the video you want to generate (minimum 50 characters)...", | |
| value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery and distant mountains. The sky is clear with a few wispy clouds, and the sunlight glistens on the motorcycle as it speeds along.", | |
| lines=5, | |
| ) | |
| txt2vid_negative_prompt = gr.Textbox( | |
| label="Step 2: Enter Negative Prompt (Optional)", | |
| placeholder="Describe what you don't want in the video...", | |
| value="worst quality, inconsistent motion...", | |
| lines=2, | |
| ) | |
| txt2vid_preset = gr.Dropdown( | |
| choices=[p["label"] for p in preset_options], | |
| value="1216x704, 41 frames", | |
| label="Step 3: Choose Resolution Preset", | |
| ) | |
| txt2vid_advanced = create_advanced_options() | |
| txt2vid_generate = gr.Button( | |
| "Step 5: Generate Video", variant="primary", size="lg" | |
| ) | |
| with gr.Column(): | |
| txt2vid_output = gr.Video(label="Step 6: Generated Output") | |
| # Image to Video Tab | |
| with gr.TabItem("Image to Video"): | |
| with gr.Row(): | |
| with gr.Column(): | |
| img2vid_image = gr.Image( | |
| type="filepath", | |
| label="Step 1: Upload Input Image", | |
| elem_id="image_upload", | |
| ) | |
| img2vid_prompt = gr.Textbox( | |
| label="Step 2: Enter Your Prompt", | |
| placeholder="Describe how you want to animate the image (minimum 50 characters)...", | |
| value="A man riding a motorcycle down a winding road, surrounded by lush, green scenery...", | |
| lines=5, | |
| ) | |
| img2vid_negative_prompt = gr.Textbox( | |
| label="Step 3: Enter Negative Prompt (Optional)", | |
| placeholder="Describe what you don't want in the video...", | |
| value="worst quality, inconsistent motion...", | |
| lines=2, | |
| ) | |
| img2vid_preset = gr.Dropdown( | |
| choices=[p["label"] for p in preset_options], | |
| value="1216x704, 41 frames", | |
| label="Step 4: Choose Resolution Preset", | |
| ) | |
| img2vid_advanced = create_advanced_options() | |
| img2vid_generate = gr.Button( | |
| "Step 6: Generate Video", variant="primary", size="lg" | |
| ) | |
| with gr.Column(): | |
| img2vid_output = gr.Video(label="Step 7: Generated Output") | |
| # [Previous event handlers remain the same] | |
| txt2vid_preset.change( | |
| fn=preset_changed, inputs=[txt2vid_preset], outputs=txt2vid_advanced[4:] | |
| ) | |
| txt2vid_generate.click( | |
| fn=generate_video_from_text, | |
| inputs=[txt2vid_prompt, txt2vid_negative_prompt, *txt2vid_advanced], | |
| outputs=txt2vid_output, | |
| ) | |
| img2vid_preset.change( | |
| fn=preset_changed, inputs=[img2vid_preset], outputs=img2vid_advanced[4:] | |
| ) | |
| img2vid_generate.click( | |
| fn=generate_video_from_image, | |
| inputs=[ | |
| img2vid_image, | |
| img2vid_prompt, | |
| img2vid_negative_prompt, | |
| *img2vid_advanced, | |
| ], | |
| outputs=img2vid_output, | |
| ) | |
| iface.launch(share=True) | |