Spaces:

AashishNKumar
/

proj11

Paused

App Files Files Community

Sapir commited on Oct 8, 2024

Commit

e46ff5e

1 Parent(s): d699d2b

Examples: update and fix scripts.

Browse files

Files changed (3) hide show

scripts/to_safetensors.py +1 -2
xora/examples/image_to_video.py +101 -88
xora/examples/text_to_video.py +90 -79

scripts/to_safetensors.py CHANGED Viewed

@@ -60,7 +60,7 @@ def load_vae_config(vae_path: Path) -> str:
     return str(config_path)
-def main(unet_path: str, vae_path: str, t5_path: str, out_path: str, mode: str,
          unet_config_path: str = None, scheduler_config_path: str = None) -> None:
     unet = convert_unet(torch.load(unet_path, weights_only=True), add_prefix=(mode == 'single'))
@@ -98,7 +98,6 @@ if __name__ == '__main__':
     parser = argparse.ArgumentParser()
     parser.add_argument('--unet_path', '-u', type=str, default='unet/ema-002.pt')
     parser.add_argument('--vae_path', '-v', type=str, default='vae/')
-    parser.add_argument('--t5_path', '-t', type=str, default='t5/PixArt-XL-2-1024-MS/')
     parser.add_argument('--out_path', '-o', type=str, default='xora.safetensors')
     parser.add_argument('--mode', '-m', type=str, choices=['single', 'separate'], default='single',
                         help="Choose 'single' for the original behavior, 'separate' to save unet and vae separately.")

     return str(config_path)
+def main(unet_path: str, vae_path: str, out_path: str, mode: str,
          unet_config_path: str = None, scheduler_config_path: str = None) -> None:
     unet = convert_unet(torch.load(unet_path, weights_only=True), add_prefix=(mode == 'single'))
     parser = argparse.ArgumentParser()
     parser.add_argument('--unet_path', '-u', type=str, default='unet/ema-002.pt')
     parser.add_argument('--vae_path', '-v', type=str, default='vae/')
     parser.add_argument('--out_path', '-o', type=str, default='xora.safetensors')
     parser.add_argument('--mode', '-m', type=str, choices=['single', 'separate'], default='single',
                         help="Choose 'single' for the original behavior, 'separate' to save unet and vae separately.")

xora/examples/image_to_video.py CHANGED Viewed

@@ -5,94 +5,107 @@ from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
 from xora.schedulers.rf import RectifiedFlowScheduler
 from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
 from pathlib import Path
 import safetensors.torch
 import json
-# Paths for the separate mode directories
-separate_dir = Path("/opt/models/xora-img2video")
-unet_dir = separate_dir / 'unet'
-vae_dir = separate_dir / 'vae'
-scheduler_dir = separate_dir / 'scheduler'
-# Load VAE from separate mode
-vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
-vae_config_path = vae_dir / "config.json"
-with open(vae_config_path, 'r') as f:
-    vae_config = json.load(f)
-vae = CausalVideoAutoencoder.from_config(vae_config)
-vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
-vae.load_state_dict(
-    state_dict=vae_state_dict,
-)
-vae = vae.cuda().to(torch.bfloat16)
-# Load UNet (Transformer) from separate mode
-unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
-unet_config_path = unet_dir / "config.json"
-transformer_config = Transformer3DModel.load_config(unet_config_path)
-transformer = Transformer3DModel.from_config(transformer_config)
-unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
-transformer.load_state_dict(unet_state_dict, strict=True)
-transformer = transformer.cuda()
-unet = transformer
-# Load Scheduler from separate mode
-scheduler_config_path = scheduler_dir / "scheduler_config.json"
-scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
-scheduler = RectifiedFlowScheduler.from_config(scheduler_config)
-# Patchifier (remains the same)
-patchifier = SymmetricPatchifier(patch_size=1)
-# Use submodels for the pipeline
-submodel_dict = {
-    "unet": unet,
-    "transformer": transformer,
-    "patchifier": patchifier,
-    "text_encoder": None,
-    "scheduler": scheduler,
-    "vae": vae,
-}
-model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
-pipeline = VideoPixArtAlphaPipeline.from_pretrained(model_name_or_path,
-                                                    safety_checker=None,
-                                                    revision=None,
-                                                    torch_dtype=torch.float32,  # dtype adjusted
-                                                    **submodel_dict,
-                                                    ).to("cuda")
-num_inference_steps = 20
-num_images_per_prompt = 2
-guidance_scale = 3
-height = 512
-width = 768
-num_frames = 57
-frame_rate = 25
-# Assuming sample is a dict loaded from a .pt file
-sample = torch.load("/opt/sample.pt")
-for key, item in sample.items():
-    if item is not None:
-        sample[key] = item.cuda()
-media_items = torch.load("/opt/sample_media.pt")
-# Generate images (video frames)
-images = pipeline(
-    num_inference_steps=num_inference_steps,
-    num_images_per_prompt=num_images_per_prompt,
-    guidance_scale=guidance_scale,
-    generator=None,
-    output_type="pt",
-    callback_on_step_end=None,
-    height=height,
-    width=width,
-    num_frames=num_frames,
-    frame_rate=frame_rate,
-    **sample,
-    is_video=True,
-    vae_per_channel_normalize=True,
-).images
-print("Generated video frames.")

 from xora.schedulers.rf import RectifiedFlowScheduler
 from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
 from pathlib import Path
+from transformers import T5EncoderModel, T5Tokenizer
 import safetensors.torch
 import json
+import argparse
+def load_vae(vae_dir):
+    vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
+    vae_config_path = vae_dir / "config.json"
+    with open(vae_config_path, 'r') as f:
+        vae_config = json.load(f)
+    vae = CausalVideoAutoencoder.from_config(vae_config)
+    vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
+    vae.load_state_dict(vae_state_dict)
+    return vae.cuda().to(torch.bfloat16)
+def load_unet(unet_dir):
+    unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
+    unet_config_path = unet_dir / "config.json"
+    transformer_config = Transformer3DModel.load_config(unet_config_path)
+    transformer = Transformer3DModel.from_config(transformer_config)
+    unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
+    transformer.load_state_dict(unet_state_dict, strict=True)
+    return transformer.cuda()
+def load_scheduler(scheduler_dir):
+    scheduler_config_path = scheduler_dir / "scheduler_config.json"
+    scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
+    return RectifiedFlowScheduler.from_config(scheduler_config)
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Load models from separate directories')
+    parser.add_argument('--separate_dir', type=str, required=True, help='Path to the directory containing unet, vae, and scheduler subdirectories')
+    args = parser.parse_args()
+    # Paths for the separate mode directories
+    separate_dir = Path(args.separate_dir)
+    unet_dir = separate_dir / 'unet'
+    vae_dir = separate_dir / 'vae'
+    scheduler_dir = separate_dir / 'scheduler'
+    # Load models
+    vae = load_vae(vae_dir)
+    unet = load_unet(unet_dir)
+    scheduler = load_scheduler(scheduler_dir)
+    # Patchifier (remains the same)
+    patchifier = SymmetricPatchifier(patch_size=1)
+    # text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to("cuda")
+    # tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
+    # Use submodels for the pipeline
+    submodel_dict = {
+        "transformer": unet,  # using unet for transformer
+        "patchifier": patchifier,
+        "text_encoder": None,
+        "tokenizer": None,
+        "scheduler": scheduler,
+        "vae": vae,
+    }
+    model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
+    pipeline = VideoPixArtAlphaPipeline(
+                                                        **submodel_dict
+                                                        ).to("cuda")
+    num_inference_steps = 20
+    num_images_per_prompt = 1
+    guidance_scale = 3
+    height = 512
+    width = 768
+    num_frames = 57
+    frame_rate = 25
+    # Sample input stays the same
+    sample = torch.load("/opt/sample_media.pt")
+    for key, item in sample.items():
+        if item is not None:
+            sample[key] = item.cuda()
+    # media_items = torch.load("/opt/sample_media.pt")
+    # Generate images (video frames)
+    images = pipeline(
+        num_inference_steps=num_inference_steps,
+        num_images_per_prompt=num_images_per_prompt,
+        guidance_scale=guidance_scale,
+        generator=None,
+        output_type="pt",
+        callback_on_step_end=None,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        frame_rate=frame_rate,
+        **sample,
+        is_video=True,
+        vae_per_channel_normalize=True,
+    ).images
+    print("Generated video frames.")
+if __name__ == "__main__":
+    main()

xora/examples/text_to_video.py CHANGED Viewed

@@ -5,93 +5,104 @@ from xora.models.transformers.symmetric_patchifier import SymmetricPatchifier
 from xora.schedulers.rf import RectifiedFlowScheduler
 from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
 from pathlib import Path
-from transformers import T5EncoderModel
 import safetensors.torch
 import json
-# Paths for the separate mode directories
-separate_dir = Path("/opt/models/xora-img2video")
-unet_dir = separate_dir / 'unet'
-vae_dir = separate_dir / 'vae'
-scheduler_dir = separate_dir / 'scheduler'
-# Load VAE from separate mode
-vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
-vae_config_path = vae_dir / "config.json"
-with open(vae_config_path, 'r') as f:
-    vae_config = json.load(f)
-vae = CausalVideoAutoencoder.from_config(vae_config)
-vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
-vae.load_state_dict(
-    state_dict=vae_state_dict,
-)
-vae = vae.cuda().to(torch.bfloat16)
-# Load UNet (Transformer) from separate mode
-unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
-unet_config_path = unet_dir / "config.json"
-transformer_config = Transformer3DModel.load_config(unet_config_path)
-transformer = Transformer3DModel.from_config(transformer_config)
-unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
-transformer.load_state_dict(unet_state_dict, strict=True)
-transformer = transformer.cuda()
-unet = transformer
-# Load Scheduler from separate mode
-scheduler_config_path = scheduler_dir / "scheduler_config.json"
-scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
-scheduler = RectifiedFlowScheduler.from_config(scheduler_config)
-# Patchifier (remains the same)
-patchifier = SymmetricPatchifier(patch_size=1)
-# Use submodels for the pipeline
-submodel_dict = {
-    "unet": unet,
-    "transformer": transformer,
-    "patchifier": patchifier,
-    "scheduler": scheduler,
-    "vae": vae,
-}
-model_name_or_path = "PixArt-alpha/PixArt-XL-2-1024-MS"
-pipeline = VideoPixArtAlphaPipeline.from_pretrained(model_name_or_path,
-                                                    safety_checker=None,
-            revision=None,
-            torch_dtype=torch.float32,
-            **submodel_dict,
-        ).to("cuda")
-# Sample input
-num_inference_steps = 20
-num_images_per_prompt = 2
-guidance_scale = 3
-height = 512
-width = 768
-num_frames = 57
-frame_rate = 25
-sample = {
-    "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
-              "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
-    'prompt_attention_mask': None,  # Adjust attention masks as needed
-    'negative_prompt': "Ugly deformed",
-    'negative_prompt_attention_mask': None
-}
-# Generate images (video frames)
-images = pipeline(
-    num_inference_steps=num_inference_steps,
-    num_images_per_prompt=num_images_per_prompt,
-    guidance_scale=guidance_scale,
-    generator=None,
-    output_type="pt",
-    callback_on_step_end=None,
-    height=height,
-    width=width,
-    num_frames=num_frames,
-    frame_rate=frame_rate,
-    **sample,
-    is_video=True,
-    vae_per_channel_normalize=True,
-).images
-print("Generated images (video frames).")

 from xora.schedulers.rf import RectifiedFlowScheduler
 from xora.pipelines.pipeline_video_pixart_alpha import VideoPixArtAlphaPipeline
 from pathlib import Path
+from transformers import T5EncoderModel, T5Tokenizer
 import safetensors.torch
 import json
+import argparse
+def load_vae(vae_dir):
+    vae_ckpt_path = vae_dir / "diffusion_pytorch_model.safetensors"
+    vae_config_path = vae_dir / "config.json"
+    with open(vae_config_path, 'r') as f:
+        vae_config = json.load(f)
+    vae = CausalVideoAutoencoder.from_config(vae_config)
+    vae_state_dict = safetensors.torch.load_file(vae_ckpt_path)
+    vae.load_state_dict(vae_state_dict)
+    return vae.cuda().to(torch.bfloat16)
+def load_unet(unet_dir):
+    unet_ckpt_path = unet_dir / "diffusion_pytorch_model.safetensors"
+    unet_config_path = unet_dir / "config.json"
+    transformer_config = Transformer3DModel.load_config(unet_config_path)
+    transformer = Transformer3DModel.from_config(transformer_config)
+    unet_state_dict = safetensors.torch.load_file(unet_ckpt_path)
+    transformer.load_state_dict(unet_state_dict, strict=True)
+    return transformer.cuda()
+def load_scheduler(scheduler_dir):
+    scheduler_config_path = scheduler_dir / "scheduler_config.json"
+    scheduler_config = RectifiedFlowScheduler.load_config(scheduler_config_path)
+    return RectifiedFlowScheduler.from_config(scheduler_config)
+def main():
+    # Parse command line arguments
+    parser = argparse.ArgumentParser(description='Load models from separate directories')
+    parser.add_argument('--separate_dir', type=str, required=True, help='Path to the directory containing unet, vae, and scheduler subdirectories')
+    args = parser.parse_args()
+    # Paths for the separate mode directories
+    separate_dir = Path(args.separate_dir)
+    unet_dir = separate_dir / 'unet'
+    vae_dir = separate_dir / 'vae'
+    scheduler_dir = separate_dir / 'scheduler'
+    # Load models
+    vae = load_vae(vae_dir)
+    unet = load_unet(unet_dir)
+    scheduler = load_scheduler(scheduler_dir)
+    # Patchifier (remains the same)
+    patchifier = SymmetricPatchifier(patch_size=1)
+    text_encoder = T5EncoderModel.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="text_encoder").to("cuda")
+    tokenizer = T5Tokenizer.from_pretrained("PixArt-alpha/PixArt-XL-2-1024-MS", subfolder="tokenizer")
+    # Use submodels for the pipeline
+    submodel_dict = {
+        "transformer": unet,  # using unet for transformer
+        "patchifier": patchifier,
+        "scheduler": scheduler,
+        "text_encoder": text_encoder,
+        "tokenizer": tokenizer,
+        "vae": vae,
+    }
+    pipeline = VideoPixArtAlphaPipeline(**submodel_dict).to("cuda")
+    # Sample input
+    num_inference_steps = 20
+    num_images_per_prompt = 2
+    guidance_scale = 3
+    height = 512
+    width = 768
+    num_frames = 57
+    frame_rate = 25
+    sample = {
+        "prompt": "A middle-aged man with glasses and a salt-and-pepper beard is driving a car and talking, gesturing with his right hand. "
+                  "The man is wearing a dark blue zip-up jacket and a light blue collared shirt. He is sitting in the driver's seat of a car with a black interior. The car is moving on a road with trees and bushes on either side. The man has a serious expression on his face and is looking straight ahead.",
+        'prompt_attention_mask': None,  # Adjust attention masks as needed
+        'negative_prompt': "Ugly deformed",
+        'negative_prompt_attention_mask': None
+    }
+    # Generate images (video frames)
+    images = pipeline(
+        num_inference_steps=num_inference_steps,
+        num_images_per_prompt=num_images_per_prompt,
+        guidance_scale=guidance_scale,
+        generator=None,
+        output_type="pt",
+        callback_on_step_end=None,
+        height=height,
+        width=width,
+        num_frames=num_frames,
+        frame_rate=frame_rate,
+        **sample,
+        is_video=True,
+        vae_per_channel_normalize=True,
+    ).images
+    print("Generated images (video frames).")
+if __name__ == "__main__":
+    main()