Spaces:

jbilcke-hf
/

ReCamMaster

Paused

App Files Files Community

jbilcke-hf commited on Jul 16

Commit

f0cc3b2

verified ·

1 Parent(s): 85e1111

Upload 3 files

Browse files

Files changed (3) hide show

LICENSE +21 -0
app.py +436 -0
requirements.txt +13 -0

LICENSE ADDED Viewed

	@@ -0,0 +1,21 @@

+MIT License
+Copyright (c) 2025 Kuaishou Visual Generation and Interaction Center
+Permission is hereby granted, free of charge, to any person obtaining a copy
+of this software and associated documentation files (the "Software"), to deal
+in the Software without restriction, including without limitation the rights
+to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+copies of the Software, and to permit persons to whom the Software is
+furnished to do so, subject to the following conditions:
+The above copyright notice and this permission notice shall be included in all
+copies or substantial portions of the Software.
+THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+SOFTWARE.

app.py ADDED Viewed

	@@ -0,0 +1,436 @@

+import gradio as gr
+import torch
+import torch.nn as nn
+import os
+import tempfile
+import shutil
+import imageio
+import pandas as pd
+import numpy as np
+from diffsynth import ModelManager, WanVideoReCamMasterPipeline, save_video
+import json
+from torchvision.transforms import v2
+from einops import rearrange
+import torchvision
+from PIL import Image
+import logging
+from pathlib import Path
+from huggingface_hub import hf_hub_download
+logging.basicConfig(level=logging.INFO)
+logger = logging.getLogger(__name__)
+# Camera transformation types
+CAMERA_TRANSFORMATIONS = {
+    "1": "Pan Right",
+    "2": "Pan Left",
+    "3": "Tilt Up",
+    "4": "Tilt Down",
+    "5": "Zoom In",
+    "6": "Zoom Out",
+    "7": "Translate Up (with rotation)",
+    "8": "Translate Down (with rotation)",
+    "9": "Arc Left (with rotation)",
+    "10": "Arc Right (with rotation)"
+}
+# Global variables for model
+model_manager = None
+pipe = None
+is_model_loaded = False
+def download_recammaster_checkpoint():
+    """Download ReCamMaster checkpoint from HuggingFace using huggingface_hub"""
+    # Define paths
+    repo_id = "KwaiVGI/ReCamMaster-Wan2.1"
+    filename = "step20000.ckpt"
+    checkpoint_dir = Path("models/ReCamMaster/checkpoints")
+    checkpoint_path = checkpoint_dir / filename
+    # Check if already exists
+    if checkpoint_path.exists():
+        logger.info(f"✓ ReCamMaster checkpoint already exists at {checkpoint_path}")
+        return checkpoint_path
+    # Create directory if it doesn't exist
+    checkpoint_dir.mkdir(parents=True, exist_ok=True)
+    # Download the checkpoint
+    logger.info("Downloading ReCamMaster checkpoint from HuggingFace...")
+    logger.info(f"Repository: {repo_id}")
+    logger.info(f"File: {filename}")
+    logger.info(f"Destination: {checkpoint_path}")
+    try:
+        # Download using huggingface_hub
+        downloaded_path = hf_hub_download(
+            repo_id=repo_id,
+            filename=filename,
+            local_dir=checkpoint_dir,
+            local_dir_use_symlinks=False
+        )
+        logger.info(f"✓ Successfully downloaded ReCamMaster checkpoint to {downloaded_path}!")
+        return downloaded_path
+    except Exception as e:
+        logger.error(f"✗ Error downloading checkpoint: {e}")
+        raise
+class Camera(object):
+    def __init__(self, c2w):
+        c2w_mat = np.array(c2w).reshape(4, 4)
+        self.c2w_mat = c2w_mat
+        self.w2c_mat = np.linalg.inv(c2w_mat)
+def parse_matrix(matrix_str):
+    """Parse camera matrix string from JSON format"""
+    rows = matrix_str.strip().split('] [')
+    matrix = []
+    for row in rows:
+        row = row.replace('[', '').replace(']', '')
+        matrix.append(list(map(float, row.split())))
+    return np.array(matrix)
+def get_relative_pose(cam_params):
+    """Calculate relative camera poses"""
+    abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
+    abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]
+    cam_to_origin = 0
+    target_cam_c2w = np.array([
+        [1, 0, 0, 0],
+        [0, 1, 0, -cam_to_origin],
+        [0, 0, 1, 0],
+        [0, 0, 0, 1]
+    ])
+    abs2rel = target_cam_c2w @ abs_w2cs[0]
+    ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
+    ret_poses = np.array(ret_poses, dtype=np.float32)
+    return ret_poses
+def load_models(progress_callback=None):
+    """Load the ReCamMaster models"""
+    global model_manager, pipe, is_model_loaded
+    if is_model_loaded:
+        return "Models already loaded!"
+    try:
+        logger.info("Starting model loading...")
+        # First ensure the checkpoint is downloaded
+        if progress_callback:
+            progress_callback(0.05, desc="Checking for ReCamMaster checkpoint...")
+        try:
+            ckpt_path = download_recammaster_checkpoint()
+            logger.info(f"Using checkpoint at {ckpt_path}")
+        except Exception as e:
+            error_msg = f"Error downloading ReCamMaster checkpoint: {str(e)}"
+            logger.error(error_msg)
+            return error_msg
+        if progress_callback:
+            progress_callback(0.1, desc="Loading model manager...")
+        # Load Wan2.1 pre-trained models
+        model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")
+        if progress_callback:
+            progress_callback(0.3, desc="Loading Wan2.1 models...")
+        model_manager.load_models([
+            "models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
+            "models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
+            "models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
+        ])
+        if progress_callback:
+            progress_callback(0.5, desc="Creating pipeline...")
+        pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")
+        if progress_callback:
+            progress_callback(0.7, desc="Initializing ReCamMaster modules...")
+        # Initialize additional modules introduced in ReCamMaster
+        dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
+        for block in pipe.dit.blocks:
+            block.cam_encoder = nn.Linear(12, dim)
+            block.projector = nn.Linear(dim, dim)
+            block.cam_encoder.weight.data.zero_()
+            block.cam_encoder.bias.data.zero_()
+            block.projector.weight = nn.Parameter(torch.eye(dim))
+            block.projector.bias = nn.Parameter(torch.zeros(dim))
+        if progress_callback:
+            progress_callback(0.9, desc="Loading ReCamMaster checkpoint...")
+        # Load ReCamMaster checkpoint
+        if not os.path.exists(ckpt_path):
+            error_msg = f"Error: ReCamMaster checkpoint not found at {ckpt_path} even after download attempt."
+            logger.error(error_msg)
+            return error_msg
+        state_dict = torch.load(ckpt_path, map_location="cpu")
+        pipe.dit.load_state_dict(state_dict, strict=True)
+        pipe.to("cuda")
+        pipe.to(dtype=torch.bfloat16)
+        is_model_loaded = True
+        if progress_callback:
+            progress_callback(1.0, desc="Models loaded successfully!")
+        logger.info("Models loaded successfully!")
+        return "Models loaded successfully!"
+    except Exception as e:
+        logger.error(f"Error loading models: {str(e)}")
+        return f"Error loading models: {str(e)}"
+def extract_frames_from_video(video_path, output_dir, max_frames=81):
+    """Extract frames from video and ensure we have at least 81 frames"""
+    os.makedirs(output_dir, exist_ok=True)
+    reader = imageio.get_reader(video_path)
+    fps = reader.get_meta_data()['fps']
+    total_frames = reader.count_frames()
+    frames = []
+    for i, frame in enumerate(reader):
+        frames.append(frame)
+    reader.close()
+    # If we have fewer than required frames, repeat the last frame
+    if len(frames) < max_frames:
+        logger.info(f"Video has {len(frames)} frames, padding to {max_frames} frames")
+        last_frame = frames[-1]
+        while len(frames) < max_frames:
+            frames.append(last_frame)
+    # Save frames
+    for i, frame in enumerate(frames[:max_frames]):
+        frame_path = os.path.join(output_dir, f"frame_{i:04d}.png")
+        imageio.imwrite(frame_path, frame)
+    return len(frames[:max_frames]), fps
+def process_video_for_recammaster(video_path, text_prompt, cam_type, height=480, width=832):
+    """Process video through ReCamMaster model"""
+    global pipe
+    # Create frame processor
+    frame_process = v2.Compose([
+        v2.CenterCrop(size=(height, width)),
+        v2.Resize(size=(height, width), antialias=True),
+        v2.ToTensor(),
+        v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
+    ])
+    def crop_and_resize(image):
+        width_img, height_img = image.size
+        scale = max(width / width_img, height / height_img)
+        image = torchvision.transforms.functional.resize(
+            image,
+            (round(height_img*scale), round(width_img*scale)),
+            interpolation=torchvision.transforms.InterpolationMode.BILINEAR
+        )
+        return image
+    # Load video frames
+    reader = imageio.get_reader(video_path)
+    frames = []
+    for i in range(81):  # ReCamMaster needs exactly 81 frames
+        try:
+            frame = reader.get_data(i)
+            frame = Image.fromarray(frame)
+            frame = crop_and_resize(frame)
+            frame = frame_process(frame)
+            frames.append(frame)
+        except:
+            # If we run out of frames, repeat the last one
+            if frames:
+                frames.append(frames[-1])
+            else:
+                raise ValueError("Video is too short!")
+    reader.close()
+    frames = torch.stack(frames, dim=0)
+    frames = rearrange(frames, "T C H W -> C T H W")
+    video_tensor = frames.unsqueeze(0)  # Add batch dimension
+    # Load camera trajectory
+    tgt_camera_path = "./example_test_data/cameras/camera_extrinsics.json"
+    with open(tgt_camera_path, 'r') as file:
+        cam_data = json.load(file)
+    # Get camera trajectory for selected type
+    cam_idx = list(range(81))[::4]  # Sample every 4 frames
+    traj = [parse_matrix(cam_data[f"frame{idx}"][f"cam{int(cam_type):02d}"]) for idx in cam_idx]
+    traj = np.stack(traj).transpose(0, 2, 1)
+    c2ws = []
+    for c2w in traj:
+        c2w = c2w[:, [1, 2, 0, 3]]
+        c2w[:3, 1] *= -1.
+        c2w[:3, 3] /= 100
+        c2ws.append(c2w)
+    tgt_cam_params = [Camera(cam_param) for cam_param in c2ws]
+    relative_poses = []
+    for i in range(len(tgt_cam_params)):
+        relative_pose = get_relative_pose([tgt_cam_params[0], tgt_cam_params[i]])
+        relative_poses.append(torch.as_tensor(relative_pose)[:,:3,:][1])
+    pose_embedding = torch.stack(relative_poses, dim=0)  # 21x3x4
+    pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
+    camera_tensor = pose_embedding.to(torch.bfloat16).unsqueeze(0)  # Add batch dimension
+    # Generate video with ReCamMaster
+    video = pipe(
+        prompt=[text_prompt],
+        negative_prompt=["worst quality, low quality, blurry, jittery, distorted"],
+        source_video=video_tensor,
+        target_camera=camera_tensor,
+        cfg_scale=5.0,
+        num_inference_steps=50,
+        seed=0,
+        tiled=True
+    )
+    return video
+def generate_recammaster_video(
+    video_file,
+    text_prompt,
+    camera_type,
+    progress=gr.Progress()
+):
+    """Main function to generate video with ReCamMaster"""
+    global pipe, is_model_loaded
+    if not is_model_loaded:
+        return None, "Error: Models not loaded! Please load models first."
+    if video_file is None:
+        return None, "Please upload a video file."
+    try:
+        # Create temporary directory for processing
+        with tempfile.TemporaryDirectory() as temp_dir:
+            progress(0.1, desc="Processing input video...")
+            # Copy uploaded video to temp directory
+            input_video_path = os.path.join(temp_dir, "input.mp4")
+            shutil.copy(video_file.name, input_video_path)
+            # Extract frames
+            progress(0.2, desc="Extracting video frames...")
+            num_frames, fps = extract_frames_from_video(input_video_path, os.path.join(temp_dir, "frames"))
+            logger.info(f"Extracted {num_frames} frames at {fps} fps")
+            # Process with ReCamMaster
+            progress(0.3, desc="Processing with ReCamMaster...")
+            output_video = process_video_for_recammaster(
+                input_video_path,
+                text_prompt,
+                camera_type
+            )
+            # Save output video
+            progress(0.9, desc="Saving output video...")
+            output_path = os.path.join(temp_dir, "output.mp4")
+            save_video(output_video, output_path, fps=30, quality=5)
+            # Copy to persistent location
+            final_output_path = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
+            shutil.copy(output_path, final_output_path)
+            progress(1.0, desc="Done!")
+            transformation_name = CAMERA_TRANSFORMATIONS.get(str(camera_type), "Unknown")
+            status_msg = f"Successfully generated video with '{transformation_name}' camera movement!"
+            return final_output_path, status_msg
+    except Exception as e:
+        logger.error(f"Error generating video: {str(e)}")
+        return None, f"Error: {str(e)}"
+# Create Gradio interface
+with gr.Blocks(title="ReCamMaster Demo") as demo:
+    # Show loading status
+    loading_status = gr.Textbox(
+        label="Model Loading Status",
+        value="Loading models, please wait...",
+        interactive=False,
+        visible=True
+    )
+    gr.Markdown("""
+    # 🎥 ReCamMaster Demo
+    ReCamMaster allows you to re-capture videos with novel camera trajectories.
+    Upload a video and select a camera transformation to see the magic!
+    **Note:** The ReCamMaster checkpoint will be automatically downloaded from HuggingFace when you start the app.
+    You still need to download Wan2.1 models using `python download_wan2.1.py` before running this demo.
+    """)
+    with gr.Row():
+        with gr.Column():
+            # Video input section
+            with gr.Group():
+                gr.Markdown("### Step 1: Upload Video")
+                video_input = gr.Video(label="Input Video")
+                text_prompt = gr.Textbox(
+                    label="Text Prompt (describe your video)",
+                    placeholder="A person walking in the street",
+                    value="A dynamic scene"
+                )
+            # Camera selection
+            with gr.Group():
+                gr.Markdown("### Step 2: Select Camera Movement")
+                camera_type = gr.Radio(
+                    choices=[(v, k) for k, v in CAMERA_TRANSFORMATIONS.items()],
+                    label="Camera Transformation",
+                    value="1"
+                )
+            # Generate button
+            generate_btn = gr.Button("Generate Video", variant="primary")
+        with gr.Column():
+            # Output section
+            output_video = gr.Video(label="Output Video")
+            status_output = gr.Textbox(label="Generation Status", interactive=False)
+    # Example videos
+    gr.Markdown("### Example Videos")
+    gr.Examples(
+        examples=[
+            ["example_test_data/videos/case0.mp4", "A person dancing", "1"],
+            ["example_test_data/videos/case1.mp4", "A scenic view", "5"],
+        ],
+        inputs=[video_input, text_prompt, camera_type],
+    )
+    # Load models automatically when the interface loads
+    def on_load():
+        status = load_models()
+        return gr.update(value=status, visible=True if "Error" in status else False)
+    demo.load(on_load, outputs=[loading_status])
+    # Event handlers
+    generate_btn.click(
+        fn=generate_recammaster_video,
+        inputs=[video_input, text_prompt, camera_type],
+        outputs=[output_video, status_output]
+    )
+if __name__ == "__main__":
+    demo.launch(share=True)

requirements.txt ADDED Viewed

	@@ -0,0 +1,13 @@

+torch>=2.0.0
+torchvision
+cupy-cuda12x
+transformers==4.46.2
+controlnet-aux==0.0.7
+imageio
+imageio[ffmpeg]
+safetensors
+einops
+sentencepiece
+protobuf
+modelscope
+ftfy