Spaces:

jbilcke-hf
/

ReCamMaster

Paused

App Files Files Community

ReCamMaster / app.py

jbilcke-hf

Upload 3 files

f0cc3b2 verified 5 months ago

raw

history blame

15.4 kB

	import gradio as gr
	import torch
	import torch.nn as nn
	import os
	import tempfile
	import shutil
	import imageio
	import pandas as pd
	import numpy as np
	from diffsynth import ModelManager, WanVideoReCamMasterPipeline, save_video
	import json
	from torchvision.transforms import v2
	from einops import rearrange
	import torchvision
	from PIL import Image
	import logging
	from pathlib import Path
	from huggingface_hub import hf_hub_download

	logging.basicConfig(level=logging.INFO)
	logger = logging.getLogger(__name__)

	# Camera transformation types
	CAMERA_TRANSFORMATIONS = {
	"1": "Pan Right",
	"2": "Pan Left",
	"3": "Tilt Up",
	"4": "Tilt Down",
	"5": "Zoom In",
	"6": "Zoom Out",
	"7": "Translate Up (with rotation)",
	"8": "Translate Down (with rotation)",
	"9": "Arc Left (with rotation)",
	"10": "Arc Right (with rotation)"
	}

	# Global variables for model
	model_manager = None
	pipe = None
	is_model_loaded = False

	def download_recammaster_checkpoint():
	"""Download ReCamMaster checkpoint from HuggingFace using huggingface_hub"""
	# Define paths
	repo_id = "KwaiVGI/ReCamMaster-Wan2.1"
	filename = "step20000.ckpt"
	checkpoint_dir = Path("models/ReCamMaster/checkpoints")
	checkpoint_path = checkpoint_dir / filename

	# Check if already exists
	if checkpoint_path.exists():
	logger.info(f"✓ ReCamMaster checkpoint already exists at {checkpoint_path}")
	return checkpoint_path

	# Create directory if it doesn't exist
	checkpoint_dir.mkdir(parents=True, exist_ok=True)

	# Download the checkpoint
	logger.info("Downloading ReCamMaster checkpoint from HuggingFace...")
	logger.info(f"Repository: {repo_id}")
	logger.info(f"File: {filename}")
	logger.info(f"Destination: {checkpoint_path}")

	try:
	# Download using huggingface_hub
	downloaded_path = hf_hub_download(
	repo_id=repo_id,
	filename=filename,
	local_dir=checkpoint_dir,
	local_dir_use_symlinks=False
	)
	logger.info(f"✓ Successfully downloaded ReCamMaster checkpoint to {downloaded_path}!")
	return downloaded_path
	except Exception as e:
	logger.error(f"✗ Error downloading checkpoint: {e}")
	raise

	class Camera(object):
	def __init__(self, c2w):
	c2w_mat = np.array(c2w).reshape(4, 4)
	self.c2w_mat = c2w_mat
	self.w2c_mat = np.linalg.inv(c2w_mat)

	def parse_matrix(matrix_str):
	"""Parse camera matrix string from JSON format"""
	rows = matrix_str.strip().split('] [')
	matrix = []
	for row in rows:
	row = row.replace('[', '').replace(']', '')
	matrix.append(list(map(float, row.split())))
	return np.array(matrix)

	def get_relative_pose(cam_params):
	"""Calculate relative camera poses"""
	abs_w2cs = [cam_param.w2c_mat for cam_param in cam_params]
	abs_c2ws = [cam_param.c2w_mat for cam_param in cam_params]

	cam_to_origin = 0
	target_cam_c2w = np.array([
	[1, 0, 0, 0],
	[0, 1, 0, -cam_to_origin],
	[0, 0, 1, 0],
	[0, 0, 0, 1]
	])
	abs2rel = target_cam_c2w @ abs_w2cs[0]
	ret_poses = [target_cam_c2w, ] + [abs2rel @ abs_c2w for abs_c2w in abs_c2ws[1:]]
	ret_poses = np.array(ret_poses, dtype=np.float32)
	return ret_poses

	def load_models(progress_callback=None):
	"""Load the ReCamMaster models"""
	global model_manager, pipe, is_model_loaded

	if is_model_loaded:
	return "Models already loaded!"

	try:
	logger.info("Starting model loading...")

	# First ensure the checkpoint is downloaded
	if progress_callback:
	progress_callback(0.05, desc="Checking for ReCamMaster checkpoint...")

	try:
	ckpt_path = download_recammaster_checkpoint()
	logger.info(f"Using checkpoint at {ckpt_path}")
	except Exception as e:
	error_msg = f"Error downloading ReCamMaster checkpoint: {str(e)}"
	logger.error(error_msg)
	return error_msg

	if progress_callback:
	progress_callback(0.1, desc="Loading model manager...")

	# Load Wan2.1 pre-trained models
	model_manager = ModelManager(torch_dtype=torch.bfloat16, device="cpu")

	if progress_callback:
	progress_callback(0.3, desc="Loading Wan2.1 models...")

	model_manager.load_models([
	"models/Wan-AI/Wan2.1-T2V-1.3B/diffusion_pytorch_model.safetensors",
	"models/Wan-AI/Wan2.1-T2V-1.3B/models_t5_umt5-xxl-enc-bf16.pth",
	"models/Wan-AI/Wan2.1-T2V-1.3B/Wan2.1_VAE.pth",
	])

	if progress_callback:
	progress_callback(0.5, desc="Creating pipeline...")

	pipe = WanVideoReCamMasterPipeline.from_model_manager(model_manager, device="cuda")

	if progress_callback:
	progress_callback(0.7, desc="Initializing ReCamMaster modules...")

	# Initialize additional modules introduced in ReCamMaster
	dim = pipe.dit.blocks[0].self_attn.q.weight.shape[0]
	for block in pipe.dit.blocks:
	block.cam_encoder = nn.Linear(12, dim)
	block.projector = nn.Linear(dim, dim)
	block.cam_encoder.weight.data.zero_()
	block.cam_encoder.bias.data.zero_()
	block.projector.weight = nn.Parameter(torch.eye(dim))
	block.projector.bias = nn.Parameter(torch.zeros(dim))

	if progress_callback:
	progress_callback(0.9, desc="Loading ReCamMaster checkpoint...")

	# Load ReCamMaster checkpoint
	if not os.path.exists(ckpt_path):
	error_msg = f"Error: ReCamMaster checkpoint not found at {ckpt_path} even after download attempt."
	logger.error(error_msg)
	return error_msg

	state_dict = torch.load(ckpt_path, map_location="cpu")
	pipe.dit.load_state_dict(state_dict, strict=True)
	pipe.to("cuda")
	pipe.to(dtype=torch.bfloat16)

	is_model_loaded = True

	if progress_callback:
	progress_callback(1.0, desc="Models loaded successfully!")

	logger.info("Models loaded successfully!")
	return "Models loaded successfully!"

	except Exception as e:
	logger.error(f"Error loading models: {str(e)}")
	return f"Error loading models: {str(e)}"

	def extract_frames_from_video(video_path, output_dir, max_frames=81):
	"""Extract frames from video and ensure we have at least 81 frames"""
	os.makedirs(output_dir, exist_ok=True)

	reader = imageio.get_reader(video_path)
	fps = reader.get_meta_data()['fps']
	total_frames = reader.count_frames()

	frames = []
	for i, frame in enumerate(reader):
	frames.append(frame)
	reader.close()

	# If we have fewer than required frames, repeat the last frame
	if len(frames) < max_frames:
	logger.info(f"Video has {len(frames)} frames, padding to {max_frames} frames")
	last_frame = frames[-1]
	while len(frames) < max_frames:
	frames.append(last_frame)

	# Save frames
	for i, frame in enumerate(frames[:max_frames]):
	frame_path = os.path.join(output_dir, f"frame_{i:04d}.png")
	imageio.imwrite(frame_path, frame)

	return len(frames[:max_frames]), fps

	def process_video_for_recammaster(video_path, text_prompt, cam_type, height=480, width=832):
	"""Process video through ReCamMaster model"""
	global pipe

	# Create frame processor
	frame_process = v2.Compose([
	v2.CenterCrop(size=(height, width)),
	v2.Resize(size=(height, width), antialias=True),
	v2.ToTensor(),
	v2.Normalize(mean=[0.5, 0.5, 0.5], std=[0.5, 0.5, 0.5]),
	])

	def crop_and_resize(image):
	width_img, height_img = image.size
	scale = max(width / width_img, height / height_img)
	image = torchvision.transforms.functional.resize(
	image,
	(round(height_imgscale), round(width_imgscale)),
	interpolation=torchvision.transforms.InterpolationMode.BILINEAR
	)
	return image

	# Load video frames
	reader = imageio.get_reader(video_path)
	frames = []

	for i in range(81): # ReCamMaster needs exactly 81 frames
	try:
	frame = reader.get_data(i)
	frame = Image.fromarray(frame)
	frame = crop_and_resize(frame)
	frame = frame_process(frame)
	frames.append(frame)
	except:
	# If we run out of frames, repeat the last one
	if frames:
	frames.append(frames[-1])
	else:
	raise ValueError("Video is too short!")

	reader.close()

	frames = torch.stack(frames, dim=0)
	frames = rearrange(frames, "T C H W -> C T H W")
	video_tensor = frames.unsqueeze(0) # Add batch dimension

	# Load camera trajectory
	tgt_camera_path = "./example_test_data/cameras/camera_extrinsics.json"
	with open(tgt_camera_path, 'r') as file:
	cam_data = json.load(file)

	# Get camera trajectory for selected type
	cam_idx = list(range(81))[::4] # Sample every 4 frames
	traj = [parse_matrix(cam_data[f"frame{idx}"][f"cam{int(cam_type):02d}"]) for idx in cam_idx]
	traj = np.stack(traj).transpose(0, 2, 1)

	c2ws = []
	for c2w in traj:
	c2w = c2w[:, [1, 2, 0, 3]]
	c2w[:3, 1] *= -1.
	c2w[:3, 3] /= 100
	c2ws.append(c2w)

	tgt_cam_params = [Camera(cam_param) for cam_param in c2ws]
	relative_poses = []
	for i in range(len(tgt_cam_params)):
	relative_pose = get_relative_pose([tgt_cam_params[0], tgt_cam_params[i]])
	relative_poses.append(torch.as_tensor(relative_pose)[:,:3,:][1])

	pose_embedding = torch.stack(relative_poses, dim=0) # 21x3x4
	pose_embedding = rearrange(pose_embedding, 'b c d -> b (c d)')
	camera_tensor = pose_embedding.to(torch.bfloat16).unsqueeze(0) # Add batch dimension

	# Generate video with ReCamMaster
	video = pipe(
	prompt=[text_prompt],
	negative_prompt=["worst quality, low quality, blurry, jittery, distorted"],
	source_video=video_tensor,
	target_camera=camera_tensor,
	cfg_scale=5.0,
	num_inference_steps=50,
	seed=0,
	tiled=True
	)

	return video

	def generate_recammaster_video(
	video_file,
	text_prompt,
	camera_type,
	progress=gr.Progress()
	):
	"""Main function to generate video with ReCamMaster"""
	global pipe, is_model_loaded

	if not is_model_loaded:
	return None, "Error: Models not loaded! Please load models first."

	if video_file is None:
	return None, "Please upload a video file."

	try:
	# Create temporary directory for processing
	with tempfile.TemporaryDirectory() as temp_dir:
	progress(0.1, desc="Processing input video...")

	# Copy uploaded video to temp directory
	input_video_path = os.path.join(temp_dir, "input.mp4")
	shutil.copy(video_file.name, input_video_path)

	# Extract frames
	progress(0.2, desc="Extracting video frames...")
	num_frames, fps = extract_frames_from_video(input_video_path, os.path.join(temp_dir, "frames"))
	logger.info(f"Extracted {num_frames} frames at {fps} fps")

	# Process with ReCamMaster
	progress(0.3, desc="Processing with ReCamMaster...")
	output_video = process_video_for_recammaster(
	input_video_path,
	text_prompt,
	camera_type
	)

	# Save output video
	progress(0.9, desc="Saving output video...")
	output_path = os.path.join(temp_dir, "output.mp4")
	save_video(output_video, output_path, fps=30, quality=5)

	# Copy to persistent location
	final_output_path = tempfile.NamedTemporaryFile(suffix='.mp4', delete=False).name
	shutil.copy(output_path, final_output_path)

	progress(1.0, desc="Done!")

	transformation_name = CAMERA_TRANSFORMATIONS.get(str(camera_type), "Unknown")
	status_msg = f"Successfully generated video with '{transformation_name}' camera movement!"

	return final_output_path, status_msg

	except Exception as e:
	logger.error(f"Error generating video: {str(e)}")
	return None, f"Error: {str(e)}"

	# Create Gradio interface
	with gr.Blocks(title="ReCamMaster Demo") as demo:
	# Show loading status
	loading_status = gr.Textbox(
	label="Model Loading Status",
	value="Loading models, please wait...",
	interactive=False,
	visible=True
	)

	gr.Markdown("""
	# 🎥 ReCamMaster Demo

	ReCamMaster allows you to re-capture videos with novel camera trajectories.
	Upload a video and select a camera transformation to see the magic!

	Note: The ReCamMaster checkpoint will be automatically downloaded from HuggingFace when you start the app.
	You still need to download Wan2.1 models using `python download_wan2.1.py` before running this demo.
	""")

	with gr.Row():
	with gr.Column():
	# Video input section
	with gr.Group():
	gr.Markdown("### Step 1: Upload Video")
	video_input = gr.Video(label="Input Video")
	text_prompt = gr.Textbox(
	label="Text Prompt (describe your video)",
	placeholder="A person walking in the street",
	value="A dynamic scene"
	)

	# Camera selection
	with gr.Group():
	gr.Markdown("### Step 2: Select Camera Movement")
	camera_type = gr.Radio(
	choices=[(v, k) for k, v in CAMERA_TRANSFORMATIONS.items()],
	label="Camera Transformation",
	value="1"
	)

	# Generate button
	generate_btn = gr.Button("Generate Video", variant="primary")

	with gr.Column():
	# Output section
	output_video = gr.Video(label="Output Video")
	status_output = gr.Textbox(label="Generation Status", interactive=False)

	# Example videos
	gr.Markdown("### Example Videos")
	gr.Examples(
	examples=[
	["example_test_data/videos/case0.mp4", "A person dancing", "1"],
	["example_test_data/videos/case1.mp4", "A scenic view", "5"],
	],
	inputs=[video_input, text_prompt, camera_type],
	)

	# Load models automatically when the interface loads
	def on_load():
	status = load_models()
	return gr.update(value=status, visible=True if "Error" in status else False)

	demo.load(on_load, outputs=[loading_status])

	# Event handlers
	generate_btn.click(
	fn=generate_recammaster_video,
	inputs=[video_input, text_prompt, camera_type],
	outputs=[output_video, status_output]
	)

	if __name__ == "__main__":
	demo.launch(share=True)