Spaces:

dream2589632147
/

Dream-wan2-2-faster-Pro

Running on Zero

App Files Files

Dream-wan2-2-faster-Pro / app.py

dream2589632147

Update app.py

07473ce verified about 2 months ago

raw

history blame

10.9 kB

	import spaces
	import torch
	from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
	from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
	from diffusers.utils.export_utils import export_to_video
	import gradio as gr
	import tempfile
	import numpy as np
	from PIL import Image
	import random
	import gc

	from torchao.quantization import quantize_
	from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
	from torchao.quantization import Int8WeightOnlyConfig

	import aoti


	MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"

	MAX_DIM = 832
	MIN_DIM = 480
	SQUARE_DIM = 640
	MULTIPLE_OF = 16

	MAX_SEED = np.iinfo(np.int32).max

	FIXED_FPS = 16
	MIN_FRAMES_MODEL = 8
	MAX_FRAMES_MODEL = 640

	MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
	MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)


	pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
	transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
	subfolder='transformer',
	torch_dtype=torch.bfloat16,
	device_map='cuda',
	),
	transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
	subfolder='transformer_2',
	torch_dtype=torch.bfloat16,
	device_map='cuda',
	),
	torch_dtype=torch.bfloat16,
	).to('cuda')

	pipe.load_lora_weights(
	"Kijai/WanVideo_comfy",
	weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
	adapter_name="lightx2v"
	)
	kwargs_lora = {}
	kwargs_lora["load_into_transformer_2"] = True
	pipe.load_lora_weights(
	"Kijai/WanVideo_comfy",
	weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
	adapter_name="lightx2v_2", **kwargs_lora
	)
	pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
	pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
	pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
	pipe.unload_lora_weights()

	quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
	quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
	quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())

	aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
	aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')


	default_prompt_i2v = "ultra realistic cinematic footage, perfectly preserved facial identity and body structure across all frames, stable anatomy and consistent body proportions, realistic muscle definition, natural motion flow and breathing dynamics, seamless motion continuity, photorealistic clothing preservation with accurate fabric movement and lighting response, consistent outfit color and texture, high-fidelity skin tone and texture stability, lifelike eye reflections and natural gaze consistency, cinematic lighting with soft volumetric shadows, professional film-grade color grading, HDR tone mapping with true-to-life contrast and depth, realistic ambient and subsurface light behavior, physically accurate reflections and highlights, detailed cinematic background with depth of field and natural bokeh, smooth camera movement with film-level motion fluidity, 35mm film aesthetic, ultra-detailed textures, consistent and coherent composition, perfect balance between depth, light, and motion for a truly photorealistic cinematic atmosphere, temporal coherence, identity consistency, no facial drift, no texture flickering, no color shifting."
	default_negative_prompt = "low quality, low resolution, low contrast, poor lighting, underexposed, overexposed, bad composition, bad framing, bad perspective, flat lighting, washed out colors, jpeg artifacts, noise, static, grain, compression artifacts, flickering, stutter, shaky camera, inconsistent motion, poor transition, broken motion, unnatural interpolation, out of focus, blurry, motion blur, ghosting, double exposure, distorted face, consistent face, changing face, warped face, face drift, identity shift, face inconsistency, natural facial expression, mutated body, deformed limbs, extra fingers, fused fingers, missing fingers, bad anatomy, unrealistic proportions, twisted pose, asymmetrical body, unappealing, uncanny, artificial face, waxy skin, plastic look, text, watermark, logo, signature, frame border, cropped edges, tiling, duplicate, repeated pattern, cartoon, anime, illustration, 3d render, painting, drawing, oversharpened, low detail, artificial texture, poor skin texture, over-smoothed, fake skin, flat skin, color banding, saturation, chromatic aberration, unrealistic shadows, inconsistent lighting, frozen frame, poor depth, lack of realism, fake reflection, artifacted highlights, bloom artifacts, bad transition, broken frame, visual glitch, bad synchronization, oversaturated colors, contrast issues, unbalanced composition, lack of cinematic tone, flat motion, jitter, warped geometry, background distortion, identity mismatch, morphing, inconsistent hair, inconsistent body shape"


	def resize_image(image: Image.Image) -> Image.Image:
	width, height = image.size
	if width == height:
	return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
	aspect_ratio = width / height
	MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
	MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM
	image_to_resize = image
	if aspect_ratio > MAX_ASPECT_RATIO:
	target_w, target_h = MAX_DIM, MIN_DIM
	crop_width = int(round(height * MAX_ASPECT_RATIO))
	left = (width - crop_width) // 2
	image_to_resize = image.crop((left, 0, left + crop_width, height))
	elif aspect_ratio < MIN_ASPECT_RATIO:
	target_w, target_h = MIN_DIM, MAX_DIM
	crop_height = int(round(width / MIN_ASPECT_RATIO))
	top = (height - crop_height) // 2
	image_to_resize = image.crop((0, top, width, top + crop_height))
	else:
	if width > height:
	target_w = MAX_DIM
	target_h = int(round(target_w / aspect_ratio))
	else:
	target_h = MAX_DIM
	target_w = int(round(target_h * aspect_ratio))
	final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
	final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF
	final_w = max(MIN_DIM, min(MAX_DIM, final_w))
	final_h = max(MIN_DIM, min(MAX_DIM, final_h))
	return image_to_resize.resize((final_w, final_h), Image.LANCZOS)


	def get_num_frames(duration_seconds: float):
	return 1 + int(np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL))


	def get_duration(input_image, prompt, steps, negative_prompt, duration_seconds, guidance_scale, guidance_scale_2, seed, randomize_seed, progress):
	BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
	BASE_STEP_DURATION = 15
	width, height = resize_image(input_image).size
	frames = get_num_frames(duration_seconds)
	factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
	step_duration = BASE_STEP_DURATION * factor ** 1.5
	return 10 + int(steps) * step_duration


	@spaces.GPU(duration=get_duration)
	def generate_video(input_image, prompt, steps=4, negative_prompt=default_negative_prompt,
	duration_seconds=MAX_DURATION, guidance_scale=1, guidance_scale_2=1,
	seed=42, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):

	if input_image is None:
	raise gr.Error("Please upload an input image.")

	num_frames = get_num_frames(duration_seconds)
	current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
	resized_image = resize_image(input_image)

	output_frames_list = pipe(
	image=resized_image,
	prompt=prompt,
	negative_prompt=negative_prompt,
	height=resized_image.height,
	width=resized_image.width,
	num_frames=num_frames,
	guidance_scale=float(guidance_scale),
	guidance_scale_2=float(guidance_scale_2),
	num_inference_steps=int(steps),
	generator=torch.Generator(device="cuda").manual_seed(current_seed),
	).frames[0]

	with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
	video_path = tmpfile.name

	export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
	return video_path, current_seed


	# ================================
	# 💎 تعديل الواجهة مع الرسالة التسويقية
	# ================================

	with gr.Blocks() as demo:
	gr.Markdown("🚀 Over 1,000 runs in the first 48 hours — thank you for your amazing feedback! \nTry the latest version below 👇")
	gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
	gr.Markdown("Run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation — compatible with 🧨 diffusers and ZeroGPU⚡️")

	with gr.Row():
	with gr.Column():
	input_image_component = gr.Image(type="pil", label="Input Image")
	prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
	duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5,
	label="Duration (seconds)",
	info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
	with gr.Accordion("Advanced Settings", open=False):
	negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
	seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
	randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
	steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
	guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage")
	guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage")
	generate_button = gr.Button("Generate Video", variant="primary")
	with gr.Column():
	video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)

	ui_inputs = [input_image_component, prompt_input, steps_slider,
	negative_prompt_input, duration_seconds_input,
	guidance_scale_input, guidance_scale_2_input,
	seed_input, randomize_seed_checkbox]

	generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])


	if __name__ == "__main__":
	demo.queue().launch(mcp_server=True)