Spaces:

dream2589632147
/

Dream-wan2-2-faster-Pro

Running on Zero

App Files Files

dream2589632147 commited on Nov 2

Commit

0cfee79

verified ·

1 Parent(s): 447cc0e

Update app.py

Browse files

Files changed (1) hide show

app.py +48 -202

app.py CHANGED Viewed

@@ -1,208 +1,54 @@
-import spaces
-import torch
-from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
-from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
-from diffusers.utils.export_utils import export_to_video
 import gradio as gr
 import tempfile
 import numpy as np
-from PIL import Image
-import random
-import gc
-from torchao.quantization import quantize_
-from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
-from torchao.quantization import Int8WeightOnlyConfig
-import aoti
-MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
-MAX_DIM = 832
-MIN_DIM = 480
-SQUARE_DIM = 640
-MULTIPLE_OF = 16
-MAX_SEED = np.iinfo(np.int32).max
-FIXED_FPS = 16
-MIN_FRAMES_MODEL = 8
-MAX_FRAMES_MODEL = 720
-MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
-MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
-pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
-    transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
-        subfolder='transformer',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
-        subfolder='transformer_2',
-        torch_dtype=torch.bfloat16,
-        device_map='cuda',
-    ),
-    torch_dtype=torch.bfloat16,
-).to('cuda')
-pipe.load_lora_weights(
-    "Kijai/WanVideo_comfy",
-    weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
-    adapter_name="lightx2v"
 )
-kwargs_lora = {}
-kwargs_lora["load_into_transformer_2"] = True
-pipe.load_lora_weights(
-    "Kijai/WanVideo_comfy",
-    weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
-    adapter_name="lightx2v_2", **kwargs_lora
-)
-pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
-pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
-pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
-pipe.unload_lora_weights()
-quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
-quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
-quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
-aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
-aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
-default_prompt_i2v = "ultra realistic cinematic footage, perfectly preserved facial identity and body structure across all frames, stable anatomy and consistent body proportions, realistic muscle definition, natural motion flow and breathing dynamics, seamless motion continuity, photorealistic clothing preservation with accurate fabric movement and lighting response, consistent outfit color and texture, high-fidelity skin tone and texture stability, lifelike eye reflections and natural gaze consistency, cinematic lighting with soft volumetric shadows, professional film-grade color grading, HDR tone mapping with true-to-life contrast and depth, realistic ambient and subsurface light behavior, physically accurate reflections and highlights, detailed cinematic background with depth of field and natural bokeh, smooth camera movement with film-level motion fluidity, 35mm film aesthetic, ultra-detailed textures, consistent and coherent composition, perfect balance between depth, light, and motion for a truly photorealistic cinematic atmosphere, temporal coherence, identity consistency, no facial drift, no texture flickering, no color shifting."
-default_negative_prompt = "low quality, low resolution, low contrast, poor lighting, underexposed, overexposed, bad composition, bad framing, bad perspective, flat lighting, washed out colors, jpeg artifacts, noise, static, grain, compression artifacts, flickering, stutter, shaky camera, inconsistent motion, poor transition, broken motion, unnatural interpolation, out of focus, blurry, motion blur, ghosting, double exposure, distorted face, consistent face, changing face, warped face, face drift, identity shift, face inconsistency, natural facial expression, mutated body, deformed limbs, extra fingers, fused fingers, missing fingers, bad anatomy, unrealistic proportions, twisted pose, asymmetrical body, unappealing, uncanny, artificial face, waxy skin, plastic look, text, watermark, logo, signature, frame border, cropped edges, tiling, duplicate, repeated pattern, cartoon, anime, illustration, 3d render, painting, drawing, oversharpened, low detail, artificial texture, poor skin texture, over-smoothed, fake skin, flat skin, color banding, saturation, chromatic aberration, unrealistic shadows, inconsistent lighting, frozen frame, poor depth, lack of realism, fake reflection, artifacted highlights, bloom artifacts, bad transition, broken frame, visual glitch, bad synchronization, oversaturated colors, contrast issues, unbalanced composition, lack of cinematic tone, flat motion, jitter, warped geometry, background distortion, identity mismatch, morphing, inconsistent hair, inconsistent body shape"
-def resize_image(image: Image.Image) -> Image.Image:
-    width, height = image.size
-    if width == height:
-        return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
-    aspect_ratio = width / height
-    MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
-    MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM
-    image_to_resize = image
-    if aspect_ratio > MAX_ASPECT_RATIO:
-        target_w, target_h = MAX_DIM, MIN_DIM
-        crop_width = int(round(height * MAX_ASPECT_RATIO))
-        left = (width - crop_width) // 2
-        image_to_resize = image.crop((left, 0, left + crop_width, height))
-    elif aspect_ratio < MIN_ASPECT_RATIO:
-        target_w, target_h = MIN_DIM, MAX_DIM
-        crop_height = int(round(width / MIN_ASPECT_RATIO))
-        top = (height - crop_height) // 2
-        image_to_resize = image.crop((0, top, width, top + crop_height))
-    else:
-        if width > height:
-            target_w = MAX_DIM
-            target_h = int(round(target_w / aspect_ratio))
-        else:
-            target_h = MAX_DIM
-            target_w = int(round(target_h * aspect_ratio))
-    final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
-    final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF
-    final_w = max(MIN_DIM, min(MAX_DIM, final_w))
-    final_h = max(MIN_DIM, min(MAX_DIM, final_h))
-    return image_to_resize.resize((final_w, final_h), Image.LANCZOS)
-def get_num_frames(duration_seconds: float):
-    return 1 + int(np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL))
-def get_duration(input_image, prompt, steps, negative_prompt, duration_seconds, guidance_scale, guidance_scale_2, seed, randomize_seed, progress):
-    BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
-    BASE_STEP_DURATION = 15
-    width, height = resize_image(input_image).size
-    frames = get_num_frames(duration_seconds)
-    factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
-    step_duration = BASE_STEP_DURATION * factor ** 1.5
-    return 10 + int(steps) * step_duration
-@spaces.GPU(duration=get_duration)
-def generate_video(input_image, prompt, steps=4, negative_prompt=default_negative_prompt,
-                   duration_seconds=MAX_DURATION, guidance_scale=1, guidance_scale_2=1,
-                   seed=42, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):
-    if input_image is None:
-        raise gr.Error("Please upload an input image.")
-    num_frames = get_num_frames(duration_seconds)
-    current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
-    resized_image = resize_image(input_image)
-    output_frames_list = pipe(
-        image=resized_image,
-        prompt=prompt,
-        negative_prompt=negative_prompt,
-        height=resized_image.height,
-        width=resized_image.width,
-        num_frames=num_frames,
-        guidance_scale=float(guidance_scale),
-        guidance_scale_2=float(guidance_scale_2),
-        num_inference_steps=int(steps),
-        generator=torch.Generator(device="cuda").manual_seed(current_seed),
-    ).frames[0]
-    with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
-        video_path = tmpfile.name
-    export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
-    return video_path, current_seed
-# ================================
-# 💎 تعديل الواجهة مع الرسالة التسويقية
-# ================================
-with gr.Blocks(theme="gradio/soft") as demo:
-    gr.Markdown("""
-    # 🎬 **Dream-wan2-2-faster-Pro**
-    ### ⚡ Ultra-Fast, Realistic Image-to-Video Generator
-    ---
-    🚀 **Over 32,000 visits and growing fast — ranked Top 3 in video generation!**
-    🌐 Powered by `dream2589632147/Dream-wan2-2-faster-Pro`
-    **What's new:**
-    - ✅ Optimized memory & faster generation (up to 70% improvement)
-    - 🎥 Max video length: 45s
-    - 💡 Works with CPU or GPU seamlessly
-    - 🧠 Enhanced detail consistency between frames
-    🔗 *Try it below and share your creations on Reddit or Hugging Face!*
-    """)
-    gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
-    gr.Markdown("Run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation — compatible with 🧨 diffusers and ZeroGPU⚡️")
-    with gr.Row():
-        with gr.Column():
-            input_image_component = gr.Image(type="pil", label="Input Image")
-            prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
-            duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5,
-                                               label="Duration (seconds)",
-                                               info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
-            with gr.Accordion("Advanced Settings", open=False):
-                negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
-                seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
-                randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
-                steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
-                guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage")
-                guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage")
-            generate_button = gr.Button("Generate Video", variant="primary")
-        with gr.Column():
-            video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
-    ui_inputs = [input_image_component, prompt_input, steps_slider,
-                 negative_prompt_input, duration_seconds_input,
-                 guidance_scale_input, guidance_scale_2_input,
-                 seed_input, randomize_seed_checkbox]
-    generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
 if __name__ == "__main__":
-    demo.queue().launch(mcp_server=True)

 import gradio as gr
+import torch
 import tempfile
+import os
 import numpy as np
+import moviepy.editor as mp
+from diffusers import DiffusionPipeline
+from audiocraft.models import AudioGen
+# ✅ Force CPU for ZeroGPU
+device = "cpu"
+# Load models
+video_model = DiffusionPipeline.from_pretrained(
+    "dream2589632147/Dream-wan2-2-faster", torch_dtype=torch.float32
+).to(device)
+audio_model = AudioGen.get_pretrained("facebook/audiogen-medium").to(device)
+def generate_video_with_audio(image, prompt):
+    # Step 1: Generate video frames
+    with tempfile.TemporaryDirectory() as tmpdir:
+        video_frames = video_model(image=image, prompt=prompt, num_frames=16).frames
+        video_path = os.path.join(tmpdir, "output.mp4")
+        mp.ImageSequenceClip(video_frames, fps=16).write_videofile(video_path, codec="libx264", audio=False, verbose=False, logger=None)
+        # Step 2: Generate sound from prompt (AudioGen)
+        wav_path = os.path.join(tmpdir, "sound.wav")
+        wav_data = audio_model.generate([prompt])[0].cpu().numpy()
+        mp.AudioFileClip(wav_path).write_audiofile(wav_path, fps=16000)
+        # Step 3: Merge video + audio
+        video_clip = mp.VideoFileClip(video_path)
+        audio_clip = mp.AudioFileClip(wav_path)
+        final = video_clip.set_audio(audio_clip)
+        output_path = os.path.join(tmpdir, "final_video.mp4")
+        final.write_videofile(output_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
+        return output_path
+# Gradio UI
+demo = gr.Interface(
+    fn=generate_video_with_audio,
+    inputs=[
+        gr.Image(type="pil", label="Upload Image"),
+        gr.Textbox(label="Prompt (e.g. ocean waves hitting rocks at sunset)")
+    ],
+    outputs=gr.Video(label="Generated Video with Sound"),
+    title="Wan2.2 Video Generator with Audio",
+    description="Generates a short video from an image and text prompt, with natural sound using AudioGen."
 )
 if __name__ == "__main__":
+    demo.launch()