Spaces:

matthewkram
/

sdmklgdfmkl

Build error

App Files Files Community

matthewkram commited on Nov 4

Commit

c6011cd

verified ·

1 Parent(s): e93faf1

Update app.py

Browse files

Files changed (1) hide show

app.py +174 -83

app.py CHANGED Viewed

@@ -1,104 +1,195 @@
 import os
 import sys
 import time
 import torch
 import numpy as np
 import tempfile
-from PIL import Image
-from datetime import datetime
-import gradio as gr
-from torch import autocast
-from pytorch_lightning import seed_everything
-import torchvision.transforms as T
-from diffusers import StableVideoDiffusionPipeline
-from diffusers.utils import load_image, export_to_video
-class WorldAnimate:
     def __init__(self):
-        model_id = "stabilityai/stable-video-diffusion-img2vid-xt"
         self.pipe = StableVideoDiffusionPipeline.from_pretrained(
-            model_id, torch_dtype=torch.float16, variant="fp16"
         )
-        self.pipe.enable_model_cpu_offload()
-        self.pipe.enable_vae_slicing()
-        self.pipe.unet.enable_forward_chunking(chunk_size=1, dim=1)
-        self.pipe.to("cuda" if torch.cuda.is_available() else "cpu")
-        torch.backends.cuda.matmul.allow_tf32 = True
-    def process_input(self, image, seed, num_frames, fps, decode_chunk_size, motion_bucket_id, noise_aug_strength):
-        if seed == -1:
-            seed = int.from_bytes(os.urandom(2), "big")
-        seed_everything(seed)
-        if isinstance(image, str):
-            image = load_image(image)
-        image = image.resize((1024, 576))
-        generator = torch.manual_seed(seed)
-        frames = self.pipe(
-            image,
-            num_frames=num_frames,
-            fps=fps,
-            decode_chunk_size=decode_chunk_size,
-            motion_bucket_id=motion_bucket_id,
-            noise_aug_strength=noise_aug_strength,
-            generator=generator,
-        ).frames[0]
-        return frames
-def app():
-    with gr.Blocks(title="World 2.2 Animate (Local No API)") as demo:
         gr.HTML("""
-            <h1 style="text-align: center; font-family: Arial; color: white;">World 2.2 Animate</h1>
-            <p style="text-align: center; font-family: Arial; color: white;">
-                This is a local processing app for image-to-video conversion using Stable Video Diffusion.<br>
-                Upload an image, adjust parameters, and generate a video with smooth motion.<br>
-                Parameters:<br>
-                - Seed: Random seed for reproducibility (-1 for random).<br>
-                - Num Frames: Number of frames in the video (default 25).<br>
-                - FPS: Frames per second (default 7).<br>
-                - Decode Chunk Size: For memory optimization (default 8).<br>
-                - Motion Bucket ID: Controls motion amount (1-255, default 127).<br>
-                - Noise Aug Strength: Adds noise for variation (0-1, default 0.02).
-            </p>
         """)
         with gr.Row():
-            with gr.Column():
-                input_image = gr.Image(label="Upload Image", type="pil")
-                seed = gr.Number(label="Seed", value=-1)
-                num_frames = gr.Slider(label="Num Frames", minimum=1, maximum=25, value=25, step=1)
-                fps = gr.Slider(label="FPS", minimum=1, maximum=30, value=7, step=1)
-                decode_chunk_size = gr.Slider(label="Decode Chunk Size", minimum=1, maximum=16, value=8, step=1)
-                motion_bucket_id = gr.Slider(label="Motion Bucket ID", minimum=1, maximum=255, value=127, step=1)
-                noise_aug_strength = gr.Slider(label="Noise Aug Strength", minimum=0.0, maximum=1.0, value=0.02, step=0.01)
-                generate_btn = gr.Button(value="Generate Video")
-            with gr.Column():
-                output_video = gr.Video(label="Generated Video")
-                status = gr.Textbox(label="Status")
-        generate_btn.click(
-            fn=process,
-            inputs=[input_image, seed, num_frames, fps, decode_chunk_size, motion_bucket_id, noise_aug_strength],
-            outputs=[output_video, status]
-        )
-        return demo  # Важно: возвращаем demo!
-def process(image, seed, num_frames, fps, decode_chunk_size, motion_bucket_id, noise_aug_strength):
-    try:
-        animator = WorldAnimate()
-        frames = animator.process_input(image, seed, num_frames, fps, decode_chunk_size, motion_bucket_id, noise_aug_strength)
-        with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as temp_video:
-            export_to_video(frames, temp_video.name, fps=fps)
-        return temp_video.name, "Success!"
-    except Exception as e:
-        return None, f"Failed: {str(e)}"
-def start_app():
-    app().launch()
 if __name__ == "__main__":
     start_app()

 import os
 import sys
+import uuid
+import shutil
 import time
+import gradio as gr
 import torch
+from diffusers import StableVideoDiffusionPipeline
+from PIL import Image
 import numpy as np
+import cv2
+import subprocess
 import tempfile
+class WanAnimateApp:
     def __init__(self):
+        model_name = "stabilityai/stable-video-diffusion-img2vid-xt"
         self.pipe = StableVideoDiffusionPipeline.from_pretrained(
+            model_name,
+            torch_dtype=torch.float16 if torch.cuda.is_available() else torch.float32,
+            variant="fp16",
+            device_map="cpu"
         )
+    def predict(
+        self,
+        ref_img,
+        video,
+        model_id,
+        model,
+    ):
+        if ref_img is None or video is None:
+            return None, "Upload both image and video."
+        try:
+            # Local processing — PIL for image (no open for type="pil")
+            if isinstance(ref_img, Image.Image):
+                ref_image = ref_img.convert("RGB").resize((576, 320))
+            else:
+                ref_image = Image.open(ref_img).convert("RGB").resize((576, 320))
+            cap = cv2.VideoCapture(video)
+            frame_count = int(cap.get(cv2.CAP_PROP_FRAME_COUNT))
+            cap.release()
+            motion_hint = f" with dynamic motion from {frame_count} frames"
+            # Prompt based on mode
+            if model_id == "wan2.2-animate-move":
+                prompt = f"Animate the character in the reference image{motion_hint}, high quality, smooth movements."
+            else:
+                prompt = f"Replace the character in the video with the reference image{motion_hint}, seamless, detailed."
+            # Parameters
+            num_frames = 25 if model == "wan-pro" else 14
+            num_steps = 25 if model == "wan-pro" else 15
+            # Local generation
+            generator = torch.Generator(device="cpu").manual_seed(42)
+            output = self.pipe(
+                ref_image,
+                num_inference_steps=num_steps,
+                num_frames=num_frames,
+                generator=generator,
+                decode_chunk_size=2
+            ).frames[0]
+            # Save MP4 with ffmpeg
+            temp_dir = tempfile.mkdtemp()
+            for i, frame in enumerate(output):
+                frame.save(f"{temp_dir}/frame_{i:04d}.png")
+            temp_video = f"/tmp/output_{uuid.uuid4()}.mp4"
+            subprocess.run([
+                'ffmpeg', '-y', '-framerate', '7', '-i', f"{temp_dir}/frame_%04d.png",
+                '-c:v', 'libx264', '-pix_fmt', 'yuv420p', temp_video
+            ], check=True)
+            shutil.rmtree(temp_dir)
+            return temp_video, "SUCCEEDED"
+        except Exception as e:
+            return None, f"Failed: {str(e)}"
+def start_app():
+    app = WanAnimateApp()
+    with gr.Blocks(title="Wan2.2-Animate (Local No API)") as demo:
         gr.HTML("""
+            <div style="padding: 2rem; text-align: center; max-width: 1200px; margin: 0 auto; font-family: Arial, sans-serif;">
+                <h1 style="font-size: 2.5rem; font-weight: bold; margin-bottom: 0.5rem; color: #333;">
+                    Wan2.2-Animate: Unified Character Animation and Replacement with Holistic Replication
+                </h1>
+                <h3 style="font-size: 1.5rem; font-weight: bold; margin-bottom: 0.5rem; color: #333;">
+                    Local version without API (SVD Proxy)
+                </h3>
+                <div style="font-size: 1.25rem; margin-bottom: 1.5rem; color: #555;">
+                    Tongyi Lab, Alibaba
+                </div>
+                <div style="display: flex; flex-wrap: wrap; justify-content: center; gap: 1rem; margin-bottom: 1.5rem;">
+                    <a href="https://arxiv.org/abs/2509.14055" target="_blank" style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; color: #333; text-decoration: none; border-radius: 9999px; font-weight: 500;">
+                        <span style="margin-right: 0.5rem;">📄</span>Paper
+                    </a>
+                    <a href="https://github.com/Wan-Video/Wan2.2" target="_blank" style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; color: #333; text-decoration: none; border-radius: 9999px; font-weight: 500;">
+                        <span style="margin-right: 0.5rem;">💻</span>GitHub
+                    </a>
+                    <a href="https://huggingface.co/Wan-AI/Wan2.2-Animate-14B" target="_blank" style="display: inline-flex; align-items: center; padding: 0.5rem 1rem; background-color: #f0f0f0; color: #333; text-decoration: none; border-radius: 9999px; font-weight: 500;">
+                        <span style="margin-right: 0.5rem;">🤗</span>HF Model
+                    </a>
+                </div>
+            </div>
+        """)
+        gr.HTML("""
+            <details>
+                <summary>‼️Usage (использования)</summary>
+                Wan-Animate supports two modes:
+                <ul>
+                    <li>Move Mode: animate the character in input image with movements from the input video</li>
+                    <li>Mix Mode: replace the character in input video with the character in input image</li>
+                </ul>
+                Wan-Animate supports two modes:
+                <ul>
+                    <li>Move Mode: Use the movements extracted from the input video to drive the character in the input image</li>
+                    <li>Mix Mode: Use the character in the input image to replace the character in the input video</li>
+                </ul>
+                Currently, the following restrictions apply to inputs:
+                <ul>
+                    <li>Video file size: Less than 200MB</li>
+                    <li>Video resolution: The shorter side must be greater than 200, and the longer side must be less than 2048</li>
+                    <li>Video duration: 2s to 30s</li>
+                    <li>Video aspect ratio: 1:3 to 3:1</li>
+                    <li>Video formats: mp4, avi, mov</li>
+                    <li>Image file size: Less than 5MB</li>
+                    <li>Image resolution: The shorter side must be greater than 200, and the longer side must be less than 4096</li>
+                    <li>Image formats: jpg, png, jpeg, webp, bmp</li>
+                </ul>
+                Current, the inference quality has two variants. You can use our open-source code for more flexible configuration.
+                <ul>
+                    <li>wan-pro: 25fps, 720p</li>
+                    <li>wan-std: 15fps, 720p</li>
+                </ul>
+            </details>
         """)
         with gr.Row():
+            with gr.Column():
+                ref_img = gr.Image(
+                    label="Reference Image (изображение)",
+                    type="pil",
+                    sources=["upload"],
+                )
+                video = gr.Video(
+                    label="Template Video (шаблонное видео)",
+                    sources=["upload"],
+                )
+                with gr.Row():
+                    model_id = gr.Dropdown(
+                        label="Mode (режим)",
+                        choices=["wan2.2-animate-move", "wan2.2-animate-mix"],
+                        value="wan2.2-animate-move",
+                        info=""
+                    )
+                    model = gr.Dropdown(
+                        label="Inference Quality (качество)",
+                        choices=["wan-pro", "wan-std"],
+                        value="wan-pro",
+                    )
+                run_button = gr.Button("Generate Video (генерировать)")
+            with gr.Column():
+                output_video = gr.Video(label="Output Video (результат)")
+                output_status = gr.Textbox(label="Status (статус)")
+        run_button.click(
+            fn=app.predict,
+            inputs=[
+                ref_img,
+                video,
+                model_id,
+                model,
+            ],
+            outputs=[output_video, output_status],
+        )
+    demo.queue(default_concurrency_limit=1)
+    demo.launch(
+        server_name="0.0.0.0",
+        server_port=7860
+    )
 if __name__ == "__main__":
     start_app()