dream2589632147 commited on
Commit
0cfee79
·
verified ·
1 Parent(s): 447cc0e

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +48 -202
app.py CHANGED
@@ -1,208 +1,54 @@
1
- import spaces
2
- import torch
3
- from diffusers.pipelines.wan.pipeline_wan_i2v import WanImageToVideoPipeline
4
- from diffusers.models.transformers.transformer_wan import WanTransformer3DModel
5
- from diffusers.utils.export_utils import export_to_video
6
  import gradio as gr
 
7
  import tempfile
 
8
  import numpy as np
9
- from PIL import Image
10
- import random
11
- import gc
12
-
13
- from torchao.quantization import quantize_
14
- from torchao.quantization import Float8DynamicActivationFloat8WeightConfig
15
- from torchao.quantization import Int8WeightOnlyConfig
16
-
17
- import aoti
18
-
19
-
20
- MODEL_ID = "Wan-AI/Wan2.2-I2V-A14B-Diffusers"
21
-
22
- MAX_DIM = 832
23
- MIN_DIM = 480
24
- SQUARE_DIM = 640
25
- MULTIPLE_OF = 16
26
-
27
- MAX_SEED = np.iinfo(np.int32).max
28
-
29
- FIXED_FPS = 16
30
- MIN_FRAMES_MODEL = 8
31
- MAX_FRAMES_MODEL = 720
32
-
33
- MIN_DURATION = round(MIN_FRAMES_MODEL/FIXED_FPS,1)
34
- MAX_DURATION = round(MAX_FRAMES_MODEL/FIXED_FPS,1)
35
-
36
-
37
- pipe = WanImageToVideoPipeline.from_pretrained(MODEL_ID,
38
- transformer=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
39
- subfolder='transformer',
40
- torch_dtype=torch.bfloat16,
41
- device_map='cuda',
42
- ),
43
- transformer_2=WanTransformer3DModel.from_pretrained('cbensimon/Wan2.2-I2V-A14B-bf16-Diffusers',
44
- subfolder='transformer_2',
45
- torch_dtype=torch.bfloat16,
46
- device_map='cuda',
47
- ),
48
- torch_dtype=torch.bfloat16,
49
- ).to('cuda')
50
-
51
- pipe.load_lora_weights(
52
- "Kijai/WanVideo_comfy",
53
- weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
54
- adapter_name="lightx2v"
55
  )
56
- kwargs_lora = {}
57
- kwargs_lora["load_into_transformer_2"] = True
58
- pipe.load_lora_weights(
59
- "Kijai/WanVideo_comfy",
60
- weight_name="Lightx2v/lightx2v_I2V_14B_480p_cfg_step_distill_rank128_bf16.safetensors",
61
- adapter_name="lightx2v_2", **kwargs_lora
62
- )
63
- pipe.set_adapters(["lightx2v", "lightx2v_2"], adapter_weights=[1., 1.])
64
- pipe.fuse_lora(adapter_names=["lightx2v"], lora_scale=3., components=["transformer"])
65
- pipe.fuse_lora(adapter_names=["lightx2v_2"], lora_scale=1., components=["transformer_2"])
66
- pipe.unload_lora_weights()
67
-
68
- quantize_(pipe.text_encoder, Int8WeightOnlyConfig())
69
- quantize_(pipe.transformer, Float8DynamicActivationFloat8WeightConfig())
70
- quantize_(pipe.transformer_2, Float8DynamicActivationFloat8WeightConfig())
71
-
72
- aoti.aoti_blocks_load(pipe.transformer, 'zerogpu-aoti/Wan2', variant='fp8da')
73
- aoti.aoti_blocks_load(pipe.transformer_2, 'zerogpu-aoti/Wan2', variant='fp8da')
74
-
75
-
76
- default_prompt_i2v = "ultra realistic cinematic footage, perfectly preserved facial identity and body structure across all frames, stable anatomy and consistent body proportions, realistic muscle definition, natural motion flow and breathing dynamics, seamless motion continuity, photorealistic clothing preservation with accurate fabric movement and lighting response, consistent outfit color and texture, high-fidelity skin tone and texture stability, lifelike eye reflections and natural gaze consistency, cinematic lighting with soft volumetric shadows, professional film-grade color grading, HDR tone mapping with true-to-life contrast and depth, realistic ambient and subsurface light behavior, physically accurate reflections and highlights, detailed cinematic background with depth of field and natural bokeh, smooth camera movement with film-level motion fluidity, 35mm film aesthetic, ultra-detailed textures, consistent and coherent composition, perfect balance between depth, light, and motion for a truly photorealistic cinematic atmosphere, temporal coherence, identity consistency, no facial drift, no texture flickering, no color shifting."
77
- default_negative_prompt = "low quality, low resolution, low contrast, poor lighting, underexposed, overexposed, bad composition, bad framing, bad perspective, flat lighting, washed out colors, jpeg artifacts, noise, static, grain, compression artifacts, flickering, stutter, shaky camera, inconsistent motion, poor transition, broken motion, unnatural interpolation, out of focus, blurry, motion blur, ghosting, double exposure, distorted face, consistent face, changing face, warped face, face drift, identity shift, face inconsistency, natural facial expression, mutated body, deformed limbs, extra fingers, fused fingers, missing fingers, bad anatomy, unrealistic proportions, twisted pose, asymmetrical body, unappealing, uncanny, artificial face, waxy skin, plastic look, text, watermark, logo, signature, frame border, cropped edges, tiling, duplicate, repeated pattern, cartoon, anime, illustration, 3d render, painting, drawing, oversharpened, low detail, artificial texture, poor skin texture, over-smoothed, fake skin, flat skin, color banding, saturation, chromatic aberration, unrealistic shadows, inconsistent lighting, frozen frame, poor depth, lack of realism, fake reflection, artifacted highlights, bloom artifacts, bad transition, broken frame, visual glitch, bad synchronization, oversaturated colors, contrast issues, unbalanced composition, lack of cinematic tone, flat motion, jitter, warped geometry, background distortion, identity mismatch, morphing, inconsistent hair, inconsistent body shape"
78
-
79
-
80
- def resize_image(image: Image.Image) -> Image.Image:
81
- width, height = image.size
82
- if width == height:
83
- return image.resize((SQUARE_DIM, SQUARE_DIM), Image.LANCZOS)
84
- aspect_ratio = width / height
85
- MAX_ASPECT_RATIO = MAX_DIM / MIN_DIM
86
- MIN_ASPECT_RATIO = MIN_DIM / MAX_DIM
87
- image_to_resize = image
88
- if aspect_ratio > MAX_ASPECT_RATIO:
89
- target_w, target_h = MAX_DIM, MIN_DIM
90
- crop_width = int(round(height * MAX_ASPECT_RATIO))
91
- left = (width - crop_width) // 2
92
- image_to_resize = image.crop((left, 0, left + crop_width, height))
93
- elif aspect_ratio < MIN_ASPECT_RATIO:
94
- target_w, target_h = MIN_DIM, MAX_DIM
95
- crop_height = int(round(width / MIN_ASPECT_RATIO))
96
- top = (height - crop_height) // 2
97
- image_to_resize = image.crop((0, top, width, top + crop_height))
98
- else:
99
- if width > height:
100
- target_w = MAX_DIM
101
- target_h = int(round(target_w / aspect_ratio))
102
- else:
103
- target_h = MAX_DIM
104
- target_w = int(round(target_h * aspect_ratio))
105
- final_w = round(target_w / MULTIPLE_OF) * MULTIPLE_OF
106
- final_h = round(target_h / MULTIPLE_OF) * MULTIPLE_OF
107
- final_w = max(MIN_DIM, min(MAX_DIM, final_w))
108
- final_h = max(MIN_DIM, min(MAX_DIM, final_h))
109
- return image_to_resize.resize((final_w, final_h), Image.LANCZOS)
110
-
111
-
112
- def get_num_frames(duration_seconds: float):
113
- return 1 + int(np.clip(int(round(duration_seconds * FIXED_FPS)), MIN_FRAMES_MODEL, MAX_FRAMES_MODEL))
114
-
115
-
116
- def get_duration(input_image, prompt, steps, negative_prompt, duration_seconds, guidance_scale, guidance_scale_2, seed, randomize_seed, progress):
117
- BASE_FRAMES_HEIGHT_WIDTH = 81 * 832 * 624
118
- BASE_STEP_DURATION = 15
119
- width, height = resize_image(input_image).size
120
- frames = get_num_frames(duration_seconds)
121
- factor = frames * width * height / BASE_FRAMES_HEIGHT_WIDTH
122
- step_duration = BASE_STEP_DURATION * factor ** 1.5
123
- return 10 + int(steps) * step_duration
124
-
125
-
126
- @spaces.GPU(duration=get_duration)
127
- def generate_video(input_image, prompt, steps=4, negative_prompt=default_negative_prompt,
128
- duration_seconds=MAX_DURATION, guidance_scale=1, guidance_scale_2=1,
129
- seed=42, randomize_seed=False, progress=gr.Progress(track_tqdm=True)):
130
-
131
- if input_image is None:
132
- raise gr.Error("Please upload an input image.")
133
-
134
- num_frames = get_num_frames(duration_seconds)
135
- current_seed = random.randint(0, MAX_SEED) if randomize_seed else int(seed)
136
- resized_image = resize_image(input_image)
137
-
138
- output_frames_list = pipe(
139
- image=resized_image,
140
- prompt=prompt,
141
- negative_prompt=negative_prompt,
142
- height=resized_image.height,
143
- width=resized_image.width,
144
- num_frames=num_frames,
145
- guidance_scale=float(guidance_scale),
146
- guidance_scale_2=float(guidance_scale_2),
147
- num_inference_steps=int(steps),
148
- generator=torch.Generator(device="cuda").manual_seed(current_seed),
149
- ).frames[0]
150
-
151
- with tempfile.NamedTemporaryFile(suffix=".mp4", delete=False) as tmpfile:
152
- video_path = tmpfile.name
153
-
154
- export_to_video(output_frames_list, video_path, fps=FIXED_FPS)
155
- return video_path, current_seed
156
-
157
-
158
- # ================================
159
- # 💎 تعديل الواجهة مع الرسالة التسويقية
160
- # ================================
161
-
162
- with gr.Blocks(theme="gradio/soft") as demo:
163
- gr.Markdown("""
164
- # 🎬 **Dream-wan2-2-faster-Pro**
165
- ### ⚡ Ultra-Fast, Realistic Image-to-Video Generator
166
- ---
167
- 🚀 **Over 32,000 visits and growing fast — ranked Top 3 in video generation!**
168
- 🌐 Powered by `dream2589632147/Dream-wan2-2-faster-Pro`
169
-
170
- **What's new:**
171
- - ✅ Optimized memory & faster generation (up to 70% improvement)
172
- - 🎥 Max video length: 45s
173
- - 💡 Works with CPU or GPU seamlessly
174
- - 🧠 Enhanced detail consistency between frames
175
-
176
- 🔗 *Try it below and share your creations on Reddit or Hugging Face!*
177
- """)
178
- gr.Markdown("# Fast 4 steps Wan 2.2 I2V (14B) with Lightning LoRA")
179
- gr.Markdown("Run Wan 2.2 in just 4-8 steps, with [Lightning LoRA](https://huggingface.co/Kijai/WanVideo_comfy/tree/main/Wan22-Lightning), fp8 quantization & AoT compilation — compatible with 🧨 diffusers and ZeroGPU⚡️")
180
-
181
- with gr.Row():
182
- with gr.Column():
183
- input_image_component = gr.Image(type="pil", label="Input Image")
184
- prompt_input = gr.Textbox(label="Prompt", value=default_prompt_i2v)
185
- duration_seconds_input = gr.Slider(minimum=MIN_DURATION, maximum=MAX_DURATION, step=0.1, value=3.5,
186
- label="Duration (seconds)",
187
- info=f"Clamped to model's {MIN_FRAMES_MODEL}-{MAX_FRAMES_MODEL} frames at {FIXED_FPS}fps.")
188
- with gr.Accordion("Advanced Settings", open=False):
189
- negative_prompt_input = gr.Textbox(label="Negative Prompt", value=default_negative_prompt, lines=3)
190
- seed_input = gr.Slider(label="Seed", minimum=0, maximum=MAX_SEED, step=1, value=42, interactive=True)
191
- randomize_seed_checkbox = gr.Checkbox(label="Randomize seed", value=True, interactive=True)
192
- steps_slider = gr.Slider(minimum=1, maximum=30, step=1, value=6, label="Inference Steps")
193
- guidance_scale_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale - high noise stage")
194
- guidance_scale_2_input = gr.Slider(minimum=0.0, maximum=10.0, step=0.5, value=1, label="Guidance Scale 2 - low noise stage")
195
- generate_button = gr.Button("Generate Video", variant="primary")
196
- with gr.Column():
197
- video_output = gr.Video(label="Generated Video", autoplay=True, interactive=False)
198
-
199
- ui_inputs = [input_image_component, prompt_input, steps_slider,
200
- negative_prompt_input, duration_seconds_input,
201
- guidance_scale_input, guidance_scale_2_input,
202
- seed_input, randomize_seed_checkbox]
203
-
204
- generate_button.click(fn=generate_video, inputs=ui_inputs, outputs=[video_output, seed_input])
205
-
206
 
207
  if __name__ == "__main__":
208
- demo.queue().launch(mcp_server=True)
 
 
 
 
 
 
1
  import gradio as gr
2
+ import torch
3
  import tempfile
4
+ import os
5
  import numpy as np
6
+ import moviepy.editor as mp
7
+ from diffusers import DiffusionPipeline
8
+ from audiocraft.models import AudioGen
9
+
10
+ # Force CPU for ZeroGPU
11
+ device = "cpu"
12
+
13
+ # Load models
14
+ video_model = DiffusionPipeline.from_pretrained(
15
+ "dream2589632147/Dream-wan2-2-faster", torch_dtype=torch.float32
16
+ ).to(device)
17
+
18
+ audio_model = AudioGen.get_pretrained("facebook/audiogen-medium").to(device)
19
+
20
+ def generate_video_with_audio(image, prompt):
21
+ # Step 1: Generate video frames
22
+ with tempfile.TemporaryDirectory() as tmpdir:
23
+ video_frames = video_model(image=image, prompt=prompt, num_frames=16).frames
24
+ video_path = os.path.join(tmpdir, "output.mp4")
25
+ mp.ImageSequenceClip(video_frames, fps=16).write_videofile(video_path, codec="libx264", audio=False, verbose=False, logger=None)
26
+
27
+ # Step 2: Generate sound from prompt (AudioGen)
28
+ wav_path = os.path.join(tmpdir, "sound.wav")
29
+ wav_data = audio_model.generate([prompt])[0].cpu().numpy()
30
+ mp.AudioFileClip(wav_path).write_audiofile(wav_path, fps=16000)
31
+
32
+ # Step 3: Merge video + audio
33
+ video_clip = mp.VideoFileClip(video_path)
34
+ audio_clip = mp.AudioFileClip(wav_path)
35
+ final = video_clip.set_audio(audio_clip)
36
+ output_path = os.path.join(tmpdir, "final_video.mp4")
37
+ final.write_videofile(output_path, codec="libx264", audio_codec="aac", verbose=False, logger=None)
38
+
39
+ return output_path
40
+
41
+ # Gradio UI
42
+ demo = gr.Interface(
43
+ fn=generate_video_with_audio,
44
+ inputs=[
45
+ gr.Image(type="pil", label="Upload Image"),
46
+ gr.Textbox(label="Prompt (e.g. ocean waves hitting rocks at sunset)")
47
+ ],
48
+ outputs=gr.Video(label="Generated Video with Sound"),
49
+ title="Wan2.2 Video Generator with Audio",
50
+ description="Generates a short video from an image and text prompt, with natural sound using AudioGen."
 
51
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
52
 
53
  if __name__ == "__main__":
54
+ demo.launch()