Spaces:
Runtime error
Runtime error
english translation
Browse files
app.py
CHANGED
|
@@ -131,7 +131,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
|
|
| 131 |
vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
|
| 132 |
if quantization_input:
|
| 133 |
quantize_(vae, int8_weight_only())
|
| 134 |
-
print("
|
| 135 |
|
| 136 |
## reference net init
|
| 137 |
reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
|
|
@@ -287,7 +287,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
|
|
| 287 |
return video_output, seed_text
|
| 288 |
|
| 289 |
|
| 290 |
-
with gr.Blocks(
|
| 291 |
gr.Markdown("""
|
| 292 |
<div>
|
| 293 |
<h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
|
|
@@ -297,7 +297,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 297 |
<a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
|
| 298 |
</div>
|
| 299 |
<div style="text-align: center; font-weight: bold; color: red;">
|
| 300 |
-
⚠️
|
| 301 |
</div>
|
| 302 |
|
| 303 |
""")
|
|
@@ -305,29 +305,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 305 |
with gr.Row():
|
| 306 |
with gr.Column():
|
| 307 |
with gr.Group():
|
| 308 |
-
image_input = gr.Image(label="
|
| 309 |
-
audio_input = gr.Audio(label="
|
| 310 |
-
pose_input = gr.Textbox(label="
|
| 311 |
-
with gr.
|
| 312 |
with gr.Row():
|
| 313 |
-
width = gr.Number(label="
|
| 314 |
-
height = gr.Number(label="
|
| 315 |
-
length = gr.Number(label="
|
| 316 |
with gr.Row():
|
| 317 |
-
steps = gr.Number(label="
|
| 318 |
-
sample_rate = gr.Number(label="
|
| 319 |
-
cfg = gr.Number(label="
|
| 320 |
with gr.Row():
|
| 321 |
-
fps = gr.Number(label="
|
| 322 |
-
context_frames = gr.Number(label="
|
| 323 |
-
context_overlap = gr.Number(label="
|
| 324 |
with gr.Row():
|
| 325 |
-
quantization_input = gr.Checkbox(label="
|
| 326 |
-
seed = gr.Number(label="
|
| 327 |
-
generate_button = gr.Button("🎬
|
| 328 |
with gr.Column():
|
| 329 |
-
video_output = gr.Video(label="
|
| 330 |
-
seed_text = gr.Textbox(label="
|
| 331 |
gr.Examples(
|
| 332 |
examples=[
|
| 333 |
["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
|
|
@@ -339,7 +339,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
|
|
| 339 |
["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
|
| 340 |
],
|
| 341 |
inputs=[image_input, audio_input],
|
| 342 |
-
label="
|
| 343 |
)
|
| 344 |
|
| 345 |
generate_button.click(
|
|
|
|
| 131 |
vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
|
| 132 |
if quantization_input:
|
| 133 |
quantize_(vae, int8_weight_only())
|
| 134 |
+
print("Use int8 quantization.")
|
| 135 |
|
| 136 |
## reference net init
|
| 137 |
reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
|
|
|
|
| 287 |
return video_output, seed_text
|
| 288 |
|
| 289 |
|
| 290 |
+
with gr.Blocks() as demo:
|
| 291 |
gr.Markdown("""
|
| 292 |
<div>
|
| 293 |
<h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
|
|
|
|
| 297 |
<a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
|
| 298 |
</div>
|
| 299 |
<div style="text-align: center; font-weight: bold; color: red;">
|
| 300 |
+
⚠️ This demonstration is for academic research and experiential use only.
|
| 301 |
</div>
|
| 302 |
|
| 303 |
""")
|
|
|
|
| 305 |
with gr.Row():
|
| 306 |
with gr.Column():
|
| 307 |
with gr.Group():
|
| 308 |
+
image_input = gr.Image(label="Image Input (Auto Scaling)", type="filepath")
|
| 309 |
+
audio_input = gr.Audio(label="Audio Input", type="filepath")
|
| 310 |
+
pose_input = gr.Textbox(label="Pose Input (Directory Path)", placeholder="Please enter the directory path for pose data.", value="assets/halfbody_demo/pose/01")
|
| 311 |
+
with gr.Accordion("Advanced Settings", open=False):
|
| 312 |
with gr.Row():
|
| 313 |
+
width = gr.Number(label="Width (multiple of 16, recommended: 768)", value=768)
|
| 314 |
+
height = gr.Number(label="Height (multiple of 16, recommended: 768)", value=768)
|
| 315 |
+
length = gr.Number(label="Video Length (recommended: 240)", value=240)
|
| 316 |
with gr.Row():
|
| 317 |
+
steps = gr.Number(label="Steps (recommended: 30)", value=20)
|
| 318 |
+
sample_rate = gr.Number(label="Sampling Rate (recommended: 16000)", value=16000)
|
| 319 |
+
cfg = gr.Number(label="CFG (recommended: 2.5)", value=2.5, step=0.1)
|
| 320 |
with gr.Row():
|
| 321 |
+
fps = gr.Number(label="Frame Rate (recommended: 24)", value=24)
|
| 322 |
+
context_frames = gr.Number(label="Context Frames (recommended: 12)", value=12)
|
| 323 |
+
context_overlap = gr.Number(label="Context Overlap (recommended: 3)", value=3)
|
| 324 |
with gr.Row():
|
| 325 |
+
quantization_input = gr.Checkbox(label="Int8 Quantization (recommended for users with 12GB VRAM, use audio no longer than 5 seconds)", value=False)
|
| 326 |
+
seed = gr.Number(label="Seed (-1 for random)", value=-1)
|
| 327 |
+
generate_button = gr.Button("🎬 Generate Video")
|
| 328 |
with gr.Column():
|
| 329 |
+
video_output = gr.Video(label="Output Video")
|
| 330 |
+
seed_text = gr.Textbox(label="Seed", interactive=False, visible=False)
|
| 331 |
gr.Examples(
|
| 332 |
examples=[
|
| 333 |
["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
|
|
|
|
| 339 |
["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
|
| 340 |
],
|
| 341 |
inputs=[image_input, audio_input],
|
| 342 |
+
label="Preset Characters and Audio",
|
| 343 |
)
|
| 344 |
|
| 345 |
generate_button.click(
|