echomimic-v2

Runtime error

App Files Files Community

fffiloni commited on Nov 27, 2024

Commit

c9a6087

verified ·

1 Parent(s): e079364

english translation

Browse files

Files changed (1) hide show

app.py +22 -22

app.py CHANGED Viewed

@@ -131,7 +131,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
     vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
     if quantization_input:
         quantize_(vae, int8_weight_only())
-        print("使用int8量化")
     ## reference net init
     reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
@@ -287,7 +287,7 @@ def generate(image_input, audio_input, pose_input, width, height, length, steps,
     return video_output, seed_text
-with gr.Blocks(theme=gr.themes.Soft()) as demo:
     gr.Markdown("""
             <div>
                 <h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
@@ -297,7 +297,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 <a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
             </div>
             <div style="text-align: center; font-weight: bold; color: red;">
-                ⚠️ 该演示仅供学术研究和体验使用。
             </div>
             """)
@@ -305,29 +305,29 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
         with gr.Row():
             with gr.Column():
                 with gr.Group():
-                    image_input = gr.Image(label="图像输入（自动缩放）", type="filepath")
-                    audio_input = gr.Audio(label="音频输入", type="filepath")
-                    pose_input = gr.Textbox(label="姿态输入（目录地址）", placeholder="请输入姿态数据的目录地址", value="assets/halfbody_demo/pose/01")
-                with gr.Group():
                     with gr.Row():
-                        width = gr.Number(label="宽度（16的倍数，推荐768）", value=768)
-                        height = gr.Number(label="高度（16的倍数，推荐768）", value=768)
-                        length = gr.Number(label="视频长度，推荐240）", value=240)
                     with gr.Row():
-                        steps = gr.Number(label="步骤（推荐30）", value=20)
-                        sample_rate = gr.Number(label="采样率（推荐16000）", value=16000)
-                        cfg = gr.Number(label="cfg（推荐2.5）", value=2.5, step=0.1)
                     with gr.Row():
-                        fps = gr.Number(label="帧率（推荐24）", value=24)
-                        context_frames = gr.Number(label="上下文框架（推荐12）", value=12)
-                        context_overlap = gr.Number(label="上下文重叠（推荐3）", value=3)
                     with gr.Row():
-                        quantization_input = gr.Checkbox(label="int8量化（推荐显存12G的用户开启，并使用不超过5秒的音频）", value=False)
-                        seed = gr.Number(label="种子(-1为随机)", value=-1)
-                generate_button = gr.Button("🎬 生成视频")
             with gr.Column():
-                video_output = gr.Video(label="输出视频")
-                seed_text = gr.Textbox(label="种子", interactive=False, visible=False)
         gr.Examples(
             examples=[
                 ["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
@@ -339,7 +339,7 @@ with gr.Blocks(theme=gr.themes.Soft()) as demo:
                 ["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
             ],
             inputs=[image_input, audio_input],
-            label="预设人物及音频",
         )
     generate_button.click(

     vae = AutoencoderKL.from_pretrained("./pretrained_weights/sd-vae-ft-mse").to(device, dtype=dtype)
     if quantization_input:
         quantize_(vae, int8_weight_only())
+        print("Use int8 quantization.")
     ## reference net init
     reference_unet = UNet2DConditionModel.from_pretrained("./pretrained_weights/sd-image-variations-diffusers", subfolder="unet", use_safetensors=False).to(dtype=dtype, device=device)
     return video_output, seed_text
+with gr.Blocks() as demo:
     gr.Markdown("""
             <div>
                 <h2 style="font-size: 30px;text-align: center;">EchoMimicV2</h2>
                 <a href="https://arxiv.org/abs/2411.10061">📜 arXiv </a>
             </div>
             <div style="text-align: center; font-weight: bold; color: red;">
+                ⚠️ This demonstration is for academic research and experiential use only.
             </div>
             """)
         with gr.Row():
             with gr.Column():
                 with gr.Group():
+                    image_input = gr.Image(label="Image Input (Auto Scaling)", type="filepath")
+                    audio_input = gr.Audio(label="Audio Input", type="filepath")
+                    pose_input = gr.Textbox(label="Pose Input (Directory Path)", placeholder="Please enter the directory path for pose data.", value="assets/halfbody_demo/pose/01")
+                with gr.Accordion("Advanced Settings", open=False):
                     with gr.Row():
+                        width = gr.Number(label="Width (multiple of 16, recommended: 768)", value=768)
+                        height = gr.Number(label="Height (multiple of 16, recommended: 768)", value=768)
+                        length = gr.Number(label="Video Length (recommended: 240）", value=240)
                     with gr.Row():
+                        steps = gr.Number(label="Steps (recommended: 30)", value=20)
+                        sample_rate = gr.Number(label="Sampling Rate (recommended: 16000)", value=16000)
+                        cfg = gr.Number(label="CFG (recommended: 2.5)", value=2.5, step=0.1)
                     with gr.Row():
+                        fps = gr.Number(label="Frame Rate (recommended: 24)", value=24)
+                        context_frames = gr.Number(label="Context Frames (recommended: 12)", value=12)
+                        context_overlap = gr.Number(label="Context Overlap (recommended: 3)", value=3)
                     with gr.Row():
+                        quantization_input = gr.Checkbox(label="Int8 Quantization (recommended for users with 12GB VRAM, use audio no longer than 5 seconds)", value=False)
+                        seed = gr.Number(label="Seed (-1 for random)", value=-1)
+                generate_button = gr.Button("🎬 Generate Video")
             with gr.Column():
+                video_output = gr.Video(label="Output Video")
+                seed_text = gr.Textbox(label="Seed", interactive=False, visible=False)
         gr.Examples(
             examples=[
                 ["EMTD_dataset/ref_imgs_by_FLUX/man/0001.png", "assets/halfbody_demo/audio/chinese/echomimicv2_man.wav"],
                 ["EMTD_dataset/ref_imgs_by_FLUX/woman/0057.png", "assets/halfbody_demo/audio/chinese/ultraman.wav"]
             ],
             inputs=[image_input, audio_input],
+            label="Preset Characters and Audio",
         )
     generate_button.click(