Spaces:
Running
on
Zero
Running
on
Zero
Commit
·
badacd0
1
Parent(s):
006ab10
new ui
Browse files
app.py
CHANGED
|
@@ -1,4 +1,4 @@
|
|
| 1 |
-
from typing import List
|
| 2 |
from pathlib import Path
|
| 3 |
from functools import partial
|
| 4 |
import spaces
|
|
@@ -137,7 +137,7 @@ def any_images_to_short_video(
|
|
| 137 |
pbar = CustomProgressBar(
|
| 138 |
gr.Progress(track_tqdm=True).tqdm(
|
| 139 |
iterable=None,
|
| 140 |
-
desc="Sampling",
|
| 141 |
total=dfot.sampling_timesteps,
|
| 142 |
)
|
| 143 |
)
|
|
@@ -200,7 +200,7 @@ def navigate_video(
|
|
| 200 |
pbar = CustomProgressBar(
|
| 201 |
gr.Progress(track_tqdm=True).tqdm(
|
| 202 |
iterable=None,
|
| 203 |
-
desc=f"Predicting next {n_prediction_frames} frames",
|
| 204 |
total=dfot.sampling_timesteps,
|
| 205 |
)
|
| 206 |
)
|
|
@@ -408,363 +408,237 @@ def smooth_navigation(
|
|
| 408 |
[(image, f"t={i}") for i, image in enumerate(images)],
|
| 409 |
)
|
| 410 |
|
| 411 |
-
|
| 412 |
-
# Create the Gradio Blocks
|
| 413 |
-
with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
| 414 |
-
gr.HTML(
|
| 415 |
-
"""
|
| 416 |
-
<style>
|
| 417 |
-
[data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
|
| 418 |
-
font-size: 16px !important;
|
| 419 |
-
font-weight: bold;
|
| 420 |
-
}
|
| 421 |
-
#header-button .button-icon {
|
| 422 |
-
margin-right: 8px;
|
| 423 |
-
}
|
| 424 |
-
#basic-controls {
|
| 425 |
-
column-gap: 0px;
|
| 426 |
-
}
|
| 427 |
-
#basic-controls button {
|
| 428 |
-
border: 1px solid #e4e4e7;
|
| 429 |
-
}
|
| 430 |
-
#basic-controls-tab {
|
| 431 |
-
padding: 0px;
|
| 432 |
-
}
|
| 433 |
-
#advanced-controls-tab {
|
| 434 |
-
padding: 0px;
|
| 435 |
-
}
|
| 436 |
-
</style>
|
| 437 |
-
"""
|
| 438 |
-
)
|
| 439 |
-
|
| 440 |
-
gr.Markdown("# Diffusion Forcing Transformer with History Guidance")
|
| 441 |
gr.Markdown(
|
| 442 |
-
"
|
|
|
|
|
|
|
|
|
|
|
|
|
| 443 |
)
|
| 444 |
-
|
| 445 |
-
|
| 446 |
-
|
| 447 |
-
|
| 448 |
-
|
| 449 |
-
|
| 450 |
-
|
| 451 |
-
|
| 452 |
-
|
| 453 |
-
|
| 454 |
-
|
| 455 |
-
|
| 456 |
-
|
| 457 |
-
|
| 458 |
-
|
| 459 |
-
|
| 460 |
-
|
| 461 |
-
|
| 462 |
-
|
| 463 |
-
|
| 464 |
-
|
| 465 |
-
|
| 466 |
-
|
| 467 |
-
|
| 468 |
-
|
| 469 |
-
|
| 470 |
-
|
| 471 |
-
|
| 472 |
-
|
| 473 |
-
|
| 474 |
-
|
| 475 |
-
|
| 476 |
-
|
| 477 |
-
|
| 478 |
-
|
| 479 |
-
|
| 480 |
-
|
| 481 |
-
|
| 482 |
-
|
| 483 |
-
|
| 484 |
-
|
| 485 |
-
|
| 486 |
-
|
| 487 |
-
|
| 488 |
-
|
| 489 |
-
|
| 490 |
-
|
| 491 |
-
|
| 492 |
-
|
| 493 |
-
|
| 494 |
-
|
| 495 |
-
|
| 496 |
-
|
| 497 |
-
|
| 498 |
-
|
| 499 |
-
|
| 500 |
-
|
| 501 |
-
|
| 502 |
-
|
| 503 |
-
|
| 504 |
-
|
| 505 |
-
|
| 506 |
-
|
| 507 |
-
|
| 508 |
-
|
| 509 |
-
|
| 510 |
-
|
| 511 |
-
|
| 512 |
-
|
| 513 |
-
|
| 514 |
-
|
| 515 |
-
|
| 516 |
-
|
| 517 |
-
|
| 518 |
-
|
| 519 |
-
|
| 520 |
-
inputs=demo1_selected_scene_index, outputs=demo1_stage
|
| 521 |
-
)
|
| 522 |
-
def move_to_image_selection(scene_idx: int):
|
| 523 |
-
if scene_idx is None:
|
| 524 |
-
gr.Warning("Scene not selected!")
|
| 525 |
-
return "Scene"
|
| 526 |
-
else:
|
| 527 |
-
return "Image"
|
| 528 |
-
|
| 529 |
-
case "Image":
|
| 530 |
-
with gr.Group():
|
| 531 |
-
demo1_image_gallery = gr.Gallery(
|
| 532 |
-
height=150,
|
| 533 |
-
value=[
|
| 534 |
-
(image, f"t={i}")
|
| 535 |
-
for i, image in enumerate(
|
| 536 |
-
prepare_short_gt_video(scene_idx)
|
| 537 |
-
)
|
| 538 |
-
],
|
| 539 |
-
label="Select Input Images",
|
| 540 |
-
columns=[8],
|
| 541 |
-
)
|
| 542 |
-
|
| 543 |
-
demo1_selector = gr.CheckboxGroup(
|
| 544 |
-
label="Select Any Number of Input Images",
|
| 545 |
-
info="Image-to-Video: Select t=0; Interpolation: Select t=0 and t=7",
|
| 546 |
-
choices=[(f"t={i}", i) for i in range(8)],
|
| 547 |
-
value=[],
|
| 548 |
-
)
|
| 549 |
-
demo1_image_select_button = gr.Button(
|
| 550 |
-
"Select Input Images", variant="primary"
|
| 551 |
-
)
|
| 552 |
-
|
| 553 |
-
@demo1_image_select_button.click(
|
| 554 |
-
inputs=[demo1_selector],
|
| 555 |
-
outputs=[demo1_stage, demo1_selected_image_indices],
|
| 556 |
-
)
|
| 557 |
-
def generate_video(selected_indices):
|
| 558 |
-
if len(selected_indices) == 0:
|
| 559 |
-
gr.Warning("Select at least one image!")
|
| 560 |
-
return "Image", []
|
| 561 |
-
else:
|
| 562 |
-
gr.Info('Click "Generate Video" on the left to start generating now!')
|
| 563 |
-
return "Generation", selected_indices
|
| 564 |
-
|
| 565 |
-
case "Generation":
|
| 566 |
-
with gr.Group():
|
| 567 |
-
gt_video = prepare_short_gt_video(scene_idx)
|
| 568 |
-
|
| 569 |
-
demo1_input_image_gallery = gr.Gallery(
|
| 570 |
-
height=150,
|
| 571 |
-
value=video_to_gif_and_images(gt_video, image_indices),
|
| 572 |
-
label="Input Images",
|
| 573 |
-
columns=[9],
|
| 574 |
-
)
|
| 575 |
-
demo1_generated_gallery = gr.Gallery(
|
| 576 |
-
height=150,
|
| 577 |
-
value=[],
|
| 578 |
-
label="Generated Video",
|
| 579 |
-
columns=[9],
|
| 580 |
-
)
|
| 581 |
-
|
| 582 |
-
demo1_ground_truth_gallery = gr.Gallery(
|
| 583 |
-
height=150,
|
| 584 |
-
value=video_to_gif_and_images(gt_video, list(range(8))),
|
| 585 |
-
label="Ground Truth Video",
|
| 586 |
-
columns=[9],
|
| 587 |
-
)
|
| 588 |
-
with gr.Sidebar():
|
| 589 |
-
gr.Markdown("### Sampling Parameters")
|
| 590 |
-
demo1_guidance_scale = gr.Slider(
|
| 591 |
-
minimum=1,
|
| 592 |
-
maximum=6,
|
| 593 |
-
value=4,
|
| 594 |
-
step=0.5,
|
| 595 |
-
label="History Guidance Scale",
|
| 596 |
-
info="Without history guidance: 1.0; Recommended: 4.0",
|
| 597 |
-
interactive=True,
|
| 598 |
-
)
|
| 599 |
-
gr.Button("Generate Video", variant="primary").click(
|
| 600 |
-
fn=any_images_to_short_video,
|
| 601 |
-
inputs=[
|
| 602 |
-
demo1_selected_scene_index,
|
| 603 |
-
demo1_selected_image_indices,
|
| 604 |
-
demo1_guidance_scale,
|
| 605 |
-
],
|
| 606 |
-
outputs=demo1_generated_gallery,
|
| 607 |
-
)
|
| 608 |
-
|
| 609 |
-
with gr.Tab("Single Image → Long Video", id="task-2"):
|
| 610 |
-
gr.Markdown(
|
| 611 |
-
f"""
|
| 612 |
-
## Demo 2: Single Image → Long {LONG_LENGTH}-second Video
|
| 613 |
-
> #### _Diffusion Forcing Transformer, with History Guidance, can generate long videos via sliding window rollouts and temporal super-resolution._
|
| 614 |
"""
|
| 615 |
-
|
| 616 |
-
|
| 617 |
-
|
| 618 |
-
|
| 619 |
-
|
| 620 |
-
@gr.render(inputs=[demo2_stage, demo2_selected_index])
|
| 621 |
-
def render_stage(s, idx):
|
| 622 |
-
match s:
|
| 623 |
-
case "Selection":
|
| 624 |
-
with gr.Group():
|
| 625 |
-
demo2_image_gallery = gr.Gallery(
|
| 626 |
-
height=300,
|
| 627 |
-
value=first_frame_list,
|
| 628 |
-
label="Select an Image to Animate",
|
| 629 |
-
columns=[8],
|
| 630 |
-
selected_index=idx,
|
| 631 |
-
)
|
| 632 |
-
|
| 633 |
-
@demo2_image_gallery.select(
|
| 634 |
-
inputs=None, outputs=demo2_selected_index
|
| 635 |
-
)
|
| 636 |
-
def update_selection(selection: gr.SelectData):
|
| 637 |
-
return selection.index
|
| 638 |
-
|
| 639 |
-
demo2_select_button = gr.Button(
|
| 640 |
-
"Select Input Image", variant="primary"
|
| 641 |
-
)
|
| 642 |
-
|
| 643 |
-
@demo2_select_button.click(
|
| 644 |
-
inputs=demo2_selected_index, outputs=demo2_stage
|
| 645 |
-
)
|
| 646 |
-
def move_to_generation(idx: int):
|
| 647 |
-
if idx is None:
|
| 648 |
-
gr.Warning("Image not selected!")
|
| 649 |
-
return "Selection"
|
| 650 |
-
else:
|
| 651 |
-
gr.Info('Click "Generate Video" on the left to start generating now!')
|
| 652 |
-
return "Generation"
|
| 653 |
-
|
| 654 |
-
case "Generation":
|
| 655 |
-
with gr.Row():
|
| 656 |
-
gr.Image(
|
| 657 |
-
value=first_frame_list[idx],
|
| 658 |
-
label="Input Image",
|
| 659 |
-
width=256,
|
| 660 |
-
height=256,
|
| 661 |
-
)
|
| 662 |
-
gr.Video(
|
| 663 |
-
value=prepare_long_gt_video(idx),
|
| 664 |
-
label="Ground Truth Video",
|
| 665 |
-
width=256,
|
| 666 |
-
height=256,
|
| 667 |
-
autoplay=True,
|
| 668 |
-
loop=True,
|
| 669 |
-
)
|
| 670 |
-
demo2_video = gr.Video(
|
| 671 |
-
label="Generated Video",
|
| 672 |
-
width=256,
|
| 673 |
-
height=256,
|
| 674 |
-
autoplay=True,
|
| 675 |
-
loop=True,
|
| 676 |
-
show_share_button=True,
|
| 677 |
-
show_download_button=True,
|
| 678 |
-
)
|
| 679 |
-
|
| 680 |
-
with gr.Sidebar():
|
| 681 |
-
gr.Markdown("### Sampling Parameters")
|
| 682 |
|
| 683 |
-
|
| 684 |
-
|
| 685 |
-
|
| 686 |
-
|
| 687 |
-
|
| 688 |
-
|
| 689 |
-
|
| 690 |
-
|
| 691 |
-
|
| 692 |
-
|
| 693 |
-
|
| 694 |
-
|
| 695 |
-
|
| 696 |
-
|
| 697 |
-
|
| 698 |
-
|
| 699 |
-
|
| 700 |
-
|
| 701 |
-
|
| 702 |
-
|
| 703 |
-
|
| 704 |
-
|
| 705 |
-
|
| 706 |
-
|
| 707 |
-
|
| 708 |
-
|
|
|
|
| 709 |
)
|
| 710 |
-
|
| 711 |
-
|
| 712 |
-
|
| 713 |
-
|
| 714 |
-
|
| 715 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 716 |
"""
|
| 717 |
-
)
|
| 718 |
-
|
| 719 |
-
|
| 720 |
-
|
| 721 |
-
|
| 722 |
-
|
| 723 |
-
|
| 724 |
-
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
| 737 |
-
|
| 738 |
-
|
| 739 |
-
|
| 740 |
-
|
| 741 |
-
|
| 742 |
-
|
| 743 |
-
|
| 744 |
-
|
| 745 |
-
|
| 746 |
-
|
| 747 |
-
@demo3_select_button.click(
|
| 748 |
-
inputs=demo3_selected_index,
|
| 749 |
-
outputs=[
|
| 750 |
-
demo3_stage,
|
| 751 |
-
demo3_current_video,
|
| 752 |
-
demo3_current_poses,
|
| 753 |
-
],
|
| 754 |
-
)
|
| 755 |
-
def move_to_generation(idx: int):
|
| 756 |
-
if idx is None:
|
| 757 |
-
gr.Warning("Image not selected!")
|
| 758 |
-
return "Selection", None, None
|
| 759 |
-
else:
|
| 760 |
-
gr.Info('Start navigating with the "Let\'s Navigate!" sidebar on the left now!')
|
| 761 |
-
return (
|
| 762 |
-
"Generation",
|
| 763 |
-
video_list[idx][:1],
|
| 764 |
-
poses_list[idx][:1],
|
| 765 |
-
)
|
| 766 |
|
| 767 |
-
|
|
|
|
|
|
|
| 768 |
with gr.Row():
|
| 769 |
demo3_current_view = gr.Image(
|
| 770 |
value=first_frame_list[idx],
|
|
@@ -785,183 +659,86 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
|
| 785 |
demo3_generated_gallery = gr.Gallery(
|
| 786 |
value=[],
|
| 787 |
label="Generated Frames",
|
| 788 |
-
columns=[
|
| 789 |
)
|
| 790 |
|
| 791 |
-
|
| 792 |
-
|
| 793 |
-
|
| 794 |
-
|
| 795 |
- **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
|
| 796 |
- **At the end of your navigation, apply temporal super-resolution to increase the FPS,** also utilizing the DFoT model.
|
| 797 |
-
- The most suitable history guidance scheme will be automatically selected based on your camera movements.
|
| 798 |
-
"""
|
| 799 |
-
|
| 800 |
-
with gr.
|
| 801 |
-
|
| 802 |
-
|
| 803 |
-
|
| 804 |
-
|
| 805 |
-
|
| 806 |
-
|
| 807 |
-
|
| 808 |
-
|
| 809 |
-
|
| 810 |
-
|
| 811 |
-
|
| 812 |
-
|
| 813 |
-
|
| 814 |
-
|
| 815 |
-
|
| 816 |
-
|
| 817 |
-
|
| 818 |
-
|
| 819 |
-
|
| 820 |
-
|
| 821 |
-
|
| 822 |
-
|
| 823 |
-
|
| 824 |
-
|
| 825 |
-
|
| 826 |
-
],
|
| 827 |
-
)
|
| 828 |
-
|
| 829 |
-
gr.Button(
|
| 830 |
-
"↖-30°\nVeer",
|
| 831 |
-
size="sm",
|
| 832 |
-
min_width=0,
|
| 833 |
-
variant="primary",
|
| 834 |
-
).click(
|
| 835 |
-
fn=partial(
|
| 836 |
-
navigate_video,
|
| 837 |
-
x_angle=0,
|
| 838 |
-
y_angle=-30,
|
| 839 |
-
distance=50,
|
| 840 |
-
),
|
| 841 |
-
inputs=[
|
| 842 |
-
demo3_current_video,
|
| 843 |
-
demo3_current_poses,
|
| 844 |
-
],
|
| 845 |
-
outputs=[
|
| 846 |
-
demo3_current_video,
|
| 847 |
-
demo3_current_poses,
|
| 848 |
-
demo3_current_view,
|
| 849 |
-
demo3_video,
|
| 850 |
-
demo3_generated_gallery,
|
| 851 |
-
],
|
| 852 |
-
)
|
| 853 |
-
|
| 854 |
-
gr.Button(
|
| 855 |
-
"↑0°\nAhead",
|
| 856 |
-
size="sm",
|
| 857 |
-
min_width=0,
|
| 858 |
-
variant="primary",
|
| 859 |
-
).click(
|
| 860 |
-
fn=partial(
|
| 861 |
-
navigate_video,
|
| 862 |
-
x_angle=0,
|
| 863 |
-
y_angle=0,
|
| 864 |
-
distance=100,
|
| 865 |
-
),
|
| 866 |
-
inputs=[
|
| 867 |
-
demo3_current_video,
|
| 868 |
-
demo3_current_poses,
|
| 869 |
-
],
|
| 870 |
-
outputs=[
|
| 871 |
-
demo3_current_video,
|
| 872 |
-
demo3_current_poses,
|
| 873 |
-
demo3_current_view,
|
| 874 |
-
demo3_video,
|
| 875 |
-
demo3_generated_gallery,
|
| 876 |
-
],
|
| 877 |
-
)
|
| 878 |
-
gr.Button(
|
| 879 |
-
"↗30°\nVeer",
|
| 880 |
-
size="sm",
|
| 881 |
-
min_width=0,
|
| 882 |
-
variant="primary",
|
| 883 |
-
).click(
|
| 884 |
-
fn=partial(
|
| 885 |
-
navigate_video,
|
| 886 |
-
x_angle=0,
|
| 887 |
-
y_angle=30,
|
| 888 |
-
distance=50,
|
| 889 |
-
),
|
| 890 |
-
inputs=[
|
| 891 |
-
demo3_current_video,
|
| 892 |
-
demo3_current_poses,
|
| 893 |
-
],
|
| 894 |
-
outputs=[
|
| 895 |
-
demo3_current_video,
|
| 896 |
-
demo3_current_poses,
|
| 897 |
-
demo3_current_view,
|
| 898 |
-
demo3_video,
|
| 899 |
-
demo3_generated_gallery,
|
| 900 |
-
],
|
| 901 |
-
)
|
| 902 |
-
gr.Button(
|
| 903 |
-
"↱\n60° Turn",
|
| 904 |
-
size="sm",
|
| 905 |
-
min_width=0,
|
| 906 |
-
variant="primary",
|
| 907 |
-
).click(
|
| 908 |
-
fn=partial(
|
| 909 |
-
navigate_video,
|
| 910 |
-
x_angle=0,
|
| 911 |
-
y_angle=60,
|
| 912 |
-
distance=0,
|
| 913 |
-
),
|
| 914 |
-
inputs=[
|
| 915 |
-
demo3_current_video,
|
| 916 |
-
demo3_current_poses,
|
| 917 |
-
],
|
| 918 |
-
outputs=[
|
| 919 |
-
demo3_current_video,
|
| 920 |
-
demo3_current_poses,
|
| 921 |
-
demo3_current_view,
|
| 922 |
-
demo3_video,
|
| 923 |
-
demo3_generated_gallery,
|
| 924 |
-
],
|
| 925 |
-
)
|
| 926 |
-
with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
|
| 927 |
-
with gr.Group():
|
| 928 |
-
gr.Markdown("_**Select angles and distance:**_")
|
| 929 |
-
|
| 930 |
-
demo3_y_angle = gr.Slider(
|
| 931 |
-
minimum=-90,
|
| 932 |
-
maximum=90,
|
| 933 |
-
value=0,
|
| 934 |
-
step=10,
|
| 935 |
-
label="Horizontal Angle",
|
| 936 |
-
interactive=True,
|
| 937 |
-
)
|
| 938 |
-
demo3_x_angle = gr.Slider(
|
| 939 |
-
minimum=-40,
|
| 940 |
-
maximum=40,
|
| 941 |
-
value=0,
|
| 942 |
-
step=10,
|
| 943 |
-
label="Vertical Angle",
|
| 944 |
-
interactive=True,
|
| 945 |
)
|
| 946 |
-
|
| 947 |
-
|
| 948 |
-
|
| 949 |
-
|
| 950 |
-
|
| 951 |
-
|
| 952 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 953 |
)
|
| 954 |
|
| 955 |
gr.Button(
|
| 956 |
-
"
|
|
|
|
|
|
|
|
|
|
| 957 |
).click(
|
| 958 |
-
fn=
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 959 |
inputs=[
|
| 960 |
demo3_current_video,
|
| 961 |
demo3_current_poses,
|
| 962 |
-
demo3_x_angle,
|
| 963 |
-
demo3_y_angle,
|
| 964 |
-
demo3_distance,
|
| 965 |
],
|
| 966 |
outputs=[
|
| 967 |
demo3_current_video,
|
|
@@ -971,37 +748,93 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
|
| 971 |
demo3_generated_gallery,
|
| 972 |
],
|
| 973 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 974 |
with gr.Group():
|
| 975 |
-
gr.Markdown("
|
| 976 |
-
|
| 977 |
-
|
| 978 |
-
|
| 979 |
-
|
| 980 |
-
|
| 981 |
-
|
| 982 |
-
|
| 983 |
-
|
| 984 |
-
demo3_generated_gallery,
|
| 985 |
-
],
|
| 986 |
)
|
| 987 |
-
|
| 988 |
-
|
| 989 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
| 990 |
)
|
| 991 |
-
|
| 992 |
-
minimum=
|
| 993 |
-
maximum=
|
| 994 |
-
value=
|
| 995 |
-
step=
|
| 996 |
-
label="
|
| 997 |
interactive=True,
|
| 998 |
)
|
| 999 |
-
|
| 1000 |
-
|
|
|
|
|
|
|
|
|
|
| 1001 |
inputs=[
|
| 1002 |
demo3_current_video,
|
| 1003 |
demo3_current_poses,
|
| 1004 |
-
|
|
|
|
|
|
|
| 1005 |
],
|
| 1006 |
outputs=[
|
| 1007 |
demo3_current_video,
|
|
@@ -1011,7 +844,206 @@ with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
|
| 1011 |
demo3_generated_gallery,
|
| 1012 |
],
|
| 1013 |
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1014 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 1015 |
|
| 1016 |
if __name__ == "__main__":
|
| 1017 |
demo.launch()
|
|
|
|
| 1 |
+
from typing import List, Literal
|
| 2 |
from pathlib import Path
|
| 3 |
from functools import partial
|
| 4 |
import spaces
|
|
|
|
| 137 |
pbar = CustomProgressBar(
|
| 138 |
gr.Progress(track_tqdm=True).tqdm(
|
| 139 |
iterable=None,
|
| 140 |
+
desc="Sampling with DFoT",
|
| 141 |
total=dfot.sampling_timesteps,
|
| 142 |
)
|
| 143 |
)
|
|
|
|
| 200 |
pbar = CustomProgressBar(
|
| 201 |
gr.Progress(track_tqdm=True).tqdm(
|
| 202 |
iterable=None,
|
| 203 |
+
desc=f"Predicting next {n_prediction_frames} frames with DFoT",
|
| 204 |
total=dfot.sampling_timesteps,
|
| 205 |
)
|
| 206 |
)
|
|
|
|
| 408 |
[(image, f"t={i}") for i, image in enumerate(images)],
|
| 409 |
)
|
| 410 |
|
| 411 |
+
def render_demo1(s: Literal["Selection", "Generation"], idx: int, demo1_stage: gr.State, demo1_selected_index: gr.State):
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 412 |
gr.Markdown(
|
| 413 |
+
f"""
|
| 414 |
+
## Demo 1: Single Image → Long {LONG_LENGTH}-second Video
|
| 415 |
+
> #### _Diffusion Forcing Transformer can generate long videos via sliding window rollouts and temporal super-resolution._
|
| 416 |
+
""",
|
| 417 |
+
elem_classes=["task-title"]
|
| 418 |
)
|
| 419 |
+
match s:
|
| 420 |
+
case "Selection":
|
| 421 |
+
with gr.Group():
|
| 422 |
+
demo1_image_gallery = gr.Gallery(
|
| 423 |
+
height=300,
|
| 424 |
+
value=first_frame_list,
|
| 425 |
+
label="Select an Image to Animate",
|
| 426 |
+
columns=[8],
|
| 427 |
+
selected_index=idx,
|
| 428 |
+
allow_preview=False,
|
| 429 |
+
preview=False,
|
| 430 |
+
)
|
| 431 |
+
|
| 432 |
+
@demo1_image_gallery.select(
|
| 433 |
+
inputs=None, outputs=[demo1_stage, demo1_selected_index]
|
| 434 |
+
)
|
| 435 |
+
def move_to_generation(selection: gr.SelectData):
|
| 436 |
+
return "Generation", selection.index
|
| 437 |
+
|
| 438 |
+
case "Generation":
|
| 439 |
+
with gr.Row():
|
| 440 |
+
gr.Image(
|
| 441 |
+
value=first_frame_list[idx],
|
| 442 |
+
label="Input Image",
|
| 443 |
+
width=256,
|
| 444 |
+
height=256,
|
| 445 |
+
)
|
| 446 |
+
gr.Video(
|
| 447 |
+
value=prepare_long_gt_video(idx),
|
| 448 |
+
label="Ground Truth Video",
|
| 449 |
+
width=256,
|
| 450 |
+
height=256,
|
| 451 |
+
autoplay=True,
|
| 452 |
+
loop=True,
|
| 453 |
+
)
|
| 454 |
+
demo1_video = gr.Video(
|
| 455 |
+
label="Generated Video",
|
| 456 |
+
width=256,
|
| 457 |
+
height=256,
|
| 458 |
+
autoplay=True,
|
| 459 |
+
loop=True,
|
| 460 |
+
show_share_button=True,
|
| 461 |
+
show_download_button=True,
|
| 462 |
+
)
|
| 463 |
+
|
| 464 |
+
gr.Markdown("### Generation Controls ↓")
|
| 465 |
+
demo1_guidance_scale = gr.Slider(
|
| 466 |
+
minimum=1,
|
| 467 |
+
maximum=6,
|
| 468 |
+
value=4,
|
| 469 |
+
step=0.5,
|
| 470 |
+
label="History Guidance Scale",
|
| 471 |
+
info="Without history guidance: 1.0; Recommended: 4.0",
|
| 472 |
+
interactive=True,
|
| 473 |
+
)
|
| 474 |
+
demo1_fps = gr.Slider(
|
| 475 |
+
minimum=4,
|
| 476 |
+
maximum=20,
|
| 477 |
+
value=4,
|
| 478 |
+
step=1,
|
| 479 |
+
label="FPS",
|
| 480 |
+
info=f"A {LONG_LENGTH}-second video will be generated at this FPS; Decrease for faster generation; Increase for a smoother video",
|
| 481 |
+
interactive=True,
|
| 482 |
+
)
|
| 483 |
+
gr.Button("Generate Video", variant="primary").click(
|
| 484 |
+
fn=single_image_to_long_video,
|
| 485 |
+
inputs=[
|
| 486 |
+
demo1_selected_index,
|
| 487 |
+
demo1_guidance_scale,
|
| 488 |
+
demo1_fps,
|
| 489 |
+
],
|
| 490 |
+
outputs=demo1_video,
|
| 491 |
+
)
|
| 492 |
+
|
| 493 |
+
def render_demo2(s: Literal["Scene", "Image", "Generation"], scene_idx: int, image_indices: List[int], demo2_stage: gr.State, demo2_selected_scene_index: gr.State, demo2_selected_image_indices: gr.State):
|
| 494 |
+
gr.Markdown(
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 495 |
"""
|
| 496 |
+
## Demo 2: Any Number of Images → Short 2-second Video
|
| 497 |
+
> #### _Diffusion Forcing Transformer is a flexible model that can generate videos given variable number of context frames._
|
| 498 |
+
""",
|
| 499 |
+
elem_classes=["task-title"]
|
| 500 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 501 |
|
| 502 |
+
match s:
|
| 503 |
+
case "Scene":
|
| 504 |
+
with gr.Group():
|
| 505 |
+
demo2_scene_gallery = gr.Gallery(
|
| 506 |
+
height=300,
|
| 507 |
+
value=gif_paths,
|
| 508 |
+
label="Select a Scene to Generate Video",
|
| 509 |
+
columns=[8],
|
| 510 |
+
selected_index=scene_idx,
|
| 511 |
+
allow_preview=False,
|
| 512 |
+
preview=False,
|
| 513 |
+
)
|
| 514 |
+
|
| 515 |
+
@demo2_scene_gallery.select(
|
| 516 |
+
inputs=None, outputs=[demo2_stage, demo2_selected_scene_index]
|
| 517 |
+
)
|
| 518 |
+
def move_to_image_selection(selection: gr.SelectData):
|
| 519 |
+
return "Image", selection.index
|
| 520 |
+
|
| 521 |
+
case "Image":
|
| 522 |
+
with gr.Group():
|
| 523 |
+
demo2_image_gallery = gr.Gallery(
|
| 524 |
+
height=150,
|
| 525 |
+
value=[
|
| 526 |
+
(image, f"t={i}")
|
| 527 |
+
for i, image in enumerate(
|
| 528 |
+
prepare_short_gt_video(scene_idx)
|
| 529 |
)
|
| 530 |
+
],
|
| 531 |
+
label="Select Input Images",
|
| 532 |
+
columns=[8],
|
| 533 |
+
)
|
| 534 |
+
|
| 535 |
+
demo2_selector = gr.CheckboxGroup(
|
| 536 |
+
label="Select Any Number of Input Images",
|
| 537 |
+
info="Image-to-Video: Select t=0; Interpolation: Select t=0 and t=7",
|
| 538 |
+
choices=[(f"t={i}", i) for i in range(8)],
|
| 539 |
+
value=[],
|
| 540 |
+
)
|
| 541 |
+
demo2_image_select_button = gr.Button(
|
| 542 |
+
"Next Step", variant="primary"
|
| 543 |
+
)
|
| 544 |
+
|
| 545 |
+
@demo2_image_select_button.click(
|
| 546 |
+
inputs=[demo2_selector],
|
| 547 |
+
outputs=[demo2_stage, demo2_selected_image_indices],
|
| 548 |
+
)
|
| 549 |
+
def generate_video(selected_indices):
|
| 550 |
+
if len(selected_indices) == 0:
|
| 551 |
+
gr.Warning("Select at least one image!")
|
| 552 |
+
return "Image", []
|
| 553 |
+
else:
|
| 554 |
+
return "Generation", selected_indices
|
| 555 |
+
|
| 556 |
+
case "Generation":
|
| 557 |
+
with gr.Group():
|
| 558 |
+
gt_video = prepare_short_gt_video(scene_idx)
|
| 559 |
+
|
| 560 |
+
demo2_input_image_gallery = gr.Gallery(
|
| 561 |
+
height=150,
|
| 562 |
+
value=video_to_gif_and_images(gt_video, image_indices),
|
| 563 |
+
label="Input Images",
|
| 564 |
+
columns=[9],
|
| 565 |
+
)
|
| 566 |
+
demo2_generated_gallery = gr.Gallery(
|
| 567 |
+
height=150,
|
| 568 |
+
value=[],
|
| 569 |
+
label="Generated Video",
|
| 570 |
+
columns=[9],
|
| 571 |
+
)
|
| 572 |
+
|
| 573 |
+
demo2_ground_truth_gallery = gr.Gallery(
|
| 574 |
+
height=150,
|
| 575 |
+
value=video_to_gif_and_images(gt_video, list(range(8))),
|
| 576 |
+
label="Ground Truth Video",
|
| 577 |
+
columns=[9],
|
| 578 |
+
)
|
| 579 |
+
gr.Markdown("### Generation Controls ↓")
|
| 580 |
+
demo2_guidance_scale = gr.Slider(
|
| 581 |
+
minimum=1,
|
| 582 |
+
maximum=6,
|
| 583 |
+
value=4,
|
| 584 |
+
step=0.5,
|
| 585 |
+
label="History Guidance Scale",
|
| 586 |
+
info="Without history guidance: 1.0; Recommended: 4.0",
|
| 587 |
+
interactive=True,
|
| 588 |
+
)
|
| 589 |
+
gr.Button("Generate Video", variant="primary").click(
|
| 590 |
+
fn=any_images_to_short_video,
|
| 591 |
+
inputs=[
|
| 592 |
+
demo2_selected_scene_index,
|
| 593 |
+
demo2_selected_image_indices,
|
| 594 |
+
demo2_guidance_scale,
|
| 595 |
+
],
|
| 596 |
+
outputs=demo2_generated_gallery,
|
| 597 |
+
)
|
| 598 |
+
|
| 599 |
+
def render_demo3(
|
| 600 |
+
s: Literal["Selection", "Generation"],
|
| 601 |
+
idx: int,
|
| 602 |
+
demo3_stage: gr.State,
|
| 603 |
+
demo3_selected_index: gr.State,
|
| 604 |
+
demo3_current_video: gr.State,
|
| 605 |
+
demo3_current_poses: gr.State
|
| 606 |
+
):
|
| 607 |
+
gr.Markdown(
|
| 608 |
"""
|
| 609 |
+
## Demo 3: Single Image → Extremely Long Video _(Navigate with Your Camera Movements!)_
|
| 610 |
+
> #### _History Guidance significantly improves quality and temporal consistency, enabling stable rollouts for extremely long videos._
|
| 611 |
+
""",
|
| 612 |
+
elem_classes=["task-title"]
|
| 613 |
+
)
|
| 614 |
+
match s:
|
| 615 |
+
case "Selection":
|
| 616 |
+
with gr.Group():
|
| 617 |
+
demo3_image_gallery = gr.Gallery(
|
| 618 |
+
height=300,
|
| 619 |
+
value=first_frame_list,
|
| 620 |
+
label="Select an Image to Start Navigation",
|
| 621 |
+
columns=[8],
|
| 622 |
+
selected_index=idx,
|
| 623 |
+
allow_preview=False,
|
| 624 |
+
preview=False,
|
| 625 |
+
)
|
| 626 |
+
|
| 627 |
+
@demo3_image_gallery.select(
|
| 628 |
+
inputs=None, outputs=[demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses]
|
| 629 |
+
)
|
| 630 |
+
def move_to_generation(selection: gr.SelectData):
|
| 631 |
+
idx = selection.index
|
| 632 |
+
return (
|
| 633 |
+
"Generation",
|
| 634 |
+
idx,
|
| 635 |
+
video_list[idx][:1],
|
| 636 |
+
poses_list[idx][:1],
|
| 637 |
+
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 638 |
|
| 639 |
+
case "Generation":
|
| 640 |
+
with gr.Row():
|
| 641 |
+
with gr.Column(scale=3):
|
| 642 |
with gr.Row():
|
| 643 |
demo3_current_view = gr.Image(
|
| 644 |
value=first_frame_list[idx],
|
|
|
|
| 659 |
demo3_generated_gallery = gr.Gallery(
|
| 660 |
value=[],
|
| 661 |
label="Generated Frames",
|
| 662 |
+
columns=[6],
|
| 663 |
)
|
| 664 |
|
| 665 |
+
with gr.Column():
|
| 666 |
+
gr.Markdown("### Navigation Controls ↓")
|
| 667 |
+
with gr.Accordion("Instructions", open=False):
|
| 668 |
+
gr.Markdown("""
|
| 669 |
- **The model will predict the next few frames based on your camera movements. Repeat the process to continue navigating through the scene.**
|
| 670 |
- **At the end of your navigation, apply temporal super-resolution to increase the FPS,** also utilizing the DFoT model.
|
| 671 |
+
- The most suitable history guidance scheme will be automatically selected based on your camera movements.
|
| 672 |
+
""")
|
| 673 |
+
with gr.Tab("Basic", elem_id="basic-controls-tab"):
|
| 674 |
+
with gr.Group():
|
| 675 |
+
gr.Markdown("_**Select a direction to move:**_")
|
| 676 |
+
with gr.Row(elem_id="basic-controls"):
|
| 677 |
+
gr.Button(
|
| 678 |
+
"↰-60°\nVeer",
|
| 679 |
+
size="sm",
|
| 680 |
+
min_width=0,
|
| 681 |
+
variant="primary",
|
| 682 |
+
).click(
|
| 683 |
+
fn=partial(
|
| 684 |
+
navigate_video,
|
| 685 |
+
x_angle=0,
|
| 686 |
+
y_angle=-60,
|
| 687 |
+
distance=0,
|
| 688 |
+
),
|
| 689 |
+
inputs=[
|
| 690 |
+
demo3_current_video,
|
| 691 |
+
demo3_current_poses,
|
| 692 |
+
],
|
| 693 |
+
outputs=[
|
| 694 |
+
demo3_current_video,
|
| 695 |
+
demo3_current_poses,
|
| 696 |
+
demo3_current_view,
|
| 697 |
+
demo3_video,
|
| 698 |
+
demo3_generated_gallery,
|
| 699 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 700 |
)
|
| 701 |
+
|
| 702 |
+
gr.Button(
|
| 703 |
+
"↖-30°\nTurn",
|
| 704 |
+
size="sm",
|
| 705 |
+
min_width=0,
|
| 706 |
+
variant="primary",
|
| 707 |
+
).click(
|
| 708 |
+
fn=partial(
|
| 709 |
+
navigate_video,
|
| 710 |
+
x_angle=0,
|
| 711 |
+
y_angle=-30,
|
| 712 |
+
distance=50,
|
| 713 |
+
),
|
| 714 |
+
inputs=[
|
| 715 |
+
demo3_current_video,
|
| 716 |
+
demo3_current_poses,
|
| 717 |
+
],
|
| 718 |
+
outputs=[
|
| 719 |
+
demo3_current_video,
|
| 720 |
+
demo3_current_poses,
|
| 721 |
+
demo3_current_view,
|
| 722 |
+
demo3_video,
|
| 723 |
+
demo3_generated_gallery,
|
| 724 |
+
],
|
| 725 |
)
|
| 726 |
|
| 727 |
gr.Button(
|
| 728 |
+
"↑0°\nAhead",
|
| 729 |
+
size="sm",
|
| 730 |
+
min_width=0,
|
| 731 |
+
variant="primary",
|
| 732 |
).click(
|
| 733 |
+
fn=partial(
|
| 734 |
+
navigate_video,
|
| 735 |
+
x_angle=0,
|
| 736 |
+
y_angle=0,
|
| 737 |
+
distance=100,
|
| 738 |
+
),
|
| 739 |
inputs=[
|
| 740 |
demo3_current_video,
|
| 741 |
demo3_current_poses,
|
|
|
|
|
|
|
|
|
|
| 742 |
],
|
| 743 |
outputs=[
|
| 744 |
demo3_current_video,
|
|
|
|
| 748 |
demo3_generated_gallery,
|
| 749 |
],
|
| 750 |
)
|
| 751 |
+
gr.Button(
|
| 752 |
+
"↗30°\nTurn",
|
| 753 |
+
size="sm",
|
| 754 |
+
min_width=0,
|
| 755 |
+
variant="primary",
|
| 756 |
+
).click(
|
| 757 |
+
fn=partial(
|
| 758 |
+
navigate_video,
|
| 759 |
+
x_angle=0,
|
| 760 |
+
y_angle=30,
|
| 761 |
+
distance=50,
|
| 762 |
+
),
|
| 763 |
+
inputs=[
|
| 764 |
+
demo3_current_video,
|
| 765 |
+
demo3_current_poses,
|
| 766 |
+
],
|
| 767 |
+
outputs=[
|
| 768 |
+
demo3_current_video,
|
| 769 |
+
demo3_current_poses,
|
| 770 |
+
demo3_current_view,
|
| 771 |
+
demo3_video,
|
| 772 |
+
demo3_generated_gallery,
|
| 773 |
+
],
|
| 774 |
+
)
|
| 775 |
+
gr.Button(
|
| 776 |
+
"↱\n60° Veer",
|
| 777 |
+
size="sm",
|
| 778 |
+
min_width=0,
|
| 779 |
+
variant="primary",
|
| 780 |
+
).click(
|
| 781 |
+
fn=partial(
|
| 782 |
+
navigate_video,
|
| 783 |
+
x_angle=0,
|
| 784 |
+
y_angle=60,
|
| 785 |
+
distance=0,
|
| 786 |
+
),
|
| 787 |
+
inputs=[
|
| 788 |
+
demo3_current_video,
|
| 789 |
+
demo3_current_poses,
|
| 790 |
+
],
|
| 791 |
+
outputs=[
|
| 792 |
+
demo3_current_video,
|
| 793 |
+
demo3_current_poses,
|
| 794 |
+
demo3_current_view,
|
| 795 |
+
demo3_video,
|
| 796 |
+
demo3_generated_gallery,
|
| 797 |
+
],
|
| 798 |
+
)
|
| 799 |
+
with gr.Tab("Advanced", elem_id="advanced-controls-tab"):
|
| 800 |
with gr.Group():
|
| 801 |
+
gr.Markdown("_**Select angles and distance:**_")
|
| 802 |
+
|
| 803 |
+
demo3_y_angle = gr.Slider(
|
| 804 |
+
minimum=-90,
|
| 805 |
+
maximum=90,
|
| 806 |
+
value=0,
|
| 807 |
+
step=10,
|
| 808 |
+
label="Horizontal Angle",
|
| 809 |
+
interactive=True,
|
|
|
|
|
|
|
| 810 |
)
|
| 811 |
+
demo3_x_angle = gr.Slider(
|
| 812 |
+
minimum=-40,
|
| 813 |
+
maximum=40,
|
| 814 |
+
value=0,
|
| 815 |
+
step=10,
|
| 816 |
+
label="Vertical Angle",
|
| 817 |
+
interactive=True,
|
| 818 |
)
|
| 819 |
+
demo3_distance = gr.Slider(
|
| 820 |
+
minimum=0,
|
| 821 |
+
maximum=200,
|
| 822 |
+
value=100,
|
| 823 |
+
step=10,
|
| 824 |
+
label="Distance",
|
| 825 |
interactive=True,
|
| 826 |
)
|
| 827 |
+
|
| 828 |
+
gr.Button(
|
| 829 |
+
"Generate Next Move", variant="primary"
|
| 830 |
+
).click(
|
| 831 |
+
fn=navigate_video,
|
| 832 |
inputs=[
|
| 833 |
demo3_current_video,
|
| 834 |
demo3_current_poses,
|
| 835 |
+
demo3_x_angle,
|
| 836 |
+
demo3_y_angle,
|
| 837 |
+
demo3_distance,
|
| 838 |
],
|
| 839 |
outputs=[
|
| 840 |
demo3_current_video,
|
|
|
|
| 844 |
demo3_generated_gallery,
|
| 845 |
],
|
| 846 |
)
|
| 847 |
+
gr.Markdown("---")
|
| 848 |
+
with gr.Group():
|
| 849 |
+
gr.Markdown("_You can always undo your last move:_")
|
| 850 |
+
gr.Button("Undo Last Move", variant="huggingface").click(
|
| 851 |
+
fn=undo_navigation,
|
| 852 |
+
inputs=[demo3_current_video, demo3_current_poses],
|
| 853 |
+
outputs=[
|
| 854 |
+
demo3_current_video,
|
| 855 |
+
demo3_current_poses,
|
| 856 |
+
demo3_current_view,
|
| 857 |
+
demo3_video,
|
| 858 |
+
demo3_generated_gallery,
|
| 859 |
+
],
|
| 860 |
+
)
|
| 861 |
+
with gr.Group():
|
| 862 |
+
gr.Markdown(
|
| 863 |
+
"_At the end, apply temporal super-resolution to obtain a smoother video:_"
|
| 864 |
+
)
|
| 865 |
+
demo3_interpolation_factor = gr.Slider(
|
| 866 |
+
minimum=2,
|
| 867 |
+
maximum=10,
|
| 868 |
+
value=2,
|
| 869 |
+
step=1,
|
| 870 |
+
label="By a Factor of",
|
| 871 |
+
interactive=True,
|
| 872 |
+
)
|
| 873 |
+
gr.Button("Smooth Out Video", variant="huggingface").click(
|
| 874 |
+
fn=smooth_navigation,
|
| 875 |
+
inputs=[
|
| 876 |
+
demo3_current_video,
|
| 877 |
+
demo3_current_poses,
|
| 878 |
+
demo3_interpolation_factor,
|
| 879 |
+
],
|
| 880 |
+
outputs=[
|
| 881 |
+
demo3_current_video,
|
| 882 |
+
demo3_current_poses,
|
| 883 |
+
demo3_current_view,
|
| 884 |
+
demo3_video,
|
| 885 |
+
demo3_generated_gallery,
|
| 886 |
+
],
|
| 887 |
+
)
|
| 888 |
+
|
| 889 |
+
|
| 890 |
|
| 891 |
+
# Create the Gradio Blocks
|
| 892 |
+
with gr.Blocks(theme=gr.themes.Base(primary_hue="teal")) as demo:
|
| 893 |
+
gr.HTML(
|
| 894 |
+
"""
|
| 895 |
+
<style>
|
| 896 |
+
[data-tab-id="task-1"], [data-tab-id="task-2"], [data-tab-id="task-3"] {
|
| 897 |
+
font-size: 16px !important;
|
| 898 |
+
font-weight: bold;
|
| 899 |
+
}
|
| 900 |
+
#page-title h1 {
|
| 901 |
+
color: #0D9488 !important;
|
| 902 |
+
}
|
| 903 |
+
.task-title h2 {
|
| 904 |
+
color: #F59E0C !important;
|
| 905 |
+
}
|
| 906 |
+
.header-button-row {
|
| 907 |
+
gap: 4px !important;
|
| 908 |
+
}
|
| 909 |
+
.header-button-row div {
|
| 910 |
+
width: 131.0px !important;
|
| 911 |
+
}
|
| 912 |
+
|
| 913 |
+
.header-button-column {
|
| 914 |
+
width: 131.0px !important;
|
| 915 |
+
gap: 5px !important;
|
| 916 |
+
}
|
| 917 |
+
.header-button a {
|
| 918 |
+
border: 1px solid #e4e4e7;
|
| 919 |
+
}
|
| 920 |
+
.header-button .button-icon {
|
| 921 |
+
margin-right: 8px;
|
| 922 |
+
}
|
| 923 |
+
#basic-controls {
|
| 924 |
+
column-gap: 0px;
|
| 925 |
+
}
|
| 926 |
+
#basic-controls-tab {
|
| 927 |
+
padding: 0px;
|
| 928 |
+
}
|
| 929 |
+
#advanced-controls-tab {
|
| 930 |
+
padding: 0px;
|
| 931 |
+
}
|
| 932 |
+
#selected-demo-button {
|
| 933 |
+
color: #F59E0C;
|
| 934 |
+
text-decoration: underline;
|
| 935 |
+
}
|
| 936 |
+
.demo-button {
|
| 937 |
+
text-align: left !important;
|
| 938 |
+
display: block !important;
|
| 939 |
+
}
|
| 940 |
+
</style>
|
| 941 |
+
"""
|
| 942 |
+
)
|
| 943 |
+
|
| 944 |
+
demo_idx = gr.State(value=1)
|
| 945 |
+
|
| 946 |
+
with gr.Sidebar():
|
| 947 |
+
gr.Markdown("# Diffusion Forcing Transformer with History Guidance", elem_id="page-title")
|
| 948 |
+
gr.Markdown(
|
| 949 |
+
"### Official Interactive Demo for [_History-Guided Video Diffusion_](https://arxiv.org/abs/2502.06764)"
|
| 950 |
+
)
|
| 951 |
+
gr.Markdown("---")
|
| 952 |
+
gr.Markdown("#### Links ↓")
|
| 953 |
+
with gr.Row(elem_classes=["header-button-row"]):
|
| 954 |
+
with gr.Column(elem_classes=["header-button-column"], min_width=0):
|
| 955 |
+
gr.Button(
|
| 956 |
+
value="Website",
|
| 957 |
+
link="https://boyuan.space/history-guidance",
|
| 958 |
+
icon="https://simpleicons.org/icons/googlechrome.svg",
|
| 959 |
+
elem_classes=["header-button"],
|
| 960 |
+
size="md",
|
| 961 |
+
min_width=0,
|
| 962 |
+
)
|
| 963 |
+
gr.Button(
|
| 964 |
+
value="Paper",
|
| 965 |
+
link="https://arxiv.org/abs/2502.06764",
|
| 966 |
+
icon="https://simpleicons.org/icons/arxiv.svg",
|
| 967 |
+
elem_classes=["header-button"],
|
| 968 |
+
size="md",
|
| 969 |
+
min_width=0,
|
| 970 |
+
)
|
| 971 |
+
with gr.Column(elem_classes=["header-button-column"], min_width=0):
|
| 972 |
+
gr.Button(
|
| 973 |
+
value="Code",
|
| 974 |
+
link="https://github.com/kwsong0113/diffusion-forcing-transformer",
|
| 975 |
+
icon="https://simpleicons.org/icons/github.svg",
|
| 976 |
+
elem_classes=["header-button"],
|
| 977 |
+
size="md",
|
| 978 |
+
min_width=0,
|
| 979 |
+
)
|
| 980 |
+
gr.Button(
|
| 981 |
+
value="Weights",
|
| 982 |
+
link="https://huggingface.co/kiwhansong/DFoT",
|
| 983 |
+
icon="https://simpleicons.org/icons/huggingface.svg",
|
| 984 |
+
elem_classes=["header-button"],
|
| 985 |
+
size="md",
|
| 986 |
+
min_width=0,
|
| 987 |
+
)
|
| 988 |
+
gr.Markdown("---")
|
| 989 |
+
gr.Markdown("#### Choose a Demo ↓")
|
| 990 |
+
with gr.Group():
|
| 991 |
+
@gr.render(inputs=[demo_idx])
|
| 992 |
+
def render_demo_tabs(idx):
|
| 993 |
+
demo_tab_button1 = gr.Button(
|
| 994 |
+
"1: Image → Long Video",
|
| 995 |
+
size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 1 else {}
|
| 996 |
+
).click(
|
| 997 |
+
fn=lambda: 1,
|
| 998 |
+
outputs=demo_idx
|
| 999 |
+
)
|
| 1000 |
+
demo_tab_button2 = gr.Button(
|
| 1001 |
+
"2: Any # of Images → Short Video",
|
| 1002 |
+
size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 2 else {}
|
| 1003 |
+
).click(
|
| 1004 |
+
fn=lambda: 2,
|
| 1005 |
+
outputs=demo_idx
|
| 1006 |
+
)
|
| 1007 |
+
demo_tab_button3 = gr.Button(
|
| 1008 |
+
"3: Image → Extremely Long Video",
|
| 1009 |
+
size="md", elem_classes=["demo-button"], **{"elem_id": "selected-demo-button"} if idx == 3 else {}
|
| 1010 |
+
).click(
|
| 1011 |
+
fn=lambda: 3,
|
| 1012 |
+
outputs=demo_idx
|
| 1013 |
+
)
|
| 1014 |
+
gr.Markdown("---")
|
| 1015 |
+
gr.Markdown("#### Troubleshooting ↓")
|
| 1016 |
+
with gr.Group():
|
| 1017 |
+
with gr.Accordion("Error or Unexpected Results?", open=False):
|
| 1018 |
+
gr.Markdown("Please try again after refreshing the page and ensure you do not click the same button multiple times.")
|
| 1019 |
+
with gr.Accordion("Too Slow or No GPU Allocation?", open=False):
|
| 1020 |
+
gr.Markdown(
|
| 1021 |
+
"Consider running the demo locally (click the dots in the top-right corner). Alternatively, you can subscribe to Hugging Face Pro for an increased GPU quota."
|
| 1022 |
+
)
|
| 1023 |
+
|
| 1024 |
+
demo1_stage = gr.State(value="Selection")
|
| 1025 |
+
demo1_selected_index = gr.State(value=None)
|
| 1026 |
+
demo2_stage = gr.State(value="Scene")
|
| 1027 |
+
demo2_selected_scene_index = gr.State(value=None)
|
| 1028 |
+
demo2_selected_image_indices = gr.State(value=[])
|
| 1029 |
+
demo3_stage = gr.State(value="Selection")
|
| 1030 |
+
demo3_selected_index = gr.State(value=None)
|
| 1031 |
+
demo3_current_video = gr.State(value=None)
|
| 1032 |
+
demo3_current_poses = gr.State(value=None)
|
| 1033 |
+
|
| 1034 |
+
@gr.render(inputs=[demo_idx, demo1_stage, demo1_selected_index, demo2_stage, demo2_selected_scene_index, demo2_selected_image_indices, demo3_stage, demo3_selected_index])
|
| 1035 |
+
def render_demo(
|
| 1036 |
+
_demo_idx, _demo1_stage, _demo1_selected_index, _demo2_stage, _demo2_selected_scene_index, _demo2_selected_image_indices, _demo3_stage, _demo3_selected_index
|
| 1037 |
+
):
|
| 1038 |
+
match _demo_idx:
|
| 1039 |
+
case 1:
|
| 1040 |
+
render_demo1(_demo1_stage, _demo1_selected_index, demo1_stage, demo1_selected_index)
|
| 1041 |
+
case 2:
|
| 1042 |
+
render_demo2(_demo2_stage, _demo2_selected_scene_index, _demo2_selected_image_indices,
|
| 1043 |
+
demo2_stage, demo2_selected_scene_index, demo2_selected_image_indices)
|
| 1044 |
+
case 3:
|
| 1045 |
+
render_demo3(_demo3_stage, _demo3_selected_index, demo3_stage, demo3_selected_index, demo3_current_video, demo3_current_poses)
|
| 1046 |
+
|
| 1047 |
|
| 1048 |
if __name__ == "__main__":
|
| 1049 |
demo.launch()
|