Multi-View-Illusion-Diffusion

Build error

App Files Files Community

callum-canavan commited on Dec 3, 2023

Commit

609badf

1 Parent(s): 1ad8665

Fix pipeline

Browse files

Files changed (6) hide show

.gitignore +2 -1
bapp.py +2 -1
requirements.txt +1 -0
test_video.py +1 -1
visual_anagrams/animate.py +23 -20
visual_anagrams/samplers.py +5 -4

.gitignore CHANGED Viewed

@@ -1,4 +1,5 @@
 env/
 __pycache__/
 *.png
-*.mp4

 env/
 __pycache__/
 *.png
+*.mp4
+*.gif

bapp.py CHANGED Viewed

@@ -75,12 +75,13 @@ def generate_content(
 choices = list(VIEW_MAP_NAMES.keys())
 gradio_app = gr.Interface(
     fn=generate_content,
     inputs=[
         gr.Textbox(label="Style", placeholder="an oil painting of"),
         gr.Textbox(label="Prompt for original view", placeholder="a dress"),
         gr.Textbox(label="Prompt for transformed view", placeholder="an old man"),
         gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
-        gr.Number(label="Number of diffusion steps", value=100, step=1, minimum=1, maximum=300),
         gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
     ],
     outputs=[gr.Video(label="Illusion"), gr.Image(label="Original"), gr.Image(label="Transformed")],

 choices = list(VIEW_MAP_NAMES.keys())
 gradio_app = gr.Interface(
     fn=generate_content,
+    title="Multi-View Illusion Diffusion",
     inputs=[
         gr.Textbox(label="Style", placeholder="an oil painting of"),
         gr.Textbox(label="Prompt for original view", placeholder="a dress"),
         gr.Textbox(label="Prompt for transformed view", placeholder="an old man"),
         gr.Dropdown(label="View transformation", choices=choices, value=choices[0]),
+        gr.Number(label="Number of diffusion steps", value=50, step=1, minimum=1, maximum=300),
         gr.Number(label="Random seed", value=0, step=1, minimum=0, maximum=100000)
     ],
     outputs=[gr.Video(label="Illusion"), gr.Image(label="Original"), gr.Image(label="Transformed")],

requirements.txt CHANGED Viewed

@@ -7,6 +7,7 @@ imageio
 imageio[ffmpeg]
 imageio[pyav]
 opencv-python
 safetensors
 sentencepiece
 transformers

 imageio[ffmpeg]
 imageio[pyav]
 opencv-python
+pygifsicle
 safetensors
 sentencepiece
 transformers

test_video.py CHANGED Viewed

@@ -7,5 +7,5 @@ if __name__ == "__main__":
         get_views(["identity", "flip"])[1],
         "a painting of vases",
         "a painting of a sloth",
-        save_video_path="tmp3.mp4",
     )

         get_views(["identity", "flip"])[1],
         "a painting of vases",
         "a painting of a sloth",
+        save_video_path="tmp.mp4",
     )

visual_anagrams/animate.py CHANGED Viewed

@@ -1,8 +1,9 @@
 import cv2
 from tqdm import tqdm
 import numpy as np
-from PIL import Image, ImageDraw, ImageFont
 import imageio
 import torchvision.transforms.functional as TF
@@ -14,11 +15,12 @@ def draw_text(image, text, fill=(0,0,0), frame_size=384, im_size=256):
     image = image.copy()
     # Font info
     font_size = 16
     # Make PIL objects
     draw = ImageDraw.Draw(image)
-    font = ImageFont.load_default()
     # Center text horizontally, and vertically between
     # illusion bottom and frame bottom
@@ -50,9 +52,9 @@ def animate_two_view(
         prompt_1,
         prompt_2,
         save_video_path='tmp.mp4',
-        hold_duration=120,
         text_fade_duration=10,
-        transition_duration=60,
         im_size=256,
         frame_size=384,
 ):
@@ -114,22 +116,23 @@ def animate_two_view(
     # Move last bit of clip to front
     frames = frames[-hold_duration//2:] + frames[:-hold_duration//2]
-    # Convert PIL images to numpy arrays
-    image_array = [imageio.core.asarray(frame) for frame in frames]
-    f = image_array[0]
-    print(f.dtype)
-    print(f.shape)
-    print(frame_size)
-    print(np.min(f), np.max(f))
-    print(len(image_array))
-    # Save as video using opencv
-    fourcc = cv2.VideoWriter_fourcc(*'mp4v')
-    video = cv2.VideoWriter(save_video_path, fourcc, 30, (frame_size, frame_size))
-    for frame in image_array:
-        video.write(frame)
-    video.release()

 import cv2
 from tqdm import tqdm
 import numpy as np
+from PIL import Image, ImageDraw, ImageFont, ImageChops
 import imageio
+from pygifsicle import optimize
 import torchvision.transforms.functional as TF
     image = image.copy()
     # Font info
+    font_path = get_courier_font_path()
     font_size = 16
     # Make PIL objects
     draw = ImageDraw.Draw(image)
+    font = ImageFont.truetype(font_path, font_size)
     # Center text horizontally, and vertically between
     # illusion bottom and frame bottom
         prompt_1,
         prompt_2,
         save_video_path='tmp.mp4',
+        hold_duration=60,
         text_fade_duration=10,
+        transition_duration=80,
         im_size=256,
         frame_size=384,
 ):
     # Move last bit of clip to front
     frames = frames[-hold_duration//2:] + frames[:-hold_duration//2]
+    images = frames
+    processed_frames = [images[0]]
+    for i in range(1, len(images)):
+        # Calculate the difference between current and previous frame
+        diff = ImageChops.difference(images[i], images[i - 1])
+        # Create a mask to isolate changes
+        mask = diff.convert("L").point(lambda x: 0 if x < 5 else 255, "1")
+        # Apply the mask to the current frame
+        new_frame = ImageChops.composite(images[i], processed_frames[-1], mask)
+        processed_frames.append(new_frame)
+    # Save the frames as a GIF
+    imageio.mimsave(save_video_path,
+                    [np.array(frame) for frame in processed_frames],
+                    fps=30)

visual_anagrams/samplers.py CHANGED Viewed

@@ -30,7 +30,7 @@ def sample_stage_1(model,
     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
     # Setup timesteps
-    model.scheduler.set_timesteps(num_inference_steps, device=device)
     timesteps = model.scheduler.timesteps
     # Make intermediate_images
@@ -45,7 +45,7 @@ def sample_stage_1(model,
     )
     # ic(noisy_images.shape)
-    for i, t in tqdm(enumerate(timesteps)):
         # Apply views to noisy_image
         viewed_noisy_images = []
         for view_fn in views:
@@ -109,6 +109,7 @@ def sample_stage_1(model,
         # ic(noise_pred.shape)
         # ic(t.shape)
         # compute the previous noisy sample x_t -> x_t-1
         noisy_images = model.scheduler.step(
             noise_pred.to('cuda'), t, noisy_images.to('cuda'), generator=generator, return_dict=False
@@ -148,7 +149,7 @@ def sample_stage_2(model,
     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
     # Get timesteps
-    model.scheduler.set_timesteps(num_inference_steps, device=device)
     timesteps = model.scheduler.timesteps
     num_channels = model.unet.config.in_channels // 2
@@ -236,7 +237,7 @@ def sample_stage_2(model,
         # compute the previous noisy sample x_t -> x_t-1
         noisy_images = model.scheduler.step(
-            noise_pred, t, noisy_images, generator=generator, return_dict=False
         )[0]
     # Return denoised images

     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
     # Setup timesteps
+    model.scheduler.set_timesteps(int(num_inference_steps), device=device)
     timesteps = model.scheduler.timesteps
     # Make intermediate_images
     )
     # ic(noisy_images.shape)
+    for i, t in enumerate(tqdm(timesteps)):
         # Apply views to noisy_image
         viewed_noisy_images = []
         for view_fn in views:
         # ic(noise_pred.shape)
         # ic(t.shape)
+        # ic(t.dtype)
         # compute the previous noisy sample x_t -> x_t-1
         noisy_images = model.scheduler.step(
             noise_pred.to('cuda'), t, noisy_images.to('cuda'), generator=generator, return_dict=False
     prompt_embeds = torch.cat([negative_prompt_embeds, prompt_embeds])
     # Get timesteps
+    model.scheduler.set_timesteps(int(num_inference_steps), device=device)
     timesteps = model.scheduler.timesteps
     num_channels = model.unet.config.in_channels // 2
         # compute the previous noisy sample x_t -> x_t-1
         noisy_images = model.scheduler.step(
+            noise_pred.to('cuda'), t, noisy_images.to('cuda'), generator=generator, return_dict=False
         )[0]
     # Return denoised images