Spaces:

Alibaba-DAMO-Academy
/

RynnEC

Runtime error

App Files Files Community

lkhl commited on Aug 12

Commit

b58be07

verified ·

1 Parent(s): 3666041

Update app.py

Browse files

Files changed (1) hide show

app.py +55 -84

app.py CHANGED Viewed

@@ -84,15 +84,6 @@ def draw_contours_on_image(img, index_mask, color_mask, brightness_factor=1.6, a
     return np.clip(blended, 0, 255).astype("uint8")
-def extract_first_frame_from_video(video):
-    cap = cv2.VideoCapture(video)
-    success, frame = cap.read()
-    cap.release()
-    if success:
-        return Image.fromarray(cv2.cvtColor(frame, cv2.COLOR_BGR2RGB))
-    return None
 def extract_points_from_mask(mask_pil):
     mask = np.asarray(mask_pil)[..., 0]
     coords = np.nonzero(mask)
@@ -100,26 +91,6 @@ def extract_points_from_mask(mask_pil):
     return coords
-def add_contour(img, mask, color=(1., 1., 1.)):
-    img = img.copy()
-    mask = mask.astype(np.uint8) * 255
-    contours, _ = cv2.findContours(mask, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
-    cv2.drawContours(img, contours, -1, color, thickness=8)
-    return img
-def load_first_frame(video_path):
-    cap = cv2.VideoCapture(video_path)
-    ret, frame = cap.read()
-    cap.release()
-    if not ret:
-        raise gr.Error("Could not read the video file.")
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    image = Image.fromarray(frame)
-    return image
 def clear_masks():
     return [], [], [], []
@@ -146,11 +117,11 @@ def apply_sam(image, input_points):
 @spaces.GPU(duration=120)
-def run(mode, images, timestamps, masks, mask_ids, instruction, mask_output_video):
     if mode == "QA":
         response = run_text_inference(images, timestamps, masks, mask_ids, instruction)
     else:
-        response, mask_output_video = run_seg_inference(images, timestamps, instruction)
     return response, mask_output_video
@@ -181,7 +152,7 @@ def run_text_inference(images, timestamps, masks, mask_ids, instruction):
     return output
-def run_seg_inference(images, timestamps, instruction):
     output, masks = mm_infer_segmentation(
         (images, timestamps),
         seg_processor,
@@ -190,6 +161,7 @@ def run_seg_inference(images, timestamps, instruction):
         tokenizer=processor.tokenizer,
         do_sample=False,
         modal='video',
     )
     w, h = images[0].size
@@ -255,7 +227,7 @@ if __name__ == "__main__":
             <h1 align="center"><img src="https://github.com/alibaba-damo-academy/RynnEC/blob/main/assets/logo.jpg?raw=true" style="vertical-align: middle; width: 45px; height: auto;"> RynnEC Demo</h1>
             <h5 align="center" style="margin: 0;">Feel free to click on anything that grabs your interest!</h5>
             <h5 align="center" style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
-            <div style="display: flex; justify-content: left; margin-top: 10px;">
             <a href="https://huggingface.co/Alibaba-DAMO-Academy/RynnEC-2B"><img src="https://img.shields.io/badge/🤗-Checkpoints-FBD49F.svg" style="margin-right: 5px;"></a>
             <a href="https://huggingface.co/datasets/Alibaba-DAMO-Academy/RynnEC-Bench"><img src="https://img.shields.io/badge/🤗-Benchmark-FBD49F.svg" style="margin-right: 5px;"></a>
             <a href="https://www.youtube.com/watch?v=vsMxbzsmrQc"><img src="https://img.shields.io/badge/Video-36600E?logo=youtube&logoColor=green" style="margin-right: 5px;"></a>
@@ -265,7 +237,6 @@ if __name__ == "__main__":
         TIPS = """
         ### 💡 Tips:
         🧸 Upload a video, and select a frame using the slider.
         ✍️ Use the drawing tool to highlight the areas you're interested in.
@@ -274,51 +245,51 @@ if __name__ == "__main__":
         📌 Click the button 'Clear Masks' to clear the current generated masks.
         """
-        with gr.Row():
-            with gr.Column():
-                gr.HTML(HEADER)
-                gr.Markdown(
-                    """
-                    """
-                )
-        with gr.Row():
-            with gr.Column():
-                video_input = gr.Video(label="Video", interactive=True)
-                frame_idx = gr.Slider(minimum=0, maximum=0, value=0, step=1, label="Select Frame", interactive=False)
-                selected_frame = gr.ImageEditor(
-                    label="Annotate Frame",
-                    type="pil",
-                    sources=[],
-                    interactive=True,
-                )
-                generate_mask_btn_video = gr.Button("1️⃣ Generate Mask", visible=True, variant="primary")
-                gr.Examples([f"./demo/videos/{i+1}.mp4" for i in range(4)], inputs=video_input, label="Examples")
-            with gr.Column():
-                mode_video = gr.Radio(label="Mode", choices=["QA", "Seg"], value="QA")
-                mask_output_video = gr.Gallery(label="Referred Masks", object_fit='scale-down')
-                query_video = gr.Textbox(label="Question", value="Please describe <object0>.", interactive=True, visible=True)
-                response_video = gr.Textbox(label="Answer", interactive=False)
-                submit_btn_video = gr.Button("Generate Caption", variant="primary", visible=False)
-                submit_btn_video1 = gr.Button("2️⃣ Generate Answer", variant="primary", visible=True)
-                description_video = gr.Textbox(label="Output", visible=False)
-                clear_masks_btn_video = gr.Button("Clear Masks", variant="secondary")
-            gr.Markdown(TIPS)
-            frames = gr.State(value=[])
-            timestamps = gr.State(value=[])
-            mask_ids = gr.State(value=[])
-        def on_video_upload(video_path):
-            frames, timestamps = load_video(video_path, fps=1, max_frames=128)
             frames = [Image.fromarray(x.transpose(1, 2, 0)) for x in frames]
             return frames, timestamps, frames[0], gr.update(value=0, maximum=len(frames) - 1, interactive=True)
@@ -328,13 +299,15 @@ if __name__ == "__main__":
         def to_seg_mode():
             return (
                 *[gr.update(visible=False) for _ in range(4)],
-                []
             )
         def to_qa_mode():
             return (
                 *[gr.update(visible=True) for _ in range(4)],
-                []
             )
         def on_mode_change(mode):
@@ -342,8 +315,8 @@ if __name__ == "__main__":
                 return to_qa_mode()
             return to_seg_mode()
-        mode_video.change(on_mode_change, inputs=[mode_video], outputs=[frame_idx, selected_frame, generate_mask_btn_video, response_video, mask_output_video])
-        video_input.change(on_video_upload, inputs=[video_input], outputs=[frames, timestamps, selected_frame, frame_idx])
         frame_idx.change(on_frame_idx_change, inputs=[frame_idx, frames], outputs=[selected_frame])
         generate_mask_btn_video.click(
@@ -354,7 +327,7 @@ if __name__ == "__main__":
         submit_btn_video1.click(
             fn=run,
-            inputs=[mode_video, frames, timestamps, mask_raw_list_video, mask_ids, query_video, mask_output_video],
             outputs=[response_video, mask_output_video],
             api_name="describe_video"
         )
@@ -372,10 +345,8 @@ if __name__ == "__main__":
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
     sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
-    # sam_model = sam_processor = None
     disable_torch_init()
     model, processor = model_init(args_cli.model_path)
     seg_model, seg_processor = model_init(args_cli.seg_model_path)
-    # model = processor = None
     demo.launch()

     return np.clip(blended, 0, 255).astype("uint8")
 def extract_points_from_mask(mask_pil):
     mask = np.asarray(mask_pil)[..., 0]
     coords = np.nonzero(mask)
     return coords
 def clear_masks():
     return [], [], [], []
 @spaces.GPU(duration=120)
+def run(mode, images, timestamps, masks, mask_ids, instruction, mask_output_video, mask_threshold):
     if mode == "QA":
         response = run_text_inference(images, timestamps, masks, mask_ids, instruction)
     else:
+        response, mask_output_video = run_seg_inference(images, timestamps, instruction, mask_threshold)
     return response, mask_output_video
     return output
+def run_seg_inference(images, timestamps, instruction, mask_threshold):
     output, masks = mm_infer_segmentation(
         (images, timestamps),
         seg_processor,
         tokenizer=processor.tokenizer,
         do_sample=False,
         modal='video',
+        mask_threshold=mask_threshold,
     )
     w, h = images[0].size
             <h1 align="center"><img src="https://github.com/alibaba-damo-academy/RynnEC/blob/main/assets/logo.jpg?raw=true" style="vertical-align: middle; width: 45px; height: auto;"> RynnEC Demo</h1>
             <h5 align="center" style="margin: 0;">Feel free to click on anything that grabs your interest!</h5>
             <h5 align="center" style="margin: 0;">If this demo please you, please give us a star ⭐ on Github or 💖 on this space.</h5>
+            <div style="display: flex; justify-content: center; margin-top: 10px;">
             <a href="https://huggingface.co/Alibaba-DAMO-Academy/RynnEC-2B"><img src="https://img.shields.io/badge/🤗-Checkpoints-FBD49F.svg" style="margin-right: 5px;"></a>
             <a href="https://huggingface.co/datasets/Alibaba-DAMO-Academy/RynnEC-Bench"><img src="https://img.shields.io/badge/🤗-Benchmark-FBD49F.svg" style="margin-right: 5px;"></a>
             <a href="https://www.youtube.com/watch?v=vsMxbzsmrQc"><img src="https://img.shields.io/badge/Video-36600E?logo=youtube&logoColor=green" style="margin-right: 5px;"></a>
         TIPS = """
         ### 💡 Tips:
         🧸 Upload a video, and select a frame using the slider.
         ✍️ Use the drawing tool to highlight the areas you're interested in.
         📌 Click the button 'Clear Masks' to clear the current generated masks.
+        ⚙️ If you change the settings, you need to re-upload the video to apply the new settings.
         """
+        gr.HTML(HEADER)
+        with gr.Tab("Demo"):
+            with gr.Row():
+                with gr.Column():
+                    video_input = gr.Video(label="Video", interactive=True)
+                    frame_idx = gr.Slider(minimum=0, maximum=0, value=0, step=1, label="Select Frame", interactive=False)
+                    selected_frame = gr.ImageEditor(
+                        label="Annotate Frame",
+                        type="pil",
+                        sources=[],
+                        interactive=True,
+                    )
+                    generate_mask_btn_video = gr.Button("1️⃣ Generate Mask", visible=True, variant="primary")
+                    gr.Examples([f"./demo/videos/{i+1}.mp4" for i in range(4)], inputs=video_input, label="Examples")
+                with gr.Column():
+                    mode_video = gr.Radio(label="Mode", choices=["QA", "Seg"], value="QA")
+                    mask_output_video = gr.Gallery(label="Referred Masks", object_fit='scale-down')
+                    query_video = gr.Textbox(label="Question", value="What's the function of <object0>?", interactive=True, visible=True)
+                    response_video = gr.Textbox(label="Answer", interactive=False)
+                    submit_btn_video = gr.Button("Generate Caption", variant="primary", visible=False)
+                    submit_btn_video1 = gr.Button("2️⃣ Generate Answer", variant="primary", visible=True)
+                    description_video = gr.Textbox(label="Output", visible=False)
+                    clear_masks_btn_video = gr.Button("Clear Masks", variant="secondary")
+        with gr.Tab("Settings"):
+            fps = gr.Slider(label="FPS", minimum=1, maximum=30, value=1, step=1)
+            max_frames = gr.Slider(label="Max Frames", minimum=1, maximum=128, value=80, step=1)
+            mask_threshold = gr.Slider(label="Mask Threshold", minimum=0.0, maximum=1.0, value=0.5, step=0.01)
+        gr.Markdown(TIPS)
+        frames = gr.State(value=[])
+        timestamps = gr.State(value=[])
+        mask_ids = gr.State(value=[])
+        def on_video_upload(video_path, fps, max_frames):
+            frames, timestamps = load_video(video_path, fps=fps, max_frames=max_frames)
             frames = [Image.fromarray(x.transpose(1, 2, 0)) for x in frames]
             return frames, timestamps, frames[0], gr.update(value=0, maximum=len(frames) - 1, interactive=True)
         def to_seg_mode():
             return (
                 *[gr.update(visible=False) for _ in range(4)],
+                [],
+                "Please segment the rubbish bin.",
             )
         def to_qa_mode():
             return (
                 *[gr.update(visible=True) for _ in range(4)],
+                [],
+                "What's the function of <object0>?",
             )
         def on_mode_change(mode):
                 return to_qa_mode()
             return to_seg_mode()
+        mode_video.change(on_mode_change, inputs=[mode_video], outputs=[frame_idx, selected_frame, generate_mask_btn_video, response_video, mask_output_video, query_video])
+        video_input.change(on_video_upload, inputs=[video_input, fps, max_frames], outputs=[frames, timestamps, selected_frame, frame_idx])
         frame_idx.change(on_frame_idx_change, inputs=[frame_idx, frames], outputs=[selected_frame])
         generate_mask_btn_video.click(
         submit_btn_video1.click(
             fn=run,
+            inputs=[mode_video, frames, timestamps, mask_raw_list_video, mask_ids, query_video, mask_output_video, mask_threshold],
             outputs=[response_video, mask_output_video],
             api_name="describe_video"
         )
     device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
     sam_model = SamModel.from_pretrained("facebook/sam-vit-huge").to(device)
     sam_processor = SamProcessor.from_pretrained("facebook/sam-vit-huge")
     disable_torch_init()
     model, processor = model_init(args_cli.model_path)
     seg_model, seg_processor = model_init(args_cli.seg_model_path)
     demo.launch()