Segment-Anything-2-video-tracking

Running

App Files Files Community

Mirko Trasciatti commited on Nov 14

Commit

4668c44

1 Parent(s): cccfb86

Add kick diagnostics plot and smoothing overlay

Browse files

Files changed (2) hide show

app.py +160 -23
requirements.txt +1 -0

app.py CHANGED Viewed

@@ -5,6 +5,8 @@ import base64
 import math
 import statistics
 from pathlib import Path
 BASE64_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.b64")
 EXAMPLE_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.mp4")
@@ -219,6 +221,13 @@ class AppState:
         self.smoothed_centers: dict[int, dict[int, tuple[float, float]]] = {}
         self.ball_speeds: dict[int, dict[int, float]] = {}
         self.kick_frame: int | None = None
         # Model selection
         self.model_repo_key: str = "tiny"
         self.model_repo_id: str | None = None
@@ -435,28 +444,35 @@ def compose_frame(state: AppState, frame_idx: int, remove_bg: bool = False) -> I
         for obj_id, centers in state.ball_centers.items():
             if not centers:
                 continue
-            items = sorted(centers.items())
             distances: list[float] = []
             prev_center = None
-            for _, (cx, cy) in items:
                 if prev_center is None:
                     distances.append(0.0)
                 else:
-                    dx = cx - prev_center[0]
-                    dy = cy - prev_center[1]
                     distances.append(float(np.hypot(dx, dy)))
-                prev_center = (cx, cy)
             max_dist = max(distances[1:], default=0.0)
             color_by_frame: dict[int, tuple[int, int, int]] = {}
-            for (f_idx, _), dist in zip(items, distances):
                 ratio = dist / max_dist if max_dist > 0 else 0.0
                 color_by_frame[f_idx] = _speed_to_color(ratio)
-            for f_idx, (cx, cy) in reversed(items):
                 highlight = (f_idx == frame_idx)
                 color = (255, 0, 0) if highlight else color_by_frame.get(f_idx, (255, 255, 0))
                 line_width = 1 if not highlight else 2
-                draw.line([(cx - cross_half, cy), (cx + cross_half, cy)], fill=color, width=line_width)
-                draw.line([(cx, cy - cross_half), (cx, cy + cross_half)], fill=color, width=line_width)
     # Save to cache and return
     state.composited_frames[frame_idx] = out_img
     return out_img
@@ -526,12 +542,100 @@ def _update_centroids_for_frame(state: AppState, frame_idx: int):
     _recompute_motion_metrics(state)
 def _recompute_motion_metrics(state: AppState, target_obj_id: int = 1):
     centers = state.ball_centers.get(target_obj_id)
     if not centers or len(centers) < 3:
         state.smoothed_centers[target_obj_id] = {}
         state.ball_speeds[target_obj_id] = {}
         state.kick_frame = None
         return
     items = sorted(centers.items())
@@ -591,6 +695,14 @@ def _detect_kick_frame(state: AppState, target_obj_id: int) -> int | None:
     areas_dict = state.mask_areas.get(target_obj_id, {})
     initial_center = smoothed[frames[0]]
     for idx in range(baseline_window, len(frames)):
         frame = frames[idx]
         speed = speed_series[idx]
@@ -634,6 +746,7 @@ def _detect_kick_frame(state: AppState, target_obj_id: int) -> int | None:
         if not moved_far:
             continue
         return frame
     return None
@@ -778,7 +891,7 @@ def on_image_click(
 def propagate_masks(GLOBAL_STATE: gr.State):
     if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
         # yield GLOBAL_STATE, "Load a video first.", gr.update()
-        return GLOBAL_STATE, "Load a video first.", gr.update()
     processor = deepcopy(GLOBAL_STATE.processor)
     model = deepcopy(GLOBAL_STATE.model)
@@ -792,7 +905,7 @@ def propagate_masks(GLOBAL_STATE: gr.State):
     processed = 0
     # Initial status; no slider change yet
-    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update()
     last_frame_idx = 0
     with torch.inference_mode():
@@ -819,19 +932,27 @@ def propagate_masks(GLOBAL_STATE: gr.State):
             processed += 1
             # Every 15th frame (or last), move slider to current frame to update preview via slider binding
             if processed % 30 == 0 or processed == total:
-                yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx)
     text = f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
     # Final status; ensure slider points to last processed frame
-    yield GLOBAL_STATE, text, gr.update(value=last_frame_idx)
-def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str]:
     # Reset only session-related state, keep uploaded video and model
     if not GLOBAL_STATE.video_frames:
         # Nothing loaded; keep behavior
-        return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video."
     # Clear prompts and caches
     GLOBAL_STATE.masks_by_frame.clear()
@@ -866,7 +987,15 @@ def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, i
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
     # clear and reload model and processor
-    return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status, gr.update(visible=False, value="")
 def create_annotation_preview(video_file, annotations):
@@ -1135,6 +1264,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
                 clear_old_chk = gr.Checkbox(value=False, label="Clear old inputs for this object")
                 prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
     # Wire events
     def _on_video_change(GLOBAL_STATE: gr.State, video):
@@ -1144,13 +1274,14 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
             first_frame,
             status,
-            gr.update(visible=False, value="")
         )
     video_in.change(
         _on_video_change,
         inputs=[GLOBAL_STATE, video_in],
-        outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status],
         show_progress=True,
     )
@@ -1165,7 +1296,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             examples=examples_list,
             inputs=[GLOBAL_STATE, video_in],
             fn=_on_video_change,
-            outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status],
             label="Examples",
             cache_examples=False,
             examples_per_page=5,
@@ -1265,6 +1396,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                     visible=True,
                 ),
                 gr.update(value=frame_idx),
             )
         x_center, y_center, _, _, conf = detection
@@ -1297,12 +1429,17 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             status_text += f" | Kick frame ≈ {state_in.kick_frame}"
         else:
             status_text += " | Kick frame: not detected"
-        return preview_img, gr.update(value=status_text, visible=True), gr.update(value=frame_idx)
     detect_ball_btn.click(
         _auto_detect_ball,
         inputs=[GLOBAL_STATE, obj_id_inp, label_radio, clear_old_chk],
-        outputs=[preview, ball_status, frame_slider],
     )
     # Image click to add a point and run forward on that frame
@@ -1352,13 +1489,13 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status, frame_slider],
     )
     reset_btn.click(
         reset_session,
         inputs=GLOBAL_STATE,
-        outputs=[GLOBAL_STATE, preview, frame_slider, frame_slider, load_status, ball_status],
     )
 # ============================================================================

 import math
 import statistics
 from pathlib import Path
+import plotly.graph_objects as go
 BASE64_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.b64")
 EXAMPLE_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.mp4")
         self.smoothed_centers: dict[int, dict[int, tuple[float, float]]] = {}
         self.ball_speeds: dict[int, dict[int, float]] = {}
         self.kick_frame: int | None = None
+        self.kick_debug_frames: list[int] = []
+        self.kick_debug_speeds: list[float] = []
+        self.kick_debug_threshold: float | None = None
+        self.kick_debug_baseline: float | None = None
+        self.kick_debug_speed_std: float | None = None
+        self.kick_debug_area: list[float] = []
+        self.kick_debug_kick_frame: int | None = None
         # Model selection
         self.model_repo_key: str = "tiny"
         self.model_repo_id: str | None = None
         for obj_id, centers in state.ball_centers.items():
             if not centers:
                 continue
+            raw_items = sorted(centers.items())
+            for _, (rx, ry) in raw_items:
+                draw.line([(rx - cross_half, ry), (rx + cross_half, ry)], fill=(160, 160, 160), width=1)
+                draw.line([(rx, ry - cross_half), (rx, ry + cross_half)], fill=(160, 160, 160), width=1)
+            smooth_dict = state.smoothed_centers.get(obj_id, {})
+            if not smooth_dict:
+                continue
+            smooth_items = sorted(smooth_dict.items())
             distances: list[float] = []
             prev_center = None
+            for _, (sx, sy) in smooth_items:
                 if prev_center is None:
                     distances.append(0.0)
                 else:
+                    dx = sx - prev_center[0]
+                    dy = sy - prev_center[1]
                     distances.append(float(np.hypot(dx, dy)))
+                prev_center = (sx, sy)
             max_dist = max(distances[1:], default=0.0)
             color_by_frame: dict[int, tuple[int, int, int]] = {}
+            for (f_idx, _), dist in zip(smooth_items, distances):
                 ratio = dist / max_dist if max_dist > 0 else 0.0
                 color_by_frame[f_idx] = _speed_to_color(ratio)
+            for f_idx, (sx, sy) in reversed(smooth_items):
                 highlight = (f_idx == frame_idx)
                 color = (255, 0, 0) if highlight else color_by_frame.get(f_idx, (255, 255, 0))
                 line_width = 1 if not highlight else 2
+                draw.line([(sx - cross_half, sy), (sx + cross_half, sy)], fill=color, width=line_width)
+                draw.line([(sx, sy - cross_half), (sx, sy + cross_half)], fill=color, width=line_width)
     # Save to cache and return
     state.composited_frames[frame_idx] = out_img
     return out_img
     _recompute_motion_metrics(state)
+def _build_kick_plot(state: AppState):
+    fig = go.Figure()
+    if state is None or not state.kick_debug_frames or not state.kick_debug_speeds:
+        fig.update_layout(
+            title="Kick speed diagnostics",
+            xaxis_title="Frame",
+            yaxis_title="Speed (px/s)",
+        )
+        return fig
+    frames = state.kick_debug_frames
+    speeds = state.kick_debug_speeds
+    areas = state.kick_debug_area if state.kick_debug_area else [0.0] * len(frames)
+    threshold = state.kick_debug_threshold or 0.0
+    baseline = state.kick_debug_baseline or 0.0
+    kick_frame = state.kick_debug_kick_frame
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=speeds,
+            mode="lines+markers",
+            name="Speed (px/s)",
+            line=dict(color="#1f77b4"),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=[frames[0], frames[-1]],
+            y=[threshold, threshold],
+            mode="lines",
+            name="Adaptive threshold",
+            line=dict(color="#d62728", dash="dash"),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=[frames[0], frames[-1]],
+            y=[baseline, baseline],
+            mode="lines",
+            name="Baseline speed",
+            line=dict(color="#ff7f0e", dash="dot"),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=areas,
+            mode="lines",
+            name="Mask area",
+            line=dict(color="#2ca02c"),
+            yaxis="y2",
+        )
+    )
+    if kick_frame is not None:
+        fig.add_trace(
+            go.Scatter(
+                x=[kick_frame, kick_frame],
+                y=[min(speeds), max(max(speeds), threshold)],
+                mode="lines",
+                name="Detected kick",
+                line=dict(color="#9467bd", dash="dashdot"),
+            )
+        )
+    fig.update_layout(
+        title="Kick speed diagnostics",
+        xaxis_title="Frame",
+        yaxis_title="Speed (px/s)",
+        yaxis=dict(side="left"),
+        yaxis2=dict(
+            title="Mask area (px)",
+            overlaying="y",
+            side="right",
+            showgrid=False,
+        ),
+        legend=dict(orientation="h"),
+        margin=dict(t=40, l=40, r=40, b=40),
+    )
+    return fig
 def _recompute_motion_metrics(state: AppState, target_obj_id: int = 1):
     centers = state.ball_centers.get(target_obj_id)
     if not centers or len(centers) < 3:
         state.smoothed_centers[target_obj_id] = {}
         state.ball_speeds[target_obj_id] = {}
         state.kick_frame = None
+        state.kick_debug_frames = []
+        state.kick_debug_speeds = []
+        state.kick_debug_threshold = None
+        state.kick_debug_baseline = None
+        state.kick_debug_speed_std = None
+        state.kick_debug_area = []
+        state.kick_debug_kick_frame = None
         return
     items = sorted(centers.items())
     areas_dict = state.mask_areas.get(target_obj_id, {})
     initial_center = smoothed[frames[0]]
+    state.kick_debug_frames = frames
+    state.kick_debug_speeds = speed_series
+    state.kick_debug_threshold = speed_threshold
+    state.kick_debug_baseline = baseline_speed
+    state.kick_debug_speed_std = speed_std
+    state.kick_debug_area = [areas_dict.get(f, 0.0) for f in frames]
+    state.kick_debug_kick_frame = None
     for idx in range(baseline_window, len(frames)):
         frame = frames[idx]
         speed = speed_series[idx]
         if not moved_far:
             continue
+        state.kick_debug_kick_frame = frame
         return frame
     return None
 def propagate_masks(GLOBAL_STATE: gr.State):
     if GLOBAL_STATE is None or GLOBAL_STATE.inference_session is None:
         # yield GLOBAL_STATE, "Load a video first.", gr.update()
+        return GLOBAL_STATE, "Load a video first.", gr.update(), _build_kick_plot(GLOBAL_STATE)
     processor = deepcopy(GLOBAL_STATE.processor)
     model = deepcopy(GLOBAL_STATE.model)
     processed = 0
     # Initial status; no slider change yet
+    yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(), _build_kick_plot(GLOBAL_STATE)
     last_frame_idx = 0
     with torch.inference_mode():
             processed += 1
             # Every 15th frame (or last), move slider to current frame to update preview via slider binding
             if processed % 30 == 0 or processed == total:
+                yield GLOBAL_STATE, f"Propagating masks: {processed}/{total}", gr.update(value=frame_idx), _build_kick_plot(GLOBAL_STATE)
     text = f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
     # Final status; ensure slider points to last processed frame
+    yield GLOBAL_STATE, text, gr.update(value=last_frame_idx), _build_kick_plot(GLOBAL_STATE)
+def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, int, str, gr.Update, go.Figure]:
     # Reset only session-related state, keep uploaded video and model
     if not GLOBAL_STATE.video_frames:
         # Nothing loaded; keep behavior
+        return (
+            GLOBAL_STATE,
+            None,
+            0,
+            0,
+            "Session reset. Load a new video.",
+            gr.update(visible=False, value=""),
+            _build_kick_plot(GLOBAL_STATE),
+        )
     # Clear prompts and caches
     GLOBAL_STATE.masks_by_frame.clear()
     slider_value = gr.update(value=current_idx)
     status = "Session reset. Prompts cleared; video preserved."
     # clear and reload model and processor
+    return (
+        GLOBAL_STATE,
+        preview_img,
+        slider_minmax,
+        slider_value,
+        status,
+        gr.update(visible=False, value=""),
+        _build_kick_plot(GLOBAL_STATE),
+    )
 def create_annotation_preview(video_file, annotations):
                 label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
                 clear_old_chk = gr.Checkbox(value=False, label="Clear old inputs for this object")
                 prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
+            kick_plot = gr.Plot(label="Kick diagnostics", show_label=True)
     # Wire events
     def _on_video_change(GLOBAL_STATE: gr.State, video):
             gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
             first_frame,
             status,
+            gr.update(visible=False, value=""),
+            _build_kick_plot(GLOBAL_STATE)
         )
     video_in.change(
         _on_video_change,
         inputs=[GLOBAL_STATE, video_in],
+        outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status, kick_plot],
         show_progress=True,
     )
             examples=examples_list,
             inputs=[GLOBAL_STATE, video_in],
             fn=_on_video_change,
+            outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status, kick_plot],
             label="Examples",
             cache_examples=False,
             examples_per_page=5,
                     visible=True,
                 ),
                 gr.update(value=frame_idx),
+                _build_kick_plot(state_in),
             )
         x_center, y_center, _, _, conf = detection
             status_text += f" | Kick frame ≈ {state_in.kick_frame}"
         else:
             status_text += " | Kick frame: not detected"
+        return (
+            preview_img,
+            gr.update(value=status_text, visible=True),
+            gr.update(value=frame_idx),
+            _build_kick_plot(state_in),
+        )
     detect_ball_btn.click(
         _auto_detect_ball,
         inputs=[GLOBAL_STATE, obj_id_inp, label_radio, clear_old_chk],
+        outputs=[preview, ball_status, frame_slider, kick_plot],
     )
     # Image click to add a point and run forward on that frame
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
+        outputs=[GLOBAL_STATE, propagate_status, frame_slider, kick_plot],
     )
     reset_btn.click(
         reset_session,
         inputs=GLOBAL_STATE,
+        outputs=[GLOBAL_STATE, preview, frame_slider, frame_slider, load_status, ball_status, kick_plot],
     )
 # ============================================================================

requirements.txt CHANGED Viewed

@@ -7,5 +7,6 @@ opencv-python
 imageio[pyav]
 spaces
 git+https://github.com/iMoonLab/yolov13

 imageio[pyav]
 spaces
 git+https://github.com/iMoonLab/yolov13
+plotly