Segment-Anything-2-video-tracking

Running

App Files Files Community

Mirko Trasciatti commited on Nov 14, 2025

Commit

778b9c7

1 Parent(s): 697126c

Add kick detection from SAM2 trajectories

Browse files

Files changed (1) hide show

app.py +136 -0

app.py CHANGED Viewed

@@ -2,6 +2,8 @@ import colorsys
 import gc
 from copy import deepcopy
 import base64
 from pathlib import Path
 BASE64_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.b64")
 EXAMPLE_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.mp4")
@@ -213,6 +215,10 @@ class AppState:
         self.pending_box_start_obj_id: int | None = None
         self.is_switching_model: bool = False
         self.ball_centers: dict[int, dict[int, tuple[int, int]]] = {}
         # Model selection
         self.model_repo_key: str = "tiny"
         self.model_repo_id: str | None = None
@@ -288,6 +294,10 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
     GLOBAL_STATE.ball_centers = {}
     load_model_if_needed(GLOBAL_STATE)
@@ -499,10 +509,129 @@ def _update_centroids_for_frame(state: AppState, frame_idx: int):
             centers.pop(int(frame_idx), None)
         seen_obj_ids.add(int(obj_id))
         _ensure_color_for_obj(state, int(obj_id))
     # Remove frames for objects without masks at this frame
     for obj_id, centers in state.ball_centers.items():
         if obj_id not in seen_obj_ids:
             centers.pop(int(frame_idx), None)
 def on_image_click(
@@ -708,6 +837,11 @@ def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, i
     GLOBAL_STATE.pending_box_start_frame_idx = None
     GLOBAL_STATE.pending_box_start_obj_id = None
     GLOBAL_STATE.ball_centers.clear()
     # Dispose and re-init inference session for current model with existing frames
     try:
@@ -1154,6 +1288,8 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         )
         status_text = f"✅ Auto-detected ball at ({x_center}, {y_center}) (conf={conf:.2f})"
         return preview_img, gr.update(value=status_text, visible=True), gr.update(value=frame_idx)
     detect_ball_btn.click(

 import gc
 from copy import deepcopy
 import base64
+import math
+import statistics
 from pathlib import Path
 BASE64_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.b64")
 EXAMPLE_VIDEO_PATH = Path("Kickit-Video-2025-07-09-13-47-18-389.mp4")
         self.pending_box_start_obj_id: int | None = None
         self.is_switching_model: bool = False
         self.ball_centers: dict[int, dict[int, tuple[int, int]]] = {}
+        self.mask_areas: dict[int, dict[int, float]] = {}
+        self.smoothed_centers: dict[int, dict[int, tuple[float, float]]] = {}
+        self.ball_speeds: dict[int, dict[int, float]] = {}
+        self.kick_frame: int | None = None
         # Model selection
         self.model_repo_key: str = "tiny"
         self.model_repo_id: str | None = None
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
     GLOBAL_STATE.ball_centers = {}
+    GLOBAL_STATE.mask_areas = {}
+    GLOBAL_STATE.smoothed_centers = {}
+    GLOBAL_STATE.ball_speeds = {}
+    GLOBAL_STATE.kick_frame = None
     load_model_if_needed(GLOBAL_STATE)
             centers.pop(int(frame_idx), None)
         seen_obj_ids.add(int(obj_id))
         _ensure_color_for_obj(state, int(obj_id))
+        mask_np = np.array(mask)
+        if mask_np.ndim == 3:
+            mask_np = mask_np.squeeze()
+        mask_np = np.clip(mask_np, 0.0, 1.0)
+        area = float(np.count_nonzero(mask_np > 0.3))
+        areas = state.mask_areas.setdefault(int(obj_id), {})
+        areas[int(frame_idx)] = area
     # Remove frames for objects without masks at this frame
     for obj_id, centers in state.ball_centers.items():
         if obj_id not in seen_obj_ids:
             centers.pop(int(frame_idx), None)
+    for obj_id, areas in state.mask_areas.items():
+        if obj_id not in seen_obj_ids:
+            areas.pop(int(frame_idx), None)
+    _recompute_motion_metrics(state)
+def _recompute_motion_metrics(state: AppState, target_obj_id: int = 1):
+    centers = state.ball_centers.get(target_obj_id)
+    if not centers or len(centers) < 3:
+        state.smoothed_centers[target_obj_id] = {}
+        state.ball_speeds[target_obj_id] = {}
+        state.kick_frame = None
+        return
+    items = sorted(centers.items())
+    dt = 1.0 / state.video_fps if state.video_fps and state.video_fps > 1e-3 else 1.0
+    alpha = 0.35
+    smoothed: dict[int, tuple[float, float]] = {}
+    speeds: dict[int, float] = {}
+    prev_frame = None
+    prev_smooth = None
+    for frame_idx, (cx, cy) in items:
+        if prev_smooth is None:
+            smooth_x, smooth_y = float(cx), float(cy)
+        else:
+            smooth_x = prev_smooth[0] + alpha * (cx - prev_smooth[0])
+            smooth_y = prev_smooth[1] + alpha * (cy - prev_smooth[1])
+        smoothed[frame_idx] = (smooth_x, smooth_y)
+        if prev_smooth is None or prev_frame is None:
+            speeds[frame_idx] = 0.0
+        else:
+            frame_delta = max(1, frame_idx - prev_frame)
+            time_delta = frame_delta * dt
+            dist = math.hypot(smooth_x - prev_smooth[0], smooth_y - prev_smooth[1])
+            speed = dist / time_delta if time_delta > 0 else dist
+            speeds[frame_idx] = speed
+        prev_smooth = (smooth_x, smooth_y)
+        prev_frame = frame_idx
+    state.smoothed_centers[target_obj_id] = smoothed
+    state.ball_speeds[target_obj_id] = speeds
+    state.kick_frame = _detect_kick_frame(state, target_obj_id)
+def _detect_kick_frame(state: AppState, target_obj_id: int) -> int | None:
+    smoothed = state.smoothed_centers.get(target_obj_id, {})
+    speeds = state.ball_speeds.get(target_obj_id, {})
+    if len(smoothed) < 5:
+        return None
+    frames = sorted(smoothed.keys())
+    speed_series = [speeds.get(f, 0.0) for f in frames]
+    baseline_window = min(5, len(frames) // 3 or 1)
+    baseline_speeds = speed_series[:baseline_window]
+    baseline_speed = statistics.median(baseline_speeds) if baseline_speeds else 0.0
+    speed_threshold = baseline_speed + 80.0  # pixels/second
+    sustain_frames = 3
+    holdout_frames = 8
+    return_distance = 12.0
+    area_window = 4
+    area_drop_ratio = 0.75
+    areas_dict = state.mask_areas.get(target_obj_id, {})
+    initial_center = smoothed[frames[0]]
+    for idx in range(baseline_window, len(frames)):
+        frame = frames[idx]
+        speed = speed_series[idx]
+        if speed < speed_threshold:
+            continue
+        sustain_ok = True
+        for j in range(1, sustain_frames + 1):
+            if idx + j >= len(frames):
+                break
+            if speed_series[idx + j] < speed_threshold * 0.8:
+                sustain_ok = False
+                break
+        if not sustain_ok:
+            continue
+        area_pass = True
+        current_area = areas_dict.get(frame)
+        if current_area:
+            prev_areas = [
+                areas_dict.get(f)
+                for f in frames[max(0, idx - area_window):idx]
+                if areas_dict.get(f) is not None
+            ]
+            if prev_areas:
+                median_prev = statistics.median(prev_areas)
+                if median_prev > 0 and current_area / median_prev > area_drop_ratio:
+                    area_pass = False
+        if not area_pass:
+            continue
+        moved_far = True
+        for future_frame in frames[idx:min(len(frames), idx + holdout_frames)]:
+            cx, cy = smoothed[future_frame]
+            dist = math.hypot(cx - initial_center[0], cy - initial_center[1])
+            if dist < return_distance:
+                moved_far = False
+                break
+        if not moved_far:
+            continue
+        return frame
+    return None
 def on_image_click(
     GLOBAL_STATE.pending_box_start_frame_idx = None
     GLOBAL_STATE.pending_box_start_obj_id = None
     GLOBAL_STATE.ball_centers.clear()
+    GLOBAL_STATE.mask_areas.clear()
+    GLOBAL_STATE.smoothed_centers.clear()
+    GLOBAL_STATE.ball_speeds.clear()
+    GLOBAL_STATE.kick_frame = None
+    GLOBAL_STATE.ball_centers.clear()
     # Dispose and re-init inference session for current model with existing frames
     try:
         )
         status_text = f"✅ Auto-detected ball at ({x_center}, {y_center}) (conf={conf:.2f})"
+        if state_in.kick_frame is not None:
+            status_text += f" | Kick frame ≈ {state_in.kick_frame}"
         return preview_img, gr.update(value=status_text, visible=True), gr.update(value=frame_idx)
     detect_ball_btn.click(