Segment-Anything-2-video-tracking

Running

App Files Files Community

Mirko Trasciatti commited on 27 days ago

Commit

fb2fd45

1 Parent(s): 2feeac4

Add YOLO-driven kick detection and chart

Browse files

Files changed (1) hide show

app.py +507 -22

app.py CHANGED Viewed

@@ -1,3 +1,5 @@
 import colorsys
 import gc
 from copy import deepcopy
@@ -161,6 +163,236 @@ def detect_person_box(
     return x_min, y_min, x_max, y_max, conf
 def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
     """Generate a deterministic pastel RGB color for a given object id.
@@ -305,6 +537,25 @@ class AppState:
         self.player_obj_id: int | None = None
         self.player_detection_frame: int | None = None
         self.player_detection_conf: float | None = None
     def __repr__(self):
         return f"AppState(video_frames={self.video_frames}, inference_session={self.inference_session is not None}, model={self.model is not None}, processor={self.processor is not None}, device={self.device}, dtype={self.dtype}, video_fps={self.video_fps}, masks_by_frame={self.masks_by_frame}, color_by_obj={self.color_by_obj}, clicks_by_frame_obj={self.clicks_by_frame_obj}, boxes_by_frame_obj={self.boxes_by_frame_obj}, composited_frames={self.composited_frames}, current_frame_idx={self.current_frame_idx}, current_obj_id={self.current_obj_id}, current_label={self.current_label}, current_clear_old={self.current_clear_old}, current_prompt_type={self.current_prompt_type}, pending_box_start={self.pending_box_start}, pending_box_start_frame_idx={self.pending_box_start_frame_idx}, pending_box_start_obj_id={self.pending_box_start_obj_id}, is_switching_model={self.is_switching_model}, model_repo_key={self.model_repo_key}, model_repo_id={self.model_repo_id}, session_repo_id={self.session_repo_id})"
@@ -403,9 +654,43 @@ def init_video_session(GLOBAL_STATE: gr.State, video: str | dict) -> tuple[AppSt
     GLOBAL_STATE.impact_debug_speed_kmh = []
     GLOBAL_STATE.impact_debug_speed_threshold_px = None
     GLOBAL_STATE.impact_meters_per_px = None
     GLOBAL_STATE.player_obj_id = None
     GLOBAL_STATE.player_detection_frame = None
     GLOBAL_STATE.player_detection_conf = None
     load_model_if_needed(GLOBAL_STATE)
@@ -957,6 +1242,137 @@ def _build_kick_plot(state: AppState):
     return fig
 def _format_impact_status(state: AppState) -> str:
     if state is None:
         return "Impact frame: not computed"
@@ -1018,15 +1434,10 @@ def _player_has_masks(state: AppState) -> bool:
 def _button_updates(state: AppState) -> tuple[Any, Any, Any]:
-    propagate_main_enabled = _ball_has_masks(state)
-    detect_player_enabled = False
-    propagate_player_enabled = False
-    if isinstance(state, AppState):
-        kick_candidate = state.kick_frame or getattr(state, "kick_debug_kick_frame", None)
-        if kick_candidate is not None:
-            detect_player_enabled = True
-        if detect_player_enabled and _player_has_masks(state):
-            propagate_player_enabled = True
     return (
         gr.update(interactive=propagate_main_enabled),
         gr.update(interactive=detect_player_enabled),
@@ -1468,6 +1879,7 @@ def propagate_masks(GLOBAL_STATE: gr.State):
             "Load a video first.",
             gr.update(),
             _build_kick_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
             propagate_main_update,
@@ -1475,6 +1887,8 @@ def propagate_masks(GLOBAL_STATE: gr.State):
             propagate_player_update,
         )
     processor = deepcopy(GLOBAL_STATE.processor)
     model = deepcopy(GLOBAL_STATE.model)
     inference_session = deepcopy(GLOBAL_STATE.inference_session)
@@ -1483,9 +1897,19 @@ def propagate_masks(GLOBAL_STATE: gr.State):
     inference_session.cache.inference_device = "cuda"
     model.to("cuda")
-    total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
     # Initial status; no slider change yet
     propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(GLOBAL_STATE)
     yield (
@@ -1493,6 +1917,7 @@ def propagate_masks(GLOBAL_STATE: gr.State):
         f"Propagating masks: {processed}/{total}",
         gr.update(),
         _build_kick_plot(GLOBAL_STATE),
         _format_impact_status(GLOBAL_STATE),
         gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
         propagate_main_update,
@@ -1500,9 +1925,10 @@ def propagate_masks(GLOBAL_STATE: gr.State):
         propagate_player_update,
     )
-    last_frame_idx = 0
     with torch.inference_mode():
-        for frame_idx, frame in enumerate(GLOBAL_STATE.video_frames):
             pixel_values = None
             if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
                 pixel_values = processor(images=frame, device="cuda", return_tensors="pt").pixel_values[0]
@@ -1531,6 +1957,7 @@ def propagate_masks(GLOBAL_STATE: gr.State):
                     f"Propagating masks: {processed}/{total}",
                     gr.update(value=frame_idx),
                     _build_kick_plot(GLOBAL_STATE),
                     _format_impact_status(GLOBAL_STATE),
                     gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                     propagate_main_update,
@@ -1554,6 +1981,7 @@ def propagate_masks(GLOBAL_STATE: gr.State):
         text,
         gr.update(value=target_frame),
         _build_kick_plot(GLOBAL_STATE),
         _format_impact_status(GLOBAL_STATE),
         gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
         propagate_main_update,
@@ -1646,6 +2074,7 @@ def reset_session(GLOBAL_STATE: gr.State) -> tuple[AppState, Image.Image, int, i
         status,
         gr.update(visible=False, value=""),
         _build_kick_plot(GLOBAL_STATE),
         _format_impact_status(GLOBAL_STATE),
         propagate_main_update,
         detect_btn_update,
@@ -1893,7 +2322,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 """
                 **Working with results**
                 - **Preview**: Use the slider to navigate frames and see the current masks.
-                - **Propagate**: Click “Propagate across video” to track all defined objects through the entire video. The preview follows progress periodically to keep things responsive.
                 - **Export**: Render an MP4 for smooth playback using the original video FPS.
                 - **Note**: More info on the Hugging Face 🤗 Transformers implementation of SAM2 can be found [here](https://huggingface.co/docs/transformers/en/main/en/model_doc/sam2_video).
                 """
@@ -1951,7 +2380,8 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 )
             with gr.Row():
                 detect_ball_btn = gr.Button("Detect Ball", variant="secondary")
-                propagate_btn = gr.Button("Propagate across video", variant="primary", interactive=False)
             detect_player_btn = gr.Button("Detect Player", variant="secondary", interactive=False)
             propagate_player_btn = gr.Button("Propagate Player", variant="primary", interactive=False)
             ball_status = gr.Markdown(visible=False)
@@ -1963,6 +2393,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 clear_old_chk = gr.Checkbox(value=False, label="Clear old inputs for this object")
                 prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
             kick_plot = gr.Plot(label="Kick & impact diagnostics", show_label=True)
     # Wire events
     def _on_video_change(GLOBAL_STATE: gr.State, video):
@@ -1975,6 +2406,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             status,
             gr.update(visible=False, value=""),
             _build_kick_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             propagate_main_update,
             detect_btn_update,
@@ -1984,7 +2416,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     video_in.change(
         _on_video_change,
         inputs=[GLOBAL_STATE, video_in],
-        outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status, kick_plot, impact_status, propagate_btn, detect_player_btn, propagate_player_btn],
         show_progress=True,
     )
@@ -1997,7 +2429,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             examples=examples_list,
             inputs=[GLOBAL_STATE, video_in],
             fn=_on_video_change,
-            outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status, kick_plot, impact_status, propagate_btn, detect_player_btn, propagate_player_btn],
             label="Examples",
             cache_examples=False,
             examples_per_page=5,
@@ -2187,6 +2619,43 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         outputs=[preview, ball_status, frame_slider, kick_plot, propagate_btn, detect_player_btn, propagate_player_btn],
     )
     def _auto_detect_player(state_in: AppState):
         if state_in is None or state_in.num_frames == 0:
             raise gr.Error("Load a video first, then try auto-detect.")
@@ -2303,6 +2772,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 "Load a video first.",
                 gr.update(),
                 _build_kick_plot(GLOBAL_STATE),
                 _format_impact_status(GLOBAL_STATE),
                 gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                 propagate_main_update,
@@ -2316,6 +2786,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 "Detect the player before propagating.",
                 gr.update(),
                 _build_kick_plot(GLOBAL_STATE),
                 _format_impact_status(GLOBAL_STATE),
                 gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                 propagate_main_update,
@@ -2330,8 +2801,17 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         inference_session.cache.inference_device = "cuda"
         model.to("cuda")
-        total = max(1, GLOBAL_STATE.num_frames)
         processed = 0
         propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(GLOBAL_STATE)
         yield (
@@ -2339,6 +2819,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             f"Propagating player: {processed}/{total}",
             gr.update(),
             _build_kick_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
             propagate_main_update,
@@ -2349,7 +2830,8 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         player_id = GLOBAL_STATE.player_obj_id or PLAYER_OBJECT_ID
         with torch.inference_mode():
-            for frame_idx, frame in enumerate(GLOBAL_STATE.video_frames):
                 pixel_values = None
                 if (
                     inference_session.processed_frames is None
@@ -2375,6 +2857,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                 GLOBAL_STATE.composited_frames.pop(frame_idx, None)
                 processed += 1
                 if processed % 30 == 0 or processed == total:
                     propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(GLOBAL_STATE)
                     yield (
@@ -2382,6 +2865,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
                         f"Propagating player: {processed}/{total}",
                         gr.update(value=frame_idx),
                         _build_kick_plot(GLOBAL_STATE),
                         _format_impact_status(GLOBAL_STATE),
                         gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                         propagate_main_update,
@@ -2394,7 +2878,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
         if target_frame is None:
             target_frame = GLOBAL_STATE.kick_frame or getattr(GLOBAL_STATE, "kick_debug_kick_frame", None)
         if target_frame is None:
-            target_frame = max(0, processed - 1)
         target_frame = int(np.clip(target_frame, 0, max(0, GLOBAL_STATE.num_frames - 1)))
         GLOBAL_STATE.current_frame_idx = target_frame
@@ -2404,6 +2888,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
             text,
             gr.update(value=target_frame),
             _build_kick_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
             propagate_main_update,
@@ -2414,7 +2899,7 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     propagate_player_btn.click(
         propagate_player_masks,
         inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status, frame_slider, kick_plot, impact_status, ball_status, propagate_btn, detect_player_btn, propagate_player_btn],
     )
     # Image click to add a point and run forward on that frame
@@ -2483,13 +2968,13 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation", the
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
-        outputs=[GLOBAL_STATE, propagate_status, frame_slider, kick_plot, impact_status, ball_status, propagate_btn, detect_player_btn, propagate_player_btn],
     )
     reset_btn.click(
         reset_session,
         inputs=GLOBAL_STATE,
-        outputs=[GLOBAL_STATE, preview, frame_slider, frame_slider, load_status, ball_status, kick_plot, impact_status, propagate_btn, detect_player_btn, propagate_player_btn],
     )
 # ============================================================================

+from __future__ import annotations
 import colorsys
 import gc
 from copy import deepcopy
     return x_min, y_min, x_max, y_max, conf
+def _compute_sam_window_from_kick(state: AppState, kick_frame: int | None) -> tuple[int, int]:
+    total_frames = state.num_frames
+    if total_frames == 0:
+        return 0, 0
+    fps = state.video_fps if state.video_fps and state.video_fps > 0 else 25.0
+    target_window_frames = max(1, int(round(fps * 4.0)))
+    half_window = target_window_frames // 2
+    if kick_frame is None:
+        start_idx = 0
+    else:
+        start_idx = max(0, int(kick_frame) - half_window)
+    end_idx = min(total_frames, start_idx + target_window_frames)
+    if end_idx <= start_idx:
+        end_idx = min(total_frames, start_idx + 1)
+    state.sam_window = (start_idx, end_idx)
+    return start_idx, end_idx
+def _perform_yolo_ball_tracking(state: AppState, progress: gr.Progress | None = None) -> None:
+    if state is None or state.num_frames == 0:
+        raise gr.Error("Load a video first, then track with YOLO.")
+    model = get_yolo_model()
+    class_ids = [
+        idx for idx, name in model.names.items() if name.lower() == YOLO_TARGET_NAME
+    ]
+    if not class_ids:
+        raise gr.Error("YOLO model does not contain the sports ball class.")
+    frames = state.video_frames
+    total = len(frames)
+    centers: dict[int, tuple[float, float]] = {}
+    boxes: dict[int, tuple[int, int, int, int]] = {}
+    confs: dict[int, float] = {}
+    areas: dict[int, float] = {}
+    first_detection_frame: int | None = None
+    for idx, frame in enumerate(frames):
+        if progress is not None:
+            progress((idx + 1) / total)
+        results = model.predict(
+            source=frame,
+            conf=YOLO_CONF_THRESHOLD,
+            iou=YOLO_IOU_THRESHOLD,
+            max_det=1,
+            classes=class_ids,
+            imgsz=640,
+            device="cpu",
+            verbose=False,
+        )
+        if not results:
+            continue
+        boxes_result = results[0].boxes
+        if boxes_result is None or len(boxes_result) == 0:
+            continue
+        box = boxes_result[0]
+        xywh = box.xywh[0].cpu().tolist()
+        conf = float(box.conf[0].cpu().item()) if box.conf is not None else 0.0
+        x_center, y_center, width, height = xywh
+        x_center = float(x_center)
+        y_center = float(y_center)
+        width = max(1.0, float(width))
+        height = max(1.0, float(height))
+        frame_width, frame_height = frame.size
+        x_min = int(round(max(0.0, x_center - width / 2.0)))
+        y_min = int(round(max(0.0, y_center - height / 2.0)))
+        x_max = int(round(min(frame_width - 1.0, x_center + width / 2.0)))
+        y_max = int(round(min(frame_height - 1.0, y_center + height / 2.0)))
+        if x_max <= x_min or y_max <= y_min:
+            continue
+        centers[idx] = (x_center, y_center)
+        boxes[idx] = (x_min, y_min, x_max, y_max)
+        confs[idx] = conf
+        areas[idx] = float((x_max - x_min) * (y_max - y_min))
+        if first_detection_frame is None:
+            first_detection_frame = idx
+    state.yolo_ball_centers = centers
+    state.yolo_ball_boxes = boxes
+    state.yolo_ball_conf = confs
+    state.yolo_mask_area_proxy = [areas.get(k, 0.0) for k in sorted(centers.keys())]
+    state.yolo_initial_frame = first_detection_frame
+    if len(centers) < 3:
+        state.yolo_smoothed_centers = {}
+        state.yolo_speeds = {}
+        state.yolo_distance_from_start = {}
+        state.yolo_threshold = None
+        state.yolo_baseline_speed = None
+        state.yolo_speed_std = None
+        state.yolo_kick_frame = None
+        state.yolo_status = "❌ YOLO13: insufficient detections to estimate kick. Please retry or annotate manually."
+        state.sam_window = None
+        return
+    items = sorted(centers.items())
+    dt = 1.0 / state.video_fps if state.video_fps and state.video_fps > 1e-3 else 1.0
+    alpha = 0.35
+    smoothed: dict[int, tuple[float, float]] = {}
+    speeds: dict[int, float] = {}
+    prev_frame = None
+    prev_smooth = None
+    for frame_idx, (cx, cy) in items:
+        if prev_smooth is None:
+            smooth_x, smooth_y = float(cx), float(cy)
+        else:
+            smooth_x = prev_smooth[0] + alpha * (cx - prev_smooth[0])
+            smooth_y = prev_smooth[1] + alpha * (cy - prev_smooth[1])
+        smoothed[frame_idx] = (smooth_x, smooth_y)
+        if prev_smooth is None or prev_frame is None:
+            speeds[frame_idx] = 0.0
+        else:
+            frame_delta = max(1, frame_idx - prev_frame)
+            time_delta = frame_delta * dt
+            dist = math.hypot(smooth_x - prev_smooth[0], smooth_y - prev_smooth[1])
+            speed = dist / time_delta if time_delta > 0 else dist
+            speeds[frame_idx] = speed
+        prev_smooth = (smooth_x, smooth_y)
+        prev_frame = frame_idx
+    frames_ordered = [frame_idx for frame_idx, _ in items]
+    speed_series = [speeds.get(f, 0.0) for f in frames_ordered]
+    baseline_window = min(10, len(frames_ordered) // 3 or 1)
+    baseline_speeds = speed_series[:baseline_window]
+    baseline_speed = statistics.median(baseline_speeds) if baseline_speeds else 0.0
+    speed_std = statistics.pstdev(baseline_speeds) if len(baseline_speeds) > 1 else 0.0
+    base_threshold = baseline_speed + 4.0 * speed_std
+    if base_threshold < baseline_speed * 3.0:
+        base_threshold = baseline_speed * 3.0
+    speed_threshold = max(base_threshold, 15.0)
+    distance_dict: dict[int, float] = {}
+    if smoothed:
+        first_frame = frames_ordered[0]
+        origin = smoothed[first_frame]
+        for frame_idx, (sx, sy) in smoothed.items():
+            distance_dict[frame_idx] = math.hypot(sx - origin[0], sy - origin[1])
+    areas_dict = {idx: areas.get(idx, 0.0) for idx in frames_ordered}
+    initial_area = areas_dict.get(frames_ordered[0], 1.0) or 1.0
+    radius_estimate = math.sqrt(initial_area / math.pi)
+    adaptive_return_distance = max(8.0, min(radius_estimate * 1.5, 40.0))
+    sustain_frames = 3
+    holdout_frames = 8
+    area_window = 4
+    area_drop_ratio = 0.75
+    kalman_pos, kalman_speed, _ = _run_kalman_filter(items, dt)
+    kalman_speed_series = [kalman_speed.get(f, 0.0) for f in frames_ordered]
+    kick_frame: int | None = None
+    for idx, frame in enumerate(frames_ordered[baseline_window:], start=baseline_window):
+        speed = speed_series[idx]
+        if speed < speed_threshold:
+            continue
+        sustain_ok = True
+        for j in range(1, sustain_frames + 1):
+            if idx + j >= len(frames_ordered):
+                break
+            if speed_series[idx + j] < speed_threshold * 0.7:
+                sustain_ok = False
+                break
+        if not sustain_ok:
+            continue
+        area_pass = True
+        current_area = areas_dict.get(frame)
+        if current_area:
+            prev_areas = [
+                areas_dict.get(f)
+                for f in frames_ordered[max(0, idx - area_window):idx]
+                if areas_dict.get(f) is not None
+            ]
+            if prev_areas:
+                median_prev = statistics.median(prev_areas)
+                if median_prev > 0:
+                    ratio = current_area / median_prev
+                    if ratio > area_drop_ratio:
+                        area_pass = False
+        if not area_pass and speed < speed_threshold * 1.2:
+            continue
+        future_slice = frames_ordered[idx: min(len(frames_ordered), idx + holdout_frames)]
+        max_future_dist = 0.0
+        for future_frame in future_slice:
+            dist = distance_dict.get(future_frame, 0.0)
+            if dist > max_future_dist:
+                max_future_dist = dist
+        if max_future_dist < adaptive_return_distance:
+            continue
+        kick_frame = frame
+        break
+    state.yolo_smoothed_centers = smoothed
+    state.yolo_speeds = speeds
+    state.yolo_distance_from_start = distance_dict
+    state.yolo_threshold = speed_threshold
+    state.yolo_baseline_speed = baseline_speed
+    state.yolo_speed_std = speed_std
+    state.yolo_kick_frames = frames_ordered
+    state.yolo_kick_speeds = speed_series
+    state.yolo_kick_distance = [distance_dict.get(f, 0.0) for f in frames_ordered]
+    state.yolo_mask_area_proxy = [areas_dict.get(f, 0.0) for f in frames_ordered]
+    state.yolo_kick_frame = kick_frame
+    coverage = len(centers) / total if total else 0.0
+    if kick_frame is not None:
+        state.yolo_status = f"✅ YOLO13 tracked {len(centers)}/{total} frames ({coverage:.0%})."
+    else:
+        state.yolo_status = (
+            f"⚠️ YOLO13 tracked {len(centers)}/{total} frames ({coverage:.0%}) but did not find a definitive kick."
+        )
+    state.kalman_centers[BALL_OBJECT_ID] = kalman_pos
+    state.kalman_speeds[BALL_OBJECT_ID] = kalman_speed
+    if kick_frame is not None:
+        state.kick_frame = kick_frame
+        _compute_sam_window_from_kick(state, kick_frame)
+    else:
+        state.sam_window = None
 def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
     """Generate a deterministic pastel RGB color for a given object id.
         self.player_obj_id: int | None = None
         self.player_detection_frame: int | None = None
         self.player_detection_conf: float | None = None
+        # YOLO tracking caches
+        self.yolo_ball_centers: dict[int, tuple[float, float]] = {}
+        self.yolo_ball_boxes: dict[int, tuple[int, int, int, int]] = {}
+        self.yolo_ball_conf: dict[int, float] = {}
+        self.yolo_smoothed_centers: dict[int, tuple[float, float]] = {}
+        self.yolo_speeds: dict[int, float] = {}
+        self.yolo_distance_from_start: dict[int, float] = {}
+        self.yolo_threshold: float | None = None
+        self.yolo_baseline_speed: float | None = None
+        self.yolo_speed_std: float | None = None
+        self.yolo_kick_frame: int | None = None
+        self.yolo_status: str = ""
+        self.yolo_kick_frames: list[int] = []
+        self.yolo_kick_speeds: list[float] = []
+        self.yolo_kick_distance: list[float] = []
+        self.yolo_mask_area_proxy: list[float] = []
+        self.yolo_initial_frame: int | None = None
+        # SAM window (start_idx inclusive, end_idx exclusive)
+        self.sam_window: tuple[int, int] | None = None
     def __repr__(self):
         return f"AppState(video_frames={self.video_frames}, inference_session={self.inference_session is not None}, model={self.model is not None}, processor={self.processor is not None}, device={self.device}, dtype={self.dtype}, video_fps={self.video_fps}, masks_by_frame={self.masks_by_frame}, color_by_obj={self.color_by_obj}, clicks_by_frame_obj={self.clicks_by_frame_obj}, boxes_by_frame_obj={self.boxes_by_frame_obj}, composited_frames={self.composited_frames}, current_frame_idx={self.current_frame_idx}, current_obj_id={self.current_obj_id}, current_label={self.current_label}, current_clear_old={self.current_clear_old}, current_prompt_type={self.current_prompt_type}, pending_box_start={self.pending_box_start}, pending_box_start_frame_idx={self.pending_box_start_frame_idx}, pending_box_start_obj_id={self.pending_box_start_obj_id}, is_switching_model={self.is_switching_model}, model_repo_key={self.model_repo_key}, model_repo_id={self.model_repo_id}, session_repo_id={self.session_repo_id})"
     GLOBAL_STATE.impact_debug_speed_kmh = []
     GLOBAL_STATE.impact_debug_speed_threshold_px = None
     GLOBAL_STATE.impact_meters_per_px = None
+    GLOBAL_STATE.yolo_ball_centers = {}
+    GLOBAL_STATE.yolo_ball_boxes = {}
+    GLOBAL_STATE.yolo_ball_conf = {}
+    GLOBAL_STATE.yolo_smoothed_centers = {}
+    GLOBAL_STATE.yolo_speeds = {}
+    GLOBAL_STATE.yolo_distance_from_start = {}
+    GLOBAL_STATE.yolo_threshold = None
+    GLOBAL_STATE.yolo_baseline_speed = None
+    GLOBAL_STATE.yolo_speed_std = None
+    GLOBAL_STATE.yolo_kick_frame = None
+    GLOBAL_STATE.yolo_status = ""
+    GLOBAL_STATE.yolo_kick_frames = []
+    GLOBAL_STATE.yolo_kick_speeds = []
+    GLOBAL_STATE.yolo_kick_distance = []
+    GLOBAL_STATE.yolo_mask_area_proxy = []
+    GLOBAL_STATE.yolo_initial_frame = None
+    GLOBAL_STATE.sam_window = None
     GLOBAL_STATE.player_obj_id = None
     GLOBAL_STATE.player_detection_frame = None
     GLOBAL_STATE.player_detection_conf = None
+    GLOBAL_STATE.yolo_ball_centers = {}
+    GLOBAL_STATE.yolo_ball_boxes = {}
+    GLOBAL_STATE.yolo_ball_conf = {}
+    GLOBAL_STATE.yolo_smoothed_centers = {}
+    GLOBAL_STATE.yolo_speeds = {}
+    GLOBAL_STATE.yolo_distance_from_start = {}
+    GLOBAL_STATE.yolo_threshold = None
+    GLOBAL_STATE.yolo_baseline_speed = None
+    GLOBAL_STATE.yolo_speed_std = None
+    GLOBAL_STATE.yolo_kick_frame = None
+    GLOBAL_STATE.yolo_status = ""
+    GLOBAL_STATE.yolo_kick_frames = []
+    GLOBAL_STATE.yolo_kick_speeds = []
+    GLOBAL_STATE.yolo_kick_distance = []
+    GLOBAL_STATE.yolo_mask_area_proxy = []
+    GLOBAL_STATE.yolo_initial_frame = None
+    GLOBAL_STATE.sam_window = None
     load_model_if_needed(GLOBAL_STATE)
     return fig
+def _ensure_ball_prompt_from_yolo(state: AppState):
+    if (
+        state is None
+        or state.inference_session is None
+        or not state.yolo_ball_centers
+    ):
+        return
+    # Check if we already have clicks for the ball
+    for frame_clicks in state.clicks_by_frame_obj.values():
+        if frame_clicks.get(BALL_OBJECT_ID):
+            return
+    anchor_frame = state.yolo_initial_frame
+    if anchor_frame is None and state.yolo_ball_centers:
+        anchor_frame = min(state.yolo_ball_centers.keys())
+    if anchor_frame is None or anchor_frame >= state.num_frames:
+        return
+    center = state.yolo_ball_centers.get(anchor_frame)
+    if center is None:
+        return
+    x_center, y_center = center
+    frame_width, frame_height = state.video_frames[anchor_frame].size
+    x_center = int(np.clip(round(x_center), 0, frame_width - 1))
+    y_center = int(np.clip(round(y_center), 0, frame_height - 1))
+    event = SimpleNamespace(
+        index=(x_center, y_center),
+        value={"x": x_center, "y": y_center},
+    )
+    state.current_obj_id = BALL_OBJECT_ID
+    state.current_label = "positive"
+    state.current_frame_idx = anchor_frame
+    on_image_click(
+        update_frame_display(state, anchor_frame),
+        state,
+        anchor_frame,
+        BALL_OBJECT_ID,
+        "positive",
+        False,
+        event,
+    )
+def _build_yolo_plot(state: AppState):
+    fig = go.Figure()
+    if state is None or not state.yolo_kick_frames or not state.yolo_kick_speeds:
+        fig.update_layout(
+            title="YOLO kick diagnostics",
+            xaxis_title="Frame",
+            yaxis_title="Speed (px/s)",
+        )
+        return fig
+    frames = state.yolo_kick_frames
+    speeds = state.yolo_kick_speeds
+    distance = state.yolo_kick_distance if state.yolo_kick_distance else [0.0] * len(frames)
+    areas = state.yolo_mask_area_proxy if state.yolo_mask_area_proxy else [0.0] * len(frames)
+    threshold = state.yolo_threshold or 0.0
+    baseline = state.yolo_baseline_speed or 0.0
+    kick_frame = state.yolo_kick_frame
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=speeds,
+            mode="lines+markers",
+            name="YOLO speed",
+            line=dict(color="#4caf50"),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=[threshold] * len(frames),
+            mode="lines",
+            name="Adaptive threshold",
+            line=dict(color="#ff9800", dash="dash"),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=[baseline] * len(frames),
+            mode="lines",
+            name="Baseline speed",
+            line=dict(color="#9e9e9e", dash="dot"),
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=distance,
+            mode="lines",
+            name="Distance from start",
+            line=dict(color="#03a9f4"),
+            yaxis="y2",
+        )
+    )
+    fig.add_trace(
+        go.Scatter(
+            x=frames,
+            y=areas,
+            mode="lines",
+            name="Box area proxy",
+            line=dict(color="#ab47bc", dash="dot"),
+            yaxis="y2",
+        )
+    )
+    if kick_frame is not None:
+        fig.add_vline(
+            x=kick_frame,
+            line=dict(color="#e91e63", width=2),
+            annotation_text=f"Kick {kick_frame}",
+            annotation_position="top right",
+        )
+    fig.update_layout(
+        title="YOLO kick diagnostics",
+        xaxis=dict(title="Frame"),
+        yaxis=dict(title="Speed (px/s)"),
+        yaxis2=dict(
+            title="Distance / Area",
+            overlaying="y",
+            side="right",
+            showgrid=False,
+        ),
+        legend=dict(orientation="h"),
+        margin=dict(t=40, l=40, r=40, b=40),
+    )
+    return fig
 def _format_impact_status(state: AppState) -> str:
     if state is None:
         return "Impact frame: not computed"
 def _button_updates(state: AppState) -> tuple[Any, Any, Any]:
+    yolo_ready = isinstance(state, AppState) and state.yolo_kick_frame is not None
+    propagate_main_enabled = _ball_has_masks(state) or yolo_ready
+    detect_player_enabled = yolo_ready
+    propagate_player_enabled = _player_has_masks(state)
     return (
         gr.update(interactive=propagate_main_enabled),
         gr.update(interactive=detect_player_enabled),
             "Load a video first.",
             gr.update(),
             _build_kick_plot(GLOBAL_STATE),
+            _build_yolo_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
             propagate_main_update,
             propagate_player_update,
         )
+    _ensure_ball_prompt_from_yolo(GLOBAL_STATE)
     processor = deepcopy(GLOBAL_STATE.processor)
     model = deepcopy(GLOBAL_STATE.model)
     inference_session = deepcopy(GLOBAL_STATE.inference_session)
     inference_session.cache.inference_device = "cuda"
     model.to("cuda")
+    if not GLOBAL_STATE.sam_window:
+        _compute_sam_window_from_kick(
+            GLOBAL_STATE,
+            GLOBAL_STATE.kick_frame or getattr(GLOBAL_STATE, "kick_debug_kick_frame", None),
+        )
+    start_idx, end_idx = GLOBAL_STATE.sam_window or (0, GLOBAL_STATE.num_frames)
+    start_idx = max(0, int(start_idx))
+    end_idx = min(GLOBAL_STATE.num_frames, max(start_idx + 1, int(end_idx)))
+    total = max(1, end_idx - start_idx)
     processed = 0
+    _ensure_ball_prompt_from_yolo(GLOBAL_STATE)
     # Initial status; no slider change yet
     propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(GLOBAL_STATE)
     yield (
         f"Propagating masks: {processed}/{total}",
         gr.update(),
         _build_kick_plot(GLOBAL_STATE),
+        _build_yolo_plot(GLOBAL_STATE),
         _format_impact_status(GLOBAL_STATE),
         gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
         propagate_main_update,
         propagate_player_update,
     )
+    last_frame_idx = start_idx
     with torch.inference_mode():
+        for frame_idx in range(start_idx, end_idx):
+            frame = GLOBAL_STATE.video_frames[frame_idx]
             pixel_values = None
             if inference_session.processed_frames is None or frame_idx not in inference_session.processed_frames:
                 pixel_values = processor(images=frame, device="cuda", return_tensors="pt").pixel_values[0]
                     f"Propagating masks: {processed}/{total}",
                     gr.update(value=frame_idx),
                     _build_kick_plot(GLOBAL_STATE),
+                    _build_yolo_plot(GLOBAL_STATE),
                     _format_impact_status(GLOBAL_STATE),
                     gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                     propagate_main_update,
         text,
         gr.update(value=target_frame),
         _build_kick_plot(GLOBAL_STATE),
+        _build_yolo_plot(GLOBAL_STATE),
         _format_impact_status(GLOBAL_STATE),
         gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
         propagate_main_update,
         status,
         gr.update(visible=False, value=""),
         _build_kick_plot(GLOBAL_STATE),
+        _build_yolo_plot(GLOBAL_STATE),
         _format_impact_status(GLOBAL_STATE),
         propagate_main_update,
         detect_btn_update,
                 """
                 **Working with results**
                 - **Preview**: Use the slider to navigate frames and see the current masks.
+                - **Track**: Click “Track ball (SAM2)” to track all defined objects across the selected window. The preview follows progress periodically to keep things responsive.
                 - **Export**: Render an MP4 for smooth playback using the original video FPS.
                 - **Note**: More info on the Hugging Face 🤗 Transformers implementation of SAM2 can be found [here](https://huggingface.co/docs/transformers/en/main/en/model_doc/sam2_video).
                 """
                 )
             with gr.Row():
                 detect_ball_btn = gr.Button("Detect Ball", variant="secondary")
+                track_ball_yolo_btn = gr.Button("Track ball (YOLO13)", variant="secondary")
+                propagate_btn = gr.Button("Track ball (SAM2)", variant="primary", interactive=False)
             detect_player_btn = gr.Button("Detect Player", variant="secondary", interactive=False)
             propagate_player_btn = gr.Button("Propagate Player", variant="primary", interactive=False)
             ball_status = gr.Markdown(visible=False)
                 clear_old_chk = gr.Checkbox(value=False, label="Clear old inputs for this object")
                 prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
             kick_plot = gr.Plot(label="Kick & impact diagnostics", show_label=True)
+            yolo_plot = gr.Plot(label="YOLO kick diagnostics", show_label=True)
     # Wire events
     def _on_video_change(GLOBAL_STATE: gr.State, video):
             status,
             gr.update(visible=False, value=""),
             _build_kick_plot(GLOBAL_STATE),
+            _build_yolo_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             propagate_main_update,
             detect_btn_update,
     video_in.change(
         _on_video_change,
         inputs=[GLOBAL_STATE, video_in],
+        outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status, kick_plot, yolo_plot, impact_status, propagate_btn, detect_player_btn, propagate_player_btn],
         show_progress=True,
     )
             examples=examples_list,
             inputs=[GLOBAL_STATE, video_in],
             fn=_on_video_change,
+            outputs=[GLOBAL_STATE, frame_slider, preview, load_status, ball_status, kick_plot, yolo_plot, impact_status, propagate_btn, detect_player_btn, propagate_player_btn],
             label="Examples",
             cache_examples=False,
             examples_per_page=5,
         outputs=[preview, ball_status, frame_slider, kick_plot, propagate_btn, detect_player_btn, propagate_player_btn],
     )
+    def _track_ball_yolo(state_in: AppState):
+        if state_in is None or state_in.num_frames == 0:
+            raise gr.Error("Load a video first, then track the ball with YOLO.")
+        progress = gr.Progress(track_tqdm=False)
+        _perform_yolo_ball_tracking(state_in, progress=progress)
+        target_frame = (
+            state_in.yolo_kick_frame
+            if state_in.yolo_kick_frame is not None
+            else state_in.yolo_initial_frame
+            if state_in.yolo_initial_frame is not None
+            else 0
+        )
+        if state_in.num_frames:
+            target_frame = int(np.clip(target_frame, 0, state_in.num_frames - 1))
+        state_in.current_frame_idx = target_frame
+        preview_img = update_frame_display(state_in, target_frame)
+        base_msg = state_in.yolo_status or ""
+        kick_msg = _format_kick_status(state_in)
+        status_text = f"{base_msg} | {kick_msg}" if base_msg else kick_msg
+        propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(state_in)
+        return (
+            preview_img,
+            gr.update(value=status_text, visible=True),
+            gr.update(value=target_frame),
+            _build_kick_plot(state_in),
+            _build_yolo_plot(state_in),
+            propagate_main_update,
+            detect_btn_update,
+            propagate_player_update,
+        )
+    track_ball_yolo_btn.click(
+        _track_ball_yolo,
+        inputs=[GLOBAL_STATE],
+        outputs=[preview, ball_status, frame_slider, kick_plot, yolo_plot, propagate_btn, detect_player_btn, propagate_player_btn],
+    )
     def _auto_detect_player(state_in: AppState):
         if state_in is None or state_in.num_frames == 0:
             raise gr.Error("Load a video first, then try auto-detect.")
                 "Load a video first.",
                 gr.update(),
                 _build_kick_plot(GLOBAL_STATE),
+                _build_yolo_plot(GLOBAL_STATE),
                 _format_impact_status(GLOBAL_STATE),
                 gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                 propagate_main_update,
                 "Detect the player before propagating.",
                 gr.update(),
                 _build_kick_plot(GLOBAL_STATE),
+                _build_yolo_plot(GLOBAL_STATE),
                 _format_impact_status(GLOBAL_STATE),
                 gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                 propagate_main_update,
         inference_session.cache.inference_device = "cuda"
         model.to("cuda")
+        if not GLOBAL_STATE.sam_window:
+            _compute_sam_window_from_kick(
+                GLOBAL_STATE,
+                GLOBAL_STATE.kick_frame or getattr(GLOBAL_STATE, "kick_debug_kick_frame", None),
+            )
+        start_idx, end_idx = GLOBAL_STATE.sam_window or (0, GLOBAL_STATE.num_frames)
+        start_idx = max(0, int(start_idx))
+        end_idx = min(GLOBAL_STATE.num_frames, max(start_idx + 1, int(end_idx)))
+        total = max(1, end_idx - start_idx)
         processed = 0
+        last_frame_idx = start_idx
         propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(GLOBAL_STATE)
         yield (
             f"Propagating player: {processed}/{total}",
             gr.update(),
             _build_kick_plot(GLOBAL_STATE),
+            _build_yolo_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
             propagate_main_update,
         player_id = GLOBAL_STATE.player_obj_id or PLAYER_OBJECT_ID
         with torch.inference_mode():
+            for frame_idx in range(start_idx, end_idx):
+                frame = GLOBAL_STATE.video_frames[frame_idx]
                 pixel_values = None
                 if (
                     inference_session.processed_frames is None
                 GLOBAL_STATE.composited_frames.pop(frame_idx, None)
                 processed += 1
+                last_frame_idx = frame_idx
                 if processed % 30 == 0 or processed == total:
                     propagate_main_update, detect_btn_update, propagate_player_update = _button_updates(GLOBAL_STATE)
                     yield (
                         f"Propagating player: {processed}/{total}",
                         gr.update(value=frame_idx),
                         _build_kick_plot(GLOBAL_STATE),
+                        _build_yolo_plot(GLOBAL_STATE),
                         _format_impact_status(GLOBAL_STATE),
                         gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
                         propagate_main_update,
         if target_frame is None:
             target_frame = GLOBAL_STATE.kick_frame or getattr(GLOBAL_STATE, "kick_debug_kick_frame", None)
         if target_frame is None:
+            target_frame = last_frame_idx
         target_frame = int(np.clip(target_frame, 0, max(0, GLOBAL_STATE.num_frames - 1)))
         GLOBAL_STATE.current_frame_idx = target_frame
             text,
             gr.update(value=target_frame),
             _build_kick_plot(GLOBAL_STATE),
+            _build_yolo_plot(GLOBAL_STATE),
             _format_impact_status(GLOBAL_STATE),
             gr.update(value=_format_kick_status(GLOBAL_STATE), visible=True),
             propagate_main_update,
     propagate_player_btn.click(
         propagate_player_masks,
         inputs=[GLOBAL_STATE],
+        outputs=[GLOBAL_STATE, propagate_status, frame_slider, kick_plot, yolo_plot, impact_status, ball_status, propagate_btn, detect_player_btn, propagate_player_btn],
     )
     # Image click to add a point and run forward on that frame
     propagate_btn.click(
         propagate_masks,
         inputs=[GLOBAL_STATE],
+        outputs=[GLOBAL_STATE, propagate_status, frame_slider, kick_plot, yolo_plot, impact_status, ball_status, propagate_btn, detect_player_btn, propagate_player_btn],
     )
     reset_btn.click(
         reset_session,
         inputs=GLOBAL_STATE,
+        outputs=[GLOBAL_STATE, preview, frame_slider, frame_slider, load_status, ball_status, kick_plot, yolo_plot, impact_status, propagate_btn, detect_player_btn, propagate_player_btn],
     )
 # ============================================================================