Segment-Anything-2-video-tracking

Running

App Files Files Community

Mirko Trasciatti commited on Nov 27

Commit

996c8dd

1 Parent(s): b230fd1

Sync API tab with YOLO→SAM2 workflow

Browse files

Files changed (1) hide show

app.py +164 -99

app.py CHANGED Viewed

@@ -3230,123 +3230,182 @@ def process_video_api(
     video_file,
     annotations_json_str: str,
     checkpoint: str = "base_plus",
-    remove_background: bool = True
 ):
     """
     Single-endpoint API for programmatic video processing.
     Args:
         video_file: Uploaded video file
-        annotations_json_str: JSON string with format:
-            {
-                "annotations": [
-                    {"object_id": 1, "frame": 139, "x": 369, "y": 652, "label": "positive"},
-                    {"object_id": 1, "frame": 156, "x": 374, "y": 513, "label": "positive"},
-                    {"object_id": 2, "frame": 156, "x": 374, "y": 257, "label": "positive"}
-                ]
-            }
         checkpoint: SAM2 model checkpoint (tiny, small, base_plus, large)
-        remove_background: Whether to remove background (default: True)
     Returns:
-        Tuple of (preview_image, processed_video_path)
     """
     import json
     try:
-        # Parse annotations
-        annotations_data = json.loads(annotations_json_str)
         annotations = annotations_data.get("annotations", [])
-        client_fps = annotations_data.get("fps", None)  # FPS used by iOS app to calculate frame indices
-        print(f"[API] Processing video with {len(annotations)} annotations")
-        print(f"[API] Client FPS: {client_fps}")
-        print(f"[API] Checkpoint: {checkpoint}")
-        print(f"[API] Remove background: {remove_background}")
-        # Create preview of annotation points
-        preview_img = create_annotation_preview(video_file, annotations)
         # Create a temporary state for this API call
         api_state = AppState()
         api_state.model_repo_key = checkpoint
         # Step 1: Initialize session with video
         api_state, min_idx, max_idx, first_frame, status = init_video_session(api_state, video_file)
         space_fps = api_state.video_fps
-        print(f"[API] Video loaded: {status}")
-        print(f"[API] ⚠️  FPS mismatch check: Client={client_fps}, Space={space_fps}")
         # If FPS mismatch, warn about potential frame offset
         if client_fps and space_fps and abs(client_fps - space_fps) > 0.5:
             offset_estimate = abs(int((client_fps - space_fps) * (api_state.num_frames / client_fps)))
-            print(f"[API] ⚠️  FPS mismatch detected! Frame indices may be off by ~{offset_estimate} frames")
-            print(f"[API] ℹ️  Recommendation: Use timestamps instead of frame indices for accuracy")
         # Step 2: Apply each annotation
-        for i, ann in enumerate(annotations):
-            object_id = ann.get("object_id", 1)
-            timestamp_ms = ann.get("timestamp_ms", None)
-            frame_idx = ann.get("frame", None)
-            x = ann.get("x", 0)
-            y = ann.get("y", 0)
-            label = ann.get("label", "positive")
-            # Calculate frame from timestamp using Space's FPS (more accurate)
-            if timestamp_ms is not None and space_fps and space_fps > 0:
-                calculated_frame = int((timestamp_ms / 1000.0) * space_fps)
-                if frame_idx is not None and calculated_frame != frame_idx:
-                    print(f"[API] ✅ Using timestamp: {timestamp_ms}ms → Frame {calculated_frame} (client sent frame {frame_idx})")
-                else:
-                    print(f"[API] ✅ Calculated frame from timestamp: {timestamp_ms}ms → Frame {calculated_frame}")
-                frame_idx = calculated_frame
-            elif frame_idx is None:
-                print(f"[API] ⚠️  Warning: No timestamp or frame provided, using frame 0")
-                frame_idx = 0
-            print(f"[API] Adding annotation {i+1}/{len(annotations)}: "
-                  f"Object {object_id}, Frame {frame_idx}, ({x}, {y}), {label}")
-            # Sync state
-            api_state.current_frame_idx = int(frame_idx)
-            api_state.current_obj_id = int(object_id)
-            api_state.current_label = str(label)
-            # Create a mock event with coordinates
-            class MockEvent:
-                def __init__(self, x, y):
-                    self.index = (x, y)
-            mock_evt = MockEvent(x, y)
-            # Add the point annotation
-            preview_img = on_image_click(
-                first_frame,
-                api_state,
-                frame_idx,
-                object_id,
-                label,
-                clear_old=False,
-                evt=mock_evt
-            )
-        # Step 3: Propagate masks across all frames
-        print("[API] Propagating masks across video...")
-        # We need to consume the generator
-        for outputs in propagate_masks(api_state):
-            if not outputs:
-                continue
-            api_state = outputs[0]
-            status_msg = outputs[1] if len(outputs) > 1 else ""
-            if status_msg:
-                print(f"[API] Progress: {status_msg}")
-        # Step 4: Render the final video
-        print(f"[API] Rendering video with remove_background={remove_background}...")
         result_video_path = _render_video(api_state, remove_background)
-        print(f"[API] ✅ Processing complete: {result_video_path}")
-        return preview_img, result_video_path
     except Exception as e:
         print(f"[API] ❌ Error: {str(e)}")
@@ -4797,7 +4856,7 @@ api_interface = gr.Interface(
     inputs=[
         gr.Video(label="Video File"),
         gr.Textbox(
-            label="Annotations JSON",
             placeholder='{"annotations": [{"object_id": 1, "frame": 139, "x": 369, "y": 652, "label": "positive"}]}',
             lines=5
         ),
@@ -4809,16 +4868,24 @@ api_interface = gr.Interface(
         gr.Checkbox(label="Remove Background", value=True)
     ],
     outputs=[
-        gr.Image(label="Annotation Preview (shows where points are placed)"),
-        gr.Video(label="Processed Video")
     ],
     title="SAM2 API",
     description="""
-    ## Programmatic API for Video Background Removal
-    **The preview image shows where your annotation points are placed on the video frames.**
-    **Annotations JSON Format:**
     ```json
     {
         "annotations": [
@@ -4828,10 +4895,7 @@ api_interface = gr.Interface(
         ]
     }
     ```
-    - **Object 1** (Ball): Frame 0 + Impact frame
-    - **Object 2** (Player): Impact frame
-    - Colors represent different objects
     """
 )
@@ -4854,13 +4918,14 @@ with gr.Blocks(title="SAM2 Video Tracking") as combined_demo:
     api_remove_bg_input_hidden = gr.Checkbox(visible=False)
     api_preview_output_hidden = gr.Image(visible=False)
     api_video_output_hidden = gr.Video(visible=False)
     # This dummy component creates the external API endpoint
     api_dummy_btn = gr.Button("API", visible=False)
     api_dummy_btn.click(
         fn=process_video_api,
         inputs=[api_video_input_hidden, api_annotations_input_hidden, api_checkpoint_input_hidden, api_remove_bg_input_hidden],
-        outputs=[api_preview_output_hidden, api_video_output_hidden],
         api_name="predict"  # This creates /api/predict for external calls
     )

     video_file,
     annotations_json_str: str,
     checkpoint: str = "base_plus",
+    remove_background: bool = True,
 ):
     """
     Single-endpoint API for programmatic video processing.
     Args:
         video_file: Uploaded video file
+        annotations_json_str: Optional JSON string containing helper annotations
         checkpoint: SAM2 model checkpoint (tiny, small, base_plus, large)
+        remove_background: Whether to remove the background in the render
     Returns:
+        Tuple of (preview_image, processed_video_path, progress_log)
     """
     import json
     try:
+        log_entries: list[str] = []
+        def log_msg(message: str):
+            text = f"[API] {message}"
+            print(text)
+            log_entries.append(text)
+        # Parse annotations (optional)
+        annotations_payload = annotations_json_str or ""
+        annotations_data = json.loads(annotations_payload) if annotations_payload.strip() else {}
         annotations = annotations_data.get("annotations", [])
+        client_fps = annotations_data.get("fps", None)
+        log_msg(f"Received {len(annotations)} annotations")
+        log_msg(f"Checkpoint: {checkpoint} | Remove background: {remove_background}")
+        preview_img = create_annotation_preview(video_file, annotations) if annotations else None
         # Create a temporary state for this API call
         api_state = AppState()
         api_state.model_repo_key = checkpoint
         # Step 1: Initialize session with video
+        log_msg("Loading video...")
         api_state, min_idx, max_idx, first_frame, status = init_video_session(api_state, video_file)
         space_fps = api_state.video_fps
+        log_msg(status)
+        log_msg(f"Client FPS={client_fps} | Space FPS={space_fps}")
         # If FPS mismatch, warn about potential frame offset
         if client_fps and space_fps and abs(client_fps - space_fps) > 0.5:
             offset_estimate = abs(int((client_fps - space_fps) * (api_state.num_frames / client_fps)))
+            log_msg(f"⚠️ FPS mismatch detected. Frame indices may be off by ~{offset_estimate} frames.")
+            log_msg("ℹ️ Recommendation: Use timestamps instead of frame indices for accuracy.")
         # Step 2: Apply each annotation
+        if annotations:
+            for i, ann in enumerate(annotations):
+                object_id = ann.get("object_id", 1)
+                timestamp_ms = ann.get("timestamp_ms", None)
+                frame_idx = ann.get("frame", None)
+                x = ann.get("x", 0)
+                y = ann.get("y", 0)
+                label = ann.get("label", "positive")
+                # Calculate frame from timestamp using Space's FPS (more accurate)
+                if timestamp_ms is not None and space_fps and space_fps > 0:
+                    calculated_frame = int((timestamp_ms / 1000.0) * space_fps)
+                    if frame_idx is not None and calculated_frame != frame_idx:
+                        log_msg(f"Annotation {i+1}: using timestamp {timestamp_ms}ms → Frame {calculated_frame} (client sent {frame_idx})")
+                    else:
+                        log_msg(f"Annotation {i+1}: timestamp {timestamp_ms}ms → Frame {calculated_frame}")
+                    frame_idx = calculated_frame
+                elif frame_idx is None:
+                    log_msg(f"Annotation {i+1}: ⚠️ No timestamp/frame provided, defaulting to frame 0")
+                    frame_idx = 0
+                log_msg(f"Adding annotation {i+1}/{len(annotations)} | Obj {object_id} | Frame {frame_idx}")
+                # Sync state
+                api_state.current_frame_idx = int(frame_idx)
+                api_state.current_obj_id = int(object_id)
+                api_state.current_label = str(label)
+                # Create a mock event with coordinates
+                class MockEvent:
+                    def __init__(self, x, y):
+                        self.index = (x, y)
+                mock_evt = MockEvent(x, y)
+                # Add the point annotation
+                preview_img = on_image_click(
+                    first_frame,
+                    api_state,
+                    frame_idx,
+                    object_id,
+                    label,
+                    clear_old=False,
+                    evt=mock_evt
+                )
+        if preview_img is None:
+            preview_img = first_frame
+        # Helper to consume generator-based steps and capture log messages
+        def _run_generator(gen, label: str):
+            final_state = None
+            for outputs in gen:
+                if not outputs:
+                    continue
+                final_state = outputs[0]
+                status_msg = outputs[1] if len(outputs) > 1 else ""
+                if status_msg:
+                    log_msg(f"{label}: {status_msg}")
+            if final_state is not None:
+                return final_state
+            raise gr.Error(f"{label} did not produce any output.")
+        # Step 3: YOLO13 detect ball
+        api_state.current_obj_id = BALL_OBJECT_ID
+        api_state.current_label = "positive"
+        log_msg("YOLO13 · Detect ball (single-frame search)")
+        _auto_detect_ball(api_state, BALL_OBJECT_ID, "positive", False)
+        if not api_state.is_ball_detected:
+            raise gr.Error("YOLO13 could not detect the ball automatically.")
+        # Step 4: YOLO13 track ball
+        log_msg("YOLO13 · Track ball across clip")
+        _track_ball_yolo(api_state)
+        if not api_state.is_yolo_tracked:
+            raise gr.Error("YOLO13 tracking failed.")
+        # Step 5: SAM2 track ball around kick window
+        log_msg("SAM2 · Track ball around kick window")
+        api_state = _run_generator(propagate_masks(api_state), "SAM2 · Ball")
+        sam_kick = _get_prioritized_kick_frame(api_state)
+        yolo_kick = api_state.yolo_kick_frame
+        if sam_kick is not None:
+            log_msg(f"SAM2 kick frame ≈ {sam_kick}")
+        if yolo_kick is not None:
+            log_msg(f"YOLO kick frame ≈ {yolo_kick}")
+        # Fallback: re-run SAM2 on entire video if kicks disagree
+        if (
+            yolo_kick is not None
+            and sam_kick is not None
+            and int(yolo_kick) != int(sam_kick)
+        ):
+            log_msg("Kick disagreement detected → re-running SAM2 across entire video.")
+            api_state.sam_window = (0, api_state.num_frames)
+            api_state = _run_generator(propagate_masks(api_state), "SAM2 · Full sweep")
+            sam_kick = _get_prioritized_kick_frame(api_state)
+            log_msg(f"SAM2 full sweep kick frame ≈ {sam_kick}")
+        else:
+            log_msg("Kick frames aligned. No full sweep required.")
+        # Step 6: YOLO detect player on SAM2 kick frame
+        log_msg("YOLO13 · Detect player on SAM2 kick frame")
+        _auto_detect_player(api_state)
+        if api_state.is_player_detected:
+            log_msg("YOLO13 · Player detected successfully.")
+        else:
+            log_msg("YOLO13 · Player detection failed; continuing without player propagation.")
+        # Step 7: SAM2 track player if detection succeeded
+        if api_state.is_player_detected:
+            log_msg("SAM2 · Track player around kick window")
+            try:
+                api_state = _run_generator(propagate_player_masks(api_state), "SAM2 · Player")
+            except gr.Error as player_error:
+                log_msg(f"SAM2 player propagation warning: {player_error}")
+        # Step 8: Render the final video
+        log_msg(f"Rendering video (remove_background={remove_background})")
         result_video_path = _render_video(api_state, remove_background)
+        log_msg("Processing complete 🎉")
+        return preview_img, result_video_path, "\n".join(log_entries)
     except Exception as e:
         print(f"[API] ❌ Error: {str(e)}")
     inputs=[
         gr.Video(label="Video File"),
         gr.Textbox(
+            label="Annotations JSON (optional)",
             placeholder='{"annotations": [{"object_id": 1, "frame": 139, "x": 369, "y": 652, "label": "positive"}]}',
             lines=5
         ),
         gr.Checkbox(label="Remove Background", value=True)
     ],
     outputs=[
+        gr.Image(label="Annotation Preview / First Frame"),
+        gr.Video(label="Processed Video"),
+        gr.Textbox(label="Processing Log", lines=12)
     ],
     title="SAM2 API",
     description="""
+    ## Programmatic KickTrimmer Pipeline
+    Submitting a video here runs the same automated workflow as the Interactive UI:
+    1. **Upload** the raw MP4.
+    2. `YOLO13` **detects** and **tracks** the ball to get the first kick estimate.
+    3. `SAM2` **tracks the ball** around that kick window. If SAM2's kick disagrees with YOLO's, it automatically re-tracks **the entire clip** for better accuracy.
+    4. `YOLO13` **detects the player** on the SAM2 kick frame, then `SAM2` propagates the player masks around that window.
+    5. The Space **renders a default cutout video** and returns it together with the processing log below.
+    ### Optional annotations
+    You can still send helper points via JSON:
     ```json
     {
         "annotations": [
         ]
     }
     ```
+    - **Object 1** = ball, **Object 2** = player. Use timestamps when possible; the API will reconcile timestamps and frame indices for you.
     """
 )
     api_remove_bg_input_hidden = gr.Checkbox(visible=False)
     api_preview_output_hidden = gr.Image(visible=False)
     api_video_output_hidden = gr.Video(visible=False)
+    api_logs_output_hidden = gr.Textbox(visible=False)
     # This dummy component creates the external API endpoint
     api_dummy_btn = gr.Button("API", visible=False)
     api_dummy_btn.click(
         fn=process_video_api,
         inputs=[api_video_input_hidden, api_annotations_input_hidden, api_checkpoint_input_hidden, api_remove_bg_input_hidden],
+        outputs=[api_preview_output_hidden, api_video_output_hidden, api_logs_output_hidden],
         api_name="predict"  # This creates /api/predict for external calls
     )