Spaces:

yonigozlan
/

Segment-Anything-2-video-tracking

Runtime error

App Files Files Community

yonigozlan HF Staff commited on Aug 12

Commit

06c9ffc

1 Parent(s): 189fb9e

update app

Browse files

Files changed (1) hide show

app.py +184 -44

app.py CHANGED Viewed

@@ -8,11 +8,17 @@ import spaces
 import torch
 from PIL import Image, ImageDraw
 from transformers import Sam2VideoModel, Sam2VideoProcessor
 def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
     golden_ratio_conjugate = 0.61803398875
     hue = (obj_id * golden_ratio_conjugate) % 1.0
     saturation = 0.45
     value = 1.0
@@ -21,10 +27,14 @@ def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
 def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
     try:
         from transformers.video_utils import load_video  # type: ignore
         frames, info = load_video(video_path_or_url)
         pil_frames = []
         for fr in frames:
             if isinstance(fr, Image.Image):
@@ -32,6 +42,7 @@ def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], di
             else:
                 pil_frames.append(Image.fromarray(fr).convert("RGB"))
         info = info if info is not None else {}
         if "fps" not in info or not info.get("fps"):
             try:
                 import cv2  # type: ignore
@@ -45,6 +56,7 @@ def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], di
                 pass
         return pil_frames, info
     except Exception:
         try:
             import cv2  # type: ignore
@@ -56,6 +68,7 @@ def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], di
                     break
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 frames.append(Image.fromarray(frame_rgb))
             fps_val = cap.get(cv2.CAP_PROP_FPS)
             cap.release()
             info = {
@@ -71,28 +84,40 @@ def overlay_masks_on_frame(
     frame: Image.Image,
     masks_per_object: dict[int, np.ndarray],
     color_by_obj: dict[int, tuple[int, int, int]],
-    alpha: float = 0.65,
 ) -> Image.Image:
-    base = np.array(frame).astype(np.float32) / 255.0
     overlay = base.copy()
     for obj_id, mask in masks_per_object.items():
         if mask is None:
             continue
         if mask.dtype != np.float32:
             mask = mask.astype(np.float32)
         if mask.ndim == 3:
             mask = mask.squeeze()
         mask = np.clip(mask, 0.0, 1.0)
         color = np.array(color_by_obj.get(obj_id, (255, 0, 0)), dtype=np.float32) / 255.0
         m = mask[..., None]
-        overlay = (1.0 - alpha * m) * overlay + (alpha * m) * color
     out = np.clip(overlay * 255.0, 0, 255).astype(np.uint8)
     return Image.fromarray(out)
 def get_device_and_dtype() -> tuple[str, torch.dtype]:
-    # Force CPU-only on Spaces with zero GPU
-    return "cpu", torch.float32
 class AppState:
@@ -105,22 +130,25 @@ class AppState:
         self.model: Optional[Sam2VideoModel] = None
         self.processor: Optional[Sam2VideoProcessor] = None
         self.device: str = "cpu"
-        self.dtype: torch.dtype = torch.float32
         self.video_fps: float | None = None
         self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
         self.color_by_obj: dict[int, tuple[int, int, int]] = {}
         self.clicks_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int]]]] = {}
         self.boxes_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int, int]]]] = {}
         self.composited_frames: dict[int, Image.Image] = {}
         self.current_frame_idx: int = 0
         self.current_obj_id: int = 1
         self.current_label: str = "positive"
         self.current_clear_old: bool = True
-        self.current_prompt_type: str = "Points"
         self.pending_box_start: tuple[int, int] | None = None
         self.pending_box_start_frame_idx: int | None = None
         self.pending_box_start_obj_id: int | None = None
         self.is_switching_model: bool = False
         self.model_repo_key: str = "tiny"
         self.model_repo_id: str | None = None
         self.session_repo_id: str | None = None
@@ -149,6 +177,7 @@ def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, tor
     if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
         if GLOBAL_STATE.model_repo_id == desired_repo:
             return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
         try:
             del GLOBAL_STATE.model
         except Exception:
@@ -159,28 +188,37 @@ def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, tor
             pass
         GLOBAL_STATE.model = None
         GLOBAL_STATE.processor = None
     device, dtype = get_device_and_dtype()
     model = Sam2VideoModel.from_pretrained(desired_repo, torch_dtype=dtype)
     processor = Sam2VideoProcessor.from_pretrained(desired_repo)
     model.to(device)
     GLOBAL_STATE.model = model
     GLOBAL_STATE.processor = processor
     GLOBAL_STATE.device = device
     GLOBAL_STATE.dtype = dtype
     GLOBAL_STATE.model_repo_id = desired_repo
     return model, processor, device, dtype
 def ensure_session_for_current_model() -> None:
     model, processor, device, dtype = load_model_if_needed()
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
         if GLOBAL_STATE.video_frames:
             GLOBAL_STATE.masks_by_frame.clear()
             GLOBAL_STATE.clicks_by_frame_obj.clear()
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
             try:
                 if GLOBAL_STATE.inference_session is not None:
                     GLOBAL_STATE.inference_session.reset_inference_session()
@@ -188,22 +226,29 @@ def ensure_session_for_current_model() -> None:
                 pass
             GLOBAL_STATE.inference_session = None
             gc.collect()
             GLOBAL_STATE.inference_session = processor.init_video_session(
                 video=GLOBAL_STATE.video_frames,
                 inference_device=device,
-                video_storage_device="cpu",
             )
             GLOBAL_STATE.session_repo_id = desired_repo
-def init_video_session(video: str | dict):
     GLOBAL_STATE.video_frames = []
     GLOBAL_STATE.inference_session = None
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
-    load_model_if_needed()
     video_path: Optional[str] = None
     if isinstance(video, dict):
         video_path = video.get("name") or video.get("path") or video.get("data")
@@ -211,6 +256,7 @@ def init_video_session(video: str | dict):
         video_path = video
     else:
         video_path = None
     if not video_path:
         raise gr.Error("Invalid video input.")
@@ -219,6 +265,7 @@ def init_video_session(video: str | dict):
         raise gr.Error("No frames could be loaded from the video.")
     GLOBAL_STATE.video_frames = frames
     GLOBAL_STATE.video_fps = None
     if isinstance(info, dict) and info.get("fps"):
         try:
@@ -226,8 +273,7 @@ def init_video_session(video: str | dict):
         except Exception:
             GLOBAL_STATE.video_fps = None
-    processor = GLOBAL_STATE.processor
-    device = GLOBAL_STATE.device
     inference_session = processor.init_video_session(
         video=frames,
         inference_device=device,
@@ -237,7 +283,9 @@ def init_video_session(video: str | dict):
     first_frame = frames[0]
     max_idx = len(frames) - 1
-    status = f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps. Device: {device}, dtype: {GLOBAL_STATE.dtype}"
     return GLOBAL_STATE, 0, max_idx, first_frame, status
@@ -251,6 +299,7 @@ def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
     if len(masks) != 0:
         out_img = overlay_masks_on_frame(out_img, masks, state.color_by_obj, alpha=0.65)
     clicks_map = state.clicks_by_frame_obj.get(frame_idx)
     if clicks_map:
         draw = ImageDraw.Draw(out_img)
@@ -258,17 +307,11 @@ def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
         for obj_id, pts in clicks_map.items():
             for x, y, lbl in pts:
                 color = (0, 255, 0) if int(lbl) == 1 else (255, 0, 0)
                 draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
                 draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
-    box_map = state.boxes_by_frame_obj.get(frame_idx)
-    if box_map:
-        draw = ImageDraw.Draw(out_img)
-        for obj_id, boxes in box_map.items():
-            color = state.color_by_obj.get(obj_id, (255, 255, 255))
-            for x1, y1, x2, y2 in boxes:
-                draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=2)
     if (
         state.pending_box_start is not None
         and state.pending_box_start_frame_idx == frame_idx
@@ -280,7 +323,15 @@ def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
         color = state.color_by_obj.get(state.pending_box_start_obj_id, (255, 255, 255))
         draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
         draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
     state.composited_frames[frame_idx] = out_img
     return out_img
@@ -289,6 +340,7 @@ def update_frame_display(state: AppState, frame_idx: int) -> Image.Image:
     if state is None or state.video_frames is None or len(state.video_frames) == 0:
         return None
     frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
     cached = state.composited_frames.get(frame_idx)
     if cached is not None:
         return cached
@@ -309,14 +361,17 @@ def on_image_click(
     label: str,
     clear_old: bool,
     evt: gr.SelectData,
-):
     if state is None or state.inference_session is None:
-        return img
     if state.is_switching_model:
         return update_frame_display(state, int(frame_idx))
     x = y = None
     if evt is not None:
         try:
             if hasattr(evt, "index") and isinstance(evt.index, (list, tuple)) and len(evt.index) == 2:
                 x, y = int(evt.index[0]), int(evt.index[1])
@@ -324,16 +379,20 @@ def on_image_click(
                 x, y = int(evt.value["x"]), int(evt.value["y"])
         except Exception:
             x = y = None
     if x is None or y is None:
-        return update_frame_display(state, int(frame_idx))
     _ensure_color_for_obj(int(obj_id))
     processor = GLOBAL_STATE.processor
     model = GLOBAL_STATE.model
     inference_session = GLOBAL_STATE.inference_session
     if state.current_prompt_type == "Boxes":
         if state.pending_box_start is None:
             if bool(clear_old):
                 frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
                 frame_clicks[int(obj_id)] = []
@@ -341,11 +400,13 @@ def on_image_click(
             state.pending_box_start = (int(x), int(y))
             state.pending_box_start_frame_idx = int(frame_idx)
             state.pending_box_start_obj_id = int(obj_id)
             state.composited_frames.pop(int(frame_idx), None)
             return update_frame_display(state, int(frame_idx))
         else:
             x1, y1 = state.pending_box_start
             x2, y2 = int(x), int(y)
             state.pending_box_start = None
             state.pending_box_start_frame_idx = None
             state.pending_box_start_obj_id = None
@@ -368,7 +429,9 @@ def on_image_click(
             obj_boxes.append((x_min, y_min, x_max, y_max))
             state.composited_frames.pop(int(frame_idx), None)
     else:
         label_int = 1 if str(label).lower().startswith("pos") else 0
         if bool(clear_old):
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
             frame_boxes[int(obj_id)] = []
@@ -381,6 +444,7 @@ def on_image_click(
             input_labels=[[[int(label_int)]]],
             clear_old_inputs=bool(clear_old),
         )
         frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
         obj_clicks = frame_clicks.setdefault(int(obj_id), [])
         if bool(clear_old):
@@ -388,21 +452,35 @@ def on_image_click(
         obj_clicks.append((int(x), int(y), int(label_int)))
         state.composited_frames.pop(int(frame_idx), None)
-    with torch.inference_mode():
-        outputs = model(inference_session=inference_session, frame_idx=int(frame_idx))
     H = inference_session.video_height
     W = inference_session.video_width
     pred_masks = outputs.pred_masks.detach().cpu()
     video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
     masks_for_frame: dict[int, np.ndarray] = {}
     obj_ids_order = list(inference_session.obj_ids)
     for i, oid in enumerate(obj_ids_order):
         mask_i = video_res_masks[i]
         mask_2d = mask_i.cpu().numpy().squeeze()
         masks_for_frame[int(oid)] = mask_2d
     GLOBAL_STATE.masks_by_frame[int(frame_idx)] = masks_for_frame
     GLOBAL_STATE.composited_frames.pop(int(frame_idx), None)
     return update_frame_display(GLOBAL_STATE, int(frame_idx))
@@ -411,18 +489,25 @@ def propagate_masks(state: AppState, progress=gr.Progress()):
     if state is None or state.inference_session is None:
         yield "Load a video first."
         return
     processor = GLOBAL_STATE.processor
     model = GLOBAL_STATE.model
     inference_session = GLOBAL_STATE.inference_session
     total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
     yield f"Propagating masks: {processed}/{total}"
-    with torch.inference_mode():
         for sam2_video_output in model.propagate_in_video_iterator(inference_session):
             H = inference_session.video_height
             W = inference_session.video_width
             pred_masks = sam2_video_output.pred_masks.detach().cpu()
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
             frame_idx = int(sam2_video_output.frame_idx)
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
@@ -430,16 +515,24 @@ def propagate_masks(state: AppState, progress=gr.Progress()):
                 mask_2d = video_res_masks[i].cpu().numpy().squeeze()
                 masks_for_frame[int(oid)] = mask_2d
             GLOBAL_STATE.masks_by_frame[frame_idx] = masks_for_frame
             GLOBAL_STATE.composited_frames.pop(frame_idx, None)
             processed += 1
             progress((processed, total), f"Propagating masks: {processed}/{total}")
             yield f"Propagating masks: {processed}/{total}"
     yield f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
-def reset_session():
     if not GLOBAL_STATE.video_frames:
         return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video."
     GLOBAL_STATE.masks_by_frame.clear()
     GLOBAL_STATE.clicks_by_frame_obj.clear()
     GLOBAL_STATE.boxes_by_frame_obj.clear()
@@ -447,6 +540,8 @@ def reset_session():
     GLOBAL_STATE.pending_box_start = None
     GLOBAL_STATE.pending_box_start_frame_idx = None
     GLOBAL_STATE.pending_box_start_obj_id = None
     try:
         if GLOBAL_STATE.inference_session is not None:
             GLOBAL_STATE.inference_session.reset_inference_session()
@@ -454,7 +549,14 @@ def reset_session():
         pass
     GLOBAL_STATE.inference_session = None
     gc.collect()
     ensure_session_for_current_model()
     current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
     current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
     preview_img = update_frame_display(GLOBAL_STATE, current_idx)
@@ -464,14 +566,12 @@ def reset_session():
     return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status
-with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)") as demo:
     state = gr.State(GLOBAL_STATE)
-    gr.Markdown(
-        """
-    **SAM2 Video (Transformers)** — CPU-only Space. Upload a video, click to add positive/negative points per object or draw two-click boxes, preview masks, then propagate across the video. Use the slider to scrub frames.
-    """
-    )
     with gr.Row():
         with gr.Column(scale=1):
@@ -485,7 +585,8 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)
             load_status = gr.Markdown(visible=True)
             reset_btn = gr.Button("Reset Session", variant="secondary")
             examples_list = [
-                ["./tennis.mp4"],
             ]
         with gr.Column(scale=2):
             preview = gr.Image(label="Preview", interactive=True)
@@ -504,13 +605,23 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)
         render_btn = gr.Button("Render MP4 for smooth playback")
     playback_video = gr.Video(label="Rendered Playback", interactive=False)
     def _on_video_change(video):
         s, min_idx, max_idx, first_frame, status = init_video_session(video)
-        return s, gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True), first_frame, status
     video_in.change(
-        _on_video_change, inputs=[video_in], outputs=[state, frame_slider, preview, load_status], show_progress=True
     )
     gr.Examples(
         examples=examples_list,
         inputs=[video_in],
@@ -525,21 +636,26 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)
         if s is not None and key:
             key = str(key)
             if key != s.model_repo_key:
                 s.is_switching_model = True
                 s.model_repo_key = key
                 s.model_repo_id = None
                 s.model = None
                 s.processor = None
         yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
         ensure_session_for_current_model()
         if s is not None:
             s.is_switching_model = False
         yield gr.update(visible=False, value="")
     ckpt_radio.change(_on_ckpt_change, inputs=[state, ckpt_radio], outputs=[ckpt_progress])
     def _rebind_session_after_ckpt(s: AppState):
         ensure_session_for_current_model()
         if s is not None:
             s.pending_box_start = None
         return gr.update()
@@ -551,7 +667,11 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)
             state_in.current_frame_idx = int(idx)
         return update_frame_display(state_in, int(idx))
-    frame_slider.change(_sync_frame_idx, inputs=[state, frame_slider], outputs=preview)
     def _sync_obj_id(s: AppState, oid):
         if s is not None and oid is not None:
@@ -576,34 +696,54 @@ with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)
     prompt_type.change(_sync_prompt_type, inputs=[state, prompt_type], outputs=[label_radio])
     preview.select(on_image_click, [preview, state, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview)
     def _render_video(s: AppState):
         if s is None or s.num_frames == 0:
             raise gr.Error("Load a video first.")
         fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
         frames_np = []
         for idx in range(s.num_frames):
             img = s.composited_frames.get(idx)
             if img is None:
                 img = compose_frame(s, idx)
-            frames_np.append(np.array(img)[:, :, ::-1])
             if (idx + 1) % 60 == 0:
                 gc.collect()
         out_path = "/tmp/sam2_playback.mp4"
         try:
             import imageio.v3 as iio  # type: ignore
             iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
             return out_path
         except Exception:
             try:
                 import imageio.v2 as imageio  # type: ignore
                 imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
                 return out_path
-            except Exception as e:
-                raise gr.Error(f"Failed to render video: {e}")
     render_btn.click(_render_video, inputs=[state], outputs=[playback_video])

 import torch
 from PIL import Image, ImageDraw
+# Prefer local transformers in the workspace
 from transformers import Sam2VideoModel, Sam2VideoProcessor
 def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
+    """Generate a deterministic pastel RGB color for a given object id.
+    Uses golden ratio to distribute hues; low-medium saturation, high value.
+    """
     golden_ratio_conjugate = 0.61803398875
+    # Map obj_id (1-based) to hue in [0,1)
     hue = (obj_id * golden_ratio_conjugate) % 1.0
     saturation = 0.45
     value = 1.0
 def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
+    """Load video frames as PIL Images using transformers.video_utils if available,
+    otherwise fall back to OpenCV. Returns (frames, info).
+    """
     try:
         from transformers.video_utils import load_video  # type: ignore
         frames, info = load_video(video_path_or_url)
+        # Ensure PIL format
         pil_frames = []
         for fr in frames:
             if isinstance(fr, Image.Image):
             else:
                 pil_frames.append(Image.fromarray(fr).convert("RGB"))
         info = info if info is not None else {}
+        # Ensure fps present when possible (fallback to cv2 probe)
         if "fps" not in info or not info.get("fps"):
             try:
                 import cv2  # type: ignore
                 pass
         return pil_frames, info
     except Exception:
+        # Fallback to OpenCV
         try:
             import cv2  # type: ignore
                     break
                 frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                 frames.append(Image.fromarray(frame_rgb))
+            # Gather fps if available
             fps_val = cap.get(cv2.CAP_PROP_FPS)
             cap.release()
             info = {
     frame: Image.Image,
     masks_per_object: dict[int, np.ndarray],
     color_by_obj: dict[int, tuple[int, int, int]],
+    alpha: float = 0.5,
 ) -> Image.Image:
+    """Overlay per-object soft masks onto the RGB frame.
+    masks_per_object: mapping of obj_id -> (H, W) float mask in [0,1]
+    color_by_obj: mapping of obj_id -> (R, G, B)
+    """
+    base = np.array(frame).astype(np.float32) / 255.0  # H, W, 3 in [0,1]
+    height, width = base.shape[:2]
     overlay = base.copy()
     for obj_id, mask in masks_per_object.items():
         if mask is None:
             continue
         if mask.dtype != np.float32:
             mask = mask.astype(np.float32)
+        # Ensure shape is H x W
         if mask.ndim == 3:
             mask = mask.squeeze()
         mask = np.clip(mask, 0.0, 1.0)
         color = np.array(color_by_obj.get(obj_id, (255, 0, 0)), dtype=np.float32) / 255.0
+        # Blend: overlay = (1 - a*m)*overlay + (a*m)*color
+        a = alpha
         m = mask[..., None]
+        overlay = (1.0 - a * m) * overlay + (a * m) * color
     out = np.clip(overlay * 255.0, 0, 255).astype(np.uint8)
     return Image.fromarray(out)
 def get_device_and_dtype() -> tuple[str, torch.dtype]:
+    device = "cuda" if torch.cuda.is_available() else "cpu"
+    dtype = torch.bfloat16
+    return device, dtype
 class AppState:
         self.model: Optional[Sam2VideoModel] = None
         self.processor: Optional[Sam2VideoProcessor] = None
         self.device: str = "cpu"
+        self.dtype: torch.dtype = torch.bfloat16
         self.video_fps: float | None = None
         self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
         self.color_by_obj: dict[int, tuple[int, int, int]] = {}
         self.clicks_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int]]]] = {}
         self.boxes_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int, int]]]] = {}
+        # Cache of composited frames (original + masks + clicks)
         self.composited_frames: dict[int, Image.Image] = {}
+        # UI state for click handler
         self.current_frame_idx: int = 0
         self.current_obj_id: int = 1
         self.current_label: str = "positive"
         self.current_clear_old: bool = True
+        self.current_prompt_type: str = "Points"  # or "Boxes"
         self.pending_box_start: tuple[int, int] | None = None
         self.pending_box_start_frame_idx: int | None = None
         self.pending_box_start_obj_id: int | None = None
         self.is_switching_model: bool = False
+        # Model selection
         self.model_repo_key: str = "tiny"
         self.model_repo_id: str | None = None
         self.session_repo_id: str | None = None
     if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
         if GLOBAL_STATE.model_repo_id == desired_repo:
             return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
+        # Different repo requested: dispose current and reload
         try:
             del GLOBAL_STATE.model
         except Exception:
             pass
         GLOBAL_STATE.model = None
         GLOBAL_STATE.processor = None
+    print(f"Loading model from {desired_repo}")
     device, dtype = get_device_and_dtype()
     model = Sam2VideoModel.from_pretrained(desired_repo, torch_dtype=dtype)
     processor = Sam2VideoProcessor.from_pretrained(desired_repo)
     model.to(device)
     GLOBAL_STATE.model = model
     GLOBAL_STATE.processor = processor
     GLOBAL_STATE.device = device
     GLOBAL_STATE.dtype = dtype
     GLOBAL_STATE.model_repo_id = desired_repo
     return model, processor, device, dtype
 def ensure_session_for_current_model() -> None:
+    """Ensure the model/processor match the selected repo and inference_session exists.
+    If a video is already loaded, re-initialize the inference session when needed.
+    """
     model, processor, device, dtype = load_model_if_needed()
     desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
     if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
         if GLOBAL_STATE.video_frames:
+            # Clear session-related UI caches when switching model
             GLOBAL_STATE.masks_by_frame.clear()
             GLOBAL_STATE.clicks_by_frame_obj.clear()
             GLOBAL_STATE.boxes_by_frame_obj.clear()
             GLOBAL_STATE.composited_frames.clear()
+            # Dispose previous session cleanly
             try:
                 if GLOBAL_STATE.inference_session is not None:
                     GLOBAL_STATE.inference_session.reset_inference_session()
                 pass
             GLOBAL_STATE.inference_session = None
             gc.collect()
+            try:
+                if torch.cuda.is_available():
+                    torch.cuda.empty_cache()
+            except Exception:
+                pass
             GLOBAL_STATE.inference_session = processor.init_video_session(
                 video=GLOBAL_STATE.video_frames,
                 inference_device=device,
             )
             GLOBAL_STATE.session_repo_id = desired_repo
+def init_video_session(video: str | dict) -> tuple[AppState, int, int, Image.Image, str]:
+    """Gradio handler: load video, init session, return state, slider bounds, and first frame."""
+    # Reset ONLY video-related fields, keep model loaded
     GLOBAL_STATE.video_frames = []
     GLOBAL_STATE.inference_session = None
     GLOBAL_STATE.masks_by_frame = {}
     GLOBAL_STATE.color_by_obj = {}
+    model, processor, device, dtype = load_model_if_needed()
+    # Gradio Video may provide a dict with 'name' or a direct file path
     video_path: Optional[str] = None
     if isinstance(video, dict):
         video_path = video.get("name") or video.get("path") or video.get("data")
         video_path = video
     else:
         video_path = None
     if not video_path:
         raise gr.Error("Invalid video input.")
         raise gr.Error("No frames could be loaded from the video.")
     GLOBAL_STATE.video_frames = frames
+    # Try to capture original FPS if provided by loader
     GLOBAL_STATE.video_fps = None
     if isinstance(info, dict) and info.get("fps"):
         try:
         except Exception:
             GLOBAL_STATE.video_fps = None
+    # Initialize session
     inference_session = processor.init_video_session(
         video=frames,
         inference_device=device,
     first_frame = frames[0]
     max_idx = len(frames) - 1
+    status = (
+        f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps. Device: {device}, dtype: bfloat16"
+    )
     return GLOBAL_STATE, 0, max_idx, first_frame, status
     if len(masks) != 0:
         out_img = overlay_masks_on_frame(out_img, masks, state.color_by_obj, alpha=0.65)
+    # Draw crosses for conditioning frames only (frames with recorded clicks)
     clicks_map = state.clicks_by_frame_obj.get(frame_idx)
     if clicks_map:
         draw = ImageDraw.Draw(out_img)
         for obj_id, pts in clicks_map.items():
             for x, y, lbl in pts:
                 color = (0, 255, 0) if int(lbl) == 1 else (255, 0, 0)
+                # horizontal
                 draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
+                # vertical
                 draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
+    # Draw temporary cross for first corner in box mode
     if (
         state.pending_box_start is not None
         and state.pending_box_start_frame_idx == frame_idx
         color = state.color_by_obj.get(state.pending_box_start_obj_id, (255, 255, 255))
         draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
         draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)
+    # Draw boxes for conditioning frames
+    box_map = state.boxes_by_frame_obj.get(frame_idx)
+    if box_map:
+        draw = ImageDraw.Draw(out_img)
+        for obj_id, boxes in box_map.items():
+            color = state.color_by_obj.get(obj_id, (255, 255, 255))
+            for x1, y1, x2, y2 in boxes:
+                draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=2)
+    # Save to cache and return
     state.composited_frames[frame_idx] = out_img
     return out_img
     if state is None or state.video_frames is None or len(state.video_frames) == 0:
         return None
     frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
+    # Serve from cache when available
     cached = state.composited_frames.get(frame_idx)
     if cached is not None:
         return cached
     label: str,
     clear_old: bool,
     evt: gr.SelectData,
+) -> Image.Image:
     if state is None or state.inference_session is None:
+        return img  # no-op preview when not ready
     if state.is_switching_model:
+        # Gracefully ignore input during model switch; return current preview unchanged
         return update_frame_display(state, int(frame_idx))
+    # Parse click coordinates from event
     x = y = None
     if evt is not None:
+        # Try different gradio event data shapes for robustness
         try:
             if hasattr(evt, "index") and isinstance(evt.index, (list, tuple)) and len(evt.index) == 2:
                 x, y = int(evt.index[0]), int(evt.index[1])
                 x, y = int(evt.value["x"]), int(evt.value["y"])
         except Exception:
             x = y = None
     if x is None or y is None:
+        raise gr.Error("Could not read click coordinates.")
     _ensure_color_for_obj(int(obj_id))
     processor = GLOBAL_STATE.processor
     model = GLOBAL_STATE.model
     inference_session = GLOBAL_STATE.inference_session
     if state.current_prompt_type == "Boxes":
+        # Two-click box input
         if state.pending_box_start is None:
+            # If clear_old is enabled, clear prior points for this object on this frame
             if bool(clear_old):
                 frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
                 frame_clicks[int(obj_id)] = []
             state.pending_box_start = (int(x), int(y))
             state.pending_box_start_frame_idx = int(frame_idx)
             state.pending_box_start_obj_id = int(obj_id)
+            # Invalidate cache so temporary cross is drawn
             state.composited_frames.pop(int(frame_idx), None)
             return update_frame_display(state, int(frame_idx))
         else:
             x1, y1 = state.pending_box_start
             x2, y2 = int(x), int(y)
+            # Clear temporary state and invalidate cache
             state.pending_box_start = None
             state.pending_box_start_frame_idx = None
             state.pending_box_start_obj_id = None
             obj_boxes.append((x_min, y_min, x_max, y_max))
             state.composited_frames.pop(int(frame_idx), None)
     else:
+        # Points mode
         label_int = 1 if str(label).lower().startswith("pos") else 0
+        # If clear_old is enabled, clear prior boxes for this object on this frame
         if bool(clear_old):
             frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
             frame_boxes[int(obj_id)] = []
             input_labels=[[[int(label_int)]]],
             clear_old_inputs=bool(clear_old),
         )
         frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
         obj_clicks = frame_clicks.setdefault(int(obj_id), [])
         if bool(clear_old):
         obj_clicks.append((int(x), int(y), int(label_int)))
         state.composited_frames.pop(int(frame_idx), None)
+    # Forward on that frame
+    device_type = "cuda" if GLOBAL_STATE.device == "cuda" else "cpu"
+    with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=GLOBAL_STATE.dtype):
+        outputs = model(
+            inference_session=inference_session,
+            frame_idx=int(frame_idx),
+        )
     H = inference_session.video_height
     W = inference_session.video_width
+    # Detach and move off GPU as early as possible to reduce GPU memory pressure
     pred_masks = outputs.pred_masks.detach().cpu()
     video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
+    # Map returned masks to object ids. For single object forward, it's [1, 1, H, W]
+    # But to be safe, iterate over session.obj_ids order.
     masks_for_frame: dict[int, np.ndarray] = {}
     obj_ids_order = list(inference_session.obj_ids)
     for i, oid in enumerate(obj_ids_order):
         mask_i = video_res_masks[i]
+        # mask_i shape could be (1, H, W) or (H, W); squeeze to 2D
         mask_2d = mask_i.cpu().numpy().squeeze()
         masks_for_frame[int(oid)] = mask_2d
     GLOBAL_STATE.masks_by_frame[int(frame_idx)] = masks_for_frame
+    # Invalidate cache for this frame to force recomposition
     GLOBAL_STATE.composited_frames.pop(int(frame_idx), None)
+    # Return updated preview
     return update_frame_display(GLOBAL_STATE, int(frame_idx))
     if state is None or state.inference_session is None:
         yield "Load a video first."
         return
     processor = GLOBAL_STATE.processor
     model = GLOBAL_STATE.model
     inference_session = GLOBAL_STATE.inference_session
     total = max(1, GLOBAL_STATE.num_frames)
     processed = 0
+    # Initial status for first run visibility
     yield f"Propagating masks: {processed}/{total}"
+    device_type = "cuda" if GLOBAL_STATE.device == "cuda" else "cpu"
+    with torch.inference_mode(), torch.autocast(device_type=device_type, dtype=GLOBAL_STATE.dtype):
         for sam2_video_output in model.propagate_in_video_iterator(inference_session):
             H = inference_session.video_height
             W = inference_session.video_width
             pred_masks = sam2_video_output.pred_masks.detach().cpu()
             video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
             frame_idx = int(sam2_video_output.frame_idx)
             masks_for_frame: dict[int, np.ndarray] = {}
             obj_ids_order = list(inference_session.obj_ids)
                 mask_2d = video_res_masks[i].cpu().numpy().squeeze()
                 masks_for_frame[int(oid)] = mask_2d
             GLOBAL_STATE.masks_by_frame[frame_idx] = masks_for_frame
+            # Invalidate cache for that frame to force recomposition
             GLOBAL_STATE.composited_frames.pop(frame_idx, None)
             processed += 1
             progress((processed, total), f"Propagating masks: {processed}/{total}")
+            # Stream status updates so users see progress text
             yield f"Propagating masks: {processed}/{total}"
     yield f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."
+def reset_session() -> tuple[AppState, Image.Image, int, int, str]:
+    # Reset only session-related state, keep uploaded video and model
     if not GLOBAL_STATE.video_frames:
+        # Nothing loaded; keep behavior
         return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video."
+    # Clear prompts and caches
     GLOBAL_STATE.masks_by_frame.clear()
     GLOBAL_STATE.clicks_by_frame_obj.clear()
     GLOBAL_STATE.boxes_by_frame_obj.clear()
     GLOBAL_STATE.pending_box_start = None
     GLOBAL_STATE.pending_box_start_frame_idx = None
     GLOBAL_STATE.pending_box_start_obj_id = None
+    # Dispose and re-init inference session for current model with existing frames
     try:
         if GLOBAL_STATE.inference_session is not None:
             GLOBAL_STATE.inference_session.reset_inference_session()
         pass
     GLOBAL_STATE.inference_session = None
     gc.collect()
+    try:
+        if torch.cuda.is_available():
+            torch.cuda.empty_cache()
+    except Exception:
+        pass
     ensure_session_for_current_model()
+    # Keep current slider index if possible
     current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
     current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
     preview_img = update_frame_display(GLOBAL_STATE, current_idx)
     return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status
+with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation") as demo:
     state = gr.State(GLOBAL_STATE)
+    gr.Markdown("""
+    **SAM2 Video (Transformers)** — Upload a video, click to add positive/negative points per object, preview masks on the clicked frame, then propagate across the video. Use the slider to scrub frames.
+    """)
     with gr.Row():
         with gr.Column(scale=1):
             load_status = gr.Markdown(visible=True)
             reset_btn = gr.Button("Reset Session", variant="secondary")
             examples_list = [
+                ["/home/ubuntu/models_implem/tennis.mp4"],
+                ["/home/ubuntu/models_implem/tennis.mp4"],
             ]
         with gr.Column(scale=2):
             preview = gr.Image(label="Preview", interactive=True)
         render_btn = gr.Button("Render MP4 for smooth playback")
     playback_video = gr.Video(label="Rendered Playback", interactive=False)
+    # Wire events
     def _on_video_change(video):
         s, min_idx, max_idx, first_frame, status = init_video_session(video)
+        return (
+            s,
+            gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True),
+            first_frame,
+            status,
+        )
     video_in.change(
+        _on_video_change,
+        inputs=[video_in],
+        outputs=[state, frame_slider, preview, load_status],
+        show_progress=True,
     )
     gr.Examples(
         examples=examples_list,
         inputs=[video_in],
         if s is not None and key:
             key = str(key)
             if key != s.model_repo_key:
+                # Update and drop current model to reload lazily next time
                 s.is_switching_model = True
                 s.model_repo_key = key
                 s.model_repo_id = None
                 s.model = None
                 s.processor = None
+        # Stream progress text while loading (first yield shows text)
         yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
         ensure_session_for_current_model()
         if s is not None:
             s.is_switching_model = False
+        # Final yield hides the text
         yield gr.update(visible=False, value="")
     ckpt_radio.change(_on_ckpt_change, inputs=[state, ckpt_radio], outputs=[ckpt_progress])
+    # Also retrigger session re-init if a video already loaded
     def _rebind_session_after_ckpt(s: AppState):
         ensure_session_for_current_model()
+        # Reset pending box corner to avoid mismatched state
         if s is not None:
             s.pending_box_start = None
         return gr.update()
             state_in.current_frame_idx = int(idx)
         return update_frame_display(state_in, int(idx))
+    frame_slider.change(
+        _sync_frame_idx,
+        inputs=[state, frame_slider],
+        outputs=preview,
+    )
     def _sync_obj_id(s: AppState, oid):
         if s is not None and oid is not None:
     prompt_type.change(_sync_prompt_type, inputs=[state, prompt_type], outputs=[label_radio])
+    # Image click to add a point and run forward on that frame
     preview.select(on_image_click, [preview, state, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview)
+    # Playback via MP4 rendering only
+    # Render a smooth MP4 using imageio/pyav (fallbacks to imageio v2 / OpenCV)
     def _render_video(s: AppState):
         if s is None or s.num_frames == 0:
             raise gr.Error("Load a video first.")
         fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
+        # Compose all frames (cache will help if already prepared)
         frames_np = []
+        first = compose_frame(s, 0)
+        h, w = first.size[1], first.size[0]
         for idx in range(s.num_frames):
             img = s.composited_frames.get(idx)
             if img is None:
                 img = compose_frame(s, idx)
+            frames_np.append(np.array(img)[:, :, ::-1])  # BGR for cv2
+            # Periodically release CPU mem to reduce pressure
             if (idx + 1) % 60 == 0:
                 gc.collect()
         out_path = "/tmp/sam2_playback.mp4"
+        # Prefer imageio with PyAV/ffmpeg to respect exact fps
         try:
             import imageio.v3 as iio  # type: ignore
             iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
             return out_path
         except Exception:
+            # Fallbacks
             try:
                 import imageio.v2 as imageio  # type: ignore
                 imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
                 return out_path
+            except Exception:
+                try:
+                    import cv2  # type: ignore
+                    fourcc = cv2.VideoWriter_fourcc(*"mp4v")
+                    writer = cv2.VideoWriter(out_path, fourcc, fps, (w, h))
+                    for fr_bgr in frames_np:
+                        writer.write(fr_bgr)
+                    writer.release()
+                    return out_path
+                except Exception as e:
+                    raise gr.Error(f"Failed to render video: {e}")
     render_btn.click(_render_video, inputs=[state], outputs=[playback_video])