Spaces:

yonigozlan
/

Segment-Anything-2-video-tracking

Runtime error

File size: 24,507 Bytes

import colorsys
import gc
from typing import Optional

import gradio as gr
import numpy as np
import spaces
import torch
from PIL import Image, ImageDraw

from transformers import Sam2VideoModel, Sam2VideoProcessor


def pastel_color_for_object(obj_id: int) -> tuple[int, int, int]:
    golden_ratio_conjugate = 0.61803398875
    hue = (obj_id * golden_ratio_conjugate) % 1.0
    saturation = 0.45
    value = 1.0
    r_f, g_f, b_f = colorsys.hsv_to_rgb(hue, saturation, value)
    return int(r_f * 255), int(g_f * 255), int(b_f * 255)


def try_load_video_frames(video_path_or_url: str) -> tuple[list[Image.Image], dict]:
    try:
        from transformers.video_utils import load_video  # type: ignore

        frames, info = load_video(video_path_or_url)
        pil_frames = []
        for fr in frames:
            if isinstance(fr, Image.Image):
                pil_frames.append(fr.convert("RGB"))
            else:
                pil_frames.append(Image.fromarray(fr).convert("RGB"))
        info = info if info is not None else {}
        if "fps" not in info or not info.get("fps"):
            try:
                import cv2  # type: ignore

                cap = cv2.VideoCapture(video_path_or_url)
                fps_val = cap.get(cv2.CAP_PROP_FPS)
                cap.release()
                if fps_val and fps_val > 0:
                    info["fps"] = float(fps_val)
            except Exception:
                pass
        return pil_frames, info
    except Exception:
        try:
            import cv2  # type: ignore

            cap = cv2.VideoCapture(video_path_or_url)
            frames = []
            while cap.isOpened():
                ret, frame = cap.read()
                if not ret:
                    break
                frame_rgb = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
                frames.append(Image.fromarray(frame_rgb))
            fps_val = cap.get(cv2.CAP_PROP_FPS)
            cap.release()
            info = {
                "num_frames": len(frames),
                "fps": float(fps_val) if fps_val and fps_val > 0 else None,
            }
            return frames, info
        except Exception as e:
            raise RuntimeError(f"Failed to load video: {e}")


def overlay_masks_on_frame(
    frame: Image.Image,
    masks_per_object: dict[int, np.ndarray],
    color_by_obj: dict[int, tuple[int, int, int]],
    alpha: float = 0.65,
) -> Image.Image:
    base = np.array(frame).astype(np.float32) / 255.0
    overlay = base.copy()
    for obj_id, mask in masks_per_object.items():
        if mask is None:
            continue
        if mask.dtype != np.float32:
            mask = mask.astype(np.float32)
        if mask.ndim == 3:
            mask = mask.squeeze()
        mask = np.clip(mask, 0.0, 1.0)
        color = np.array(color_by_obj.get(obj_id, (255, 0, 0)), dtype=np.float32) / 255.0
        m = mask[..., None]
        overlay = (1.0 - alpha * m) * overlay + (alpha * m) * color
    out = np.clip(overlay * 255.0, 0, 255).astype(np.uint8)
    return Image.fromarray(out)


def get_device_and_dtype() -> tuple[str, torch.dtype]:
    # Force CPU-only on Spaces with zero GPU
    return "cpu", torch.float32


class AppState:
    def __init__(self):
        self.reset()

    def reset(self):
        self.video_frames: list[Image.Image] = []
        self.inference_session = None
        self.model: Optional[Sam2VideoModel] = None
        self.processor: Optional[Sam2VideoProcessor] = None
        self.device: str = "cpu"
        self.dtype: torch.dtype = torch.float32
        self.video_fps: float | None = None
        self.masks_by_frame: dict[int, dict[int, np.ndarray]] = {}
        self.color_by_obj: dict[int, tuple[int, int, int]] = {}
        self.clicks_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int]]]] = {}
        self.boxes_by_frame_obj: dict[int, dict[int, list[tuple[int, int, int, int]]]] = {}
        self.composited_frames: dict[int, Image.Image] = {}
        self.current_frame_idx: int = 0
        self.current_obj_id: int = 1
        self.current_label: str = "positive"
        self.current_clear_old: bool = True
        self.current_prompt_type: str = "Points"
        self.pending_box_start: tuple[int, int] | None = None
        self.pending_box_start_frame_idx: int | None = None
        self.pending_box_start_obj_id: int | None = None
        self.is_switching_model: bool = False
        self.model_repo_key: str = "tiny"
        self.model_repo_id: str | None = None
        self.session_repo_id: str | None = None

    @property
    def num_frames(self) -> int:
        return len(self.video_frames)


GLOBAL_STATE = AppState()


def _model_repo_from_key(key: str) -> str:
    mapping = {
        "tiny": "yonigozlan/sam2.1_hiera_tiny_hf",
        "small": "yonigozlan/sam2.1_hiera_small_hf",
        "base_plus": "yonigozlan/sam2.1_hiera_base_plus_hf",
        "large": "yonigozlan/sam2.1_hiera_large_hf",
    }
    return mapping.get(key, mapping["base_plus"])


@spaces.GPU()
def load_model_if_needed() -> tuple[Sam2VideoModel, Sam2VideoProcessor, str, torch.dtype]:
    desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
    if GLOBAL_STATE.model is not None and GLOBAL_STATE.processor is not None:
        if GLOBAL_STATE.model_repo_id == desired_repo:
            return GLOBAL_STATE.model, GLOBAL_STATE.processor, GLOBAL_STATE.device, GLOBAL_STATE.dtype
        try:
            del GLOBAL_STATE.model
        except Exception:
            pass
        try:
            del GLOBAL_STATE.processor
        except Exception:
            pass
        GLOBAL_STATE.model = None
        GLOBAL_STATE.processor = None

    device, dtype = get_device_and_dtype()
    model = Sam2VideoModel.from_pretrained(desired_repo, torch_dtype=dtype)
    processor = Sam2VideoProcessor.from_pretrained(desired_repo)
    model.to(device)
    GLOBAL_STATE.model = model
    GLOBAL_STATE.processor = processor
    GLOBAL_STATE.device = device
    GLOBAL_STATE.dtype = dtype
    GLOBAL_STATE.model_repo_id = desired_repo
    return model, processor, device, dtype


def ensure_session_for_current_model() -> None:
    model, processor, device, dtype = load_model_if_needed()
    desired_repo = _model_repo_from_key(GLOBAL_STATE.model_repo_key)
    if GLOBAL_STATE.inference_session is None or GLOBAL_STATE.session_repo_id != desired_repo:
        if GLOBAL_STATE.video_frames:
            GLOBAL_STATE.masks_by_frame.clear()
            GLOBAL_STATE.clicks_by_frame_obj.clear()
            GLOBAL_STATE.boxes_by_frame_obj.clear()
            GLOBAL_STATE.composited_frames.clear()
            try:
                if GLOBAL_STATE.inference_session is not None:
                    GLOBAL_STATE.inference_session.reset_inference_session()
            except Exception:
                pass
            GLOBAL_STATE.inference_session = None
            gc.collect()
            GLOBAL_STATE.inference_session = processor.init_video_session(
                video=GLOBAL_STATE.video_frames,
                inference_device=device,
                video_storage_device="cpu",
            )
            GLOBAL_STATE.session_repo_id = desired_repo


def init_video_session(video: str | dict):
    GLOBAL_STATE.video_frames = []
    GLOBAL_STATE.inference_session = None
    GLOBAL_STATE.masks_by_frame = {}
    GLOBAL_STATE.color_by_obj = {}

    load_model_if_needed()

    video_path: Optional[str] = None
    if isinstance(video, dict):
        video_path = video.get("name") or video.get("path") or video.get("data")
    elif isinstance(video, str):
        video_path = video
    else:
        video_path = None
    if not video_path:
        raise gr.Error("Invalid video input.")

    frames, info = try_load_video_frames(video_path)
    if len(frames) == 0:
        raise gr.Error("No frames could be loaded from the video.")

    GLOBAL_STATE.video_frames = frames
    GLOBAL_STATE.video_fps = None
    if isinstance(info, dict) and info.get("fps"):
        try:
            GLOBAL_STATE.video_fps = float(info["fps"]) or None
        except Exception:
            GLOBAL_STATE.video_fps = None

    processor = GLOBAL_STATE.processor
    device = GLOBAL_STATE.device
    inference_session = processor.init_video_session(
        video=frames,
        inference_device=device,
        video_storage_device="cpu",
    )
    GLOBAL_STATE.inference_session = inference_session

    first_frame = frames[0]
    max_idx = len(frames) - 1
    status = f"Loaded {len(frames)} frames @ {GLOBAL_STATE.video_fps or 'unknown'} fps. Device: {device}, dtype: {GLOBAL_STATE.dtype}"
    return GLOBAL_STATE, 0, max_idx, first_frame, status


def compose_frame(state: AppState, frame_idx: int) -> Image.Image:
    if state is None or state.video_frames is None or len(state.video_frames) == 0:
        return None
    frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
    frame = state.video_frames[frame_idx]
    masks = state.masks_by_frame.get(frame_idx, {})
    out_img = frame
    if len(masks) != 0:
        out_img = overlay_masks_on_frame(out_img, masks, state.color_by_obj, alpha=0.65)

    clicks_map = state.clicks_by_frame_obj.get(frame_idx)
    if clicks_map:
        draw = ImageDraw.Draw(out_img)
        cross_half = 6
        for obj_id, pts in clicks_map.items():
            for x, y, lbl in pts:
                color = (0, 255, 0) if int(lbl) == 1 else (255, 0, 0)
                draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
                draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)

    box_map = state.boxes_by_frame_obj.get(frame_idx)
    if box_map:
        draw = ImageDraw.Draw(out_img)
        for obj_id, boxes in box_map.items():
            color = state.color_by_obj.get(obj_id, (255, 255, 255))
            for x1, y1, x2, y2 in boxes:
                draw.rectangle([(x1, y1), (x2, y2)], outline=color, width=2)

    if (
        state.pending_box_start is not None
        and state.pending_box_start_frame_idx == frame_idx
        and state.pending_box_start_obj_id is not None
    ):
        draw = ImageDraw.Draw(out_img)
        x, y = state.pending_box_start
        cross_half = 6
        color = state.color_by_obj.get(state.pending_box_start_obj_id, (255, 255, 255))
        draw.line([(x - cross_half, y), (x + cross_half, y)], fill=color, width=2)
        draw.line([(x, y - cross_half), (x, y + cross_half)], fill=color, width=2)

    state.composited_frames[frame_idx] = out_img
    return out_img


def update_frame_display(state: AppState, frame_idx: int) -> Image.Image:
    if state is None or state.video_frames is None or len(state.video_frames) == 0:
        return None
    frame_idx = int(np.clip(frame_idx, 0, len(state.video_frames) - 1))
    cached = state.composited_frames.get(frame_idx)
    if cached is not None:
        return cached
    return compose_frame(state, frame_idx)


def _ensure_color_for_obj(obj_id: int):
    if obj_id not in GLOBAL_STATE.color_by_obj:
        GLOBAL_STATE.color_by_obj[obj_id] = pastel_color_for_object(obj_id)


@spaces.GPU()
def on_image_click(
    img: Image.Image | np.ndarray,
    state: AppState,
    frame_idx: int,
    obj_id: int,
    label: str,
    clear_old: bool,
    evt: gr.SelectData,
):
    if state is None or state.inference_session is None:
        return img
    if state.is_switching_model:
        return update_frame_display(state, int(frame_idx))

    x = y = None
    if evt is not None:
        try:
            if hasattr(evt, "index") and isinstance(evt.index, (list, tuple)) and len(evt.index) == 2:
                x, y = int(evt.index[0]), int(evt.index[1])
            elif hasattr(evt, "value") and isinstance(evt.value, dict) and "x" in evt.value and "y" in evt.value:
                x, y = int(evt.value["x"]), int(evt.value["y"])
        except Exception:
            x = y = None
    if x is None or y is None:
        return update_frame_display(state, int(frame_idx))

    _ensure_color_for_obj(int(obj_id))
    processor = GLOBAL_STATE.processor
    model = GLOBAL_STATE.model
    inference_session = GLOBAL_STATE.inference_session

    if state.current_prompt_type == "Boxes":
        if state.pending_box_start is None:
            if bool(clear_old):
                frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
                frame_clicks[int(obj_id)] = []
                state.composited_frames.pop(int(frame_idx), None)
            state.pending_box_start = (int(x), int(y))
            state.pending_box_start_frame_idx = int(frame_idx)
            state.pending_box_start_obj_id = int(obj_id)
            state.composited_frames.pop(int(frame_idx), None)
            return update_frame_display(state, int(frame_idx))
        else:
            x1, y1 = state.pending_box_start
            x2, y2 = int(x), int(y)
            state.pending_box_start = None
            state.pending_box_start_frame_idx = None
            state.pending_box_start_obj_id = None
            state.composited_frames.pop(int(frame_idx), None)
            x_min, y_min = min(x1, x2), min(y1, y2)
            x_max, y_max = max(x1, x2), max(y1, y2)

            processor.add_inputs_to_inference_session(
                inference_session=inference_session,
                frame_idx=int(frame_idx),
                obj_ids=int(obj_id),
                input_boxes=[[[x_min, y_min, x_max, y_max]]],
                clear_old_inputs=bool(clear_old),
            )

            frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
            obj_boxes = frame_boxes.setdefault(int(obj_id), [])
            if bool(clear_old):
                obj_boxes.clear()
            obj_boxes.append((x_min, y_min, x_max, y_max))
            state.composited_frames.pop(int(frame_idx), None)
    else:
        label_int = 1 if str(label).lower().startswith("pos") else 0
        if bool(clear_old):
            frame_boxes = state.boxes_by_frame_obj.setdefault(int(frame_idx), {})
            frame_boxes[int(obj_id)] = []
            state.composited_frames.pop(int(frame_idx), None)
        processor.add_inputs_to_inference_session(
            inference_session=inference_session,
            frame_idx=int(frame_idx),
            obj_ids=int(obj_id),
            input_points=[[[[int(x), int(y)]]]],
            input_labels=[[[int(label_int)]]],
            clear_old_inputs=bool(clear_old),
        )
        frame_clicks = state.clicks_by_frame_obj.setdefault(int(frame_idx), {})
        obj_clicks = frame_clicks.setdefault(int(obj_id), [])
        if bool(clear_old):
            obj_clicks.clear()
        obj_clicks.append((int(x), int(y), int(label_int)))
        state.composited_frames.pop(int(frame_idx), None)

    with torch.inference_mode():
        outputs = model(inference_session=inference_session, frame_idx=int(frame_idx))

    H = inference_session.video_height
    W = inference_session.video_width
    pred_masks = outputs.pred_masks.detach().cpu()
    video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
    masks_for_frame: dict[int, np.ndarray] = {}
    obj_ids_order = list(inference_session.obj_ids)
    for i, oid in enumerate(obj_ids_order):
        mask_i = video_res_masks[i]
        mask_2d = mask_i.cpu().numpy().squeeze()
        masks_for_frame[int(oid)] = mask_2d
    GLOBAL_STATE.masks_by_frame[int(frame_idx)] = masks_for_frame
    GLOBAL_STATE.composited_frames.pop(int(frame_idx), None)
    return update_frame_display(GLOBAL_STATE, int(frame_idx))


@spaces.GPU()
def propagate_masks(state: AppState, progress=gr.Progress()):
    if state is None or state.inference_session is None:
        yield "Load a video first."
        return
    processor = GLOBAL_STATE.processor
    model = GLOBAL_STATE.model
    inference_session = GLOBAL_STATE.inference_session
    total = max(1, GLOBAL_STATE.num_frames)
    processed = 0
    yield f"Propagating masks: {processed}/{total}"
    with torch.inference_mode():
        for sam2_video_output in model.propagate_in_video_iterator(inference_session):
            H = inference_session.video_height
            W = inference_session.video_width
            pred_masks = sam2_video_output.pred_masks.detach().cpu()
            video_res_masks = processor.post_process_masks([pred_masks], original_sizes=[[H, W]])[0]
            frame_idx = int(sam2_video_output.frame_idx)
            masks_for_frame: dict[int, np.ndarray] = {}
            obj_ids_order = list(inference_session.obj_ids)
            for i, oid in enumerate(obj_ids_order):
                mask_2d = video_res_masks[i].cpu().numpy().squeeze()
                masks_for_frame[int(oid)] = mask_2d
            GLOBAL_STATE.masks_by_frame[frame_idx] = masks_for_frame
            GLOBAL_STATE.composited_frames.pop(frame_idx, None)
            processed += 1
            progress((processed, total), f"Propagating masks: {processed}/{total}")
            yield f"Propagating masks: {processed}/{total}"
    yield f"Propagated masks across {processed} frames for {len(inference_session.obj_ids)} objects."


def reset_session():
    if not GLOBAL_STATE.video_frames:
        return GLOBAL_STATE, None, 0, 0, "Session reset. Load a new video."
    GLOBAL_STATE.masks_by_frame.clear()
    GLOBAL_STATE.clicks_by_frame_obj.clear()
    GLOBAL_STATE.boxes_by_frame_obj.clear()
    GLOBAL_STATE.composited_frames.clear()
    GLOBAL_STATE.pending_box_start = None
    GLOBAL_STATE.pending_box_start_frame_idx = None
    GLOBAL_STATE.pending_box_start_obj_id = None
    try:
        if GLOBAL_STATE.inference_session is not None:
            GLOBAL_STATE.inference_session.reset_inference_session()
    except Exception:
        pass
    GLOBAL_STATE.inference_session = None
    gc.collect()
    ensure_session_for_current_model()
    current_idx = int(getattr(GLOBAL_STATE, "current_frame_idx", 0))
    current_idx = max(0, min(current_idx, GLOBAL_STATE.num_frames - 1))
    preview_img = update_frame_display(GLOBAL_STATE, current_idx)
    slider_minmax = gr.update(minimum=0, maximum=max(GLOBAL_STATE.num_frames - 1, 0), interactive=True)
    slider_value = gr.update(value=current_idx)
    status = "Session reset. Prompts cleared; video preserved."
    return GLOBAL_STATE, preview_img, slider_minmax, slider_value, status


with gr.Blocks(title="SAM2 Video (Transformers) - Interactive Segmentation (CPU)") as demo:
    state = gr.State(GLOBAL_STATE)

    gr.Markdown(
        """
    **SAM2 Video (Transformers)** — CPU-only Space. Upload a video, click to add positive/negative points per object or draw two-click boxes, preview masks, then propagate across the video. Use the slider to scrub frames.
    """
    )

    with gr.Row():
        with gr.Column(scale=1):
            video_in = gr.Video(label="Upload video", sources=["upload", "webcam"], interactive=True)
            ckpt_radio = gr.Radio(
                choices=["tiny", "small", "base_plus", "large"],
                value="tiny",
                label="SAM2 checkpoint",
            )
            ckpt_progress = gr.Markdown(visible=False)
            load_status = gr.Markdown(visible=True)
            reset_btn = gr.Button("Reset Session", variant="secondary")
            examples_list = [
                ["./tennis.mp4"],
            ]
        with gr.Column(scale=2):
            preview = gr.Image(label="Preview", interactive=True)
            frame_slider = gr.Slider(label="Frame", minimum=0, maximum=0, step=1, value=0, interactive=True)

    with gr.Row():
        obj_id_inp = gr.Number(value=1, precision=0, label="Object ID")
        label_radio = gr.Radio(choices=["positive", "negative"], value="positive", label="Point label")
        clear_old_chk = gr.Checkbox(value=True, label="Clear old inputs for this object")
        prompt_type = gr.Radio(choices=["Points", "Boxes"], value="Points", label="Prompt type")
        with gr.Column():
            propagate_btn = gr.Button("Propagate across video", variant="primary")
            propagate_status = gr.Markdown(visible=True)

    with gr.Row():
        render_btn = gr.Button("Render MP4 for smooth playback")
    playback_video = gr.Video(label="Rendered Playback", interactive=False)

    def _on_video_change(video):
        s, min_idx, max_idx, first_frame, status = init_video_session(video)
        return s, gr.update(minimum=min_idx, maximum=max_idx, value=min_idx, interactive=True), first_frame, status

    video_in.change(
        _on_video_change, inputs=[video_in], outputs=[state, frame_slider, preview, load_status], show_progress=True
    )
    gr.Examples(
        examples=examples_list,
        inputs=[video_in],
        fn=_on_video_change,
        outputs=[state, frame_slider, preview, load_status],
        label="Examples",
        cache_examples=False,
        examples_per_page=5,
    )

    def _on_ckpt_change(s: AppState, key: str):
        if s is not None and key:
            key = str(key)
            if key != s.model_repo_key:
                s.is_switching_model = True
                s.model_repo_key = key
                s.model_repo_id = None
                s.model = None
                s.processor = None
        yield gr.update(visible=True, value=f"Loading checkpoint: {key}...")
        ensure_session_for_current_model()
        if s is not None:
            s.is_switching_model = False
        yield gr.update(visible=False, value="")

    ckpt_radio.change(_on_ckpt_change, inputs=[state, ckpt_radio], outputs=[ckpt_progress])

    def _rebind_session_after_ckpt(s: AppState):
        ensure_session_for_current_model()
        if s is not None:
            s.pending_box_start = None
        return gr.update()

    ckpt_radio.change(_rebind_session_after_ckpt, inputs=[state], outputs=[])

    def _sync_frame_idx(state_in: AppState, idx: int):
        if state_in is not None:
            state_in.current_frame_idx = int(idx)
        return update_frame_display(state_in, int(idx))

    frame_slider.change(_sync_frame_idx, inputs=[state, frame_slider], outputs=preview)

    def _sync_obj_id(s: AppState, oid):
        if s is not None and oid is not None:
            s.current_obj_id = int(oid)
        return gr.update()

    obj_id_inp.change(_sync_obj_id, inputs=[state, obj_id_inp], outputs=[])

    def _sync_label(s: AppState, lab: str):
        if s is not None and lab is not None:
            s.current_label = str(lab)
        return gr.update()

    label_radio.change(_sync_label, inputs=[state, label_radio], outputs=[])

    def _sync_prompt_type(s: AppState, val: str):
        if s is not None and val is not None:
            s.current_prompt_type = str(val)
            s.pending_box_start = None
        show_labels = str(val).lower() == "points"
        return gr.update(visible=show_labels)

    prompt_type.change(_sync_prompt_type, inputs=[state, prompt_type], outputs=[label_radio])

    preview.select(on_image_click, [preview, state, frame_slider, obj_id_inp, label_radio, clear_old_chk], preview)

    def _render_video(s: AppState):
        if s is None or s.num_frames == 0:
            raise gr.Error("Load a video first.")
        fps = s.video_fps if s.video_fps and s.video_fps > 0 else 12
        frames_np = []
        for idx in range(s.num_frames):
            img = s.composited_frames.get(idx)
            if img is None:
                img = compose_frame(s, idx)
            frames_np.append(np.array(img)[:, :, ::-1])
            if (idx + 1) % 60 == 0:
                gc.collect()
        out_path = "/tmp/sam2_playback.mp4"
        try:
            import imageio.v3 as iio  # type: ignore

            iio.imwrite(out_path, [fr[:, :, ::-1] for fr in frames_np], plugin="pyav", fps=fps)
            return out_path
        except Exception:
            try:
                import imageio.v2 as imageio  # type: ignore

                imageio.mimsave(out_path, [fr[:, :, ::-1] for fr in frames_np], fps=fps)
                return out_path
            except Exception as e:
                raise gr.Error(f"Failed to render video: {e}")

    render_btn.click(_render_video, inputs=[state], outputs=[playback_video])

    propagate_btn.click(propagate_masks, inputs=[state], outputs=[propagate_status], show_progress=True)

    reset_btn.click(
        reset_session,
        inputs=None,
        outputs=[state, preview, frame_slider, frame_slider, load_status],
    )


demo.queue(api_open=False).launch()