Spaces:

jiani-huang
/

LASER

Running on Zero

File size: 22,056 Bytes

from pathlib import Path
from collections.abc import Mapping, Sequence
from functools import lru_cache
import inspect
import shutil
import tempfile
import os
import sys

# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
current_dir = Path(__file__).resolve().parent
src_dir = current_dir / "src"
if src_dir.is_dir() and str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

import spaces  # <-- ZeroGPU integration
import gradio as gr
import torch
from transformers import pipeline  # not strictly necessary, but fine


# -----------------------------
# Environment / diagnostics
# -----------------------------
os.environ["GRADIO_TEMP_DIR"] = str(Path(__file__).parent / "gradio_temp")
os.environ["OPENAI_API_KEY"] = "test"
os.environ["OMP_NUM_THREADS"] = "4"

print("All imports finished")
print(f"Python version: {sys.version}")
print(f"PyTorch version: {torch.__version__}")
print(f"CUDA available: {torch.cuda.is_available()}")
print(f"CUDA version: {torch.version.cuda}")
print(f"cuDNN version: {torch.backends.cudnn.version()}")
print(f"Number of GPUs: {torch.cuda.device_count()}")

if torch.cuda.is_available():
    for i in range(torch.cuda.device_count()):
        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
        print(
            f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB"
        )

torch.backends.cuda.matmul.allow_tf32 = False
torch.backends.cudnn.allow_tf32 = False
os.environ["TORCH_DTYPE"] = "float32"
torch.set_default_dtype(torch.float32)

current_dir = Path(__file__).resolve().parent
# For Spaces, assume checkpoints live alongside app.py or in a "checkpoints" subdir.
# If you keep them next to app.py locally, this still works.
# NOTE: SAM2 config uses Hydra, so we use just the filename (it searches in sam2/configs/)
sam_config_path = "sam2_hiera_t.yaml"  # Hydra will find this in sam2/configs/
sam_checkpoint_path = str(current_dir / "sam2_hiera_tiny.pt")
gd_config_path = str(current_dir / "GroundingDINO_SwinT_OGC.py")
gd_checkpoint_path = str(current_dir / "groundingdino_swint_ogc.pth")
visualization_dir = str(current_dir / "outputs")
print(
    f"Setting up paths: {sam_config_path}, {sam_checkpoint_path}, {gd_config_path}, {gd_checkpoint_path}"
)


def _split_top_level_commas(s: str):
    """
    Split a string on commas that are NOT inside parentheses.

    Example:
      "behind(person, dog), bite(dog, frisbee)"
      -> ["behind(person, dog)", "bite(dog, frisbee)"]
    """
    parts = []
    buf = []
    depth = 0
    for ch in s:
        if ch == "(":
            depth += 1
            buf.append(ch)
        elif ch == ")":
            if depth > 0:
                depth -= 1
            buf.append(ch)
        elif ch == "," and depth == 0:
            part = "".join(buf).strip()
            if part:
                parts.append(part)
            buf = []
        else:
            buf.append(ch)
    if buf:
        part = "".join(buf).strip()
        if part:
            parts.append(part)
    return parts


def _extract_categories_from_binary(binary_keywords_str: str) -> list[str]:
    """
    Pull candidate category tokens from binary keyword strings, e.g. relation(a, b).
    Only returns tokens when parentheses and two comma-separated entries exist.
    """
    categories: list[str] = []
    for kw in _split_top_level_commas(binary_keywords_str or ""):
        lpar = kw.find("(")
        rpar = kw.rfind(")")
        if lpar == -1 or rpar <= lpar:
            continue
        inside = kw[lpar + 1 : rpar]
        parts = [p.strip() for p in inside.split(",") if p.strip()]
        if len(parts) == 2:
            categories.extend(parts)
    return categories


def _parse_binary_keywords(binary_keywords_str: str, categorical_keywords: list[str]):
    """
    Parse binary keyword string like:
        "behind(person, dog), bite(dog, frisbee)"
    into:
      - binary_keywords_list: list of raw strings (used as CLIP text)
      - batched_binary_predicates: {0: [(rel_text, from_cat, to_cat), ...]} or None
      - warnings: list of warning strings about invalid/mismatched categories
    """
    if not binary_keywords_str:
        return [], None, []

    cat_map = {
        kw.strip().lower(): kw.strip()
        for kw in categorical_keywords
        if isinstance(kw, str) and kw.strip()
    }

    entries = _split_top_level_commas(binary_keywords_str)
    binary_keywords_list: list[str] = []
    predicates: list[tuple[str, str, str]] = []
    warnings: list[str] = []

    for raw in entries:
        kw = raw.strip()
        if not kw:
            continue
        # Always use the full raw keyword as the CLIP text string
        binary_keywords_list.append(kw)

        lpar = kw.find("(")
        rpar = kw.rfind(")")
        if (lpar == -1 and rpar != -1) or (lpar != -1 and rpar == -1) or rpar < lpar:
            msg = (
                f"Binary keyword '{kw}' has mismatched parentheses; expected "
                "relation(from_category, to_category)."
            )
            print(msg)
            warnings.append(msg)
            continue

        if lpar == -1 or rpar <= lpar:
            # No explicit (from,to) part; treat as plain relation (no category filter)
            continue

        inside = kw[lpar + 1 : rpar]
        parts = inside.split(",")
        if len(parts) != 2:
            msg = (
                f"Ignoring '(from,to)' part in binary keyword '{kw}': "
                f"expected exactly two comma-separated items."
            )
            print(msg)
            warnings.append(msg)
            continue

        from_raw = parts[0].strip()
        to_raw = parts[1].strip()
        if not from_raw or not to_raw:
            msg = f"Ignoring binary keyword '{kw}': empty from/to category."
            print(msg)
            warnings.append(msg)
            continue

        canonical_from = cat_map.get(from_raw.lower())
        canonical_to = cat_map.get(to_raw.lower())

        if canonical_from is None:
            msg = (
                f"Binary keyword '{kw}': from-category '{from_raw}' does not "
                f"match any categorical keyword {categorical_keywords}."
            )
            print(msg)
            warnings.append(msg)
        if canonical_to is None:
            msg = (
                f"Binary keyword '{kw}': to-category '{to_raw}' does not "
                f"match any categorical keyword {categorical_keywords}."
            )
            print(msg)
            warnings.append(msg)

        if canonical_from is None or canonical_to is None:
            continue

        # Store (relation_text, from_category, to_category)
        predicates.append((kw, canonical_from, canonical_to))

    if not predicates:
        return binary_keywords_list, None, warnings

    return binary_keywords_list, {0: predicates}, warnings


@lru_cache(maxsize=1)
def _load_vine_pipeline():
    """
    Lazy-load and cache the LASER (VINE HF) pipeline so we don't re-download/rebuild it on every request.
    """
    from vine_hf import VineConfig, VineModel, VinePipeline

    config = VineConfig(
        segmentation_method="grounding_dino_sam2",
        model_name="openai/clip-vit-base-patch32",
        use_hf_repo=True,
        model_repo="KevinX-Penn28/testing",
        box_threshold=0.35,
        text_threshold=0.25,
        target_fps=1,  # default 1 FPS
        topk_cate=5,
        white_alpha=0.3,
        visualization_dir=visualization_dir,
        visualize=True,
        debug_visualizations=False,
        device="cuda",
        categorical_pool="max",
        auto_add_not_unary=False,  # UI will control this per-call
    )
    model = VineModel(config)
    return VinePipeline(
        model=model,
        tokenizer=None,
        sam_config_path=sam_config_path,
        sam_checkpoint_path=sam_checkpoint_path,
        gd_config_path=gd_config_path,
        gd_checkpoint_path=gd_checkpoint_path,
        device="cuda",
        trust_remote_code=True,
    )


@spaces.GPU(duration=120)  # Up to ~5 minutes of H200 ZeroGPU time per call
def process_video(
    video_file,
    categorical_keywords,
    unary_keywords,
    binary_keywords,
    auto_add_not_unary,
    output_fps,
    box_threshold,
    text_threshold,
    binary_confidence_threshold,
):
    vine_pipe = _load_vine_pipeline()

    # Normalize incoming video input to a file path
    if isinstance(video_file, dict):
        video_file = (
            video_file.get("name")
            or video_file.get("filepath")
            or video_file.get("data")
        )
    if not isinstance(video_file, (str, Path)):
        raise ValueError(f"Unsupported video input type: {type(video_file)}")

    video_path = Path(video_file)
    if video_path.suffix.lower() != ".mp4":
        msg = (
            "Please upload an MP4 file. LASER currently supports MP4 inputs for "
            "scene-graph generation."
        )
        print(msg)
        return None, {"error": msg}
    video_file = str(video_path)

    # Keep original strings for parsing
    categorical_keywords_str = categorical_keywords
    unary_keywords_str = unary_keywords
    binary_keywords_str = binary_keywords

    categorical_keywords = (
        [kw.strip() for kw in categorical_keywords_str.split(",")]
        if categorical_keywords_str
        else []
    )
    unary_keywords = (
        [kw.strip() for kw in unary_keywords_str.split(",")]
        if unary_keywords_str
        else []
    )

    # Preprocess: pull categories referenced in binary keywords and add any missing ones
    added_categories: list[str] = []
    extra_cats = _extract_categories_from_binary(binary_keywords_str or "")
    if extra_cats:
        existing_lower = {kw.lower() for kw in categorical_keywords}
        for cat in extra_cats:
            if cat and cat.lower() not in existing_lower:
                categorical_keywords.append(cat)
                existing_lower.add(cat.lower())
                added_categories.append(cat)

    # Parse binary keywords with category info (if provided)
    (
        binary_keywords_list,
        batched_binary_predicates,
        binary_input_warnings,
    ) = _parse_binary_keywords(binary_keywords_str or "", categorical_keywords)
    if added_categories:
        binary_input_warnings.append(
            "Auto-added categorical keywords from binary relations: "
            + ", ".join(added_categories)
        )

    skip_binary = len(binary_keywords_list) == 0

    # Debug: Print what we're sending to the pipeline
    print("\n" + "=" * 80)
    print("INPUT TO LASER PIPELINE:")
    print(f"  categorical_keywords: {categorical_keywords}")
    print(f"  unary_keywords: {unary_keywords}")
    print(f"  binary_keywords (raw parsed): {binary_keywords_list}")
    print(f"  batched_binary_predicates: {batched_binary_predicates}")
    print(f"  auto_add_not_unary: {auto_add_not_unary}")
    print(f"  skip_binary: {skip_binary}")
    print("=" * 80 + "\n")

    # Object pairs is now optional - empty list will auto-generate all pairs in vine_model.py
    object_pairs: list[tuple[int, int]] = []

    extra_forward_kwargs = {}
    if batched_binary_predicates is not None and not skip_binary:
        # Use category-based filtering of binary pairs
        extra_forward_kwargs["batched_binary_predicates"] = batched_binary_predicates
        extra_forward_kwargs["topk_cate"] = 1  # as requested

    extra_forward_kwargs["auto_add_not_unary"] = bool(auto_add_not_unary)
    if skip_binary:
        extra_forward_kwargs["disable_binary"] = True

    results = vine_pipe(
        inputs=video_file,
        categorical_keywords=categorical_keywords,
        unary_keywords=unary_keywords,
        binary_keywords=binary_keywords_list,
        object_pairs=object_pairs,
        segmentation_method="grounding_dino_sam2",
        return_top_k=5,
        include_visualizations=True,
        debug_visualizations=False,
        device="cuda",
        box_threshold=box_threshold,
        text_threshold=text_threshold,
        target_fps=output_fps,
        binary_confidence_threshold=binary_confidence_threshold,
        **extra_forward_kwargs,
    )

    # Debug: Print what the pipeline returned
    print("\n" + "=" * 80)
    print("PIPELINE RESULTS DEBUG:")
    print(f"  results type: {type(results)}")
    if isinstance(results, dict):
        print(f"  results keys: {list(results.keys())}")
    print("=" * 80 + "\n")

    vine_pipe.box_threshold = box_threshold
    vine_pipe.text_threshold = text_threshold
    vine_pipe.target_fps = output_fps

    if isinstance(results, Mapping):
        results_dict = results
    elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
        results_dict = results[0]
    else:
        results_dict = {}

    visualizations = results_dict.get("visualizations") or {}
    vine = visualizations.get("vine") or {}
    all_vis = vine.get("all") or {}
    result_video_path = all_vis.get("video_path")
    if not result_video_path:
        candidates = sorted(
            Path(visualization_dir).rglob("*.mp4"),
            key=lambda p: p.stat().st_mtime,
            reverse=True,
        )
        result_video_path = str(candidates[0]) if candidates else None
    summary = results_dict.get("summary") or {}

    # Attach any binary category parsing warnings into the summary JSON
    if binary_input_warnings:
        if "binary_input_warnings" in summary:
            summary["binary_input_warnings"].extend(binary_input_warnings)
        else:
            summary["binary_input_warnings"] = binary_input_warnings

    if result_video_path and os.path.exists(result_video_path):
        gradio_tmp = (
            Path(os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir()))
            / "vine_outputs"
        )
        gradio_tmp.mkdir(parents=True, exist_ok=True)
        dest_path = gradio_tmp / Path(result_video_path).name
        try:
            shutil.copyfile(result_video_path, dest_path)
            video_path_for_ui = str(dest_path)
        except Exception as e:
            print(f"Warning: failed to copy video to Gradio temp dir: {e}")
            video_path_for_ui = str(result_video_path)
    else:
        video_path_for_ui = None
        print(
            "Warning: annotated video not found or empty; check visualization settings."
        )

    return video_path_for_ui, summary


def _video_component(label: str, *, is_output: bool = False):
    """
    Build a Gradio Video component that is compatible with older Gradio versions
    (no `type`/`sources`/`format` kwargs) and newer ones when available.
    """
    kwargs = {"label": label}
    sig = inspect.signature(gr.Video.__init__)

    # Only set format for OUTPUT components
    if is_output and "format" in sig.parameters:
        kwargs["format"] = "mp4"

    if not is_output:
        if "type" in sig.parameters:
            kwargs["type"] = "filepath"
        if "sources" in sig.parameters:
            kwargs["sources"] = ["upload"]
        # Restrict to MP4 files only
        if "file_types" in sig.parameters:
            kwargs["file_types"] = [".mp4"]

    if is_output and "autoplay" in sig.parameters:
        kwargs["autoplay"] = True

    return gr.Video(**kwargs)


def _create_blocks():
    """
    Build a Blocks context that works across Gradio versions.
    """
    blocks_kwargs = {"title": "LASER Scene Graph Demo"}
    soft_theme = None

    if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"):
        try:
            soft_theme = gr.themes.Soft()
        except Exception:
            soft_theme = None

    if "theme" in inspect.signature(gr.Blocks).parameters and soft_theme is not None:
        blocks_kwargs["theme"] = soft_theme

    return gr.Blocks(**blocks_kwargs)


# Create Gradio interface with two-column layout
with _create_blocks() as demo:
    gr.Markdown(
        """
        # 🎬 LASER: Spatio-temporal Scene Graphs for Video

        Turn any MP4 into a spatio-temporal scene graph with LASER - our 454-million parameter foundation model for scene-graph generation. LASER trains on 87K+ open-domain videos using a neurosymbolic caption-to-scene alignment pipeline, so it learns fine-grained video semantics without human labels.

        Upload an MP4 and sketch the scene graph you care about: specify the objects, actions, and interactions you want, and LASER will assemble a spatio-temporal scene graph plus an annotated video.
        """
    )

    with gr.Row():
        # Left column: Inputs
        with gr.Column(scale=1):
            gr.Markdown("### Scene Graph Inputs")

            video_input = _video_component("Upload Video (MP4 only)", is_output=False)
            gr.Markdown("*Note: Only MP4 format is currently supported*")

            gr.Markdown("#### Scene Graph Queries")
            categorical_input = gr.Textbox(
                label="Categorical Keywords",
                placeholder="e.g., person, car, dog",
                value="person, car, dog",
                info="Objects to detect in the video (comma-separated)",
            )
            unary_input = gr.Textbox(
                label="Unary Keywords",
                placeholder="e.g., walking, running, standing",
                value="walking, running, standing",
                info="Single-object actions to detect (comma-separated)",
            )
            binary_input = gr.Textbox(
                label="Binary Keywords",
                placeholder="e.g., behind(person, dog), bite(dog, frisbee)",
                info=(
                    "Object-to-object interactions to detect. "
                    "Use format: relation(from_category, to_category). "
                    "Example: 'behind(person, dog), bite(dog, frisbee)'. "
                    "If you omit '(from,to)', the relation will be applied to all object pairs (default behavior). "
                    "Leave blank to skip binary relation search entirely."
                ),
            )

            add_not_unary_checkbox = gr.Checkbox(
                label="Also query 'not <unary>' predicates",
                value=False,
                info="If enabled, for each unary keyword X, also query 'not X'.",
            )

            gr.Markdown("#### Processing Settings")
            fps_input = gr.Number(
                label="Output FPS",
                value=1,
                info="Frames per second for processing (lower = faster)",
            )

            with gr.Accordion("Advanced Settings", open=False):
                box_threshold_input = gr.Slider(
                    label="Box Threshold",
                    minimum=0.1,
                    maximum=0.9,
                    value=0.35,
                    step=0.05,
                    info="Confidence threshold for object detection",
                )
                text_threshold_input = gr.Slider(
                    label="Text Threshold",
                    minimum=0.1,
                    maximum=0.9,
                    value=0.25,
                    step=0.05,
                    info="Confidence threshold for text-based detection",
                )
                binary_confidence_input = gr.Slider(
                    label="Binary Relation Confidence Threshold",
                    minimum=0.0,
                    maximum=1.0,
                    value=.5,
                    step=0.05,
                    info="Minimum confidence to show binary relations and object pairs",
                )

            submit_btn = gr.Button("🚀 Process Video", variant="primary", size="lg")

        # Right column: Outputs
        with gr.Column(scale=1):
            gr.Markdown("### Scene Graph Results")

            video_output = _video_component("Annotated Video Output", is_output=True)

            gr.Markdown("### Scene Graph Summary")
            summary_output = gr.JSON(label="Scene Graph / Detected Events")

    gr.Markdown(
        """
        ---
        ### How to Use LASER
        1. Upload an MP4 (we validate the format for you).  
        2. Describe the **nodes** of your spatio-temporal scene graph with categorical keywords (objects) and unary keywords (single-object actions).  
        3. Wire up **binary** relations:
           - Use the structured form `relation(from_category, to_category)` (e.g., `behind(person, dog), bite(dog, frisbee)`) to limit relations to those category pairs.
           - Or list relation names (`chasing, carrying`) to evaluate all object pairs.
           - Leave the field blank to skip binary relations entirely (no pair search or binary predicates).
           - Categories referenced inside binary relations are auto-added to the categorical list for you.
        4. Optionally enable automatic `'not <unary>'` predicates.  
        5. Adjust processing settings if needed and click **Process Video** to receive an annotated video plus the serialized scene graph.

        More to explore:
        - LASER paper (ICLR'25): https://arxiv.org/abs/2304.07647 | Demo: https://huggingface.co/spaces/jiani-huang/LASER | Code: https://github.com/video-fm/LASER  
        - ESCA paper: https://arxiv.org/abs/2510.15963 | Code: https://github.com/video-fm/ESCA | Model: https://huggingface.co/video-fm/vine_v0 | Dataset: https://huggingface.co/datasets/video-fm/ESCA-video-87K  
        - Meet us at **NeurIPS 2025** (San Diego, Exhibit Hall C/D/E, Booth #4908 - Wed, Dec 3 - 11:00 a.m.-2:00 p.m. PST) for the foundation model demo, code, and full paper.
        """
    )

    submit_btn.click(
        fn=process_video,
        inputs=[
            video_input,
            categorical_input,
            unary_input,
            binary_input,
            add_not_unary_checkbox,
            fps_input,
            box_threshold_input,
            text_threshold_input,
            binary_confidence_input,
        ],
        outputs=[video_output, summary_output],
    )

if __name__ == "__main__":
    print("Got to main")
    demo.launch(share=True, debug=True)