Spaces:

jiani-huang
/

LASER

Running on T4

App Files Files Community

ASethi04 commited on 10 days ago

Commit

f9a6349

0 Parent(s):

updates

Browse files

Files changed (27) hide show

.gitattributes +35 -0
GroundingDINO_SwinT_OGC.py +43 -0
README.md +12 -0
app.py +288 -0
groundingdino_swint_ogc.pth +3 -0
requirements.txt +24 -0
sam2_hiera_t.yaml +118 -0
sam2_hiera_tiny.pt +3 -0
vine_hf/OVERVIEW.md +218 -0
vine_hf/README.md +355 -0
vine_hf/README_HF.md +345 -0
vine_hf/__init__.py +23 -0
vine_hf/convert_inference.py +288 -0
vine_hf/example_ensemble_weights.py +333 -0
vine_hf/example_sam2_masks.py +331 -0
vine_hf/example_usage.ipynb +310 -0
vine_hf/example_usage.py +283 -0
vine_hf/example_visualization.py +146 -0
vine_hf/example_with_pretrained_vine.py +287 -0
vine_hf/flattening.py +124 -0
vine_hf/push_to_hub.py +232 -0
vine_hf/push_to_video_fm.py +274 -0
vine_hf/setup.py +73 -0
vine_hf/vine_config.py +86 -0
vine_hf/vine_model.py +1001 -0
vine_hf/vine_pipeline.py +923 -0
vine_hf/vis_utils.py +941 -0

.gitattributes ADDED Viewed

	@@ -0,0 +1,35 @@

+*.7z filter=lfs diff=lfs merge=lfs -text
+*.arrow filter=lfs diff=lfs merge=lfs -text
+*.bin filter=lfs diff=lfs merge=lfs -text
+*.bz2 filter=lfs diff=lfs merge=lfs -text
+*.ckpt filter=lfs diff=lfs merge=lfs -text
+*.ftz filter=lfs diff=lfs merge=lfs -text
+*.gz filter=lfs diff=lfs merge=lfs -text
+*.h5 filter=lfs diff=lfs merge=lfs -text
+*.joblib filter=lfs diff=lfs merge=lfs -text
+*.lfs.* filter=lfs diff=lfs merge=lfs -text
+*.mlmodel filter=lfs diff=lfs merge=lfs -text
+*.model filter=lfs diff=lfs merge=lfs -text
+*.msgpack filter=lfs diff=lfs merge=lfs -text
+*.npy filter=lfs diff=lfs merge=lfs -text
+*.npz filter=lfs diff=lfs merge=lfs -text
+*.onnx filter=lfs diff=lfs merge=lfs -text
+*.ot filter=lfs diff=lfs merge=lfs -text
+*.parquet filter=lfs diff=lfs merge=lfs -text
+*.pb filter=lfs diff=lfs merge=lfs -text
+*.pickle filter=lfs diff=lfs merge=lfs -text
+*.pkl filter=lfs diff=lfs merge=lfs -text
+*.pt filter=lfs diff=lfs merge=lfs -text
+*.pth filter=lfs diff=lfs merge=lfs -text
+*.rar filter=lfs diff=lfs merge=lfs -text
+*.safetensors filter=lfs diff=lfs merge=lfs -text
+saved_model/**/* filter=lfs diff=lfs merge=lfs -text
+*.tar.* filter=lfs diff=lfs merge=lfs -text
+*.tar filter=lfs diff=lfs merge=lfs -text
+*.tflite filter=lfs diff=lfs merge=lfs -text
+*.tgz filter=lfs diff=lfs merge=lfs -text
+*.wasm filter=lfs diff=lfs merge=lfs -text
+*.xz filter=lfs diff=lfs merge=lfs -text
+*.zip filter=lfs diff=lfs merge=lfs -text
+*.zst filter=lfs diff=lfs merge=lfs -text
+*tfevents* filter=lfs diff=lfs merge=lfs -text

GroundingDINO_SwinT_OGC.py ADDED Viewed

	@@ -0,0 +1,43 @@

+batch_size = 1
+modelname = "groundingdino"
+backbone = "swin_T_224_1k"
+position_embedding = "sine"
+pe_temperatureH = 20
+pe_temperatureW = 20
+return_interm_indices = [1, 2, 3]
+backbone_freeze_keywords = None
+enc_layers = 6
+dec_layers = 6
+pre_norm = False
+dim_feedforward = 2048
+hidden_dim = 256
+dropout = 0.0
+nheads = 8
+num_queries = 900
+query_dim = 4
+num_patterns = 0
+num_feature_levels = 4
+enc_n_points = 4
+dec_n_points = 4
+two_stage_type = "standard"
+two_stage_bbox_embed_share = False
+two_stage_class_embed_share = False
+transformer_activation = "relu"
+dec_pred_bbox_embed_share = True
+dn_box_noise_scale = 1.0
+dn_label_noise_ratio = 0.5
+dn_label_coef = 1.0
+dn_bbox_coef = 1.0
+embed_init_tgt = True
+dn_labelbook_size = 2000
+max_text_len = 256
+text_encoder_type = "bert-base-uncased"
+use_text_enhancer = True
+use_fusion_layer = True
+use_checkpoint = True
+use_transformer_ckpt = True
+use_text_cross_attention = True
+text_dropout = 0.0
+fusion_dropout = 0.0
+fusion_droppath = 0.1
+sub_sentence_present = True

README.md ADDED Viewed

	@@ -0,0 +1,12 @@

+---
+title: LASER Demo
+emoji: 🐠
+colorFrom: gray
+colorTo: blue
+sdk: gradio
+sdk_version: 6.0.1
+app_file: app.py
+pinned: false
+---
+Check out the configuration reference at https://huggingface.co/docs/hub/spaces-config-reference

app.py ADDED Viewed

	@@ -0,0 +1,288 @@

+from pathlib import Path
+from collections.abc import Mapping, Sequence
+from functools import lru_cache
+import inspect
+import shutil
+import tempfile
+import os
+import sys
+import spaces  # <-- ZeroGPU integration
+import gradio as gr
+import torch
+from transformers import pipeline  # not strictly necessary, but fine
+# -----------------------------
+# Environment / diagnostics
+# -----------------------------
+os.environ["GRADIO_TEMP_DIR"] = str(Path(__file__).parent / "gradio_temp")
+os.environ["OPENAI_API_KEY"] = "test"
+os.environ["OMP_NUM_THREADS"] = "4"
+print("All imports finished")
+print(f"Python version: {sys.version}")
+print(f"PyTorch version: {torch.__version__}")
+print(f"CUDA available: {torch.cuda.is_available()}")
+print(f"CUDA version: {torch.version.cuda}")
+print(f"cuDNN version: {torch.backends.cudnn.version()}")
+print(f"Number of GPUs: {torch.cuda.device_count()}")
+if torch.cuda.is_available():
+    for i in range(torch.cuda.device_count()):
+        print(f"GPU {i}: {torch.cuda.get_device_name(i)}")
+        print(
+            f"  Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB"
+        )
+torch.backends.cuda.matmul.allow_tf32 = False
+torch.backends.cudnn.allow_tf32 = False
+os.environ["TORCH_DTYPE"] = "float32"
+torch.set_default_dtype(torch.float32)
+current_dir = Path(__file__).resolve().parent
+# For Spaces, assume checkpoints live alongside app.py or in a "checkpoints" subdir.
+# If you keep them next to app.py locally, this still works.
+sam_config_path = str(current_dir / "sam2_hiera_t.yaml")
+sam_checkpoint_path = str(current_dir / "sam2_hiera_tiny.pt")
+gd_config_path = str(current_dir / "GroundingDINO_SwinT_OGC.py")
+gd_checkpoint_path = str(current_dir / "groundingdino_swint_ogc.pth")
+visualization_dir = str(current_dir / "outputs")
+print(
+    f"Setting up paths: {sam_config_path}, {sam_checkpoint_path}, {gd_config_path}, {gd_checkpoint_path}"
+)
+@lru_cache(maxsize=1)
+def _load_vine_pipeline():
+    """
+    Lazy-load and cache the Vine pipeline so we don't re-download/rebuild it on every request.
+    """
+    from vine_hf import VineConfig, VineModel, VinePipeline
+    config = VineConfig(
+        segmentation_method="grounding_dino_sam2",
+        model_name="openai/clip-vit-base-patch32",
+        use_hf_repo=True,
+        model_repo="KevinX-Penn28/testing",
+        box_threshold=0.35,
+        text_threshold=0.25,
+        target_fps=1,  # default 1 FPS
+        topk_cate=5,
+        white_alpha=0.3,
+        visualization_dir=visualization_dir,
+        visualize=True,
+        debug_visualizations=False,
+        device="cuda",
+        categorical_pool="max",
+    )
+    model = VineModel(config)
+    return VinePipeline(
+        model=model,
+        tokenizer=None,
+        sam_config_path=sam_config_path,
+        sam_checkpoint_path=sam_checkpoint_path,
+        gd_config_path=gd_config_path,
+        gd_checkpoint_path=gd_checkpoint_path,
+        device="cuda",
+        trust_remote_code=True,
+    )
+@spaces.GPU(duration=300)  # Up to ~5 minutes of H200 ZeroGPU time per call
+def process_video(
+    video_file,
+    categorical_keywords,
+    unary_keywords,
+    binary_keywords,
+    object_pairs,
+    output_fps,
+    box_threshold,
+    text_threshold,
+):
+    vine_pipe = _load_vine_pipeline()
+    # Normalize incoming video input to a file path
+    if isinstance(video_file, dict):
+        video_file = (
+            video_file.get("name")
+            or video_file.get("filepath")
+            or video_file.get("data")
+        )
+    if not isinstance(video_file, (str, Path)):
+        raise ValueError(f"Unsupported video input type: {type(video_file)}")
+    categorical_keywords = (
+        [kw.strip() for kw in categorical_keywords.split(",")]
+        if categorical_keywords
+        else []
+    )
+    unary_keywords = (
+        [kw.strip() for kw in unary_keywords.split(",")] if unary_keywords else []
+    )
+    binary_keywords = (
+        [kw.strip() for kw in binary_keywords.split(",")] if binary_keywords else []
+    )
+    object_pairs = (
+        [tuple(map(int, pair.split("-"))) for pair in object_pairs.split(",")]
+        if object_pairs
+        else []
+    )
+    results = vine_pipe(
+        inputs=video_file,
+        categorical_keywords=categorical_keywords,
+        unary_keywords=unary_keywords,
+        binary_keywords=binary_keywords,
+        object_pairs=object_pairs,
+        segmentation_method="grounding_dino_sam2",
+        return_top_k=5,
+        include_visualizations=True,
+        debug_visualizations=False,
+        device="cuda",
+        box_threshold=box_threshold,
+        text_threshold=text_threshold,
+        target_fps=output_fps,
+    )
+    vine_pipe.box_threshold = box_threshold
+    vine_pipe.text_threshold = text_threshold
+    vine_pipe.target_fps = output_fps
+    if isinstance(results, Mapping):
+        results_dict = results
+    elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
+        results_dict = results[0]
+    else:
+        results_dict = {}
+    visualizations = results_dict.get("visualizations") or {}
+    vine = visualizations.get("vine") or {}
+    all_vis = vine.get("all") or {}
+    result_video_path = all_vis.get("video_path")
+    if not result_video_path:
+        candidates = sorted(
+            Path(visualization_dir).rglob("*.mp4"),
+            key=lambda p: p.stat().st_mtime,
+            reverse=True,
+        )
+        result_video_path = str(candidates[0]) if candidates else None
+    summary = results_dict.get("summary") or {}
+    if result_video_path and os.path.exists(result_video_path):
+        gradio_tmp = Path(
+            os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir())
+        ) / "vine_outputs"
+        gradio_tmp.mkdir(parents=True, exist_ok=True)
+        dest_path = gradio_tmp / Path(result_video_path).name
+        try:
+            shutil.copyfile(result_video_path, dest_path)
+            video_path_for_ui = str(dest_path)
+        except Exception as e:
+            print(f"Warning: failed to copy video to Gradio temp dir: {e}")
+            video_path_for_ui = str(result_video_path)
+    else:
+        video_path_for_ui = None
+        print(
+            "Warning: annotated video not found or empty; check visualization settings."
+        )
+    return video_path_for_ui, summary
+def _video_component(label: str, *, is_output: bool = False):
+    """
+    Build a Gradio Video component that is compatible with older Gradio versions
+    (no `type`/`sources`/`format` kwargs) and newer ones when available.
+    """
+    kwargs = {"label": label}
+    sig = inspect.signature(gr.Video.__init__)
+    # Only set format for OUTPUT components
+    if is_output and "format" in sig.parameters:
+        kwargs["format"] = "mp4"
+    if not is_output:
+        if "type" in sig.parameters:
+            kwargs["type"] = "filepath"
+        if "sources" in sig.parameters:
+            kwargs["sources"] = ["upload"]
+    if is_output and "autoplay" in sig.parameters:
+        kwargs["autoplay"] = True
+    return gr.Video(**kwargs)
+def _create_blocks():
+    """
+    Build a Blocks context that works across Gradio versions.
+    """
+    blocks_kwargs = {"title": "VINE Demo"}
+    soft_theme = None
+    if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"):
+        try:
+            soft_theme = gr.themes.Soft()
+        except Exception:
+            soft_theme = None
+    if "theme" in inspect.signature(gr.Blocks).parameters and soft_theme is not None:
+        blocks_kwargs["theme"] = soft_theme
+    return gr.Blocks(**blocks_kwargs)
+# Create Gradio interface
+with _create_blocks() as demo:
+    video_input = _video_component("Upload Video", is_output=False)
+    categorical_input = gr.Textbox(
+        label="Categorical Keywords (comma-separated)",
+        value="person, car, tree, background",
+    )
+    unary_input = gr.Textbox(
+        label="Unary Keywords (comma-separated)", value="walking, running, standing"
+    )
+    binary_input = gr.Textbox(
+        label="Binary Keywords (comma-separated)",
+        placeholder="e.g., chasing, carrying",
+    )
+    pairs_input = gr.Textbox(
+        label="Object Pairs (comma-separated indices)",
+        placeholder="e.g., 0-1,0-2 for pairs of objects",
+    )
+    fps_input = gr.Number(
+        label="Output FPS (affects processing speed)", value=1  # default 1 FPS
+    )
+    with gr.Accordion("Advanced Settings", open=False):
+        box_threshold_input = gr.Slider(
+            label="Box Threshold", minimum=0.1, maximum=0.9, value=0.35, step=0.05
+        )
+        text_threshold_input = gr.Slider(
+            label="Text Threshold", minimum=0.1, maximum=0.9, value=0.25, step=0.05
+        )
+    submit_btn = gr.Button("Process Video", variant="primary")
+    video_output = _video_component("Output Video with Annotations", is_output=True)
+    json_output = gr.JSON(label="Summary of Detected Events")
+    submit_btn.click(
+        fn=process_video,
+        inputs=[
+            video_input,
+            categorical_input,
+            unary_input,
+            binary_input,
+            pairs_input,
+            fps_input,
+            box_threshold_input,
+            text_threshold_input,
+        ],
+        outputs=[video_output, json_output],
+    )
+if __name__ == "__main__":
+    print("Got to main")
+    demo.launch(share=True, debug=True)

groundingdino_swint_ogc.pth ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:3b3ca2563c77c69f651d7bd133e97139c186df06231157a64c507099c52bc799
+size 693997677

requirements.txt ADDED Viewed

	@@ -0,0 +1,24 @@

+gradio>=4.0.0
+spaces>=0.24.0
+transformers>=4.40.0
+huggingface-hub>=0.23.0
+safetensors>=0.4.2
+accelerate>=0.30.0
+--extra-index-url https://download.pytorch.org/whl/cu121
+torch==2.2.1
+torchvision==0.17.1
+numpy
+opencv-python
+pillow
+matplotlib
+seaborn
+pandas
+tqdm
+scikit-learn
+-e git+https://github.com/video-fm/video-sam2.git#egg=video_sam2
+-e git+https://github.com/IDEA-Research/GroundingDINO.git#egg=GroundingDINO
+-e git+https://github.com/kevinxuez/LASER.git#egg=laser

sam2_hiera_t.yaml ADDED Viewed

	@@ -0,0 +1,118 @@

+# @package _global_
+# Model
+model:
+  _target_: sam2.modeling.sam2_base.SAM2Base
+  image_encoder:
+    _target_: sam2.modeling.backbones.image_encoder.ImageEncoder
+    scalp: 1
+    trunk:
+      _target_: sam2.modeling.backbones.hieradet.Hiera
+      embed_dim: 96
+      num_heads: 1
+      stages: [1, 2, 7, 2]
+      global_att_blocks: [5, 7, 9]
+      window_pos_embed_bkg_spatial_size: [7, 7]
+    neck:
+      _target_: sam2.modeling.backbones.image_encoder.FpnNeck
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 256
+        normalize: true
+        scale: null
+        temperature: 10000
+      d_model: 256
+      backbone_channel_list: [768, 384, 192, 96]
+      fpn_top_down_levels: [2, 3]  # output level 0 and 1 directly use the backbone features
+      fpn_interp_model: nearest
+  memory_attention:
+    _target_: sam2.modeling.memory_attention.MemoryAttention
+    d_model: 256
+    pos_enc_at_input: true
+    layer:
+      _target_: sam2.modeling.memory_attention.MemoryAttentionLayer
+      activation: relu
+      dim_feedforward: 2048
+      dropout: 0.1
+      pos_enc_at_attn: false
+      self_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+      d_model: 256
+      pos_enc_at_cross_attn_keys: true
+      pos_enc_at_cross_attn_queries: false
+      cross_attention:
+        _target_: sam2.modeling.sam.transformer.RoPEAttention
+        rope_theta: 10000.0
+        feat_sizes: [64, 64]
+        rope_k_repeat: True
+        embedding_dim: 256
+        num_heads: 1
+        downsample_rate: 1
+        dropout: 0.1
+        kv_in_dim: 64
+    num_layers: 4
+  memory_encoder:
+      _target_: sam2.modeling.memory_encoder.MemoryEncoder
+      out_dim: 64
+      position_encoding:
+        _target_: sam2.modeling.position_encoding.PositionEmbeddingSine
+        num_pos_feats: 64
+        normalize: true
+        scale: null
+        temperature: 10000
+      mask_downsampler:
+        _target_: sam2.modeling.memory_encoder.MaskDownSampler
+        kernel_size: 3
+        stride: 2
+        padding: 1
+      fuser:
+        _target_: sam2.modeling.memory_encoder.Fuser
+        layer:
+          _target_: sam2.modeling.memory_encoder.CXBlock
+          dim: 256
+          kernel_size: 7
+          padding: 3
+          layer_scale_init_value: 1e-6
+          use_dwconv: True  # depth-wise convs
+        num_layers: 2
+  num_maskmem: 7
+  image_size: 1024
+  # apply scaled sigmoid on mask logits for memory encoder, and directly feed input mask as output mask
+  # SAM decoder
+  sigmoid_scale_for_mem_enc: 20.0
+  sigmoid_bias_for_mem_enc: -10.0
+  use_mask_input_as_output_without_sam: true
+  # Memory
+  directly_add_no_mem_embed: true
+  # use high-resolution feature map in the SAM mask decoder
+  use_high_res_features_in_sam: true
+  # output 3 masks on the first click on initial conditioning frames
+  multimask_output_in_sam: true
+  # SAM heads
+  iou_prediction_use_sigmoid: True
+  # cross-attend to object pointers from other frames (based on SAM output tokens) in the encoder
+  use_obj_ptrs_in_encoder: true
+  add_tpos_enc_to_obj_ptrs: false
+  only_obj_ptrs_in_the_past_for_eval: true
+  # object occlusion prediction
+  pred_obj_scores: true
+  pred_obj_scores_mlp: true
+  fixed_no_obj_ptr: true
+  # multimask tracking settings
+  multimask_output_for_tracking: true
+  use_multimask_token_for_obj_ptr: true
+  multimask_min_pt_num: 0
+  multimask_max_pt_num: 1
+  use_mlp_for_obj_ptr_proj: true
+  # Compilation flag
+  # HieraT does not currently support compilation, should always be set to False
+  compile_image_encoder: False

sam2_hiera_tiny.pt ADDED Viewed

	@@ -0,0 +1,3 @@

+version https://git-lfs.github.com/spec/v1
+oid sha256:65b50056e05bcb13694174f51bb6da89c894b57b75ccdf0ba6352c597c5d1125
+size 155906050

vine_hf/OVERVIEW.md ADDED Viewed

	@@ -0,0 +1,218 @@

+# VINE HuggingFace Interface - Complete Overview
+This directory contains a complete HuggingFace-compatible interface for the VINE (Video Understanding with Natural Language) model. The interface allows you to easily use, share, and deploy your VINE model through the HuggingFace ecosystem.
+## 📁 Directory Structure
+```
+vine_hf/
+├── __init__.py                 # Package initialization and exports
+├── vine_config.py              # VineConfig class (PretrainedConfig)
+├── vine_model.py               # VineModel class (PreTrainedModel)
+├── vine_pipeline.py            # VinePipeline class (Pipeline)
+├── example_usage.py            # Comprehensive usage examples
+├── convert_inference.py        # Migration guide from inference.py
+├── push_to_hub.py             # Script to push model to HF Hub
+├── setup.py                   # Package setup configuration
+├── README.md                  # Detailed documentation
+└── OVERVIEW.md                # This file
+```
+## 🏗️ Architecture Components
+### 1. VineConfig (`vine_config.py`)
+- Inherits from `PretrainedConfig`
+- Configures model parameters, segmentation methods, and processing options
+- Compatible with HuggingFace configuration system
+### 2. VineModel (`vine_model.py`)
+- Inherits from `PreTrainedModel`
+- Implements the core VINE model with three CLIP backbones
+- Supports categorical, unary, and binary predictions
+- Provides both `forward()` and `predict()` methods
+### 3. VinePipeline (`vine_pipeline.py`)
+- Inherits from `Pipeline`
+- Handles end-to-end video processing workflow
+- Integrates segmentation (SAM2, Grounding DINO + SAM2)
+- Provides user-friendly interface for video understanding
+## 🚀 Key Features
+✅ **Full HuggingFace Compatibility**
+- Compatible with `transformers` library
+- Supports `AutoModel` and `pipeline` interfaces
+- Can be pushed to and loaded from HuggingFace Hub
+✅ **Flexible Segmentation**
+- Support for SAM2 automatic segmentation
+- Support for Grounding DINO + SAM2 text-guided segmentation
+- Configurable thresholds and parameters
+✅ **Multi-Modal Understanding**
+- Categorical classification (object types)
+- Unary predicates (single object actions)
+- Binary relations (object-object relationships)
+✅ **Easy Integration**
+- Simple pipeline interface for end users
+- Direct model access for researchers
+- Comprehensive configuration options
+## 📖 Usage Examples
+### Quick Start with Pipeline
+```python
+from transformers import pipeline
+from vine_hf import VineModel, VinePipeline
+# Create pipeline
+vine_pipeline = pipeline(
+    "vine-video-understanding",
+    model="your-username/vine-model",
+    trust_remote_code=True
+)
+# Process video
+results = vine_pipeline(
+    "video.mp4",
+    categorical_keywords=['human', 'dog', 'frisbee'],
+    unary_keywords=['running', 'jumping'],
+    binary_keywords=['chasing', 'behind']
+)
+```
+### Direct Model Usage
+```python
+from vine_hf import VineConfig, VineModel
+config = VineConfig(segmentation_method="grounding_dino_sam2")
+model = VineModel(config)
+results = model.predict(
+    video_frames=video_tensor,
+    masks=masks_dict,
+    bboxes=bboxes_dict,
+    categorical_keywords=['human', 'dog'],
+    unary_keywords=['running', 'sitting'],
+    binary_keywords=['chasing', 'near']
+)
+```
+## 🔧 Migration from Original Code
+The `convert_inference.py` script shows how to migrate from the original `inference.py` workflow:
+**Original Approach:**
+- Manual model loading and configuration
+- Direct handling of segmentation pipeline
+- Custom result processing
+- Complex setup requirements
+**New HuggingFace Interface:**
+- Standardized model configuration
+- Automatic preprocessing/postprocessing
+- Simple pipeline interface
+- Easy sharing via HuggingFace Hub
+## 📤 Sharing Your Model
+Use the `push_to_hub.py` script to share your trained model:
+```bash
+python vine_hf/push_to_hub.py \
+    --weights path/to/your/model.pth \
+    --repo your-username/vine-model \
+    --login
+```
+## 🛠️ Installation & Setup
+1. **Install Dependencies:**
+```bash
+pip install transformers torch torchvision opencv-python pillow numpy
+```
+2. **Install Segmentation Models (Optional):**
+   - SAM2: https://github.com/facebookresearch/sam2
+   - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
+3. **Install VINE HF Interface:**
+```bash
+cd vine_hf
+pip install -e .
+```
+## 🎯 Configuration Options
+The `VineConfig` class supports extensive configuration:
+- **Model Settings:** CLIP backbone, hidden dimensions
+- **Segmentation:** Method, thresholds, target FPS
+- **Processing:** Alpha values, top-k results, video length limits
+- **Performance:** Multi-class mode, output format options
+## 📊 Output Format
+The interface returns structured predictions:
+```python
+{
+    "categorical_predictions": {obj_id: [(prob, category), ...]},
+    "unary_predictions": {(frame, obj): [(prob, action), ...]},
+    "binary_predictions": {(frame, pair): [(prob, relation), ...]},
+    "confidence_scores": {"categorical": float, "unary": float, "binary": float},
+    "summary": {
+        "num_objects_detected": int,
+        "top_categories": [(category, prob), ...],
+        "top_actions": [(action, prob), ...],
+        "top_relations": [(relation, prob), ...]
+    }
+}
+```
+## 🔍 Testing & Validation
+Run the example scripts to test your setup:
+```bash
+# Test basic functionality
+python vine_hf/example_usage.py
+# Test migration from original code
+python vine_hf/convert_inference.py
+```
+## 🤝 Contributing
+To contribute or customize:
+1. **Modify Configuration:** Edit `vine_config.py` for new parameters
+2. **Extend Model:** Add functionality to `vine_model.py`
+3. **Enhance Pipeline:** Improve preprocessing/postprocessing in `vine_pipeline.py`
+4. **Add Features:** Create additional utility scripts
+## 📝 Next Steps
+1. **Load Your Weights:** Use your trained VINE model weights
+2. **Test Segmentation:** Set up Grounding DINO and SAM2 models
+3. **Validate Results:** Compare with original inference.py output
+4. **Share Model:** Push to HuggingFace Hub for community use
+5. **Deploy:** Use in applications, demos, or research projects
+## 🐛 Troubleshooting
+**Common Issues:**
+- **Import Errors:** Check PYTHONPATH and package installation
+- **Segmentation Failures:** Verify Grounding DINO/SAM2 setup
+- **Weight Loading:** Adjust weight loading logic in `convert_inference.py`
+- **CUDA Issues:** Check GPU availability and PyTorch installation
+**Support:**
+- Check the README.md for detailed documentation
+- Review example_usage.py for working code examples
+- Examine convert_inference.py for migration guidance
+---
+This HuggingFace interface makes VINE accessible to the broader ML community while maintaining all the powerful video understanding capabilities of the original model. The standardized interface enables easy sharing, deployment, and integration with existing HuggingFace workflows.

vine_hf/README.md ADDED Viewed

	@@ -0,0 +1,355 @@

+# VINE HuggingFace Interface
+VINE (Video Understanding with Natural Language) is a model that processes videos along with categorical, unary, and binary keywords to return probability distributions over those keywords for detected objects and their relationships.
+This package provides a HuggingFace-compatible interface for the VINE model, making it easy to use for video understanding tasks.
+## Features
+- **Categorical Classification**: Classify objects in videos (e.g., "human", "dog", "frisbee")
+- **Unary Predicates**: Detect actions on single objects (e.g., "running", "jumping", "sitting")
+- **Binary Relations**: Detect relationships between object pairs (e.g., "behind", "in front of", "chasing")
+- **Multiple Segmentation Methods**: Support for SAM2 and Grounding DINO + SAM2
+- **HuggingFace Integration**: Full compatibility with HuggingFace transformers and pipelines
+- **Visualization Hooks**: Optional high-level visualizations plus lightweight debug mask dumps for quick sanity checks
+## Installation
+```bash
+# Install the package (assuming it's in your Python path)
+pip install transformers torch torchvision
+pip install opencv-python pillow numpy
+# For segmentation functionality, you'll also need:
+# - SAM2: https://github.com/facebookresearch/sam2
+# - Grounding DINO: https://github.com/IDEA-Research/GroundingDINO
+```
+## Segmentation Model Configuration
+`VinePipeline` lazily brings up the segmentation stack the first time a call needs masks. Thresholds, FPS, visualization toggles, and device selection live in `VineConfig`; the pipeline constructor tells it where to fetch SAM2 / GroundingDINO weights or lets you inject already-instantiated modules.
+### Provide file paths at construction (most common)
+```python
+from vine_hf import VineConfig, VineModel, VinePipeline
+vine_config = VineConfig(
+    segmentation_method="grounding_dino_sam2",  # or "sam2"
+    box_threshold=0.35,
+    text_threshold=0.25,
+    target_fps=5,
+    visualization_dir="output/visualizations", # where to write visualizations (and debug visualizations if enabled)
+    debug_visualizations=True, # Write videos of the groundingDINO/SAM2/Binary/Unary, etc... outputs
+    pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
+    device="cuda:0",  # accepts int, str, or torch.device
+)
+vine_model = VineModel(vine_config)
+vine_pipeline = VinePipeline(
+    model=vine_model,
+    tokenizer=None,
+    sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
+    sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
+    gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+    gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
+    device=vine_config._device,
+)
+```
+When `segmentation_method="grounding_dino_sam2"`, both SAM2 and GroundingDINO must be reachable. The pipeline validates the paths; missing files raise a `ValueError`. If you pick `"sam2"`, only the SAM2 config and checkpoint are required.
+### Reuse pre-initialized segmentation modules
+If you build the segmentation stack elsewhere, inject the components with `set_segmentation_models` before running the pipeline:
+```python
+from sam2.build_sam import build_sam2_video_predictor, build_sam2
+from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+from groundingdino.util.inference import Model as GroundingDINOModel
+sam_predictor = build_sam2_video_predictor(..., device=vine_config._device)
+mask_generator = SAM2AutomaticMaskGenerator(build_sam2(..., device=vine_config._device))
+grounding_model = GroundingDINOModel(..., device=vine_config._device)
+vine_pipeline.set_segmentation_models(
+    sam_predictor=sam_predictor,
+    mask_generator=mask_generator,
+    grounding_model=grounding_model,
+)
+```
+Any argument left as `None` is initialized lazily from the file paths when the pipeline first needs that backend.
+## Quick Start
+## Requirements
+-torch
+-torchvision
+-transformers
+-opencv-python
+-matplotlib
+-seaborn
+-pandas
+-numpy
+-ipywidgets
+-tqdm
+-scikit-learn
+-sam2 (from Facebook Research) "https://github.com/video-fm/video-sam2"
+-sam2 weights (downloaded separately. EX: https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt)
+-groundingdino (from IDEA Research)
+-groundingdino weights (downloaded separately. EX:https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth)
+-spacy-fastlang
+-en-core-web-sm (for spacy-fastlang)
+-ffmpeg (for video processing)
+-(optional) laser weights/full model checkpoint (downloaded separately. EX: https://huggingface.co/video-fm/vine_v0)
+Usually, by running the laser/environments/laser_env.yml from the LASER repo, most dependencies will be installed. You will need to manually install sam2 and groundingdino as per their instructions.
+### Using the Pipeline (Recommended)
+```python
+from transformers.pipelines import PIPELINE_REGISTRY
+from vine_hf import VineConfig, VineModel, VinePipeline
+PIPELINE_REGISTRY.register_pipeline(
+    "vine-video-understanding",
+    pipeline_class=VinePipeline,
+    pt_model=VineModel,
+    type="multimodal",
+)
+config = VineConfig(
+    segmentation_method="grounding_dino_sam2",
+    pretrained_vine_path="/abs/path/to/laser_model_v1.pkl",
+    visualization_dir="output",
+    visualize=True,
+    device="cuda:0",
+)
+model = VineModel(config)
+vine_pipeline = VinePipeline(
+    model=model,
+    tokenizer=None,
+    sam_config_path="/abs/path/to/sam2/sam2.1_hiera_t.yaml",
+    sam_checkpoint_path="/abs/path/to/sam2/sam2_hiera_tiny.pt",
+    gd_config_path="/abs/path/to/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+    gd_checkpoint_path="/abs/path/to/groundingdino/weights/groundingdino_swint_ogc.pth",
+    device=config._device,
+)
+results = vine_pipeline(
+    "/path/to/video.mp4",
+    categorical_keywords=["dog", "human"],
+    unary_keywords=["running"],
+    binary_keywords=["chasing"],
+    object_pairs=[(0, 1)],
+    return_top_k=3,
+    include_visualizations=True,
+)
+print(results["summary"])
+```
+### Using the Model Directly (Advanced)
+For advanced users who want to provide their own segmentation:
+```python
+from vine_hf import VineConfig, VineModel
+import torch
+# Create configuration
+config = VineConfig(
+    pretrained_vine_path="/path/to/your/vine/weights"  # Optional: your fine-tuned weights
+)
+# Initialize model
+model = VineModel(config)
+# If you have your own video frames, masks, and bboxes from external segmentation
+video_frames = torch.randn(3, 224, 224, 3) * 255  # Your video frames
+masks = {0: {1: torch.ones(224, 224, 1)}}  # Your segmentation masks
+bboxes = {0: {1: [50, 50, 150, 150]}}  # Your bounding boxes
+# Run prediction
+results = model.predict(
+    video_frames=video_frames,
+    masks=masks,
+    bboxes=bboxes,
+    categorical_keywords=['human', 'dog', 'frisbee'],
+    unary_keywords=['running', 'jumping'],
+    binary_keywords=['chasing', 'following'],
+    object_pairs=[(1, 2)],
+    return_top_k=3
+)
+```
+**Note**: For most users, the pipeline approach above is recommended as it handles video loading and segmentation automatically.
+## Configuration Options
+The `VineConfig` class supports the following parameters (non-exhaustive):
+- `model_name`: CLIP model backbone (default: `"openai/clip-vit-large-patch14-336"`)
+- `pretrained_vine_path`: Optional path or Hugging Face repo with pretrained VINE weights
+- `segmentation_method`: `"sam2"` or `"grounding_dino_sam2"` (default: `"grounding_dino_sam2"`)
+- `box_threshold` / `text_threshold`: Grounding DINO thresholds
+- `target_fps`: Target FPS for video processing (default: `1`)
+- `alpha`, `white_alpha`: Rendering parameters used when extracting masked crops
+- `topk_cate`: Top-k categories to return per object (default: `3`)
+- `max_video_length`: Maximum frames to process (default: `100`)
+- `visualize`: When `True`, pipeline post-processing attempts to create stitched visualizations
+- `visualization_dir`: Optional base directory where visualization assets are written
+- `debug_visualizations`: When `True`, the model saves a single first-frame mask composite for quick inspection
+- `debug_visualization_path`: Target filepath for the debug mask composite (must point to a writable file)
+- `return_flattened_segments`, `return_valid_pairs`, `interested_object_pairs`: Advanced geometry outputs for downstream consumers
+## Output Format
+The model returns a dictionary with the following structure:
+```python
+{
+    "masks" : {},
+    "boxes" : {},
+    "categorical_predictions": {
+        object_id: [(probability, category), ...]
+    },
+    "unary_predictions": {
+        (frame_id, object_id): [(probability, action), ...]
+    },
+    "binary_predictions": {
+        (frame_id, (obj1_id, obj2_id)): [(probability, relation), ...]
+    },
+    "confidence_scores": {
+        "categorical": max_categorical_confidence,
+        "unary": max_unary_confidence,
+        "binary": max_binary_confidence
+    },
+    "summary": {
+        "num_objects_detected": int,
+        "top_categories": [(category, probability), ...],
+        "top_actions": [(action, probability), ...],
+        "top_relations": [(relation, probability), ...]
+    }
+}
+```
+## Visualization & Debugging
+There are two complementary visualization layers:
+- **Post-process visualizations** (`include_visualizations=True` in the pipeline call) produces a high-level stitched video summarizing detections, actions, and relations over time.
+- **Debug visualizations** (`debug_visualizations=True` in `VineConfig`) dumps videos of intermediate segmentation masks and outputs from GroundingDINO, SAM2, Unary, Binary, etc. for quick sanity checks.
+If you plan to enable either option, ensure the relevant output directories exist before running the pipeline.
+## Segmentation Methods
+### Grounding DINO + SAM2 (Recommended)
+Uses Grounding DINO for object detection based on text prompts, then SAM2 for precise segmentation.
+Requirements:
+- Grounding DINO model and weights
+- SAM2 model and weights
+- Properly configured paths to model checkpoints
+### SAM2 Only
+Uses SAM2's automatic mask generation without text-based object detection.
+Requirements:
+- SAM2 model and weights
+## Model Architecture
+VINE is built on top of CLIP and uses three separate CLIP models for different tasks:
+- **Categorical Model**: For object classification
+- **Unary Model**: For single-object action recognition
+- **Binary Model**: For relationship detection between object pairs
+Each model processes both visual and textual features to compute similarity scores and probability distributions.
+## Pushing to HuggingFace Hub
+```python
+from vine_hf import VineConfig, VineModel
+# Create and configure your model
+config = VineConfig()
+model = VineModel(config)
+# Load your pretrained weights
+# model.load_state_dict(torch.load('path/to/your/weights.pth'))
+# Register for auto classes
+config.register_for_auto_class()
+model.register_for_auto_class("AutoModel")
+# Push to Hub
+config.push_to_hub('your-username/vine-model')
+model.push_to_hub('your-username/vine-model')
+```
+## Loading from HuggingFace Hub
+```python
+from transformers import AutoModel, pipeline
+# Load model
+model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
+# Or use with pipeline
+vine_pipeline = pipeline(
+    'vine-video-understanding',
+    model='your-username/vine-model',
+    trust_remote_code=True
+)
+```
+## Examples
+See `example_usage.py` for comprehensive examples including:
+- Direct model usage
+- Pipeline usage
+- HuggingFace Hub integration
+- Real video processing
+## Requirements
+- Python 3.7+
+- PyTorch 1.9+
+- transformers 4.20+
+- OpenCV
+- PIL/Pillow
+- NumPy
+For segmentation:
+- SAM2 (Facebook Research)
+- Grounding DINO (IDEA Research)
+## Citation
+If you use VINE in your research, please cite:
+```bibtex
+@article{vine2024,
+  title={VINE: Video Understanding with Natural Language},
+  author={Your Authors},
+  journal={Your Journal},
+  year={2024}
+}
+```
+## License
+[Your License Here]
+## Contact
+[Your Contact Information Here]

vine_hf/README_HF.md ADDED Viewed

	@@ -0,0 +1,345 @@

+# VINE: Video Understanding with Natural Language
+[![HuggingFace](https://img.shields.io/badge/%F0%9F%A4%97%20Hugging%20Face-video--fm%2Fvine-blue)](https://huggingface.co/video-fm/vine)
+[![GitHub](https://img.shields.io/badge/GitHub-LASER-green)](https://github.com/kevinxuez/LASER)
+VINE is a video understanding model that processes videos along with categorical, unary, and binary keywords to return probability distributions over those keywords for detected objects and their relationships.
+## Quick Start
+```python
+from transformers import AutoModel
+from vine_hf import VineConfig, VineModel, VinePipeline
+# Load VINE model from HuggingFace
+model = AutoModel.from_pretrained('video-fm/vine', trust_remote_code=True)
+# Create pipeline with your checkpoint paths
+vine_pipeline = VinePipeline(
+    model=model,
+    tokenizer=None,
+    sam_config_path="/path/to/sam2_config.yaml",
+    sam_checkpoint_path="/path/to/sam2_checkpoint.pt",
+    gd_config_path="/path/to/grounding_dino_config.py",
+    gd_checkpoint_path="/path/to/grounding_dino_checkpoint.pth",
+    device="cuda",
+    trust_remote_code=True
+)
+# Process a video
+results = vine_pipeline(
+    'path/to/video.mp4',
+    categorical_keywords=['human', 'dog', 'frisbee'],
+    unary_keywords=['running', 'jumping'],
+    binary_keywords=['chasing', 'behind'],
+    return_top_k=3
+)
+```
+## Installation
+### Option 1: Automated Setup (Recommended)
+```bash
+# Download the setup script
+wget https://raw.githubusercontent.com/kevinxuez/vine_hf/main/setup_vine_demo.sh
+# Run the setup
+bash setup_vine_demo.sh
+# Activate environment
+conda activate vine_demo
+```
+### Option 2: Manual Installation
+```bash
+# 1. Create conda environment
+conda create -n vine_demo python=3.10 -y
+conda activate vine_demo
+# 2. Install PyTorch with CUDA support
+pip install torch==2.7.1 torchvision==0.22.1 --index-url https://download.pytorch.org/whl/cu126
+# 3. Install core dependencies
+pip install transformers huggingface-hub safetensors
+# 4. Clone and install required repositories
+git clone https://github.com/video-fm/video-sam2.git
+git clone https://github.com/video-fm/GroundingDINO.git
+git clone https://github.com/kevinxuez/LASER.git
+git clone https://github.com/kevinxuez/vine_hf.git
+# Install in editable mode
+pip install -e ./video-sam2
+pip install -e ./GroundingDINO
+pip install -e ./LASER
+pip install -e ./vine_hf
+# Build GroundingDINO extensions
+cd GroundingDINO && python setup.py build_ext --force --inplace && cd ..
+```
+## Required Checkpoints
+VINE requires SAM2 and GroundingDINO checkpoints for segmentation. Download these separately:
+### SAM2 Checkpoint
+```bash
+wget https://dl.fbaipublicfiles.com/segment_anything_2/072824/sam2_hiera_tiny.pt
+wget https://raw.githubusercontent.com/facebookresearch/sam2/main/sam2/configs/sam2.1/sam2.1_hiera_t.yaml
+```
+### GroundingDINO Checkpoint
+```bash
+wget https://github.com/IDEA-Research/GroundingDINO/releases/download/v0.1.0-alpha/groundingdino_swint_ogc.pth
+wget https://raw.githubusercontent.com/IDEA-Research/GroundingDINO/main/groundingdino/config/GroundingDINO_SwinT_OGC.py
+```
+## Architecture
+```
+video-fm/vine (HuggingFace Hub)
+├── VINE Model Weights (~1.8GB)
+│   ├── Categorical CLIP model (fine-tuned)
+│   ├── Unary CLIP model (fine-tuned)
+│   └── Binary CLIP model (fine-tuned)
+└── Architecture Files
+    ├── vine_config.py
+    ├── vine_model.py
+    ├── vine_pipeline.py
+    └── utilities
+User Provides:
+├── Dependencies (via pip/conda)
+│   ├── laser (video processing utilities)
+│   ├── sam2 (segmentation)
+│   └── groundingdino (object detection)
+└── Checkpoints (downloaded separately)
+    ├── SAM2 model files
+    └── GroundingDINO model files
+```
+## Why This Architecture?
+This separation of concerns provides several benefits:
+1. **Lightweight Distribution**: Only VINE-specific weights (~1.8GB) are on HuggingFace
+2. **Version Control**: Users can choose their preferred SAM2/GroundingDINO versions
+3. **Licensing**: Keeps different model licenses separate
+4. **Flexibility**: Easy to swap segmentation backends
+5. **Standard Practice**: Similar to models like LLaVA, BLIP-2, etc.
+## Full Usage Example
+```python
+import os
+from pathlib import Path
+from transformers import AutoModel
+from vine_hf import VinePipeline
+# Set up paths
+checkpoint_dir = Path("/path/to/checkpoints")
+sam_config = checkpoint_dir / "sam2_hiera_t.yaml"
+sam_checkpoint = checkpoint_dir / "sam2_hiera_tiny.pt"
+gd_config = checkpoint_dir / "GroundingDINO_SwinT_OGC.py"
+gd_checkpoint = checkpoint_dir / "groundingdino_swint_ogc.pth"
+# Load VINE from HuggingFace
+model = AutoModel.from_pretrained('video-fm/vine', trust_remote_code=True)
+# Create pipeline
+vine_pipeline = VinePipeline(
+    model=model,
+    tokenizer=None,
+    sam_config_path=str(sam_config),
+    sam_checkpoint_path=str(sam_checkpoint),
+    gd_config_path=str(gd_config),
+    gd_checkpoint_path=str(gd_checkpoint),
+    device="cuda:0",
+    trust_remote_code=True
+)
+# Process video
+results = vine_pipeline(
+    "path/to/video.mp4",
+    categorical_keywords=['person', 'dog', 'ball'],
+    unary_keywords=['running', 'jumping', 'sitting'],
+    binary_keywords=['chasing', 'next to', 'holding'],
+    object_pairs=[(0, 1), (0, 2)],  # person-dog, person-ball
+    return_top_k=5,
+    include_visualizations=True
+)
+# Access results
+print(f"Detected {results['summary']['num_objects_detected']} objects")
+print(f"Top categories: {results['summary']['top_categories']}")
+print(f"Top actions: {results['summary']['top_actions']}")
+print(f"Top relations: {results['summary']['top_relations']}")
+# Access detailed predictions
+for obj_id, predictions in results['categorical_predictions'].items():
+    print(f"\nObject {obj_id}:")
+    for prob, category in predictions:
+        print(f"  {category}: {prob:.3f}")
+```
+## Output Format
+```python
+{
+    "categorical_predictions": {
+        object_id: [(probability, category), ...]
+    },
+    "unary_predictions": {
+        (frame_id, object_id): [(probability, action), ...]
+    },
+    "binary_predictions": {
+        (frame_id, (obj1_id, obj2_id)): [(probability, relation), ...]
+    },
+    "confidence_scores": {
+        "categorical": float,
+        "unary": float,
+        "binary": float
+    },
+    "summary": {
+        "num_objects_detected": int,
+        "top_categories": [(category, probability), ...],
+        "top_actions": [(action, probability), ...],
+        "top_relations": [(relation, probability), ...]
+    },
+    "visualizations": {  # if include_visualizations=True
+        "vine": {
+            "all": {"frames": [...], "video_path": "..."},
+            ...
+        }
+    }
+}
+```
+## Configuration Options
+```python
+from vine_hf import VineConfig
+config = VineConfig(
+    model_name="openai/clip-vit-base-patch32",  # CLIP backbone
+    segmentation_method="grounding_dino_sam2",   # or "sam2"
+    box_threshold=0.35,                          # GroundingDINO threshold
+    text_threshold=0.25,                         # GroundingDINO threshold
+    target_fps=5,                                # Video sampling rate
+    visualize=True,                              # Enable visualizations
+    visualization_dir="outputs/",                # Output directory
+    debug_visualizations=False,                  # Debug mode
+    device="cuda:0"                              # Device
+)
+```
+## Deployment Examples
+### Local Script
+```python
+# test_vine.py
+from transformers import AutoModel
+from vine_hf import VinePipeline
+model = AutoModel.from_pretrained('video-fm/vine', trust_remote_code=True)
+pipeline = VinePipeline(model=model, ...)
+results = pipeline("video.mp4", ...)
+```
+### HuggingFace Spaces
+```python
+# app.py for Gradio Space
+import gradio as gr
+from transformers import AutoModel
+from vine_hf import VinePipeline
+model = AutoModel.from_pretrained('video-fm/vine', trust_remote_code=True)
+# ... set up pipeline and Gradio interface
+```
+### API Server
+```python
+# FastAPI server
+from fastapi import FastAPI
+from transformers import AutoModel
+from vine_hf import VinePipeline
+app = FastAPI()
+model = AutoModel.from_pretrained('video-fm/vine', trust_remote_code=True)
+pipeline = VinePipeline(model=model, ...)
+@app.post("/process")
+async def process_video(video_path: str):
+    return pipeline(video_path, ...)
+```
+## Troubleshooting
+### Import Errors
+```bash
+# Make sure all dependencies are installed
+pip list | grep -E "laser|sam2|groundingdino"
+# Reinstall if needed
+pip install -e ./LASER
+pip install -e ./video-sam2
+pip install -e ./GroundingDINO
+```
+### CUDA Errors
+```python
+# Check CUDA availability
+import torch
+print(torch.cuda.is_available())
+print(torch.version.cuda)
+# Use CPU if needed
+pipeline = VinePipeline(model=model, device="cpu", ...)
+```
+### Checkpoint Not Found
+```bash
+# Verify checkpoint paths
+ls -lh /path/to/sam2_hiera_tiny.pt
+ls -lh /path/to/groundingdino_swint_ogc.pth
+```
+## System Requirements
+- **Python**: 3.10+
+- **CUDA**: 11.8+ (for GPU)
+- **GPU**: 8GB+ VRAM recommended (T4, V100, A100, etc.)
+- **RAM**: 16GB+ recommended
+- **Storage**: ~3GB for checkpoints
+## Citation
+```bibtex
+@article{laser2024,
+  title={LASER: Language-guided Object Grounding and Relation Understanding in Videos},
+  author={Your Authors},
+  journal={Your Conference/Journal},
+  year={2024}
+}
+```
+## License
+This model and code are released under the MIT License. Note that SAM2 and GroundingDINO have their own respective licenses.
+## Links
+- **Model**: https://huggingface.co/video-fm/vine
+- **Code**: https://github.com/kevinxuez/LASER
+- **vine_hf Package**: https://github.com/kevinxuez/vine_hf
+- **SAM2**: https://github.com/facebookresearch/sam2
+- **GroundingDINO**: https://github.com/IDEA-Research/GroundingDINO
+## Support
+For issues or questions:
+- **Model/Architecture**: [HuggingFace Discussions](https://huggingface.co/video-fm/vine/discussions)
+- **LASER Framework**: [GitHub Issues](https://github.com/kevinxuez/LASER/issues)
+- **vine_hf Package**: [GitHub Issues](https://github.com/kevinxuez/vine_hf/issues)

vine_hf/__init__.py ADDED Viewed

	@@ -0,0 +1,23 @@

+"""
+VINE HuggingFace Interface
+VINE (Video Understanding with Natural Language) is a model that processes videos
+along with categorical, unary, and binary keywords to return probability
+distributions over those keywords for detected objects and their relationships.
+This package provides a HuggingFace-compatible interface for the VINE model,
+including configuration, model, and pipeline classes.
+"""
+from .vine_config import VineConfig
+from .vine_model import VineModel
+from .vine_pipeline import VinePipeline
+__version__ = "1.0.0"
+__author__ = "LASER Team"
+__all__ = [
+    "VineConfig",
+    "VineModel",
+    "VinePipeline"
+]

vine_hf/convert_inference.py ADDED Viewed

	@@ -0,0 +1,288 @@

+"""
+Script to convert existing inference.py workflow to use VINE HuggingFace interface
+This script demonstrates how to migrate from the original inference.py approach
+to the new HuggingFace-compatible interface.
+"""
+import os
+import sys
+import torch
+import numpy as np
+from typing import Dict, List, Tuple, Any
+# Add paths for imports
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from vine_hf import VineConfig, VineModel, VinePipeline
+from laser.loading import load_video
+def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
+    """
+    Load a pretrained VINE model from the original format into HuggingFace format.
+    Args:
+        model_dir: Directory containing the model
+        model_name: Name of the model file (without .{epoch}.model extension)
+        epoch: Epoch number to load
+    Returns:
+        VineModel instance with loaded weights
+    """
+    print(f"Loading pretrained VINE model from {model_dir}")
+    # Create configuration (adjust parameters as needed)
+    # We expect local ensemble weights in `model_dir`, so configure
+    # VineConfig to load from local directory/filename.
+    model_file = f"{model_name}.{epoch}.model"
+    config = VineConfig(
+        model_name="openai/clip-vit-base-patch32",
+        segmentation_method="grounding_dino_sam2",
+        target_fps=1,
+        box_threshold=0.35,
+        text_threshold=0.25,
+        use_hf_repo=False,
+        local_dir=model_dir,
+        local_filename=model_file,
+    )
+    # Initialize model (VineModel will consult the config when loading)
+    vine_model = VineModel(config)
+    # Load original weights
+    model_file = f"{model_name}.{epoch}.model"
+    model_path = os.path.join(model_dir, model_file)
+    if os.path.exists(model_path):
+        print(f"Loading weights from: {model_path}")
+        try:
+            # Add safe globals for PyTorch 2.6+
+            import torch.serialization
+            from laser.models.llava_clip_model_v3 import PredicateModel
+            torch.serialization.add_safe_globals([PredicateModel])
+            # Load the original model
+            original_model = torch.load(model_path, map_location='cpu', weights_only=False)
+            # Transfer weights to HuggingFace model
+            # This assumes the original model has the same structure
+            # You may need to adjust this based on your specific model structure
+            if hasattr(original_model, 'clip_cate_model'):
+                vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
+            if hasattr(original_model, 'clip_unary_model'):
+                vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
+            if hasattr(original_model, 'clip_binary_model'):
+                vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
+            if hasattr(original_model, 'clip_tokenizer'):
+                vine_model.clip_tokenizer = original_model.clip_tokenizer
+            if hasattr(original_model, 'clip_processor'):
+                vine_model.clip_processor = original_model.clip_processor
+            print("✓ Weights transferred successfully")
+        except Exception as e:
+            print(f"✗ Error loading weights: {e}")
+            print("You may need to adjust the weight loading logic for your specific model")
+    else:
+        print(f"✗ Model file not found: {model_path}")
+    return vine_model
+def convert_inference_workflow():
+    """
+    Convert the original inference.py workflow to use HuggingFace interface.
+    This function demonstrates how to replicate the original inference workflow
+    using the new HuggingFace-compatible components.
+    """
+    print("=== Converting Inference Workflow ===")
+    # Original parameters from inference.py
+    video_id = 'v1'
+    target_fps = 1
+    classes = ['human', 'dog', 'frisbee']
+    unary_keywords = ['running', 'jumping', 'sitting', 'standing']
+    binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
+    # Paths (adjust these to match your setup)
+    demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
+    video_dir = os.path.join(demo_dir, "videos")
+    video_path = os.path.join(video_dir, f"{video_id}.mp4")
+    # Model paths (adjust these to match your setup)
+    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
+    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
+    model_name = "ensemble-2025-02-10-14-57-22"
+    # Segmentation model paths (adjust these to your actual paths)
+    sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
+    sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
+    gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
+    gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"
+    print(f"Video path: {video_path}")
+    print(f"Model dir: {model_dir}")
+    print(f"SAM2 config: {sam_config_path}")
+    print(f"GroundingDINO config: {gd_config_path}")
+    # Check if video exists
+    if not os.path.exists(video_path):
+        print(f"✗ Video not found: {video_path}")
+        print("Please adjust the video path or use your own video file")
+        return
+    # 1. Load video (same as original)
+    print(f"Loading video: {video_id}")
+    video_tensor = load_video(video_path, target_fps=target_fps)
+    print(f"Video shape: {video_tensor.shape}")
+    # 2. Load VINE model with HuggingFace interface
+    print("Loading VINE model...")
+    if os.path.exists(model_dir):
+        vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
+    else:
+        print(f"Model directory not found: {model_dir}")
+        print("Creating new model with random weights for demonstration")
+        config = VineConfig()
+        vine_model = VineModel(config)
+    # 3. Create pipeline for easier use
+    print("Creating VINE pipeline...")
+    from transformers.pipelines import PIPELINE_REGISTRY
+    # Register pipeline if not already registered
+    try:
+        PIPELINE_REGISTRY.register_pipeline(
+            "vine-video-understanding",
+            pipeline_class=VinePipeline,
+            pt_model=VineModel,
+            type="multimodal",
+        )
+    except Exception:
+        pass  # Already registered
+    # Create pipeline instance with segmentation model paths
+    vine_pipeline = VinePipeline(
+        model=vine_model,
+        tokenizer=None,
+        # SAM2 configuration
+        sam_config_path=sam_config_path,
+        sam_checkpoint_path=sam_checkpoint_path,
+        # GroundingDINO configuration
+        gd_config_path=gd_config_path,
+        gd_checkpoint_path=gd_checkpoint_path
+    )
+    # 4. Process video with new interface
+    print("Processing video with VINE HuggingFace interface...")
+    try:
+        # Use the pipeline to process the video
+        results = vine_pipeline(
+            video_path,
+            categorical_keywords=classes,
+            unary_keywords=unary_keywords,
+            binary_keywords=binary_keywords,
+            object_pairs=[(1, 2), (2, 3)],  # Example object pairs
+            segmentation_method='grounding_dino_sam2',
+            target_fps=target_fps,
+            return_top_k=3,
+            include_visualizations=False
+        )
+        # 5. Display results (similar to original format)
+        print("\n=== VINE Results (HuggingFace Interface) ===")
+        # Categorical predictions
+        print("\nCategorical Predictions:")
+        for obj_id, predictions in results['categorical_predictions'].items():
+            print(f"  Object {obj_id}:")
+            for prob, category in predictions:
+                print(f"    {prob:.3f}: {category}")
+        # Unary predictions
+        print("\nUnary Predictions:")
+        for (frame_id, obj_id), predictions in results['unary_predictions'].items():
+            print(f"  Frame {frame_id}, Object {obj_id}:")
+            for prob, action in predictions:
+                print(f"    {prob:.3f}: {action}")
+        # Binary predictions
+        print("\nBinary Predictions:")
+        for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
+            print(f"  Frame {frame_id}, Objects {obj_pair}:")
+            for prob, relation in predictions:
+                print(f"    {prob:.3f}: {relation}")
+        # Summary
+        print(f"\nSummary:")
+        print(f"  Objects detected: {results['summary']['num_objects_detected']}")
+        print(f"  Top categories: {results['summary']['top_categories']}")
+        print(f"  Top actions: {results['summary']['top_actions']}")
+        print(f"  Top relations: {results['summary']['top_relations']}")
+        print("\n✓ Successfully processed video with VINE HuggingFace interface!")
+    except Exception as e:
+        print(f"✗ Error processing video: {e}")
+        print("This may be due to missing segmentation models or other dependencies")
+        print("The interface is set up correctly, but full functionality requires:")
+        print("  1. Properly installed Grounding DINO and SAM2")
+        print("  2. Correct model weights")
+        print("  3. Proper configuration paths")
+def compare_interfaces():
+    """
+    Compare the original inference.py approach with the new HuggingFace interface.
+    """
+    print("\n=== Interface Comparison ===")
+    print("\nOriginal inference.py approach:")
+    print("✓ Direct access to model internals")
+    print("✓ Full control over segmentation pipeline")
+    print("✗ Complex setup and configuration")
+    print("✗ Not compatible with HuggingFace ecosystem")
+    print("✗ Requires manual handling of all components")
+    print("\nNew HuggingFace interface:")
+    print("✓ Easy to use pipeline interface")
+    print("✓ Compatible with HuggingFace Hub")
+    print("✓ Standardized configuration")
+    print("✓ Automatic handling of preprocessing/postprocessing")
+    print("✓ Easy sharing and distribution")
+    print("✓ Configurable segmentation model paths")
+    print("✗ Slightly less direct control (can still access model directly)")
+    print("\nMigration benefits:")
+    print("• Share your model easily on HuggingFace Hub")
+    print("• Users can load your model with a single line")
+    print("• Standardized interface for video understanding")
+    print("• Better integration with other HuggingFace tools")
+    print("• Simplified deployment and inference")
+    print("• Flexible segmentation model configuration")
+if __name__ == "__main__":
+    print("VINE HuggingFace Interface Conversion")
+    print("=" * 50)
+    # Run conversion demonstration
+    convert_inference_workflow()
+    # Show comparison
+    compare_interfaces()
+    print("\n" + "=" * 50)
+    print("Next steps:")
+    print("1. Install SAM2 and GroundingDINO dependencies")
+    print("2. Download the required model checkpoints")
+    print("3. Update the paths in this script to point to your models")
+    print("4. Test the interface with your specific model weights")
+    print("5. Adjust configuration parameters as needed")
+    print("6. Push your model to HuggingFace Hub using push_to_hub.py")
+    print("7. Share with the community!")

vine_hf/example_ensemble_weights.py ADDED Viewed

	@@ -0,0 +1,333 @@

+"""
+Example demonstrating how to load and use VINE ensemble weights
+This script shows the correct way to load your pretrained VINE ensemble weights
+and use them with the HuggingFace interface, based on the actual inference.py workflow.
+"""
+import os
+import sys
+import torch
+import numpy as np
+from transformers.pipelines import PIPELINE_REGISTRY
+#os.environ["OPENAI_API_KEY"]="dummy-key"  # Set your OpenAI API key here or via environment variable
+# Add the parent directory to the path to import vine_hf
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from vine_hf import VineConfig, VineModel, VinePipeline
+from laser.loading import load_video
+def example_load_ensemble_weights():
+    """Example of loading ensemble weights correctly."""
+    print("=== Loading Ensemble VINE Weights ===")
+    # Path to your ensemble model (adjust this to your actual path)
+    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
+    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
+    print(f"Looking for ensemble weights in: {model_dir}")
+    if os.path.exists(model_dir):
+        print("✓ Model directory found")
+        # List available model files
+        model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')]
+        print(f"Available model files: {model_files}")
+        if model_files:
+            # Create configuration with ensemble path (local directory with .model files)
+            config = VineConfig(
+                segmentation_method="grounding_dino_sam2",
+                use_hf_repo=False,
+                local_dir=model_dir,
+                local_filename=None,
+            )
+            print("Creating VINE model with ensemble weights...")
+            vine_model = VineModel(config)
+            print("✓ VINE model created with ensemble weights!")
+            return vine_model
+        else:
+            print("✗ No .model files found in directory")
+            return None
+    else:
+        print(f"✗ Model directory not found: {model_dir}")
+        print("Please adjust the path to point to your ensemble weights")
+        return None
+def example_direct_ensemble_loading():
+    """Example of loading ensemble weights using from_pretrained_vine."""
+    print("\n=== Direct Ensemble Loading ===")
+    # Path to specific ensemble file
+    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
+    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
+    if os.path.exists(model_dir):
+        try:
+            # Use the class method for direct loading
+            vine_model = VineModel.from_pretrained_vine(
+                model_path=model_dir,
+                epoch=0  # Load epoch 0
+            )
+            print("✓ Model loaded using from_pretrained_vine!")
+            return vine_model
+        except Exception as e:
+            print(f"✗ Error loading with from_pretrained_vine: {e}")
+            return None
+    else:
+        print(f"✗ Model directory not found: {model_dir}")
+        return None
+def example_compare_original_vs_hf():
+    """Compare the original inference.py approach with HuggingFace interface."""
+    print("\n=== Comparing Original vs HuggingFace Interface ===")
+    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
+    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
+    model_name = "ensemble-2025-02-10-14-57-22"
+    epoch = 0
+    if not os.path.exists(model_dir):
+        print(f"Model directory not found: {model_dir}")
+        return
+    print("Original approach (from inference.py):")
+    print("```python")
+    print("def load_model(model_dir, model_name, epoch, device):")
+    print("    model_name = model_name + f'.{epoch}.model'")
+    print("    predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)")
+    print("    return predicate_model")
+    print("")
+    print("predicate_model = load_model(model_dir, model_name, epoch, device)")
+    print("```")
+    print("\nNew HuggingFace approach:")
+    print("```python")
+    print("config = VineConfig(pretrained_vine_path=model_dir)")
+    print("vine_model = VineModel(config)")
+    print("# or")
+    print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)")
+    print("```")
+    # Try to load with both approaches if possible
+    try:
+        # Original approach
+        def load_model(model_dir, model_name, epoch, device):
+            model_name = model_name + f'.{epoch}.model'
+            model_path = os.path.join(model_dir, model_name)
+            if os.path.exists(model_path):
+                return torch.load(model_path, map_location=device, weights_only=False)
+            else:
+                print(f"Model file not found: {model_path}")
+                return None
+        device = "cuda" if torch.cuda.is_available() else "cpu"
+        original_model = load_model(model_dir, model_name, epoch, device)
+        if original_model:
+            print(f"✓ Original model loaded: {type(original_model)}")
+            print(f"  Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}")
+            print(f"  Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}")
+            print(f"  Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}")
+        # HuggingFace approach
+        vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch)
+        if vine_model:
+            print(f"✓ HuggingFace model loaded: {type(vine_model)}")
+            print(f"  Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}")
+            print(f"  Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}")
+            print(f"  Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}")
+            print("\n✓ Both approaches work! HuggingFace interface successfully loads ensemble weights.")
+    except Exception as e:
+        print(f"Error in comparison: {e}")
+def example_ensemble_with_pipeline():
+    """Example using ensemble weights with the pipeline."""
+    print("\n=== Using Ensemble Weights with Pipeline ===")
+    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
+    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
+    if not os.path.exists(model_dir):
+        print(f"Model directory not found: {model_dir}")
+        return
+    # Register pipeline
+    PIPELINE_REGISTRY.register_pipeline(
+        "vine-video-understanding",
+        pipeline_class=VinePipeline,
+        pt_model=VineModel,
+        type="multimodal",
+    )
+    # Create model with ensemble weights (local directory)
+    config = VineConfig(
+        segmentation_method="grounding_dino_sam2",
+        use_hf_repo=False,
+        local_dir=model_dir,
+        local_filename=None,
+    )
+    vine_model = VineModel(config)
+    # Create pipeline with segmentation model paths
+    vine_pipeline = VinePipeline(
+        model=vine_model,
+        tokenizer=None,
+        # SAM2 configuration
+        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
+        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
+        # GroundingDINO configuration
+        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
+        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        device="cuda" if torch.cuda.is_available() else "cpu",
+    )
+    print("✓ Pipeline created with ensemble VINE weights")
+    # Check for demo video
+    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
+    if os.path.exists(demo_video):
+        print(f"Found demo video: {demo_video}")
+        # Use the same keywords as in the original inference.py
+        categorical_keywords = ['human', 'dog', 'frisbee']
+        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
+        binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
+        print("Example pipeline usage:")
+        print("```python")
+        print("results = vine_pipeline(")
+        print(f"    '{demo_video}',")
+        print(f"    categorical_keywords={categorical_keywords},")
+        print(f"    unary_keywords={unary_keywords},")
+        print(f"    binary_keywords={binary_keywords},")
+        print("    segmentation_method='grounding_dino_sam2'")
+        print(")")
+        print("```")
+        # Uncomment to actually run (requires segmentation models)
+        # try:
+        #     results = vine_pipeline(
+        #         demo_video,
+        #         categorical_keywords=categorical_keywords,
+        #         unary_keywords=unary_keywords,
+        #         binary_keywords=binary_keywords,
+        #         segmentation_method='grounding_dino_sam2'
+        #     )
+        #     print("Results:", results['summary'])
+        # except Exception as e:
+        #     print(f"Pipeline execution failed: {e}")
+        #     print("This is expected if segmentation models are not set up")
+    return vine_pipeline
+def demonstrate_weight_transfer():
+    """Demonstrate how weights are transferred from ensemble to HuggingFace format."""
+    print("\n=== Weight Transfer Demonstration ===")
+    print("The ensemble model structure (PredicateModel):")
+    print("- clip_cate_model: CLIP model for categorical classification")
+    print("- clip_unary_model: CLIP model for unary predicates")
+    print("- clip_binary_model: CLIP model for binary relations")
+    print("- clip_tokenizer: Tokenizer for text processing")
+    print("- clip_processor: Processor for image processing")
+    print("\nWeight transfer process:")
+    print("1. Load ensemble model with torch.load()")
+    print("2. Initialize base CLIP models in HuggingFace format")
+    print("3. Transfer state_dict from ensemble to HuggingFace models:")
+    print("   - ensemble.clip_cate_model → hf.clip_cate_model")
+    print("   - ensemble.clip_unary_model → hf.clip_unary_model")
+    print("   - ensemble.clip_binary_model → hf.clip_binary_model")
+    print("4. Transfer tokenizer and processor")
+    print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!")
+def troubleshooting_guide():
+    """Provide troubleshooting guide for common issues."""
+    print("\n=== Troubleshooting Guide ===")
+    print("Common Issues:")
+    print("1. 'No model file found for epoch X'")
+    print("   → Check that .model files exist in the directory")
+    print("   → Verify the epoch number is correct")
+    print("   → List files: ls /path/to/model/dir/*.model")
+    print("\n2. 'Error loading VINE weights'")
+    print("   → Check file permissions")
+    print("   → Verify the model file is not corrupted")
+    print("   → Try loading with torch.load() directly first")
+    print("\n3. 'CLIP model mismatch'")
+    print("   → Ensure config.model_name matches the base model used in training")
+    print("\n4. 'Device mismatch errors'")
+    print("   → Models are loaded to CPU first, then moved to device")
+    print("   → Check CUDA availability with torch.cuda.is_available()")
+    print("\nDebugging steps:")
+    print("1. Test loading ensemble model directly:")
+    print("   model = torch.load('path/to/model.0.model', map_location='cpu')")
+    print("2. Check model attributes:")
+    print("   print(dir(model))")
+    print("3. Verify state_dict keys:")
+    print("   print(model.clip_cate_model.state_dict().keys())")
+if __name__ == "__main__":
+    print("VINE Ensemble Weights Loading Examples")
+    print("=" * 50)
+    # Test ensemble weight loading
+    try:
+        model1 = example_load_ensemble_weights()
+    except Exception as e:
+        print(f"Ensemble loading example failed: {e}")
+    try:
+        model2 = example_direct_ensemble_loading()
+    except Exception as e:
+        print(f"Direct loading example failed: {e}")
+    # Compare approaches
+    try:
+        example_compare_original_vs_hf()
+    except Exception as e:
+        print(f"Comparison example failed: {e}")
+    # Test pipeline with ensemble weights
+    try:
+        pipeline = example_ensemble_with_pipeline()
+    except Exception as e:
+        print(f"Pipeline example failed: {e}")
+    # Educational content
+    demonstrate_weight_transfer()
+    troubleshooting_guide()
+    print("\n" + "=" * 50)
+    print("Key Points:")
+    print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights")
+    print("2. Use torch.load() to load the ensemble, then transfer weights")
+    print("3. The HuggingFace interface preserves your fine-tuned weights")
+    print("4. Specify pretrained_vine_path in VineConfig to auto-load weights")
+    print("5. Use VineModel.from_pretrained_vine() for direct loading")

vine_hf/example_sam2_masks.py ADDED Viewed

	@@ -0,0 +1,331 @@

+"""
+Example demonstrating SAM2 mask generation in VINE HuggingFace interface
+This script shows how to use both SAM2-only and Grounding DINO + SAM2
+segmentation methods with the VINE model.
+"""
+import os
+import sys
+import torch
+import numpy as np
+from transformers.pipelines import PIPELINE_REGISTRY
+# Add the parent directory to the path to import vine_hf
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Add the parent directory to the path to import vine_hf
+#Either uncomment the below or set a environemental key, though it isn't needed to run.
+#os.environ['OPENAI_API_KEY'] = 'dummy-key'
+from vine_hf import VineConfig, VineModel, VinePipeline
+from laser.loading import load_video
+def example_sam2_only_segmentation():
+    """Example using SAM2 automatic mask generation only."""
+    print("=== SAM2-Only Segmentation Example ===")
+    # Create configuration for SAM2-only
+    config = VineConfig(
+        use_hf_repo=True,
+        model_repo="video-fm/vine_v0",
+        segmentation_method="sam2",  # Use SAM2 only
+        target_fps=1,
+        debug_visualizations=True,
+    )
+    # Register pipeline
+    PIPELINE_REGISTRY.register_pipeline(
+        "vine-video-understanding",
+        pipeline_class=VinePipeline,
+        pt_model=VineModel,
+        type="multimodal",
+    )
+    # Create model and pipeline with SAM2 paths
+    vine_model = VineModel(config)
+    vine_pipeline = VinePipeline(
+        model=vine_model,
+        tokenizer=None,
+        sam_config_path="path/to/your/sam2/sam_config.yaml",
+        sam_checkpoint_path="path/to/your/sam2/sam_checkpoint.pth",
+        gd_config_path="path/to/your/groundingdino/config.py",
+        gd_checkpoint_path="path/to/your/groundingdino/checkpoint.pth",
+    )
+    # Check for demo video
+    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
+    if os.path.exists(demo_video):
+        print(f"Processing video: {demo_video}")
+        # Define keywords (SAM2 will find all objects, then classify them)
+        categorical_keywords = ['human', 'dog', 'frisbee', 'object', 'person', 'animal']
+        unary_keywords = ['running', 'jumping', 'sitting', 'standing', 'moving', 'static']
+        binary_keywords = ['behind', 'in front of', 'next to', 'chasing', 'following']
+        object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
+        print("Using SAM2 automatic mask generation...")
+        print("This will find all objects in the video automatically")
+        try:
+            # Process with SAM2 only
+            results = vine_pipeline(
+                demo_video,
+                categorical_keywords=categorical_keywords,
+                unary_keywords=unary_keywords,
+                binary_keywords=binary_keywords,
+                object_pairs=object_pairs,
+                segmentation_method="sam2",
+                return_top_k=3,
+                debug_visualizations=True,
+                debug_visualization_path=os.path.join(os.getcwd(), "sam2_debug_masks.png"),
+            )
+            print("\n✓ SAM2 segmentation completed!")
+            print("Results summary:")
+            print(f"  Objects detected: {results['summary']['num_objects_detected']}")
+            print(f"  Top categories: {results['summary']['top_categories']}")
+            print(f"  Top actions: {results['summary']['top_actions']}")
+            return results
+        except Exception as e:
+            print(f"SAM2 segmentation failed: {e}")
+            print("Make sure SAM2 models are properly installed")
+            return None
+    else:
+        print(f"Demo video not found: {demo_video}")
+        return None
+def example_grounding_dino_sam2_segmentation():
+    """Example using Grounding DINO + SAM2 text-guided segmentation."""
+    print("\n=== Grounding DINO + SAM2 Segmentation Example ===")
+    # Create configuration for Grounding DINO + SAM2
+    config = VineConfig(
+        use_hf_repo=True,
+        model_repo="video-fm/vine_v0",
+        segmentation_method="grounding_dino_sam2",  # Use text-guided segmentation
+        box_threshold=0.35,
+        text_threshold=0.25,
+        target_fps=1,
+        debug_visualizations=True,
+    )
+    # Create model and pipeline with both SAM2 and GroundingDINO paths
+    vine_model = VineModel(config)
+    vine_pipeline = VinePipeline(
+        model=vine_model,
+        tokenizer=None,
+        # SAM2 configuration
+        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
+        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
+        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
+        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        device=0,
+    )
+    # Check for demo video
+    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
+    if os.path.exists(demo_video):
+        print(f"Processing video: {demo_video}")
+        # Define keywords (Grounding DINO will look specifically for these)
+        categorical_keywords = ['human', 'dog', 'frisbee']  # Specific objects to find
+        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
+        binary_keywords = ['behind', 'chasing', 'next to', 'throwing to']
+        object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
+        print("Using Grounding DINO + SAM2 text-guided segmentation...")
+        print(f"Looking specifically for: {categorical_keywords}")
+        try:
+            # Process with Grounding DINO + SAM2
+            results = vine_pipeline(
+                demo_video,
+                categorical_keywords=categorical_keywords,
+                unary_keywords=unary_keywords,
+                binary_keywords=binary_keywords,
+                object_pairs=object_pairs,
+                segmentation_method="grounding_dino_sam2",
+                box_threshold=0.35,
+                text_threshold=0.25,
+                return_top_k=3,
+                debug_visualizations=True,
+            )
+            print("\n✓ Grounding DINO + SAM2 segmentation completed!")
+            print("Results summary:")
+            print(f"  Objects detected: {results['summary']['num_objects_detected']}")
+            print(f"  Top categories: {results['summary']['top_categories']}")
+            print(f"  Top actions: {results['summary']['top_actions']}")
+            print(f"  Top relations: {results['summary']['top_relations']}")
+            return results
+        except Exception as e:
+            print(f"Grounding DINO + SAM2 segmentation failed: {e}")
+            print("Make sure both Grounding DINO and SAM2 models are properly installed")
+            return None
+    else:
+        print(f"Demo video not found: {demo_video}")
+        return None
+def compare_segmentation_methods():
+    """Compare SAM2-only vs Grounding DINO + SAM2 approaches."""
+    print("\n=== Comparing Segmentation Methods ===")
+    print("\nSAM2-Only Approach:")
+    print("✓ Finds all objects automatically")
+    print("✓ No need to specify what to look for")
+    print("✓ Good for exploratory analysis")
+    print("✗ May find too many irrelevant objects")
+    print("✗ Less precise for specific object types")
+    print("\nGrounding DINO + SAM2 Approach:")
+    print("✓ Finds specific objects based on text prompts")
+    print("✓ More precise and targeted")
+    print("✓ Better for known object categories")
+    print("✓ Integrates object detection with segmentation")
+    print("✗ Limited to specified categories")
+    print("✗ Requires knowing what objects to look for")
+def demonstrate_mask_processing():
+    """Demonstrate how masks are processed internally."""
+    print("\n=== Mask Processing Demonstration ===")
+    # Load a video to show the processing pipeline
+    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
+    if os.path.exists(demo_video):
+        print("Loading video for mask processing demo...")
+        # Load video tensor
+        video_tensor = np.asarray(load_video(demo_video, target_fps=1))
+        print(f"Video shape: {video_tensor.shape}")
+        # Create pipeline with segmentation model paths
+        config = VineConfig(segmentation_method="sam2")
+        vine_model = VineModel(config)
+        vine_pipeline = VinePipeline(
+            model=vine_model,
+            tokenizer=None,
+            # SAM2 configuration
+            sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
+            sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
+            gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
+            gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        )
+        try:
+            # Process just the first few frames to show the pipeline
+            print("\nProcessing first 2 frames with SAM2...")
+            # Manually call the preprocessing to show the steps
+            processed_data = vine_pipeline.preprocess(
+                video_tensor[:2],  # Just first 2 frames
+                segmentation_method="sam2",
+                categorical_keywords=['object']
+            )
+            print("Mask processing results:")
+            print(f"  Number of frames processed: {processed_data['num_frames']}")
+            print(f"  Frames with masks: {list(processed_data['masks'].keys())}")
+            # Show mask details
+            for frame_id, frame_masks in processed_data['masks'].items():
+                print(f"  Frame {frame_id}: {len(frame_masks)} objects detected")
+                for obj_id, mask in frame_masks.items():
+                    print(f"    Object {obj_id}: mask shape {mask.shape}")
+            print("\nBounding box extraction:")
+            for frame_id, frame_bboxes in processed_data['bboxes'].items():
+                print(f"  Frame {frame_id}: {len(frame_bboxes)} bounding boxes")
+                for obj_id, bbox in frame_bboxes.items():
+                    print(f"    Object {obj_id}: bbox {bbox}")
+        except Exception as e:
+            print(f"Mask processing failed: {e}")
+            print("This is expected if SAM2 models are not properly set up")
+    else:
+        print(f"Demo video not found: {demo_video}")
+def test_mask_formats():
+    """Test different mask input formats."""
+    print("\n=== Testing Mask Formats ===")
+    # Create dummy data to test mask processing
+    height, width = 224, 224
+    # Test different mask formats
+    print("Testing mask format conversions...")
+    # Format 1: NumPy boolean array
+    mask_np = np.random.rand(height, width) > 0.5
+    print(f"NumPy mask: {mask_np.shape}, dtype: {mask_np.dtype}")
+    # Format 2: PyTorch tensor
+    mask_torch = torch.from_numpy(mask_np)
+    print(f"PyTorch mask: {mask_torch.shape}, dtype: {mask_torch.dtype}")
+    # Format 3: 3D mask with singleton dimension
+    mask_3d = mask_torch.unsqueeze(-1)
+    print(f"3D mask: {mask_3d.shape}")
+    # Test bounding box extraction
+    from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
+    try:
+        bbox = mask_to_bbox(mask_torch)
+        print(f"Extracted bbox: {bbox}")
+        print("✓ Mask format testing successful")
+    except Exception as e:
+        print(f"Mask format testing failed: {e}")
+if __name__ == "__main__":
+    print("VINE SAM2 Mask Generation Examples")
+    print("=" * 50)
+    # Test SAM2-only approach
+    try:
+        sam2_results = example_sam2_only_segmentation()
+    except Exception as e:
+        print(f"SAM2-only example failed: {e}")
+    # Test Grounding DINO + SAM2 approach
+    try:
+        gd_sam2_results = example_grounding_dino_sam2_segmentation()
+    except Exception as e:
+        print(f"Grounding DINO + SAM2 example failed: {e}")
+    # Compare approaches
+    compare_segmentation_methods()
+    # Demonstrate mask processing
+    try:
+        demonstrate_mask_processing()
+    except Exception as e:
+        print(f"Mask processing demo failed: {e}")
+    # Test mask formats
+    try:
+        test_mask_formats()
+    except Exception as e:
+        print(f"Mask format testing failed: {e}")
+    print("\n" + "=" * 50)
+    print("Examples completed!")
+    print("\nKey takeaways:")
+    print("1. SAM2-only: Automatic object detection and segmentation")
+    print("2. Grounding DINO + SAM2: Text-guided object detection and segmentation")
+    print("3. Both methods provide masks and bounding boxes for VINE model")
+    print("4. Choose method based on whether you know what objects to look for")

vine_hf/example_usage.ipynb ADDED Viewed

	@@ -0,0 +1,310 @@

+{
+ "cells": [
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "44d53281",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "/home/kevinx/miniconda3/envs/laser_env/lib/python3.10/site-packages/pydantic/_internal/_config.py:383: UserWarning: Valid config keys have changed in V2:\n",
+      "* 'schema_extra' has been renamed to 'json_schema_extra'\n",
+      "  warnings.warn(message, UserWarning)\n",
+      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
+      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
+     ]
+    }
+   ],
+   "source": [
+    "import os\n",
+    "import sys\n",
+    "import torch\n",
+    "from transformers import pipeline, AutoModel\n",
+    "from transformers.pipelines import PIPELINE_REGISTRY\n",
+    "\n",
+    "# Uncomment or set your own\n",
+    "#os.environ['OPENAI_API_KEY'] = 'dummy-key'\n",
+    "from vine_hf import VineConfig, VineModel, VinePipeline"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 2,
+   "id": "174e479f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "PIPELINE_REGISTRY.register_pipeline(\n",
+    "            \"vine-video-understanding\",\n",
+    "            pipeline_class=VinePipeline,\n",
+    "            pt_model=VineModel,\n",
+    "            type=\"multimodal\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a9af2770",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vine_config = VineConfig(\n",
+    "    model_name=\"openai/clip-vit-base-patch32\",\n",
+    "    # Local file example: set use_hf_repo=False and provide local_dir/local_filename\n",
+    "    use_hf_repo=False,\n",
+    "    local_dir=os.path.dirname('/path/to/your/pretrained/model.pt'),\n",
+    "    local_filename=os.path.basename('/path/to/your/pretrained/model.pt'),  # Local file path\n",
+    "    segmentation_method=\"grounding_dino_sam2\",\n",
+    "    visualize=True,\n",
+    "    visualization_dir=\"path/to/visualization/dir\",\n",
+    "    debug_visualizations=True,\n",
+    "    device=0,  # Change to your desired device\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "274e6515",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Loaded state type: <class 'collections.OrderedDict'>\n"
+     ]
+    }
+   ],
+   "source": [
+    "vine_pipeline = VinePipeline(\n",
+    "    model=VineModel(vine_config),        \n",
+    "    tokenizer=None,\n",
+    "    sam_config_path=\"path/to/sam2/configs/sam2_hiera_base_plus.yaml\",\n",
+    "    sam_checkpoint_path=\"path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt\",\n",
+    "    gd_config_path=\"path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py\",\n",
+    "    gd_checkpoint_path=\"path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth\",\n",
+    ")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 6,
+   "id": "123a090d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "categorical_keywords = ['human', 'dog', 'frisbee']\n",
+    "unary_keywords = ['running', 'jumping', 'catching', 'throwing']\n",
+    "binary_keywords = ['behind', 'in front of', 'next to', 'chasing']\n",
+    "object_pairs = [(0, 1), (0, 2), (1, 2)]  # human-dog, dog-frisbee relationships "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 7,
+   "id": "0b42f032",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "demo_video_path = \"/home/kevinx/LASER/LASER/demo/videos/v1.mp4\"  # Replace with your video file path"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 8,
+   "id": "8202c654",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Segmentation method: grounding_dino_sam2\n",
+      "Generating Grounding DINO + SAM2 masks...\n",
+      "<class 'int'>\n",
+      "✓ SAM2 models initialized successfully\n",
+      "<class 'int'>\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:4314.)\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "final text_encoder_type: bert-base-uncased\n",
+      "✓ GroundingDINO model initialized successfully\n",
+      "Start detecting objects at time  05:08:58.178592\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Detecting objects:   0%|          | 0/3 [00:00<?, ?it/s]FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
+      "UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
+      "UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
+      "FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
+      "Detecting objects: 100%|██████████| 3/3 [00:01<00:00,  2.82it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Finished detecting objects at time  05:08:59.250419\n",
+      "Loading inference state at time  05:08:59.544425\n",
+      "Number of frames:  3\n",
+      "None\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "Processing frames: 100%|██████████| 3/3 [00:00<00:00, 11.77it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Annotated frames:  []\n",
+      "Find the most dense prompt at time  05:09:01.413703\n",
+      "Most dense frame: 0\n",
+      "\n",
+      "\n",
+      "Start propagating objects at time  05:09:01.416367\n",
+      "Pass count:  0\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 20.20it/s]\n",
+      "propagate in video: 0it [00:00, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most dense frame: 1\n",
+      "\n",
+      "\n",
+      "Pass count:  1\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 19.25it/s]\n",
+      "propagate in video: 0it [00:00, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most dense frame: 2\n",
+      "\n",
+      "\n",
+      "Pass count:  2\n"
+     ]
+    },
+    {
+     "name": "stderr",
+     "output_type": "stream",
+     "text": [
+      "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 25.92it/s]\n",
+      "propagate in video: 0it [00:00, ?it/s]\n"
+     ]
+    },
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Most dense frame: -1\n",
+      "\n",
+      "\n",
+      "\n",
+      "Results:\n",
+      "Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "try:\n",
+    "    results = vine_pipeline(\n",
+    "        demo_video_path,\n",
+    "        categorical_keywords=categorical_keywords,\n",
+    "        unary_keywords=unary_keywords,\n",
+    "        binary_keywords=binary_keywords,\n",
+    "        object_pairs=object_pairs,\n",
+    "        segmentation_method='grounding_dino_sam2',\n",
+    "        return_top_k=3,\n",
+    "        include_visualizations=False,\n",
+    "        debug_visualizations=False,\n",
+    "    )\n",
+    "    \n",
+    "    print(\"\\nResults:\")\n",
+    "    print(f\"Summary: {results['summary']}\")\n",
+    "    \n",
+    "except Exception as e:\n",
+    "    print(f\"Note: Full execution requires segmentation models to be properly set up.\")\n",
+    "    print(f\"Error: {e}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": 9,
+   "id": "414ede9b",
+   "metadata": {},
+   "outputs": [
+    {
+     "name": "stdout",
+     "output_type": "stream",
+     "text": [
+      "Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
+     ]
+    }
+   ],
+   "source": [
+    "print(f\"Summary: {results['summary']}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "laser_env",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3",
+   "version": "3.10.0"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}

vine_hf/example_usage.py ADDED Viewed

	@@ -0,0 +1,283 @@

+"""
+Example usage of VINE HuggingFace interface
+This script demonstrates how to use the VINE model through the HuggingFace interface
+for video understanding with categorical, unary, and binary keyword predictions.
+"""
+import os
+import sys
+import torch
+from transformers import pipeline, AutoModel
+from transformers.pipelines import PIPELINE_REGISTRY
+# Add the parent directory to the path to import vine_hf
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+# Uncomment or set your own
+#os.environ['OPENAI_API_KEY'] = 'dummy-key'
+from vine_hf import VineConfig, VineModel, VinePipeline
+def example_direct_model_usage():
+    """Example of using the VINE model directly."""
+    print("=== Direct Model Usage ===")
+    # Create configuration
+    config = VineConfig(
+        model_name="openai/clip-vit-base-patch32",
+        segmentation_method="grounding_dino_sam2",
+        use_hf_repo=True,
+        model_repo="video-fm/vine_v0",  # Your HF Hub model
+        debug_visualizations=True,
+        debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
+        target_fps=30,
+        box_threshold=0.35,
+        text_threshold=0.25
+    )
+    # Initialize model
+    model = VineModel(config)
+    print(f"Model initialized with CLIP backbone: {config.model_name}")
+    print(f"Segmentation method: {config.segmentation_method}")
+    print(f"Device: {model.device}")
+    # Example video data (placeholder - in real usage, load from video file)
+    num_frames, height, width = 3, 224, 224
+    video_frames = torch.randn(num_frames, height, width, 3) * 255
+    video_frames = video_frames.clamp(0, 255).byte()
+    # Example masks and bboxes (placeholder - in real usage, generated by segmentation)
+    masks = {
+        0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
+        1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
+        2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
+    }
+    bboxes = {
+        0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
+        1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
+        2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
+    }
+    # Define keywords
+    categorical_keywords = ["human", "dog", "frisbee"]
+    unary_keywords = ["running", "jumping", "sitting", "standing"]
+    binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
+    object_pairs = [(1, 2)]  # Object 1 relates to Object 2
+    # Run prediction
+    print("\nRunning prediction...")
+    results = model.predict(
+        video_frames=video_frames,
+        masks=masks,
+        bboxes=bboxes,
+        categorical_keywords=categorical_keywords,
+        unary_keywords=unary_keywords,
+        binary_keywords=binary_keywords,
+        object_pairs=object_pairs,
+        return_top_k=3
+    )
+    print("\nResults:")
+    print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
+    print(f"Unary predictions: {len(results['unary_predictions'])} actions")
+    print(f"Binary predictions: {len(results['binary_predictions'])} relations")
+    print(f"Confidence scores: {results['confidence_scores']}")
+def example_pipeline_usage():
+    """Example of using the VINE pipeline."""
+    print("\n=== Pipeline Usage ===")
+    # Register the pipeline
+    PIPELINE_REGISTRY.register_pipeline(
+        "vine-video-understanding",
+        pipeline_class=VinePipeline,
+        pt_model=VineModel,
+        type="multimodal",
+    )
+    vine_config = VineConfig(
+             model_name="openai/clip-vit-base-patch32",
+            use_hf_repo=True,
+            model_repo="video-fm/vine_v0",  # Your HF Hub model
+            segmentation_method="grounding_dino_sam2",
+            debug_visualizations=True,
+        )
+    vine_pipe = VinePipeline(
+        model=VineModel(vine_config),
+        tokenizer=None,
+        trust_remote_code=True,
+        # SAM2 configuration
+        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
+        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
+        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
+        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        device=0,
+    )
+    print("Pipeline created successfully!")
+    # Example usage with video path
+    video_path = "path/to/your/video.mp4"  # Replace with actual video path
+    # For demonstration, we'll show the expected usage format
+    print(f"\nExample pipeline call (replace with actual video path):")
+    print(f"results = vine_pipeline(")
+    print(f"    '{video_path}',")
+    print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
+    print(f"    unary_keywords=['running', 'jumping', 'sitting'],")
+    print(f"    binary_keywords=['behind', 'in front of', 'next to'],")
+    print(f"    object_pairs=[(1, 2)],")
+    print(f"    segmentation_method='grounding_dino_sam2',")
+    print(f"    return_top_k=3,")
+    print(f"    return_flattened_segments=True,")
+    print(f"    return_valid_pairs=True,")
+    print(f"    include_visualizations=True,")
+    print(f"    debug_visualizations=True")
+    print(f")")
+    # Note: Actual execution would require proper video file and segmentation models
+def example_huggingface_hub_usage():
+    """Example of how to push and load from HuggingFace Hub."""
+    print("\n=== HuggingFace Hub Usage ===")
+    # Example of preparing model for Hub
+    config = VineConfig()
+    model = VineModel(config)
+    # Register for auto classes
+    config.register_for_auto_class()
+    model.register_for_auto_class("AutoModel")
+    print("Model registered for auto classes")
+    # Example push to hub (commented out - requires actual model weights and credentials)
+    # config.push_to_hub('your-username/vine-model')
+    # model.push_to_hub('your-username/vine-model')
+    # Example load from hub (commented out - requires actual model on hub)
+    # model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
+    # pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)
+    print("To push to Hub:")
+    print("1. config.push_to_hub('your-username/vine-model')")
+    print("2. model.push_to_hub('your-username/vine-model')")
+    print("\nTo load from Hub:")
+    print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
+    print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")
+def example_with_real_video():
+    """Example showing how to use with a real video file."""
+    print("\n=== Real Video Usage Example ===")
+    # Check if demo video exists
+    demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
+    if os.path.exists(demo_video_path):
+        print(f"Found demo video: {demo_video_path}")
+        # Create pipeline with segmentation model paths
+        PIPELINE_REGISTRY.register_pipeline(
+            "vine-video-understanding",
+            pipeline_class=VinePipeline,
+            pt_model=VineModel,
+            type="multimodal",
+        )
+        vine_config = VineConfig(
+            model_name="openai/clip-vit-base-patch32",
+            use_hf_repo=True,
+            model_repo="video-fm/vine_v0",  # Your HF Hub model
+            segmentation_method="grounding_dino_sam2",
+            debug_visualizations=True,
+            debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
+        )
+        vine_pipeline = VinePipeline(
+            model=VineModel(vine_config),
+            tokenizer=None,
+            trust_remote_code=True,
+            # SAM2 configuration
+            sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
+            sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
+            gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
+            gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        )
+        # Define keywords based on the demo
+        categorical_keywords = ['human', 'dog', 'frisbee']
+        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
+        binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
+        object_pairs = [(0, 1), (0, 2), (1, 2)]  # human-dog, dog-frisbee relationships
+        print("\nProcessing video with VINE...")
+        print("Keywords:")
+        print(f"  Categorical: {categorical_keywords}")
+        print(f"  Unary: {unary_keywords}")
+        print(f"  Binary: {binary_keywords}")
+        print(f"  Object pairs: {object_pairs}")
+        # Note: This would require proper segmentation models to be set up
+        try:
+            results = vine_pipeline(
+                demo_video_path,
+                categorical_keywords=categorical_keywords,
+                unary_keywords=unary_keywords,
+                binary_keywords=binary_keywords,
+                object_pairs=object_pairs,
+                segmentation_method='grounding_dino_sam2',
+                return_top_k=3,
+                include_visualizations=False,
+                debug_visualizations=True,
+            )
+            print("\nResults:")
+            print(f"Summary: {results['summary']}")
+        except Exception as e:
+            print(f"Note: Full execution requires segmentation models to be properly set up.")
+            print(f"Error: {e}")
+    else:
+        print(f"Demo video not found at: {demo_video_path}")
+        print("To use with a real video, provide the path to your video file.")
+if __name__ == "__main__":
+    print("VINE HuggingFace Interface Examples")
+    print("=" * 50)
+    # Run examples
+    try:
+        example_direct_model_usage()
+    except Exception as e:
+        print(f"Direct model usage failed: {e}")
+    try:
+        example_pipeline_usage()
+    except Exception as e:
+        print(f"Pipeline usage failed: {e}")
+    try:
+        example_huggingface_hub_usage()
+    except Exception as e:
+        print(f"Hub usage example failed: {e}")
+    try:
+        example_with_real_video()
+    except Exception as e:
+        print(f"Real video example failed: {e}")
+    print("\n" + "=" * 50)
+    print("Examples completed!")
+    print("\nNext steps:")
+    print("1. Set up Grounding DINO and SAM2 models for segmentation")
+    print("2. Load your pretrained VINE model weights")
+    print("3. Test with your own videos")
+    print("4. Push to HuggingFace Hub for sharing")

vine_hf/example_visualization.py ADDED Viewed

	@@ -0,0 +1,146 @@

+# Example visualization runner for VINE
+# - Loads a video (path, demo, or random)
+# - Runs the VINE pipeline
+# - Saves annotated frames and an MP4 if available
+import os
+import sys
+import argparse
+import cv2
+import numpy as np
+from collections.abc import Mapping, Sequence
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import pipeline
+# Set your OpenAI API key here or via environment variable
+os.environ['OPENAI_API_KEY'] = "dummy-key"
+# Local imports (workspace)
+sys.path.append(os.path.dirname(__file__))
+from vine_hf.vine_pipeline import VinePipeline  # https://github.com link not needed; local path used
+from vine_hf.vine_model import VineModel
+from vine_hf.vine_config import VineConfig
+from laser.loading import load_video
+def build_pipeline(args) -> VinePipeline:
+    # Register pipeline type
+    PIPELINE_REGISTRY.register_pipeline(
+        "vine-video-understanding",
+        pipeline_class=VinePipeline,
+        pt_model=VineModel,
+        type="multimodal",
+    )
+    config = VineConfig(
+        segmentation_method="grounding_dino_sam2",
+        model_name="openai/clip-vit-base-patch32",
+        # Example: load from HF repo
+        use_hf_repo=True,
+        model_repo="video-fm/vine_v0",
+        # Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
+        box_threshold=args.box_threshold,
+        text_threshold=args.text_threshold,
+        target_fps=args.fps,
+        topk_cate=args.topk_cate,
+        visualization_dir=args.out_dir,
+        visualize=True,
+        debug_visualizations=True,
+        device=args.device,
+    )
+    model = VineModel(config)
+    # Create pipeline instance with segmentation model paths (if provided)
+    vine_pipe = VinePipeline(
+        model=model,
+        tokenizer=None,
+        sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
+        sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
+        gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
+        gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
+        device=args.device,
+        trust_remote_code=True,
+    )
+    return vine_pipe
+def resolve_video(args) -> np.ndarray | str:
+    # Priority: user --video -> demo video -> random frames
+    if args.video and os.path.exists(args.video):
+        return args.video
+    demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
+    demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
+    if os.path.exists(demo_video):
+        return demo_video
+    if os.path.exists(demo_alt):
+        return demo_alt
+    # Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
+    print("No video found; using random frames.")
+    rng = np.random.default_rng(0)
+    frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
+    return frames
+def main():
+    parser = argparse.ArgumentParser(description="VINE visualization example")
+    parser.add_argument("--video", type=str, default=None, help="Path to a video file")
+    parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
+    parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
+    parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
+    parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
+    parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
+    parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
+    parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
+    parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")
+    args = parser.parse_args()
+    vine_pipe = build_pipeline(args)
+    video = resolve_video(args)
+    # Keywords similar to examples/tests
+    categorical_keywords = ["dog", "frisbee", "cat"]
+    unary_keywords = ["running", "jumping", "sitting", "flying"]
+    binary_keywords = ["behind", "next to", "chasing","biting"]
+    object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]
+    print("Running VINE pipeline...")
+    call_kwargs = dict(
+        categorical_keywords=categorical_keywords,
+        unary_keywords=unary_keywords,
+        binary_keywords=binary_keywords,
+        object_pairs=object_pairs,
+        segmentation_method=args.method,
+        return_top_k=args.topk_cate,
+        include_visualizations=True,
+        debug_visualizations=args.debug_visualizations,
+    )
+    results = vine_pipe(
+        video,
+        **call_kwargs,
+    )
+    # Normalize pipeline output to a dict (can be dict or list[dict])
+    if isinstance(results, Mapping):
+        result = results
+    elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
+        result = results[0]
+    else:
+        result = {}
+    # Print brief summary
+    summary = result.get("summary", {}) if isinstance(result, dict) else {}
+    print("Summary:", summary)
+if __name__ == "__main__":
+    main()

vine_hf/example_with_pretrained_vine.py ADDED Viewed

	@@ -0,0 +1,287 @@

+"""
+Example usage of VINE HuggingFace interface with pretrained VINE weights
+This script demonstrates how to use the VINE model with your pretrained weights
+from the ensemble format or from video-fm/vine_v0.
+"""
+import os
+import sys
+import torch
+from transformers import pipeline
+from transformers.pipelines import PIPELINE_REGISTRY
+# Set your OpenAI API key here or via environment variable
+#os.environ['OPENAI_API_KEY'] = "dummy-key"
+# Add the parent directory to the path to import vine_hf
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+from vine_hf import VineConfig, VineModel, VinePipeline
+def example_with_local_pretrained_weights():
+    print("=== Using Local Pretrained VINE Weights ===")
+    # Download https://huggingface.co/video-fm/vine_v0/tree/main/laser_model_v1.pt
+    pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt"  # Replace with your local path
+    # Create configuration with your pretrained path (local file)
+    config = VineConfig(
+        model_name="openai/clip-vit-base-patch32",
+        segmentation_method="grounding_dino_sam2",
+        target_fps=1,
+        visualize=True,
+        visualization_dir="path/to/visualization/dir",
+        debug_visualizations=True,
+        use_hf_repo=False,
+        local_dir=os.path.dirname(pretrained_vine_file),
+        local_filename=os.path.basename(pretrained_vine_file),
+    )
+    # Method 1: Initialize model directly
+    print("Method 1: Direct model initialization")
+    vine_model = VineModel(config)
+    print(f"✓ Model initialized with pretrained weights from: {pretrained_vine_file}")
+    # Method 2: Use the from_pretrained_vine class method
+    print("\nMethod 2: Using from_pretrained_vine class method")
+    vine_model_2 = VineModel.from_pretrained_vine(
+        model_path=pretrained_vine_file,
+        config=config,
+        epoch=0  # Specify epoch number
+    )
+    print("✓ Model loaded using from_pretrained_vine method")
+    return vine_model
+def example_with_huggingface_hub():
+    """Example using VINE weights from HuggingFace Hub."""
+    print("\n=== Using HuggingFace Hub Weights ===")
+    # Create configuration to use HuggingFace Hub weights
+    config = VineConfig(
+        model_name="openai/clip-vit-base-patch32",
+        use_hf_repo=True,
+        model_repo="video-fm/vine_v0",  # Your HF Hub model
+        segmentation_method="grounding_dino_sam2",
+        visualize=True,
+        visualization_dir="path/to/visualization/dir",
+        debug_visualizations=True,
+    )
+    try:
+        # Initialize model (will try to load from HF Hub)
+        vine_model = VineModel(config)
+        print("✓ Model loaded from HuggingFace Hub: video-fm/vine_v0")
+        return vine_model
+    except Exception as e:
+        print(f"✗ Could not load from HuggingFace Hub: {e}")
+        print("Make sure your model is pushed to video-fm/vine_v0")
+        return None
+def example_pipeline_with_pretrained():
+    """Example using pipeline with pretrained VINE weights."""
+    print("\n=== Pipeline with Pretrained VINE ===")
+    # Register the pipeline
+    PIPELINE_REGISTRY.register_pipeline(
+        "vine-video-understanding",
+        pipeline_class=VinePipeline,
+        pt_model=VineModel,
+        type="multimodal",
+    )
+    # Create configuration with your weights
+    pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt"  # Replace with your local path
+    config = VineConfig(
+        model_name="openai/clip-vit-base-patch32",
+        segmentation_method="grounding_dino_sam2",
+        visualize=True,
+        visualization_dir="path/to/visualization/dir",
+        debug_visualizations=True,
+        use_hf_repo=False,
+        local_dir=os.path.dirname(pretrained_vine_file),
+        local_filename=os.path.basename(pretrained_vine_file),
+    )
+    # Create model with pretrained weights
+    vine_model = VineModel(config)
+    # Create pipeline with segmentation model paths
+    vine_pipeline = VinePipeline(
+        model=vine_model,
+        tokenizer=None,
+        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
+        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
+        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
+        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
+        device=0
+    )
+    print("✓ Pipeline created with pretrained VINE weights")
+    # Example usage (would require actual video file)
+    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
+    if os.path.exists(demo_video):
+        print(f"Found demo video: {demo_video}")
+        print("Example pipeline call:")
+        print(f"results = vine_pipeline(")
+        print(f"    '{demo_video}',")
+        print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
+        print(f"    unary_keywords=['running', 'jumping', 'sitting'],")
+        print(f"    binary_keywords=['behind', 'chasing', 'next to']")
+        print(f"    debug_visualizations=True")
+        print(f")")
+        # Uncomment to actually run (requires segmentation models)
+        # results = vine_pipeline(
+        #     demo_video,
+        #     categorical_keywords=['human', 'dog', 'frisbee'],
+        #     unary_keywords=['running', 'jumping', 'sitting'],
+        #     binary_keywords=['behind', 'chasing', 'next to'],
+        #     debug_visualizations=True,
+        # )
+        # print("Results:", results['summary'])
+    return vine_pipeline
+def example_manual_weight_loading():
+    """Example of manually loading weights after model creation."""
+    print("\n=== Manual Weight Loading ===")
+    # Create model with base CLIP weights
+    # No pretrained path: create base config (no HF repo or local file configured)
+    config = VineConfig()
+    vine_model = VineModel(config)
+    print("✓ Model created with base CLIP weights")
+    model_dir = "/path/to/your/local/ensemble/model_dir.pt"  # Replace with your model directory
+    if os.path.exists(model_dir):
+        success = vine_model.load_pretrained_vine_weights(model_dir, epoch=0)
+        if success:
+            print("✓ Successfully loaded pretrained VINE weights manually")
+        else:
+            print("✗ Failed to load pretrained weights")
+    else:
+        print(f"✗ Model directory not found: {model_dir}")
+    return vine_model
+def compare_model_outputs():
+    """Compare outputs between base CLIP and pretrained VINE."""
+    print("\n=== Comparing Model Outputs ===")
+    # Create dummy data for testing
+    video_frames = torch.randn(3, 224, 224, 3) * 255  # 3 frames
+    video_frames = video_frames.clamp(0, 255).byte()
+    masks = {
+        0: {1: torch.ones(224, 224, 1)},
+        1: {1: torch.ones(224, 224, 1)},
+        2: {1: torch.ones(224, 224, 1)}
+    }
+    bboxes = {
+        0: {1: [50, 50, 150, 150]},
+        1: {1: [52, 52, 152, 152]},
+        2: {1: [54, 54, 154, 154]}
+    }
+    keywords = ['human', 'dog', 'frisbee']
+    # Model 1: Base CLIP
+    print("Creating model with base CLIP weights...")
+    config_base = VineConfig()
+    model_base = VineModel(config_base)
+    # Model 2: Pretrained VINE (if available)
+    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
+    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
+    if os.path.exists(model_dir):
+        print("Creating model with pretrained VINE weights...")
+        config_vine = VineConfig(
+            use_hf_repo=False,
+            local_dir=model_dir,
+            local_filename=None,
+        )
+        model_vine = VineModel(config_vine)
+        print("\nComparing predictions...")
+        # Get predictions from both models
+        with torch.no_grad():
+            results_base = model_base.predict(
+                video_frames=video_frames,
+                masks=masks,
+                bboxes=bboxes,
+                categorical_keywords=keywords,
+                return_top_k=3
+            )
+            results_vine = model_vine.predict(
+                video_frames=video_frames,
+                masks=masks,
+                bboxes=bboxes,
+                categorical_keywords=keywords,
+                return_top_k=3
+            )
+        print("Base CLIP confidence scores:", results_base['confidence_scores'])
+        print("Pretrained VINE confidence scores:", results_vine['confidence_scores'])
+        print("✓ Successfully compared both models")
+    else:
+        print(f"Pretrained model not found at: {model_dir}")
+        print("Skipping comparison")
+if __name__ == "__main__":
+    print("VINE HuggingFace Interface - Pretrained Weights Examples")
+    print("=" * 60)
+    try:
+        # Test local pretrained weights
+        model1 = example_with_local_pretrained_weights()
+    except Exception as e:
+        print(f"Local weights example failed: {e}")
+    try:
+        # Test HuggingFace Hub weights
+        model2 = example_with_huggingface_hub()
+    except Exception as e:
+        print(f"HuggingFace Hub example failed: {e}")
+    try:
+        # Test pipeline with pretrained weights
+        pipeline = example_pipeline_with_pretrained()
+    except Exception as e:
+        print(f"Pipeline example failed: {e}")
+    # try:
+    #     # Test manual weight loading
+    #     #model3 = example_manual_weight_loading()
+    # except Exception as e:
+    #     print(f"Manual loading example failed: {e}")
+    # try:
+    #     # Compare model outputs
+    #     #compare_model_outputs()
+    # except Exception as e:
+    #     print(f"Comparison example failed: {e}")
+    print("\n" + "=" * 60)
+    print("Examples completed!")
+    print("\nUsage Summary:")
+    print("1. Configure VineConfig with `use_hf_repo` + `model_repo` for Hub models, or `use_hf_repo=False` + `local_dir`/`local_filename` for local weights")
+    print("2. Use VineModel.from_pretrained_vine() for direct loading")

vine_hf/flattening.py ADDED Viewed

	@@ -0,0 +1,124 @@

+from __future__ import annotations
+from collections import defaultdict
+from typing import Dict, Iterable, List, Optional, Sequence, Tuple, Union
+import numpy as np
+import torch
+MaskType = Union[np.ndarray, torch.Tensor]
+def _to_numpy_mask(mask: MaskType) -> np.ndarray:
+    """
+    Convert assorted mask formats to a 2D numpy boolean array.
+    """
+    if isinstance(mask, torch.Tensor):
+        mask_np = mask.detach().cpu().numpy()
+    else:
+        mask_np = np.asarray(mask)
+    # Remove singleton dimensions at the front/back
+    while mask_np.ndim > 2 and mask_np.shape[0] == 1:
+        mask_np = np.squeeze(mask_np, axis=0)
+    if mask_np.ndim > 2 and mask_np.shape[-1] == 1:
+        mask_np = np.squeeze(mask_np, axis=-1)
+    if mask_np.ndim != 2:
+        raise ValueError(f"Expected mask to be 2D after squeezing, got shape {mask_np.shape}")
+    return mask_np.astype(bool)
+def _mask_to_bbox(mask: np.ndarray) -> Optional[Tuple[int, int, int, int]]:
+    """
+    Compute a bounding box for a 2D boolean mask.
+    """
+    if not mask.any():
+        return None
+    rows, cols = np.nonzero(mask)
+    y_min, y_max = rows.min(), rows.max()
+    x_min, x_max = cols.min(), cols.max()
+    return x_min, y_min, x_max, y_max
+def flatten_segments_for_batch(
+    video_id: int,
+    segments: Dict[int, Dict[int, MaskType]],
+    bbox_min_dim: int = 5,
+) -> Dict[str, List]:
+    """
+    Flatten nested segmentation data into batched lists suitable for predicate
+    models or downstream visualizations. Mirrors the notebook helper but is
+    robust to differing mask dtypes/shapes.
+    """
+    batched_object_ids: List[Tuple[int, int, int]] = []
+    batched_masks: List[np.ndarray] = []
+    batched_bboxes: List[Tuple[int, int, int, int]] = []
+    frame_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
+    for frame_id, frame_objects in segments.items():
+        valid_objects: List[int] = []
+        for object_id, raw_mask in frame_objects.items():
+            mask = _to_numpy_mask(raw_mask)
+            bbox = _mask_to_bbox(mask)
+            if bbox is None:
+                continue
+            x_min, y_min, x_max, y_max = bbox
+            if abs(y_max - y_min) < bbox_min_dim or abs(x_max - x_min) < bbox_min_dim:
+                continue
+            valid_objects.append(object_id)
+            batched_object_ids.append((video_id, frame_id, object_id))
+            batched_masks.append(mask)
+            batched_bboxes.append(bbox)
+        for i in valid_objects:
+            for j in valid_objects:
+                if i == j:
+                    continue
+                frame_pairs.append((video_id, frame_id, (i, j)))
+    return {
+        "object_ids": batched_object_ids,
+        "masks": batched_masks,
+        "bboxes": batched_bboxes,
+        "pairs": frame_pairs,
+    }
+def extract_valid_object_pairs(
+    batched_object_ids: Sequence[Tuple[int, int, int]],
+    interested_object_pairs: Optional[Iterable[Tuple[int, int]]] = None,
+) -> List[Tuple[int, int, Tuple[int, int]]]:
+    """
+    Filter object pairs per frame. If `interested_object_pairs` is provided, only
+    emit those combinations when both objects are present; otherwise emit all
+    permutations (i, j) with i != j for each frame.
+    """
+    frame_to_objects: Dict[Tuple[int, int], set] = defaultdict(set)
+    for vid, fid, oid in batched_object_ids:
+        frame_to_objects[(vid, fid)].add(oid)
+    interested = (
+        list(interested_object_pairs)
+        if interested_object_pairs is not None
+        else None
+    )
+    valid_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
+    for (vid, fid), object_ids in frame_to_objects.items():
+        if interested:
+            for src, dst in interested:
+                if src in object_ids and dst in object_ids:
+                    valid_pairs.append((vid, fid, (src, dst)))
+        else:
+            for src in object_ids:
+                for dst in object_ids:
+                    if src == dst:
+                        continue
+                    valid_pairs.append((vid, fid, (src, dst)))
+    return valid_pairs

vine_hf/push_to_hub.py ADDED Viewed

	@@ -0,0 +1,232 @@

+"""
+Script to push VINE model to HuggingFace Hub
+This script helps you push your trained VINE model to the HuggingFace Hub
+for easy sharing and distribution.
+"""
+import os
+import sys
+import torch
+import argparse
+from huggingface_hub import notebook_login
+from transformers.pipelines import PIPELINE_REGISTRY
+# Add the parent directory to the path to import vine_hf
+sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
+os.environ['OPENAI_API_KEY'] = "dummy-key"
+from vine_hf import VineConfig, VineModel, VinePipeline
+def push_vine_to_hub(
+    model_weights_path: str,
+    repo_name: str,
+    model_name: str = "openai/clip-vit-base-patch32",
+    segmentation_method: str = "grounding_dino_sam2",
+    commit_message: str = "Upload VINE model",
+    private: bool = False
+):
+    """
+    Push VINE model to HuggingFace Hub.
+    Args:
+        model_weights_path: Path to the trained model weights (.pth file)
+        repo_name: Name for the repository (e.g., "username/vine-model")
+        model_name: CLIP model backbone name
+        segmentation_method: Segmentation method used
+        commit_message: Commit message for the push
+        private: Whether to create a private repository
+    """
+    print("=== Pushing VINE Model to HuggingFace Hub ===")
+    # 1. Create configuration
+    print(f"Creating configuration with backbone: {model_name}")
+    config = VineConfig(
+        model_name=model_name,
+        segmentation_method=segmentation_method
+    )
+    # 2. Initialize model
+    print("Initializing model...")
+    model = VineModel(config)
+    # 3. Load trained weights
+    if os.path.exists(model_weights_path):
+        print(f"Loading weights from: {model_weights_path}")
+        try:
+            # Try loading with weights_only=False for compatibility
+            weights = torch.load(model_weights_path, map_location='cpu', weights_only=False)
+            # Handle different weight formats
+            if isinstance(weights, dict):
+                if 'state_dict' in weights:
+                    model.load_state_dict(weights['state_dict'])
+                elif 'model' in weights:
+                    model.load_state_dict(weights['model'])
+                else:
+                    model.load_state_dict(weights)
+            else:
+                # Assume it's the model directly
+                model = weights
+            print("✓ Weights loaded successfully")
+        except Exception as e:
+            print(f"✗ Error loading weights: {e}")
+            print("Please check your weights file format")
+            return False
+    else:
+        print(f"✗ Weights file not found: {model_weights_path}")
+        return False
+    # 4. Register for auto classes
+    print("Registering for auto classes...")
+    config.register_for_auto_class()
+    model.register_for_auto_class("AutoModel")
+    # 5. Register pipeline
+    print("Registering pipeline...")
+    PIPELINE_REGISTRY.register_pipeline(
+        "vine-video-understanding",
+        pipeline_class=VinePipeline,
+        pt_model=VineModel,
+        type="multimodal",
+    )
+    # 6. Create pipeline instance
+    print("Creating pipeline...")
+    vine_pipeline = VinePipeline(model=model, tokenizer=None)
+    try:
+        # 7. Push configuration to hub
+        print(f"Pushing configuration to {repo_name}...")
+        config.push_to_hub(
+            repo_name,
+            commit_message=f"{commit_message} - config",
+            private=private
+        )
+        print("✓ Configuration pushed successfully")
+        # 8. Push model to hub
+        print(f"Pushing model to {repo_name}...")
+        model.push_to_hub(
+            repo_name,
+            commit_message=f"{commit_message} - model",
+            private=private
+        )
+        print("✓ Model pushed successfully")
+        # 9. Push pipeline to hub
+        print(f"Pushing pipeline to {repo_name}...")
+        vine_pipeline.push_to_hub(
+            repo_name,
+            commit_message=f"{commit_message} - pipeline",
+            private=private
+        )
+        print("✓ Pipeline pushed successfully")
+        print(f"\n🎉 Successfully pushed VINE model to: https://huggingface.co/{repo_name}")
+        print(f"\nTo use your model:")
+        print(f"```python")
+        print(f"from transformers import pipeline")
+        print(f"")
+        print(f"vine_pipeline = pipeline(")
+        print(f"    'vine-video-understanding',")
+        print(f"    model='{repo_name}',")
+        print(f"    trust_remote_code=True")
+        print(f")")
+        print(f"")
+        print(f"results = vine_pipeline(")
+        print(f"    'path/to/video.mp4',")
+        print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
+        print(f"    unary_keywords=['running', 'jumping'],")
+        print(f"    binary_keywords=['chasing', 'behind']")
+        print(f")")
+        print(f"```")
+        return True
+    except Exception as e:
+        print(f"✗ Error pushing to hub: {e}")
+        print("Please check your HuggingFace credentials and repository permissions")
+        return False
+def main():
+    parser = argparse.ArgumentParser(description="Push VINE model to HuggingFace Hub")
+    parser.add_argument(
+        "--weights",
+        type=str,
+        required=True,
+        help="Path to the trained model weights (.pth file)"
+    )
+    parser.add_argument(
+        "--repo",
+        type=str,
+        required=True,
+        help="Repository name (e.g., 'username/vine-model')"
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="openai/clip-vit-base-patch32",
+        help="CLIP model backbone name"
+    )
+    parser.add_argument(
+        "--segmentation",
+        type=str,
+        default="grounding_dino_sam2",
+        choices=["sam2", "grounding_dino_sam2"],
+        help="Segmentation method"
+    )
+    parser.add_argument(
+        "--message",
+        type=str,
+        default="Upload VINE model",
+        help="Commit message"
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Create private repository"
+    )
+    parser.add_argument(
+        "--login",
+        action="store_true",
+        help="Login to HuggingFace Hub first"
+    )
+    args = parser.parse_args()
+    # Login if requested
+    if args.login:
+        print("Logging in to HuggingFace Hub...")
+        notebook_login()
+    # Push model
+    success = push_vine_to_hub(
+        model_weights_path=args.weights,
+        repo_name=args.repo,
+        model_name=args.model_name,
+        segmentation_method=args.segmentation,
+        commit_message=args.message,
+        private=args.private
+    )
+    if success:
+        print("\n✅ Model successfully pushed to HuggingFace Hub!")
+    else:
+        print("\n❌ Failed to push model to HuggingFace Hub")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

vine_hf/push_to_video_fm.py ADDED Viewed

	@@ -0,0 +1,274 @@

+"""
+Script to push VINE model to video-fm organization on HuggingFace Hub
+This script pushes the VINE architecture (config, model, pipeline) and model weights
+to the video-fm organization for easy sharing and distribution.
+"""
+import os
+import sys
+import torch
+import argparse
+from pathlib import Path
+from huggingface_hub import HfApi, login
+from transformers.pipelines import PIPELINE_REGISTRY
+from transformers import AutoModel
+from safetensors.torch import save_file
+# Add the parent directory to path to enable vine_hf imports
+current_dir = Path(__file__).parent
+parent_dir = current_dir.parent
+sys.path.insert(0, str(parent_dir))
+os.environ['OPENAI_API_KEY'] = "dummy-key"
+# Import from vine_hf package
+from vine_hf import VineConfig, VineModel, VinePipeline
+def push_vine_to_video_fm(
+    source_repo_or_path: str = "KevinX-Penn28/testing",
+    target_repo: str = "video-fm/vine",
+    model_name: str = "openai/clip-vit-base-patch32",
+    commit_message: str = "Upload VINE model architecture and weights",
+    private: bool = False,
+    use_local_weights: bool = False,
+):
+    """
+    Push VINE model to video-fm organization on HuggingFace Hub.
+    Args:
+        source_repo_or_path: Source HF repo or local path with model weights
+        target_repo: Target repository (e.g., "video-fm/vine")
+        model_name: CLIP model backbone name
+        commit_message: Commit message for the push
+        private: Whether to create a private repository
+        use_local_weights: If True, source_repo_or_path is a local file path
+    """
+    print("=" * 70)
+    print("🚀 Pushing VINE Model to HuggingFace Hub - video-fm Organization")
+    print("=" * 70)
+    # 1. Create configuration
+    print(f"\n📝 Creating configuration with backbone: {model_name}")
+    config = VineConfig(
+        model_name=model_name,
+        segmentation_method="grounding_dino_sam2",
+        use_hf_repo=not use_local_weights,
+        model_repo=source_repo_or_path if not use_local_weights else None,
+        local_dir=str(Path(source_repo_or_path).parent) if use_local_weights else None,
+        local_filename=Path(source_repo_or_path).name if use_local_weights else None,
+    )
+    # 2. Initialize model (will automatically load weights from source)
+    print(f"\n🔧 Initializing model and loading weights from: {source_repo_or_path}")
+    model = VineModel(config)
+    print("✓ Model initialized with weights loaded")
+    # 3. Register for auto classes
+    print("\n📋 Registering for auto classes...")
+    config.register_for_auto_class()
+    model.register_for_auto_class("AutoModel")
+    print("✓ Registered for AutoModel and AutoConfig")
+    # 4. Register pipeline
+    print("\n🔌 Registering custom pipeline...")
+    try:
+        PIPELINE_REGISTRY.register_pipeline(
+            "vine-video-understanding",
+            pipeline_class=VinePipeline,
+            pt_model=VineModel,
+            type="multimodal",
+        )
+        print("✓ Pipeline registered")
+    except Exception as e:
+        print(f"⚠ Pipeline registration: {e} (may already be registered)")
+    try:
+        # 5. Push configuration to hub
+        print(f"\n⬆️  Pushing configuration to {target_repo}...")
+        config.push_to_hub(
+            target_repo,
+            commit_message=f"{commit_message} - config",
+            private=private
+        )
+        print("✓ Configuration pushed successfully")
+        # 6. Push model to hub
+        print(f"\n⬆️  Pushing model to {target_repo}...")
+        model.push_to_hub(
+            target_repo,
+            commit_message=f"{commit_message} - model and weights",
+            private=private
+        )
+        print("✓ Model and weights pushed successfully")
+        # 7. Copy additional necessary files to the repo
+        print(f"\n📦 Uploading additional architecture files...")
+        api = HfApi()
+        # Upload flattening.py and vis_utils.py as they're imported by the model
+        current_dir = Path(__file__).parent
+        additional_files = [
+            "flattening.py",
+            "vis_utils.py",
+        ]
+        for filename in additional_files:
+            file_path = current_dir / filename
+            if file_path.exists():
+                api.upload_file(
+                    path_or_fileobj=str(file_path),
+                    path_in_repo=filename,
+                    repo_id=target_repo,
+                    commit_message=f"Add {filename}",
+                )
+                print(f"✓ Uploaded {filename}")
+            else:
+                print(f"⚠ Warning: {filename} not found at {file_path}")
+        # 8. Upload README if it exists
+        readme_path = current_dir / "README.md"
+        if readme_path.exists():
+            api.upload_file(
+                path_or_fileobj=str(readme_path),
+                path_in_repo="README.md",
+                repo_id=target_repo,
+                commit_message="Add README documentation",
+            )
+            print("✓ Uploaded README.md")
+        print("\n" + "=" * 70)
+        print("🎉 Successfully pushed VINE model to HuggingFace Hub!")
+        print("=" * 70)
+        print(f"\n📍 Model URL: https://huggingface.co/{target_repo}")
+        print(f"\n📚 To use your model:")
+        print(f"""
+```python
+from transformers import AutoModel, AutoConfig
+from vine_hf import VineConfig, VineModel, VinePipeline
+# Option 1: Load with AutoModel
+model = AutoModel.from_pretrained('{target_repo}', trust_remote_code=True)
+# Option 2: Load with VineModel directly
+config = VineConfig.from_pretrained('{target_repo}')
+model = VineModel.from_pretrained('{target_repo}')
+# Option 3: Use with pipeline
+from transformers import pipeline
+vine_pipeline = pipeline(
+    'vine-video-understanding',
+    model='{target_repo}',
+    trust_remote_code=True
+)
+results = vine_pipeline(
+    'path/to/video.mp4',
+    categorical_keywords=['human', 'dog', 'frisbee'],
+    unary_keywords=['running', 'jumping'],
+    binary_keywords=['chasing', 'behind']
+)
+```
+""")
+        return True
+    except Exception as e:
+        print(f"\n❌ Error pushing to hub: {e}")
+        import traceback
+        traceback.print_exc()
+        print("\nPlease check:")
+        print("  - HuggingFace credentials (run: huggingface-cli login)")
+        print("  - Repository permissions for video-fm organization")
+        print("  - Network connectivity")
+        return False
+def main():
+    parser = argparse.ArgumentParser(
+        description="Push VINE model to video-fm organization on HuggingFace Hub"
+    )
+    parser.add_argument(
+        "--source",
+        type=str,
+        default="KevinX-Penn28/testing",
+        help="Source HF repo or local path with model weights (default: KevinX-Penn28/testing)"
+    )
+    parser.add_argument(
+        "--target",
+        type=str,
+        default="video-fm/vine",
+        help="Target repository in video-fm org (default: video-fm/vine)"
+    )
+    parser.add_argument(
+        "--model-name",
+        type=str,
+        default="openai/clip-vit-base-patch32",
+        help="CLIP model backbone name"
+    )
+    parser.add_argument(
+        "--message",
+        type=str,
+        default="Upload VINE model architecture and weights",
+        help="Commit message"
+    )
+    parser.add_argument(
+        "--private",
+        action="store_true",
+        help="Create private repository"
+    )
+    parser.add_argument(
+        "--local-weights",
+        action="store_true",
+        help="Use local weights file instead of HF repo"
+    )
+    args = parser.parse_args()
+    # Check login status
+    try:
+        api = HfApi()
+        user_info = api.whoami()
+        print(f"✓ Logged in as: {user_info['name']}")
+        # Check if user has access to video-fm org
+        orgs = [org['name'] for org in user_info.get('orgs', [])]
+        if 'video-fm' in orgs:
+            print(f"✓ Confirmed access to video-fm organization")
+        else:
+            print(f"⚠ Warning: You may not have access to video-fm organization")
+            print(f"  Your organizations: {orgs}")
+    except Exception as e:
+        print(f"❌ Not logged in to HuggingFace. Please run: huggingface-cli login")
+        print(f"   Or use: python -c 'from huggingface_hub import login; login()'")
+        sys.exit(1)
+    # Push model
+    success = push_vine_to_video_fm(
+        source_repo_or_path=args.source,
+        target_repo=args.target,
+        model_name=args.model_name,
+        commit_message=args.message,
+        private=args.private,
+        use_local_weights=args.local_weights,
+    )
+    if success:
+        print("\n✅ Successfully completed!")
+        sys.exit(0)
+    else:
+        print("\n❌ Push failed!")
+        sys.exit(1)
+if __name__ == "__main__":
+    main()

vine_hf/setup.py ADDED Viewed

	@@ -0,0 +1,73 @@

+"""
+Setup script for VINE HuggingFace Interface
+"""
+from setuptools import setup
+with open("README.md", "r", encoding="utf-8") as fh:
+    long_description = fh.read()
+setup(
+    name="vine-hf",
+    version="1.0.0",
+    author="LASER Team",
+    author_email="your-email@example.com",
+    description="HuggingFace interface for VINE (Video Understanding with Natural Language)",
+    long_description=long_description,
+    long_description_content_type="text/markdown",
+    url="https://github.com/your-username/vine-hf",
+    # Since all modules are in the root directory, we use py_modules instead of packages
+    py_modules=[
+        "vine_config",
+        "vine_model",
+        "vine_pipeline",
+        "vis_utils",
+        "flattening",
+        "convert_inference",
+    ],
+    # Also include __init__.py to make it a package
+    packages=["vine_hf"],
+    package_dir={"vine_hf": "."},
+    classifiers=[
+        "Development Status :: 4 - Beta",
+        "Intended Audience :: Developers",
+        "Intended Audience :: Science/Research",
+        "License :: OSI Approved :: MIT License",
+        "Operating System :: OS Independent",
+        "Programming Language :: Python :: 3",
+        "Programming Language :: Python :: 3.7",
+        "Programming Language :: Python :: 3.8",
+        "Programming Language :: Python :: 3.9",
+        "Programming Language :: Python :: 3.10",
+        "Topic :: Scientific/Engineering :: Artificial Intelligence",
+        "Topic :: Multimedia :: Video",
+    ],
+    python_requires=">=3.7",
+    install_requires=[
+        "torch>=1.9.0",
+        "torchvision>=0.10.0",
+        "transformers>=4.20.0",
+        "opencv-python>=4.5.0",
+        "pillow>=8.0.0",
+        "numpy>=1.20.0",
+        "huggingface-hub>=0.10.0",
+        "tqdm>=4.60.0",
+    ],
+    extras_require={
+        "dev": [
+            "pytest>=6.0",
+            "black>=22.0",
+            "flake8>=4.0",
+            "isort>=5.0",
+        ],
+        "segmentation": [
+            # Note: SAM2 and Grounding DINO need to be installed separately
+            # as they're not available on PyPI
+        ],
+    },
+    entry_points={
+        "console_scripts": [
+            "vine-push-to-hub=vine_hf.push_to_hub:main",
+        ],
+    },
+)

vine_hf/vine_config.py ADDED Viewed

	@@ -0,0 +1,86 @@

+import torch
+from transformers import PretrainedConfig
+from typing import List, Optional, Dict, Any, Tuple, Union
+from pathlib import Path
+class VineConfig(PretrainedConfig):
+    """
+    Configuration class for VINE (Video Understanding with Natural Language) model.
+    """
+    model_type = "vine"
+    def __init__(
+        self,
+        model_name: str = "openai/clip-vit-base-patch32",
+        hidden_dim: int = 768,
+        use_hf_repo: bool = True,
+        model_repo: Optional[str] = "KevinX-Penn28/testing",
+        model_file: Optional[str] = None,
+        local_dir: Optional[str] = str(Path(__file__).resolve().parent),
+        local_filename: Optional[str] = "laser_model_v1.pkl",
+        num_top_pairs: int = 18,
+        segmentation_method: str = "grounding_dino_sam2",
+        box_threshold: float = 0.35,
+        text_threshold: float = 0.25,
+        target_fps: int = 1,
+        alpha: float = 0.5,
+        white_alpha: float = 0.8,
+        topk_cate: int = 3,
+        multi_class: bool = False,
+        output_logit: bool = False,
+        use_pretrained_cate_weights: bool = False,
+        categorical_pool: str = "mean",  # "mean" or "max"
+        max_video_length: int = 100,
+        bbox_min_dim: int = 1,
+        visualize: bool = False,
+        visualization_dir: Optional[str] = None,
+        return_flattened_segments: bool = False,
+        return_valid_pairs: bool = False,
+        interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
+        debug_visualizations: bool = False,
+        device: Optional[Union[str, int]] = None,
+        **kwargs: Any,
+    ):
+        self.model_name = model_name
+        self.use_hf_repo = use_hf_repo
+        if use_hf_repo:
+            self.model_repo = model_repo
+            self.model_file = model_file
+            self.local_dir = None
+            self.local_filename = None
+        else:
+            self.model_repo = None
+            self.model_file = None
+            self.local_dir = local_dir
+            self.local_filename = local_filename
+        self.hidden_dim = hidden_dim
+        self.num_top_pairs = num_top_pairs
+        self.segmentation_method = segmentation_method
+        self.box_threshold = box_threshold
+        self.text_threshold = text_threshold
+        self.target_fps = target_fps
+        self.alpha = alpha
+        self.white_alpha = white_alpha
+        self.topk_cate = topk_cate
+        self.multi_class = multi_class
+        self.output_logit = output_logit
+        self.use_pretrained_cate_weights = use_pretrained_cate_weights
+        self.categorical_pool = categorical_pool
+        self.max_video_length = max_video_length
+        self.bbox_min_dim = bbox_min_dim
+        self.visualize = visualize
+        self.visualization_dir = visualization_dir
+        self.return_flattened_segments = return_flattened_segments
+        self.return_valid_pairs = return_valid_pairs
+        self.interested_object_pairs = interested_object_pairs or []
+        self.debug_visualizations = debug_visualizations
+        if isinstance(device, int):
+            self._device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"
+        else:
+            self._device = device or ("cuda" if torch.cuda.is_available() else "cpu")
+        super().__init__(**kwargs)

vine_hf/vine_model.py ADDED Viewed

	@@ -0,0 +1,1001 @@

+import os
+import sys
+from typing import Dict, List, Tuple, Optional, Any, Union
+import cv2
+import numpy as np
+import torch
+from safetensors.torch import load_file
+from torch import nn
+import torch.nn.functional as F
+import torch.utils.checkpoint as cp
+from transformers import PreTrainedModel, AutoTokenizer, AutoModel, AutoProcessor
+from huggingface_hub import snapshot_download
+from .vine_config import VineConfig
+from laser.models import llava_clip_model_v3
+sys.modules["llava_clip_model_v3"] = llava_clip_model_v3
+from laser.models.model_utils import (
+    extract_single_object,
+    extract_object_subject,
+    crop_image_contain_bboxes,
+    segment_list,
+)
+from .flattening import (
+    extract_valid_object_pairs,
+    flatten_segments_for_batch,
+)
+from .vis_utils import save_mask_one_image
+class VineModel(PreTrainedModel):
+    """
+    VINE (Video Understanding with Natural Language) Model.
+    Internally, the core CLIP/text/image/pair logic mirrors
+    llava_clip_model_v3.PredicateModel as closely as possible for a single video,
+    with a small extension to re-normalize categorical probs after pooling.
+    """
+    config_class = VineConfig
+    def __init__(self, config: VineConfig):
+        super().__init__(config)
+        self.config = config
+        self.visualize = getattr(config, "visualize", False)
+        self.visualization_dir = getattr(config, "visualization_dir", None)
+        self.debug_visualizations = getattr(config, "debug_visualizations", False)
+        self._device = getattr(config, "_device")
+        # CLIP components
+        self.clip_tokenizer = AutoTokenizer.from_pretrained(config.model_name)
+        if self.clip_tokenizer.pad_token is None:
+            self.clip_tokenizer.pad_token = (
+                self.clip_tokenizer.unk_token
+                if self.clip_tokenizer.unk_token
+                else self.clip_tokenizer.eos_token
+            )
+        self.clip_processor = AutoProcessor.from_pretrained(config.model_name)
+        self.clip_cate_model = AutoModel.from_pretrained(config.model_name)
+        self.clip_unary_model = AutoModel.from_pretrained(config.model_name)
+        self.clip_binary_model = AutoModel.from_pretrained(config.model_name)
+        # Load fine-tuned weights if available
+        if config.use_hf_repo:
+            self._load_huggingface_vine_weights(config.model_repo, config.model_file)
+        else:
+            self._load_local_pretrained_vine_weights(
+                config.local_dir, config.local_filename
+            )
+        # Optionally reset categorical model to base CLIP (ignore fine-tune)
+        if not getattr(config, "use_pretrained_cate_weights", True):
+            self.clip_cate_model = AutoModel.from_pretrained(config.model_name)
+            self.clip_cate_model.to(self._device)
+        self.to(self._device)
+    # ------------------------------------------------------------------ #
+    # Weight loading
+    # ------------------------------------------------------------------ #
+    def _load_huggingface_vine_weights(
+        self, model_repo: str, model_file: Optional[str] = None
+    ):
+        try:
+            print(f"Loading VINE weights from HuggingFace repo: {model_repo}")
+            repo_path = snapshot_download(model_repo, revision=model_file or "main")
+            weights = load_file(os.path.join(repo_path, "model.safetensors"))
+            self.load_state_dict(weights, strict=False)
+            print("✓ Successfully loaded VINE weights from HuggingFace Hub")
+            return True
+        except Exception as e:
+            print(f"✗ Error loading VINE weights from HuggingFace Hub: {e}")
+            print("Using base CLIP models instead")
+            return False
+    def _load_local_pretrained_vine_weights(
+        self, local_dir: str, local_filename: Optional[str] = None, epoch: int = 0
+    ):
+        if local_dir is None and local_filename is None:
+            return False
+        full_path = (
+            os.path.join(local_dir, local_filename) if local_filename else local_dir
+        )
+        # .pkl – usually pickled PredicateModel
+        if isinstance(full_path, str) and full_path.endswith(".pkl"):
+            print(f"Loading VINE weights from: {full_path}")
+            loaded_vine_model = torch.load(
+                full_path, map_location=self._device, weights_only=False
+            )
+            print(f"Loaded state type: {type(loaded_vine_model)}")
+            if not isinstance(loaded_vine_model, dict):
+                if hasattr(loaded_vine_model, "clip_tokenizer"):
+                    self.clip_tokenizer = loaded_vine_model.clip_tokenizer
+                if hasattr(loaded_vine_model, "clip_processor"):
+                    self.clip_processor = loaded_vine_model.clip_processor
+                if hasattr(loaded_vine_model, "clip_cate_model"):
+                    self.clip_cate_model.load_state_dict(
+                        loaded_vine_model.clip_cate_model.state_dict()
+                    )
+                if hasattr(loaded_vine_model, "clip_unary_model"):
+                    self.clip_unary_model.load_state_dict(
+                        loaded_vine_model.clip_unary_model.state_dict()
+                    )
+                if hasattr(loaded_vine_model, "clip_binary_model"):
+                    self.clip_binary_model.load_state_dict(
+                        loaded_vine_model.clip_binary_model.state_dict()
+                    )
+                print("✓ Loaded VINE weights from .pkl PredicateModel checkpoint")
+                return True
+        # .pt / .pth – plain state_dict
+        elif isinstance(full_path, str) and (
+            full_path.endswith(".pt") or full_path.endswith(".pth")
+        ):
+            print(f"Loading VINE weights from: {full_path}")
+            state = torch.load(full_path, map_location=self._device, weights_only=True)
+            print(f"Loaded state type: {type(state)}")
+            self.load_state_dict(state, strict=False)
+            print("✓ Loaded VINE weights from state_dict")
+            return True
+        # .model – full PredicateModel object
+        elif isinstance(full_path, str) and full_path.endswith(".model"):
+            print(f"Loading VINE weights from: {full_path}")
+            pretrained_model = torch.load(
+                full_path, map_location="cpu", weights_only=False
+            )
+            if hasattr(pretrained_model, "clip_tokenizer"):
+                self.clip_tokenizer = pretrained_model.clip_tokenizer
+            if hasattr(pretrained_model, "clip_processor"):
+                self.clip_processor = pretrained_model.clip_processor
+            if hasattr(pretrained_model, "clip_cate_model"):
+                self.clip_cate_model.load_state_dict(
+                    pretrained_model.clip_cate_model.state_dict()
+                )
+            if hasattr(pretrained_model, "clip_unary_model"):
+                self.clip_unary_model.load_state_dict(
+                    pretrained_model.clip_unary_model.state_dict()
+                )
+            if hasattr(pretrained_model, "clip_binary_model"):
+                self.clip_binary_model.load_state_dict(
+                    pretrained_model.clip_binary_model.state_dict()
+                )
+            print("✓ Loaded all sub-model weights from .model file")
+            return True
+        # directory of .model files
+        if isinstance(full_path, str) and os.path.isdir(full_path):
+            model_files = [
+                f for f in os.listdir(full_path) if f.endswith(f".{epoch}.model")
+            ]
+            if model_files:
+                model_file = os.path.join(full_path, model_files[0])
+                print(f"Loading VINE weights from: {model_file}")
+                pretrained_model = torch.load(model_file, map_location="cpu")
+                if hasattr(pretrained_model, "clip_tokenizer"):
+                    self.clip_tokenizer = pretrained_model.clip_tokenizer
+                if hasattr(pretrained_model, "clip_processor"):
+                    self.clip_processor = pretrained_model.clip_processor
+                if hasattr(pretrained_model, "clip_cate_model"):
+                    self.clip_cate_model.load_state_dict(
+                        pretrained_model.clip_cate_model.state_dict()
+                    )
+                if hasattr(pretrained_model, "clip_unary_model"):
+                    self.clip_unary_model.load_state_dict(
+                        pretrained_model.clip_unary_model.state_dict()
+                    )
+                if hasattr(pretrained_model, "clip_binary_model"):
+                    self.clip_binary_model.load_state_dict(
+                        pretrained_model.clip_binary_model.state_dict()
+                    )
+                print("✓ Loaded all sub-model weights from ensemble format")
+                return True
+            else:
+                print(f"No model file found for epoch {epoch} in {full_path}")
+                return False
+        print("Unsupported format for pretrained VINE path:", full_path)
+        return False
+    @classmethod
+    def from_pretrained_vine(
+        cls,
+        model_path: str,
+        config: Optional[VineConfig] = None,
+        epoch: int = 0,
+        **kwargs: Any,
+    ):
+        if config is None:
+            if model_path and ("/" in model_path and not os.path.exists(model_path)):
+                config = VineConfig(use_hf_repo=True, model_repo=model_path)
+            else:
+                if os.path.isdir(model_path):
+                    config = VineConfig(use_hf_repo=False, local_dir=model_path)
+                else:
+                    config = VineConfig(
+                        use_hf_repo=False,
+                        local_dir=os.path.dirname(model_path) or None,
+                        local_filename=os.path.basename(model_path) or None,
+                    )
+        else:
+            if model_path and ("/" in model_path and not os.path.exists(model_path)):
+                config.use_hf_repo = True
+                config.model_repo = model_path
+                config.model_file = None
+                config.local_dir = None
+                config.local_filename = None
+            else:
+                config.use_hf_repo = False
+                if os.path.isdir(model_path):
+                    config.local_dir = model_path
+                    config.local_filename = None
+                else:
+                    config.local_dir = os.path.dirname(model_path) or None
+                    config.local_filename = os.path.basename(model_path) or None
+        model = cls(config, **kwargs)
+        return model
+    # ------------------------------------------------------------------ #
+    # Gradient checkpoint helpers
+    # ------------------------------------------------------------------ #
+    def _text_features_checkpoint(self, model, token_dict):
+        input_ids = token_dict["input_ids"]
+        attention_mask = token_dict["attention_mask"]
+        token_type_ids = token_dict.get("token_type_ids", None)
+        if token_type_ids is not None:
+            def forward_pass(input_ids, attention_mask, token_type_ids):
+                return model.get_text_features(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                    token_type_ids=token_type_ids,
+                )
+            return cp.checkpoint(
+                forward_pass,
+                input_ids,
+                attention_mask,
+                token_type_ids,
+                use_reentrant=False,
+            )
+        else:
+            def forward_pass(input_ids, attention_mask):
+                return model.get_text_features(
+                    input_ids=input_ids,
+                    attention_mask=attention_mask,
+                )
+            return cp.checkpoint(
+                forward_pass, input_ids, attention_mask, use_reentrant=False
+            )
+    def _image_features_checkpoint(self, model, pixel_values):
+        def forward_pass(pixel_values):
+            return model.get_image_features(pixel_values=pixel_values)
+        return cp.checkpoint(forward_pass, pixel_values, use_reentrant=False)
+    # ------------------------------------------------------------------ #
+    # CLIP similarity
+    # ------------------------------------------------------------------ #
+    def clip_sim(self, model, nl_feat, img_feat):
+        img_feat = img_feat / img_feat.norm(p=2, dim=-1, keepdim=True)
+        nl_feat = nl_feat / nl_feat.norm(p=2, dim=-1, keepdim=True)
+        logit_scale = getattr(model, "logit_scale", None)
+        logits_per_text = torch.matmul(nl_feat, img_feat.t())
+        if logit_scale is not None:
+            logits_per_text = logits_per_text * logit_scale.exp()
+        return logits_per_text
+    # ------------------------------------------------------------------ #
+    # Forward: single-video PredicateModel-style logic
+    # ------------------------------------------------------------------ #
+    def forward(
+        self,
+        video_frames: torch.Tensor,
+        masks: Dict[int, Dict[int, torch.Tensor]],
+        bboxes: Dict[int, Dict[int, List]],
+        categorical_keywords: List[str],
+        unary_keywords: Optional[List[str]] = None,
+        binary_keywords: Optional[List[str]] = None,
+        object_pairs: Optional[List[Tuple[int, int]]] = None,
+        return_flattened_segments: Optional[bool] = None,
+        return_valid_pairs: Optional[bool] = None,
+        interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
+        debug_visualizations: Optional[bool] = None,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        if unary_keywords is None:
+            unary_keywords = []
+        if binary_keywords is None:
+            binary_keywords = []
+        if object_pairs is None:
+            object_pairs = []
+        if return_flattened_segments is None:
+            return_flattened_segments = getattr(
+                self.config, "return_flattened_segments", False
+            )
+        if return_valid_pairs is None:
+            return_valid_pairs = getattr(self.config, "return_valid_pairs", False)
+        if interested_object_pairs is None or len(interested_object_pairs) == 0:
+            interested_object_pairs = (
+                getattr(self.config, "interested_object_pairs", []) or []
+            )
+        if debug_visualizations is None:
+            debug_visualizations = self.debug_visualizations
+        alpha = getattr(self.config, "alpha", 0.5)
+        white_alpha = getattr(self.config, "white_alpha", 0.8)
+        topk_cate = kwargs.pop("topk_cate", getattr(self.config, "topk_cate", 3))
+        dummy_str = kwargs.pop("dummy_str", getattr(self.config, "dummy_str", "$$$"))
+        multi_class = kwargs.pop("multi_class", getattr(self.config, "multi_class", False))
+        output_logit = kwargs.pop("output_logit", getattr(self.config, "output_logit", False))
+        output_embeddings = kwargs.pop("output_embeddings", False)
+        batched_video_ids = [0]
+        if torch.is_tensor(video_frames):
+            num_frames = video_frames.shape[0]
+            batched_videos = [
+                self._frame_to_numpy(video_frames[fid]) for fid in range(num_frames)
+            ]
+        else:
+            num_frames = len(video_frames)
+            batched_videos = [
+                self._frame_to_numpy(video_frames[fid]) for fid in range(num_frames)
+            ]
+        batched_masks: List[np.ndarray] = []
+        batched_bboxes: List[List[float]] = []
+        batched_object_ids: List[Tuple[int, int, int]] = []
+        for frame_id, frame_masks in masks.items():
+            if frame_id >= num_frames:
+                continue
+            frame_boxes = bboxes.get(frame_id, {})
+            for obj_id, mask in frame_masks.items():
+                if obj_id not in frame_boxes:
+                    continue
+                bbox = frame_boxes[obj_id]
+                batched_object_ids.append((0, frame_id, obj_id))
+                batched_masks.append(self._mask_to_numpy(mask))
+                batched_bboxes.append(bbox)
+        batched_names = [list(categorical_keywords)]
+        batched_unary_kws = [list(unary_keywords)]
+        batched_binary_kws = [list(binary_keywords)]
+        batched_obj_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
+        if object_pairs:
+            for frame_id, frame_masks in masks.items():
+                if frame_id >= num_frames:
+                    continue
+                present_ids = set(frame_masks.keys())
+                for (from_oid, to_oid) in object_pairs:
+                    if from_oid in present_ids and to_oid in present_ids:
+                        batched_obj_pairs.append((0, frame_id, (from_oid, to_oid)))
+        batched_video_splits = [0]
+        batched_binary_predicates = [None]
+        def fill_empty(batched_kw):
+            new_batched = []
+            for kw_ls in batched_kw:
+                if len(kw_ls) == 0:
+                    new_batched.append([dummy_str])
+                else:
+                    new_batched.append(list(kw_ls))
+            return new_batched
+        batched_names = fill_empty(batched_names)
+        batched_unary_kws = fill_empty(batched_unary_kws)
+        batched_binary_kws = fill_empty(batched_binary_kws)
+        dummy_prob = torch.tensor(0.0, device=self._device)
+        batched_obj_name_features = []
+        batched_unary_nl_features = []
+        batched_binary_nl_features = []
+        batched_object_ids_lookup: Dict[int, List[Tuple[int, int]]] = {0: []}
+        batch_size = len(batched_video_ids)
+        # Step 1: text features
+        for object_names, unary_kws, binary_kws in zip(
+            batched_names, batched_unary_kws, batched_binary_kws
+        ):
+            if len(object_names) == 0:
+                batched_obj_name_features.append([])
+            else:
+                obj_tokens = self.clip_tokenizer(
+                    object_names,
+                    return_tensors="pt",
+                    max_length=75,
+                    truncation=True,
+                    padding="max_length",
+                ).to(self._device)
+                obj_feats = self._text_features_checkpoint(
+                    self.clip_cate_model, obj_tokens
+                )
+                batched_obj_name_features.append(obj_feats)
+            if len(unary_kws) == 0:
+                batched_unary_nl_features.append([])
+            else:
+                unary_tokens = self.clip_tokenizer(
+                    list(unary_kws),
+                    return_tensors="pt",
+                    max_length=75,
+                    truncation=True,
+                    padding="max_length",
+                ).to(self._device)
+                unary_feats = self._text_features_checkpoint(
+                    self.clip_unary_model, unary_tokens
+                )
+                batched_unary_nl_features.append(unary_feats)
+            if len(binary_kws) == 0:
+                batched_binary_nl_features.append([])
+            else:
+                binary_tokens = self.clip_tokenizer(
+                    list(binary_kws),
+                    return_tensors="pt",
+                    max_length=75,
+                    truncation=True,
+                    padding="max_length",
+                ).to(self._device)
+                binary_feats = self._text_features_checkpoint(
+                    self.clip_binary_model, binary_tokens
+                )
+                batched_binary_nl_features.append(binary_feats)
+        # Step 2: crop objects
+        batched_frame_masks: Dict[Tuple[int, int, int], np.ndarray] = {}
+        batched_frame_bboxes: Dict[Tuple[int, int, int], List[float]] = {}
+        batched_cropped_objs: Dict[int, List[np.ndarray]] = {
+            vid: [] for vid in range(batch_size)
+        }
+        assert len(batched_object_ids) > 0, f"No object bbox: {batched_video_ids}"
+        batched_video_splits = [0] + batched_video_splits
+        for (video_id, frame_id, obj_id), mask, bbox in zip(
+            batched_object_ids, batched_masks, batched_bboxes
+        ):
+            overall_frame_id = batched_video_splits[video_id] + frame_id
+            object_img = extract_single_object(
+                batched_videos[overall_frame_id], mask, white_alpha
+            )
+            cropped_object_img = crop_image_contain_bboxes(
+                object_img, [bbox], batched_video_ids
+            )
+            if self.visualization_dir:
+                debug_crop_dir = os.path.join(self.visualization_dir, "debug_crops")
+                os.makedirs(debug_crop_dir, exist_ok=True)
+                cv2.imwrite(
+                    os.path.join(debug_crop_dir, f"frame_{frame_id}_obj_{obj_id}.jpg"),
+                    cv2.cvtColor(cropped_object_img, cv2.COLOR_RGB2BGR),
+                )
+            batched_frame_masks[(video_id, frame_id, obj_id)] = mask
+            batched_frame_bboxes[(video_id, frame_id, obj_id)] = bbox
+            batched_object_ids_lookup[video_id].append((frame_id, obj_id))
+            batched_cropped_objs[video_id].append(cropped_object_img)
+        # Step 3: categorical + unary
+        batched_image_unary_probs: Dict[int, Dict] = {}
+        batched_image_cate_probs: Dict[int, Dict] = {}
+        batched_obj_cate_features: Dict[int, Any] = {}
+        batched_obj_unary_features: Dict[int, Any] = {}
+        batched_obj_per_cate: Dict[int, Dict[str, List[Tuple[torch.Tensor, int]]]] = {}
+        for vid in range(batch_size):
+            batched_image_unary_probs[vid] = {}
+            batched_image_cate_probs[vid] = {}
+            batched_obj_cate_features[vid] = {}
+            batched_obj_unary_features[vid] = {}
+            batched_obj_per_cate[vid] = {}
+        for vid_id, (
+            unary_nl_feats,
+            object_name_feats,
+            cate,
+            unary_pred,
+            binary_predicates,
+        ) in enumerate(
+            zip(
+                batched_unary_nl_features,
+                batched_obj_name_features,
+                batched_names,
+                batched_unary_kws,
+                batched_binary_predicates,
+            )
+        ):
+            cropped_objs = batched_cropped_objs[vid_id]
+            if len(cropped_objs) != 0:
+                inputs = self.clip_processor(
+                    images=cropped_objs, return_tensors="pt"
+                ).to(self._device)
+                cate_obj_clip_features = self._image_features_checkpoint(
+                    self.clip_cate_model, inputs["pixel_values"]
+                )
+                unary_obj_clip_features = self._image_features_checkpoint(
+                    self.clip_unary_model, inputs["pixel_values"]
+                )
+                batched_obj_unary_features[vid_id] = unary_obj_clip_features
+                batched_obj_cate_features[vid_id] = cate_obj_clip_features
+            else:
+                batched_obj_cate_features[vid_id] = torch.tensor([])
+                batched_obj_unary_features[vid_id] = torch.tensor([])
+            object_ids = batched_object_ids_lookup[vid_id]
+            # Categorical logits
+            if (
+                len(object_name_feats) == 0
+                or len(object_ids) == 0
+                or len(cropped_objs) == 0
+            ):
+                cate_logits_per_text = torch.tensor([])
+            else:
+                cate_logits_per_text = self.clip_sim(
+                    self.clip_cate_model, object_name_feats, cate_obj_clip_features
+                )
+                if not output_logit:
+                    cate_logits_per_text = cate_logits_per_text.softmax(dim=0)
+            if not (
+                len(object_ids) == 0
+                or (
+                    cate_logits_per_text.ndim == 2
+                    and cate_logits_per_text.shape[1] == len(object_ids)
+                )
+                or len(object_name_feats) == 0
+            ):
+                print("Object cate shape mismatch here")
+            assert (
+                len(object_name_feats) == 0
+                or len(object_ids) == 0
+                or (
+                    cate_logits_per_text.ndim == 2
+                    and cate_logits_per_text.shape[1] == len(object_ids)
+                )
+            ), f"Mismatched object id and cate logic: {batched_video_ids}"
+            # Aggregate per object id across frames
+            cate_prob_per_obj: Dict[int, Dict[str, List[torch.Tensor]]] = {}
+            for cate_name, probs in zip(cate, cate_logits_per_text):
+                if cate_name == dummy_str:
+                    dummy_prob += probs.sum()
+                else:
+                    for prob, (fid, oid) in zip(probs, object_ids):
+                        cate_prob_per_obj.setdefault(oid, {})
+                        cate_prob_per_obj[oid].setdefault(cate_name, []).append(prob)
+            new_cate_prob_per_obj: Dict[Tuple[int, str], torch.Tensor] = {}
+            obj_per_cate: Dict[str, List[Tuple[torch.Tensor, int]]] = {}
+            for oid, object_cate_info in cate_prob_per_obj.items():
+                # Pool across frames per category
+                pooled: Dict[str, torch.Tensor] = {}
+                for cate_name, prob_list in object_cate_info.items():
+                    stacked = torch.stack(prob_list)
+                    if getattr(self.config, "categorical_pool", "mean") == "mean":
+                        pooled_prob = stacked.mean()
+                    else:
+                        pooled_prob = stacked.max()
+                    pooled[cate_name] = pooled_prob
+                if not pooled:
+                    continue
+                # Renormalize across categories so they sum to 1 per object
+                probs_tensor = torch.stack(list(pooled.values()))
+                denom = probs_tensor.sum()
+                if denom.item() <= 0:
+                    norm_tensor = torch.ones_like(probs_tensor) / len(pooled)
+                else:
+                    norm_tensor = probs_tensor / denom
+                for (cate_name, _), norm_prob in zip(pooled.items(), norm_tensor):
+                    obj_per_cate.setdefault(cate_name, []).append((norm_prob, oid))
+                    new_cate_prob_per_obj[(oid, cate_name)] = norm_prob
+            for cate_name in obj_per_cate:
+                obj_per_cate[cate_name] = sorted(
+                    obj_per_cate[cate_name], key=lambda x: x[0], reverse=True
+                )
+            # Unary
+            if len(unary_nl_feats) == 0 or len(cropped_objs) == 0:
+                unary_logits_per_text = torch.tensor([])
+            else:
+                unary_logits_per_text = self.clip_sim(
+                    self.clip_unary_model, unary_nl_feats, unary_obj_clip_features
+                )
+                if not output_logit:
+                    unary_logits_per_text = unary_logits_per_text.softmax(dim=0)
+            unary_prob_per_obj: Dict[Tuple[int, int, str], torch.Tensor] = {}
+            for unary_name, probs in zip(unary_pred, unary_logits_per_text):
+                if unary_name == dummy_str:
+                    dummy_prob += probs.sum()
+                else:
+                    for prob, (fid, oid) in zip(probs, object_ids):
+                        unary_prob_per_obj[(fid, oid, unary_name)] = prob
+            batched_image_cate_probs[vid_id] = new_cate_prob_per_obj
+            batched_image_unary_probs[vid_id] = unary_prob_per_obj
+            batched_obj_per_cate[vid_id] = obj_per_cate
+        # Step 4: binary pairs
+        batched_cropped_obj_pairs: Dict[int, List[np.ndarray]] = {}
+        frame_splits: Dict[Tuple[int, int], Dict[str, int]] = {}
+        current_info = (0, 0)
+        frame_splits[current_info] = {"start": 0}
+        batched_topk_cate_candidates: Dict[int, Dict[str, List[int]]] = {
+            video_id: {} for video_id in range(batch_size)
+        }
+        for video_id, obj_per_cate in batched_obj_per_cate.items():
+            topk_cate_candidates: Dict[str, List[int]] = {}
+            for cate_name, pred_oid_ls in obj_per_cate.items():
+                for _, oid in pred_oid_ls[:topk_cate]:
+                    topk_cate_candidates.setdefault(cate_name, []).append(oid)
+            batched_topk_cate_candidates[video_id] = topk_cate_candidates
+        obj_pair_lookup: Dict[int, Dict[Tuple[int, int], List[int]]] = {
+            video_id: {} for video_id in range(len(batched_video_ids))
+        }
+        for (vid, fid, (from_oid, to_oid)) in batched_obj_pairs:
+            if (from_oid, to_oid) not in obj_pair_lookup[vid]:
+                obj_pair_lookup[vid][(from_oid, to_oid)] = []
+            obj_pair_lookup[vid][(from_oid, to_oid)].append(fid)
+        selected_pairs = set()
+        if batched_binary_predicates[0] is None:
+            selected_pairs = set(batched_obj_pairs)
+        else:
+            for bp_vid, binary_predicates in enumerate(batched_binary_predicates):
+                topk_cate_candidates = batched_topk_cate_candidates[bp_vid]
+                for (rel_name, from_obj_name, to_obj_name) in binary_predicates:
+                    if (
+                        from_obj_name in topk_cate_candidates
+                        and to_obj_name in topk_cate_candidates
+                    ):
+                        from_oids = topk_cate_candidates[from_obj_name]
+                        to_oids = topk_cate_candidates[to_obj_name]
+                        for from_oid in from_oids:
+                            for to_oid in to_oids:
+                                if (
+                                    bp_vid in obj_pair_lookup
+                                    and (from_oid, to_oid) in obj_pair_lookup[bp_vid]
+                                ):
+                                    for fid in obj_pair_lookup[bp_vid][
+                                        (from_oid, to_oid)
+                                    ]:
+                                        selected_pairs.add(
+                                            (bp_vid, fid, (from_oid, to_oid))
+                                        )
+        selected_pairs = list(selected_pairs)
+        new_select_pairs: Dict[int, List[Tuple[int, int, Tuple[int, int]]]] = {
+            video_id: [] for video_id in range(len(batched_video_ids))
+        }
+        for (vid, fid, (from_oid, to_oid)) in selected_pairs:
+            new_select_pairs[vid].append((vid, fid, (from_oid, to_oid)))
+        for vid in range(len(batched_video_ids)):
+            batched_cropped_obj_pairs[vid] = []
+        for (vid, fid, (from_id, to_id)) in selected_pairs:
+            if (vid, fid, from_id) not in batched_frame_masks or (
+                vid,
+                fid,
+                to_id,
+            ) not in batched_frame_masks:
+                continue
+            if (vid, fid, from_id) not in batched_frame_bboxes or (
+                vid,
+                fid,
+                to_id,
+            ) not in batched_frame_bboxes:
+                continue
+            overall_frame_id = batched_video_splits[vid] + fid
+            mask1 = batched_frame_masks[(vid, fid, from_id)]
+            mask2 = batched_frame_masks[(vid, fid, to_id)]
+            bbox1 = batched_frame_bboxes[(vid, fid, from_id)]
+            bbox2 = batched_frame_bboxes[(vid, fid, to_id)]
+            bb_pop_image = extract_object_subject(
+                batched_videos[overall_frame_id],
+                mask1,
+                mask2,
+                alpha=alpha,
+                white_alpha=white_alpha,
+            )
+            cropped_bb_pop_image = crop_image_contain_bboxes(
+                img=bb_pop_image,
+                bbox_ls=[bbox1, bbox2],
+                data_id=batched_video_ids,
+            )
+            batched_cropped_obj_pairs[vid].append(cropped_bb_pop_image)
+        if len(selected_pairs) == 0:
+            selected_pairs.append((0, -1, (-1, -1)))
+            new_select_pairs[0] = [(0, -1, (-1, -1))]
+            dummy_img = batched_videos[0]
+            batched_cropped_obj_pairs[0] = [dummy_img]
+        batched_image_binary_probs: List[
+            Dict[Tuple[int, Tuple[int, int], str], torch.Tensor]
+        ] = []
+        batched_obj_pair_features: Dict[int, torch.Tensor] = {
+            vid: torch.tensor([]) for vid in range(batch_size)
+        }
+        if len(batched_cropped_obj_pairs) == 0:
+            batched_image_binary_probs.append({})
+        else:
+            for vid, binary_nl_features in enumerate(batched_binary_nl_features):
+                if len(binary_nl_features) == 0:
+                    batched_image_binary_probs.append({})
+                    continue
+                binary_kws = batched_binary_kws[vid]
+                cropped_obj_pairs = batched_cropped_obj_pairs[vid]
+                if len(cropped_obj_pairs) == 0:
+                    batched_image_binary_probs.append({})
+                    continue
+                inputs = self.clip_processor(
+                    images=cropped_obj_pairs, return_tensors="pt"
+                ).to(self._device)
+                obj_features = self._image_features_checkpoint(
+                    self.clip_binary_model, inputs["pixel_values"]
+                )
+                batched_obj_pair_features[vid] = obj_features
+                obj_clip_features = obj_features / obj_features.norm(
+                    p=2, dim=-1, keepdim=True
+                )
+                binary_nl_features = binary_nl_features / binary_nl_features.norm(
+                    p=2, dim=-1, keepdim=True
+                )
+                logit_scale = self.clip_binary_model.logit_scale
+                binary_logits_per_text = torch.matmul(
+                    binary_nl_features, obj_clip_features.t()
+                ) * logit_scale.exp()
+                if not output_logit:
+                    if not multi_class:
+                        binary_logits_per_text = binary_logits_per_text.softmax(dim=0)
+                    else:
+                        binary_logits_per_text = binary_logits_per_text.sigmoid()
+                binary_prob_per_obj: Dict[
+                    Tuple[int, Tuple[int, int], str], torch.Tensor
+                ] = {}
+                for binary_name, probs in zip(binary_kws, binary_logits_per_text):
+                    if binary_name == dummy_str:
+                        dummy_prob += probs.sum()
+                    else:
+                        for prob, (vid_, fid, obj_pair) in zip(
+                            probs, new_select_pairs[vid]
+                        ):
+                            if fid == -1:
+                                dummy_prob += prob
+                            else:
+                                binary_prob_per_obj[(fid, obj_pair, binary_name)] = prob
+                batched_image_binary_probs.append(binary_prob_per_obj)
+        result: Dict[str, Any] = {
+            "categorical_probs": batched_image_cate_probs,
+            "unary_probs": batched_image_unary_probs,
+            "binary_probs": batched_image_binary_probs,
+            "dummy_prob": dummy_prob,
+        }
+        if output_embeddings:
+            embeddings_dict = {
+                "cate_obj_clip_features": batched_obj_cate_features,
+                "cate_object_ids": batched_object_ids_lookup,
+                "unary_obj_clip_features": batched_obj_unary_features,
+                "unary_object_ids": batched_object_ids_lookup,
+                "binary_obj_pair_features": batched_obj_pair_features,
+                "binary_object_pairs": new_select_pairs,
+            }
+            result["embeddings"] = embeddings_dict
+        if return_flattened_segments or return_valid_pairs:
+            flattened = flatten_segments_for_batch(
+                video_id=0,
+                segments=masks,
+                bbox_min_dim=self.config.bbox_min_dim,
+            )
+            if return_flattened_segments:
+                result["flattened_segments"] = flattened
+            if return_valid_pairs:
+                interested_pairs = (
+                    interested_object_pairs if interested_object_pairs else None
+                )
+                result["valid_pairs"] = extract_valid_object_pairs(
+                    flattened["object_ids"],
+                    interested_pairs,
+                )
+                if interested_pairs is None:
+                    result["valid_pairs_metadata"] = {"pair_source": "all_pairs"}
+                else:
+                    result["valid_pairs_metadata"] = {
+                        "pair_source": "filtered",
+                        "requested_pairs": interested_object_pairs,
+                    }
+        return result
+    # ------------------------------------------------------------------ #
+    # Helpers
+    # ------------------------------------------------------------------ #
+    def _frame_to_numpy(self, frame: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+        if torch.is_tensor(frame):
+            frame_np = frame.detach().cpu().numpy()
+        else:
+            frame_np = np.asarray(frame)
+        return np.ascontiguousarray(frame_np)
+    def _mask_to_numpy(self, mask: Union[torch.Tensor, np.ndarray]) -> np.ndarray:
+        if torch.is_tensor(mask):
+            mask_np = mask.detach().cpu().numpy()
+        else:
+            mask_np = np.asarray(mask)
+        if mask_np.ndim == 3:
+            if mask_np.shape[0] == 1:
+                mask_np = mask_np.squeeze(0)
+            elif mask_np.shape[2] == 1:
+                mask_np = mask_np.squeeze(2)
+        if mask_np.ndim != 2:
+            raise ValueError(f"Mask must be 2D after squeezing, got shape {mask_np.shape}")
+        return mask_np.astype(bool, copy=False)
+    def _extract_text_features(self, model, keywords: List[str]):
+        tokens = self.clip_tokenizer(
+            keywords,
+            return_tensors="pt",
+            max_length=75,
+            truncation=True,
+            padding="max_length",
+        ).to(self._device)
+        return self._text_features_checkpoint(model, tokens)
+    def _extract_image_features(self, model, image):
+        if torch.is_tensor(image):
+            image = image.detach().cpu().numpy()
+        elif isinstance(image, np.ndarray):
+            pass
+        inputs = self.clip_processor(images=image, return_tensors="pt").to(self._device)
+        return self._image_features_checkpoint(model, inputs["pixel_values"])
+    # ------------------------------------------------------------------ #
+    # High-level predict API
+    # ------------------------------------------------------------------ #
+    def predict(
+        self,
+        video_frames: torch.Tensor,
+        masks: Dict[int, Dict[int, torch.Tensor]],
+        bboxes: Dict[int, Dict[int, List]],
+        categorical_keywords: List[str],
+        unary_keywords: Optional[List[str]] = None,
+        binary_keywords: Optional[List[str]] = None,
+        object_pairs: Optional[List[Tuple[int, int]]] = None,
+        return_top_k: int = 3,
+        return_flattened_segments: Optional[bool] = None,
+        return_valid_pairs: Optional[bool] = None,
+        interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
+        debug_visualizations: Optional[bool] = None,
+    ) -> Dict[str, Any]:
+        with torch.no_grad():
+            outputs = self.forward(
+                video_frames=video_frames,
+                masks=masks,
+                bboxes=bboxes,
+                categorical_keywords=categorical_keywords,
+                unary_keywords=unary_keywords,
+                binary_keywords=binary_keywords,
+                object_pairs=object_pairs,
+                return_flattened_segments=return_flattened_segments,
+                return_valid_pairs=return_valid_pairs,
+                interested_object_pairs=interested_object_pairs,
+                debug_visualizations=debug_visualizations,
+            )
+        formatted_categorical: Dict[int, List[Tuple[float, str]]] = {}
+        for (obj_id, category), prob in outputs["categorical_probs"][0].items():
+            if obj_id not in formatted_categorical:
+                formatted_categorical[obj_id] = []
+            prob_val = float(prob.detach().cpu()) if torch.is_tensor(prob) else float(prob)
+            formatted_categorical[obj_id].append((prob_val, category))
+        for obj_id in formatted_categorical:
+            formatted_categorical[obj_id] = sorted(
+                formatted_categorical[obj_id], reverse=True
+            )[:return_top_k]
+        formatted_unary: Dict[Tuple[int, int], List[Tuple[float, str]]] = {}
+        for (frame_id, obj_id, predicate), prob in outputs["unary_probs"][0].items():
+            key = (frame_id, obj_id)
+            if key not in formatted_unary:
+                formatted_unary[key] = []
+            prob_val = float(prob.detach().cpu()) if torch.is_tensor(prob) else float(prob)
+            formatted_unary[key].append((prob_val, predicate))
+        for key in formatted_unary:
+            formatted_unary[key] = sorted(
+                formatted_unary[key], reverse=True
+            )[:return_top_k]
+        formatted_binary: Dict[Tuple[int, Tuple[int, int]], List[Tuple[float, str]]] = {}
+        if len(outputs["binary_probs"]) > 0:
+            for (frame_id, obj_pair, predicate), prob in outputs["binary_probs"][0].items():
+                key = (frame_id, obj_pair)
+                if key not in formatted_binary:
+                    formatted_binary[key] = []
+                prob_val = float(prob.detach().cpu()) if torch.is_tensor(prob) else float(prob)
+                formatted_binary[key].append((prob_val, predicate))
+            for key in formatted_binary:
+                formatted_binary[key] = sorted(
+                    formatted_binary[key], reverse=True
+                )[:return_top_k]
+        def max_conf(d: Dict[Any, List[Tuple[float, str]]]) -> float:
+            if not d:
+                return 0.0
+            return max(
+                (max((p for p, _ in preds), default=0.0) for preds in d.values()),
+                default=0.0,
+            )
+        result: Dict[str, Any] = {
+            "categorical_predictions": formatted_categorical,
+            "unary_predictions": formatted_unary,
+            "binary_predictions": formatted_binary,
+            "confidence_scores": {
+                "categorical": max_conf(formatted_categorical),
+                "unary": max_conf(formatted_unary),
+                "binary": max_conf(formatted_binary),
+            },
+        }
+        if "flattened_segments" in outputs:
+            result["flattened_segments"] = outputs["flattened_segments"]
+        if "valid_pairs" in outputs:
+            result["valid_pairs"] = outputs["valid_pairs"]
+        if "valid_pairs_metadata" in outputs:
+            result["valid_pairs_metadata"] = outputs["valid_pairs_metadata"]
+        return result

vine_hf/vine_pipeline.py ADDED Viewed

	@@ -0,0 +1,923 @@

+import os
+import uuid
+import hashlib
+import tempfile
+from pathlib import Path
+from typing import Dict, List, Tuple, Optional, Any, Union
+import cv2
+import numpy as np
+import torch
+from transformers import Pipeline
+from .vine_config import VineConfig
+from .vine_model import VineModel
+from .vis_utils import render_dino_frames, render_sam_frames, render_vine_frame_sets
+from laser.loading import load_video
+from laser.preprocess.mask_generation_grounding_dino import generate_masks_grounding_dino
+class VinePipeline(Pipeline):
+    """
+    Pipeline for VINE model that handles end-to-end video understanding.
+    """
+    def __init__(
+        self,
+        sam_config_path: Optional[str] = None,
+        sam_checkpoint_path: Optional[str] = None,
+        gd_config_path: Optional[str] = None,
+        gd_checkpoint_path: Optional[str] = None,
+        **kwargs: Any,
+    ):
+        self.grounding_model = None
+        self.sam_predictor = None
+        self.mask_generator = None
+        self.sam_config_path = sam_config_path
+        self.sam_checkpoint_path = sam_checkpoint_path
+        self.gd_config_path = gd_config_path
+        self.gd_checkpoint_path = gd_checkpoint_path
+        super().__init__(**kwargs)
+        self.segmentation_method = getattr(
+            self.model.config, "segmentation_method", "grounding_dino_sam2"
+        )
+        self.box_threshold = getattr(self.model.config, "box_threshold", 0.35)
+        self.text_threshold = getattr(self.model.config, "text_threshold", 0.25)
+        self.target_fps = getattr(self.model.config, "target_fps", 1)
+        self.visualize = getattr(self.model.config, "visualize", False)
+        self.visualization_dir = getattr(self.model.config, "visualization_dir", None)
+        self.debug_visualizations = getattr(
+            self.model.config, "debug_visualizations", False
+        )
+        self._device = getattr(self.model.config, "_device")
+        if kwargs.get("device") is not None:
+            self._device = kwargs.get("device")
+    # ------------------------------------------------------------------ #
+    # Segmentation model injection
+    # ------------------------------------------------------------------ #
+    def set_segmentation_models(
+        self,
+        *,
+        sam_predictor=None,
+        mask_generator=None,
+        grounding_model=None,
+    ):
+        if sam_predictor is not None:
+            self.sam_predictor = sam_predictor
+        if mask_generator is not None:
+            self.mask_generator = mask_generator
+        if grounding_model is not None:
+            self.grounding_model = grounding_model
+    # ------------------------------------------------------------------ #
+    # Pipeline protocol
+    # ------------------------------------------------------------------ #
+    def _sanitize_parameters(self, **kwargs: Any):
+        preprocess_kwargs: Dict[str, Any] = {}
+        forward_kwargs: Dict[str, Any] = {}
+        postprocess_kwargs: Dict[str, Any] = {}
+        if "segmentation_method" in kwargs:
+            preprocess_kwargs["segmentation_method"] = kwargs["segmentation_method"]
+        if "target_fps" in kwargs:
+            preprocess_kwargs["target_fps"] = kwargs["target_fps"]
+        if "box_threshold" in kwargs:
+            preprocess_kwargs["box_threshold"] = kwargs["box_threshold"]
+        if "text_threshold" in kwargs:
+            preprocess_kwargs["text_threshold"] = kwargs["text_threshold"]
+        if "categorical_keywords" in kwargs:
+            preprocess_kwargs["categorical_keywords"] = kwargs["categorical_keywords"]
+        if "categorical_keywords" in kwargs:
+            forward_kwargs["categorical_keywords"] = kwargs["categorical_keywords"]
+        if "unary_keywords" in kwargs:
+            forward_kwargs["unary_keywords"] = kwargs["unary_keywords"]
+        if "binary_keywords" in kwargs:
+            forward_kwargs["binary_keywords"] = kwargs["binary_keywords"]
+        if "object_pairs" in kwargs:
+            forward_kwargs["object_pairs"] = kwargs["object_pairs"]
+        if "return_flattened_segments" in kwargs:
+            forward_kwargs["return_flattened_segments"] = kwargs[
+                "return_flattened_segments"
+            ]
+        if "return_valid_pairs" in kwargs:
+            forward_kwargs["return_valid_pairs"] = kwargs["return_valid_pairs"]
+        if "interested_object_pairs" in kwargs:
+            forward_kwargs["interested_object_pairs"] = kwargs[
+                "interested_object_pairs"
+            ]
+        if "debug_visualizations" in kwargs:
+            forward_kwargs["debug_visualizations"] = kwargs["debug_visualizations"]
+            postprocess_kwargs["debug_visualizations"] = kwargs["debug_visualizations"]
+        if "return_top_k" in kwargs:
+            postprocess_kwargs["return_top_k"] = kwargs["return_top_k"]
+        if "self.visualize" in kwargs:
+            postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
+        return preprocess_kwargs, forward_kwargs, postprocess_kwargs
+    # ------------------------------------------------------------------ #
+    # Preprocess: video + segmentation
+    # ------------------------------------------------------------------ #
+    def preprocess(
+        self,
+        video_input: Union[str, np.ndarray, torch.Tensor],
+        segmentation_method: Optional[str] = None,
+        target_fps: Optional[int] = None,
+        box_threshold: Optional[float] = None,
+        text_threshold: Optional[float] = None,
+        categorical_keywords: Optional[List[str]] = None,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        if segmentation_method is None:
+            segmentation_method = self.segmentation_method
+        if target_fps is None:
+            target_fps = self.target_fps
+        else:
+            self.target_fps = target_fps
+        if box_threshold is None:
+            box_threshold = self.box_threshold
+        else:
+            self.box_threshold = box_threshold
+        if text_threshold is None:
+            text_threshold = self.text_threshold
+        else:
+            self.text_threshold = text_threshold
+        if categorical_keywords is None:
+            categorical_keywords = ["object"]
+        if isinstance(video_input, str):
+            video_tensor = load_video(video_input, target_fps=target_fps)
+            if isinstance(video_tensor, list):
+                video_tensor = np.array(video_tensor)
+            elif isinstance(video_tensor, torch.Tensor):
+                video_tensor = video_tensor.cpu().numpy()
+        elif isinstance(video_input, (np.ndarray, torch.Tensor)):
+            if isinstance(video_input, torch.Tensor):
+                video_tensor = video_input.numpy()
+            else:
+                video_tensor = video_input
+        else:
+            raise ValueError(f"Unsupported video input type: {type(video_input)}")
+        if not isinstance(video_tensor, np.ndarray):
+            video_tensor = np.array(video_tensor)
+        if len(video_tensor.shape) != 4:
+            raise ValueError(
+                f"Expected video tensor shape (frames, height, width, channels), got {video_tensor.shape}"
+            )
+        visualization_data: Dict[str, Any] = {}
+        print(f"Segmentation method: {segmentation_method}")
+        if segmentation_method == "sam2":
+            masks, bboxes, vis_data = self._generate_sam2_masks(video_tensor)
+        elif segmentation_method == "grounding_dino_sam2":
+            masks, bboxes, vis_data = self._generate_grounding_dino_sam2_masks(
+                video_tensor,
+                categorical_keywords,
+                box_threshold,
+                text_threshold,
+                video_input,
+            )
+        else:
+            raise ValueError(f"Unsupported segmentation method: {segmentation_method}")
+        if vis_data:
+            visualization_data.update(vis_data)
+        visualization_data.setdefault("sam_masks", masks)
+        return {
+            "video_frames": torch.tensor(video_tensor),
+            "masks": masks,
+            "bboxes": bboxes,
+            "num_frames": len(video_tensor),
+            "visualization_data": visualization_data,
+        }
+    # ------------------------------------------------------------------ #
+    # Segmentation helpers
+    # ------------------------------------------------------------------ #
+    def _generate_sam2_masks(
+        self, video_tensor: np.ndarray
+    ) -> Tuple[Dict[int, Dict[int, torch.Tensor]], Dict[int, Dict[int, List[int]]], Dict[str, Any]]:
+        print("Generating SAM2 masks...")
+        if self.mask_generator is None:
+            self._initialize_segmentation_models()
+        if self.mask_generator is None:
+            raise ValueError("SAM2 mask generator not available")
+        masks: Dict[int, Dict[int, torch.Tensor]] = {}
+        bboxes: Dict[int, Dict[int, List[int]]] = {}
+        for frame_id, frame in enumerate(video_tensor):
+            if isinstance(frame, np.ndarray) and frame.dtype != np.uint8:
+                frame = (
+                    (frame * 255).astype(np.uint8)
+                    if frame.max() <= 1
+                    else frame.astype(np.uint8)
+                )
+            frame_masks = self.mask_generator.generate(frame)
+            masks[frame_id] = {}
+            bboxes[frame_id] = {}
+            for obj_id, mask_data in enumerate(frame_masks):
+                mask = mask_data["segmentation"]
+                if isinstance(mask, np.ndarray):
+                    mask = torch.from_numpy(mask)
+                if len(mask.shape) == 2:
+                    mask = mask.unsqueeze(-1)
+                elif len(mask.shape) == 3 and mask.shape[0] == 1:
+                    mask = mask.permute(1, 2, 0)
+                wrapped_id = obj_id + 1
+                masks[frame_id][wrapped_id] = mask
+                mask_np = (
+                    mask.squeeze().numpy()
+                    if isinstance(mask, torch.Tensor)
+                    else mask.squeeze()
+                )
+                coords = np.where(mask_np > 0)
+                if len(coords[0]) > 0:
+                    y1, y2 = coords[0].min(), coords[0].max()
+                    x1, x2 = coords[1].min(), coords[1].max()
+                    bboxes[frame_id][wrapped_id] = [x1, y1, x2, y2]
+        tracked_masks, tracked_bboxes = self._track_ids_across_frames(masks, bboxes)
+        return tracked_masks, tracked_bboxes, {"sam_masks": tracked_masks}
+    def _generate_grounding_dino_sam2_masks(
+        self,
+        video_tensor: np.ndarray,
+        categorical_keywords: List[str],
+        box_threshold: float,
+        text_threshold: float,
+        video_path: Union[str, None],
+    ) -> Tuple[Dict[int, Dict[int, torch.Tensor]], Dict[int, Dict[int, List[int]]], Dict[str, Any]]:
+        print("Generating Grounding DINO + SAM2 masks...")
+        if self.grounding_model is None or self.sam_predictor is None:
+            self._initialize_segmentation_models()
+        if self.grounding_model is None or self.sam_predictor is None:
+            raise ValueError("GroundingDINO or SAM2 models not available")
+        temp_video_path = None
+        if video_path is None or not isinstance(video_path, str):
+            temp_video_path = self._create_temp_video(video_tensor)
+            video_path = temp_video_path
+        CHUNK = 5
+        classes_ls = [
+            categorical_keywords[i : i + CHUNK]
+            for i in range(0, len(categorical_keywords), CHUNK)
+        ]
+        base_name = Path(video_path).stem
+        fps_tag = f"fps{int(self.target_fps)}"
+        path_hash = hashlib.md5(video_path.encode("utf-8")).hexdigest()[:8]
+        video_cache_name = f"{base_name}_{fps_tag}_{path_hash}"
+        video_segments, oid_class_pred, _ = generate_masks_grounding_dino(
+            self.grounding_model,
+            box_threshold,
+            text_threshold,
+            self.sam_predictor,
+            self.mask_generator,
+            video_tensor,
+            video_path,
+            video_cache_name,
+            out_dir=tempfile.gettempdir(),
+            classes_ls=classes_ls,
+            target_fps=self.target_fps,
+            visualize=self.debug_visualizations,
+            frames=None,
+            max_prop_time=2,
+        )
+        masks: Dict[int, Dict[int, torch.Tensor]] = {}
+        bboxes: Dict[int, Dict[int, List[int]]] = {}
+        for frame_id, frame_masks in video_segments.items():
+            masks[frame_id] = {}
+            bboxes[frame_id] = {}
+            for obj_id, mask in frame_masks.items():
+                if not isinstance(mask, torch.Tensor):
+                    mask = torch.tensor(mask)
+                masks[frame_id][obj_id] = mask
+                mask_np = mask.numpy()
+                if mask_np.ndim == 3 and mask_np.shape[0] == 1:
+                    mask_np = np.squeeze(mask_np, axis=0)
+                coords = np.where(mask_np > 0)
+                if len(coords[0]) > 0:
+                    y1, y2 = coords[0].min(), coords[0].max()
+                    x1, x2 = coords[1].min(), coords[1].max()
+                    bboxes[frame_id][obj_id] = [x1, y1, x2, y2]
+        if temp_video_path and os.path.exists(temp_video_path):
+            os.remove(temp_video_path)
+        tracked_masks, tracked_bboxes = self._track_ids_across_frames(masks, bboxes)
+        vis_data: Dict[str, Any] = {
+            "sam_masks": tracked_masks,
+            "dino_labels": oid_class_pred,
+        }
+        return tracked_masks, tracked_bboxes, vis_data
+    # ------------------------------------------------------------------ #
+    # ID tracking across frames
+    # ------------------------------------------------------------------ #
+    def _bbox_iou(self, box1: List[int], box2: List[int]) -> float:
+        x1, y1, x2, y2 = box1
+        x1b, y1b, x2b, y2b = box2
+        ix1 = max(x1, x1b)
+        iy1 = max(y1, y1b)
+        ix2 = min(x2, x2b)
+        iy2 = min(y2, y2b)
+        iw = max(0, ix2 - ix1)
+        ih = max(0, iy2 - iy1)
+        inter = iw * ih
+        if inter <= 0:
+            return 0.0
+        area1 = max(0, x2 - x1) * max(0, y2 - y1)
+        area2 = max(0, x2b - x1b) * max(0, y2b - y1b)
+        union = area1 + area2 - inter
+        if union <= 0:
+            return 0.0
+        return inter / union
+    def _track_ids_across_frames(
+        self,
+        masks: Dict[int, Dict[int, torch.Tensor]],
+        bboxes: Dict[int, Dict[int, List[int]]],
+        iou_threshold: float = 0.3,
+    ) -> Tuple[Dict[int, Dict[int, torch.Tensor]], Dict[int, Dict[int, List[int]]]]:
+        frame_ids = sorted(masks.keys())
+        tracked_masks: Dict[int, Dict[int, torch.Tensor]] = {}
+        tracked_bboxes: Dict[int, Dict[int, List[int]]] = {}
+        next_track_id = 0
+        prev_tracks: Dict[int, List[int]] = {}
+        for frame_id in frame_ids:
+            frame_masks = masks.get(frame_id, {})
+            frame_boxes = bboxes.get(frame_id, {})
+            tracked_masks[frame_id] = {}
+            tracked_bboxes[frame_id] = {}
+            if not frame_boxes:
+                prev_tracks = {}
+                continue
+            det_ids = list(frame_boxes.keys())
+            prev_ids = list(prev_tracks.keys())
+            candidates: List[Tuple[float, int, int]] = []
+            for tid in prev_ids:
+                prev_box = prev_tracks[tid]
+                for det_id in det_ids:
+                    iou = self._bbox_iou(prev_box, frame_boxes[det_id])
+                    if iou > iou_threshold:
+                        candidates.append((iou, tid, det_id))
+            candidates.sort(reverse=True)
+            matched_prev = set()
+            matched_det = set()
+            for iou, tid, det_id in candidates:
+                if tid in matched_prev or det_id in matched_det:
+                    continue
+                matched_prev.add(tid)
+                matched_det.add(det_id)
+                tracked_masks[frame_id][tid] = frame_masks[det_id]
+                tracked_bboxes[frame_id][tid] = frame_boxes[det_id]
+            for det_id in det_ids:
+                if det_id in matched_det:
+                    continue
+                tid = next_track_id
+                next_track_id += 1
+                tracked_masks[frame_id][tid] = frame_masks[det_id]
+                tracked_bboxes[frame_id][tid] = frame_boxes[det_id]
+            prev_tracks = {
+                tid: tracked_bboxes[frame_id][tid]
+                for tid in tracked_bboxes[frame_id].keys()
+            }
+        return tracked_masks, tracked_bboxes
+    # ------------------------------------------------------------------ #
+    # Segmentation model initialization
+    # ------------------------------------------------------------------ #
+    def _initialize_segmentation_models(self):
+        if self.sam_predictor is None or self.mask_generator is None:
+            self._initialize_sam2_models()
+        if self.grounding_model is None:
+            self._initialize_grounding_dino_model()
+    def _initialize_sam2_models(self):
+        try:
+            from sam2.build_sam import build_sam2_video_predictor, build_sam2
+            from sam2.automatic_mask_generator import SAM2AutomaticMaskGenerator
+        except ImportError as e:
+            print(f"Warning: Could not import SAM2: {e}")
+            return
+        config_path, checkpoint_path = self._resolve_sam2_paths()
+        if self.sam_config_path is not None and not os.path.exists(config_path):
+            raise ValueError(f"SAM2 config path not found: {config_path}")
+        if self.sam_checkpoint_path is not None and not os.path.exists(checkpoint_path):
+            raise ValueError(f"SAM2 checkpoint path not found: {checkpoint_path}")
+        if not os.path.exists(checkpoint_path):
+            print(f"Warning: SAM2 checkpoint not found at {checkpoint_path}")
+            print("SAM2 functionality will be unavailable")
+            return
+        try:
+            device = self._device
+            self.sam_predictor = build_sam2_video_predictor(
+                config_path, checkpoint_path, device=device
+            )
+            sam2_model = build_sam2(
+                config_path,
+                checkpoint_path,
+                device=device,
+                apply_postprocessing=False,
+            )
+            self.mask_generator = SAM2AutomaticMaskGenerator(
+                model=sam2_model,
+                points_per_side=32,
+                points_per_batch=32,
+                pred_iou_thresh=0.7,
+                stability_score_thresh=0.8,
+                crop_n_layers=2,
+                box_nms_thresh=0.6,
+                crop_n_points_downscale_factor=2,
+                min_mask_region_area=100,
+                use_m2m=True,
+            )
+            print("✓ SAM2 models initialized successfully")
+        except Exception as e:
+            raise ValueError(f"Failed to initialize SAM2 with custom paths: {e}")
+    def _initialize_grounding_dino_model(self):
+        try:
+            from groundingdino.util.inference import Model as gd_Model
+        except ImportError as e:
+            print(f"Warning: Could not import GroundingDINO: {e}")
+            return
+        config_path, checkpoint_path = self._resolve_grounding_dino_paths()
+        if self.gd_config_path is not None and not os.path.exists(config_path):
+            raise ValueError(f"GroundingDINO config path not found: {config_path}")
+        if self.gd_checkpoint_path is not None and not os.path.exists(checkpoint_path):
+            raise ValueError(
+                f"GroundingDINO checkpoint path not found: {checkpoint_path}"
+            )
+        if not (os.path.exists(config_path) and os.path.exists(checkpoint_path)):
+            print(
+                f"Warning: GroundingDINO models not found at {config_path} / {checkpoint_path}"
+            )
+            print("GroundingDINO functionality will be unavailable")
+            return
+        try:
+            device = self._device
+            self.grounding_model = gd_Model(
+                model_config_path=config_path,
+                model_checkpoint_path=checkpoint_path,
+                device=device,
+            )
+            print("✓ GroundingDINO model initialized successfully")
+        except Exception as e:
+            raise ValueError(f"Failed to initialize GroundingDINO with custom paths: {e}")
+    def _resolve_sam2_paths(self):
+        if self.sam_config_path and self.sam_checkpoint_path:
+            return self.sam_config_path, self.sam_checkpoint_path
+    def _resolve_grounding_dino_paths(self):
+        if self.gd_config_path and self.gd_checkpoint_path:
+            return self.gd_config_path, self.gd_checkpoint_path
+    # ------------------------------------------------------------------ #
+    # Video writing helpers
+    # ------------------------------------------------------------------ #
+    def _prepare_visualization_dir(self, name: str, enabled: bool) -> Optional[str]:
+        if not enabled:
+            return None
+        if self.visualization_dir:
+            target_dir = (
+                os.path.join(self.visualization_dir, name)
+                if name
+                else self.visualization_dir
+            )
+            os.makedirs(target_dir, exist_ok=True)
+            return target_dir
+        return tempfile.mkdtemp(prefix=f"vine_{name}_")
+    def _create_temp_video(
+        self,
+        video_tensor: np.ndarray,
+        base_dir: Optional[str] = None,
+        prefix: str = "temp_video",
+    ) -> str:
+        import subprocess
+        if base_dir is None:
+            base_dir = tempfile.mkdtemp(prefix=f"vine_{prefix}_")
+        else:
+            os.makedirs(base_dir, exist_ok=True)
+        file_name = f"{prefix}_{uuid.uuid4().hex}.mp4"
+        temp_path = os.path.join(base_dir, file_name)
+        height, width = video_tensor.shape[1:3]
+        processing_fps = max(1, self.target_fps)
+        output_fps = processing_fps
+        video_tensor_for_output = video_tensor
+        ffmpeg_success = False
+        try:
+            ffmpeg_success = self._create_video_with_ffmpeg(
+                video_tensor_for_output, temp_path, output_fps, width, height
+            )
+        except Exception as e:
+            print(f"FFmpeg method failed: {e}")
+        if not ffmpeg_success:
+            print("Using OpenCV fallback")
+            self._create_temp_video_opencv(
+                video_tensor_for_output, temp_path, output_fps, width, height
+            )
+        return temp_path
+    def _create_video_with_ffmpeg(
+        self, video_tensor: np.ndarray, output_path: str, fps: int, width: int, height: int
+    ) -> bool:
+        import subprocess
+        try:
+            ffmpeg_cmd = [
+                "ffmpeg",
+                "-y",
+                "-f",
+                "rawvideo",
+                "-vcodec",
+                "rawvideo",
+                "-s",
+                f"{width}x{height}",
+                "-pix_fmt",
+                "rgb24",
+                "-r",
+                str(fps),
+                "-i",
+                "pipe:0",
+                "-c:v",
+                "libx264",
+                "-preset",
+                "fast",
+                "-crf",
+                "23",
+                "-pix_fmt",
+                "yuv420p",
+                "-movflags",
+                "+faststart",
+                "-loglevel",
+                "error",
+                output_path,
+            ]
+            process = subprocess.Popen(
+                ffmpeg_cmd,
+                stdin=subprocess.PIPE,
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            frame_data = b""
+            for frame in video_tensor:
+                if frame.dtype != np.uint8:
+                    frame = (
+                        (frame * 255).astype(np.uint8)
+                        if frame.max() <= 1
+                        else frame.astype(np.uint8)
+                    )
+                frame_data += frame.tobytes()
+            stdout, stderr = process.communicate(input=frame_data, timeout=60)
+            if process.returncode == 0:
+                print(f"Video created with FFmpeg (H.264) at {fps} FPS")
+                return True
+            else:
+                error_msg = stderr.decode() if stderr else "Unknown error"
+                print(f"FFmpeg error: {error_msg}")
+                return False
+        except FileNotFoundError:
+            print("FFmpeg not found in PATH")
+            return False
+        except Exception as e:
+            print(f"FFmpeg exception: {e}")
+            return False
+    def _create_temp_video_opencv(
+        self, video_tensor: np.ndarray, temp_path: str, fps: int, width: int, height: int
+    ) -> str:
+        codecs_to_try = ["avc1", "X264", "mp4v"]
+        out = None
+        used_codec = None
+        for codec in codecs_to_try:
+            try:
+                fourcc = cv2.VideoWriter_fourcc(*codec)
+                temp_out = cv2.VideoWriter(temp_path, fourcc, fps, (width, height))
+                if temp_out.isOpened():
+                    out = temp_out
+                    used_codec = codec
+                    break
+                else:
+                    temp_out.release()
+            except Exception as e:
+                print(f"Warning: Codec {codec} not available: {e}")
+                continue
+        if out is None or not out.isOpened():
+            raise RuntimeError(
+                f"Failed to initialize VideoWriter with any codec. Tried: {codecs_to_try}"
+            )
+        print(f"Using OpenCV with codec: {used_codec}")
+        for frame in video_tensor:
+            if len(frame.shape) == 3 and frame.shape[2] == 3:
+                frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
+            else:
+                frame_bgr = frame
+            if frame_bgr.dtype != np.uint8:
+                frame_bgr = (
+                    (frame_bgr * 255).astype(np.uint8)
+                    if frame_bgr.max() <= 1
+                    else frame_bgr.astype(np.uint8)
+                )
+            out.write(frame_bgr)
+        out.release()
+        return temp_path
+    # ------------------------------------------------------------------ #
+    # Forward + postprocess
+    # ------------------------------------------------------------------ #
+    def _forward(self, model_inputs: Dict[str, Any], **forward_kwargs: Any) -> Dict[str, Any]:
+        outputs = self.model.predict(
+            video_frames=model_inputs["video_frames"],
+            masks=model_inputs["masks"],
+            bboxes=model_inputs["bboxes"],
+            **forward_kwargs,
+        )
+        outputs.setdefault("video_frames", model_inputs.get("video_frames"))
+        outputs.setdefault("bboxes", model_inputs.get("bboxes"))
+        outputs.setdefault("masks", model_inputs.get("masks"))
+        outputs.setdefault("visualization_data", model_inputs.get("visualization_data"))
+        return outputs
+    def postprocess(
+        self,
+        model_outputs: Dict[str, Any],
+        return_top_k: int = 3,
+        visualize: Optional[bool] = None,
+        **kwargs: Any,
+    ) -> Dict[str, Any]:
+        results: Dict[str, Any] = {
+            "categorical_predictions": model_outputs.get("categorical_predictions", {}),
+            "unary_predictions": model_outputs.get("unary_predictions", {}),
+            "binary_predictions": model_outputs.get("binary_predictions", {}),
+            "confidence_scores": model_outputs.get("confidence_scores", {}),
+            "summary": self._generate_summary(model_outputs),
+        }
+        print("\n" + "=" * 50)
+        print("DEBUG: Raw Model Outputs - Categorical Predictions")
+        cat_preds = model_outputs.get("categorical_predictions", {})
+        for obj_id, preds in cat_preds.items():
+            print(f"Object {obj_id}: {preds}")
+        print("=" * 50 + "\n")
+        if "flattened_segments" in model_outputs:
+            results["flattened_segments"] = model_outputs["flattened_segments"]
+        if "valid_pairs" in model_outputs:
+            results["valid_pairs"] = model_outputs["valid_pairs"]
+        if "valid_pairs_metadata" in model_outputs:
+            results["valid_pairs_metadata"] = model_outputs["valid_pairs_metadata"]
+        if "visualization_data" in model_outputs:
+            results["visualization_data"] = model_outputs["visualization_data"]
+        if self.visualize and "video_frames" in model_outputs and "bboxes" in model_outputs:
+            frames_tensor = model_outputs["video_frames"]
+            if isinstance(frames_tensor, torch.Tensor):
+                frames_np = frames_tensor.detach().cpu().numpy()
+            else:
+                frames_np = np.asarray(frames_tensor)
+            if frames_np.dtype != np.uint8:
+                if np.issubdtype(frames_np.dtype, np.floating):
+                    max_val = frames_np.max() if frames_np.size else 0.0
+                    scale = 255.0 if max_val <= 1.0 else 1.0
+                    frames_np = (frames_np * scale).clip(0, 255).astype(np.uint8)
+                else:
+                    frames_np = frames_np.clip(0, 255).astype(np.uint8)
+            cat_label_lookup: Dict[int, Tuple[str, float]] = {}
+            for obj_id, preds in model_outputs.get("categorical_predictions", {}).items():
+                if preds:
+                    prob, label = preds[0]
+                    cat_label_lookup[obj_id] = (label, prob)
+            unary_preds = model_outputs.get("unary_predictions", {})
+            unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]] = {}
+            for (frame_id, obj_id), preds in unary_preds.items():
+                if preds:
+                    unary_lookup.setdefault(frame_id, {})[obj_id] = preds[:1]
+            binary_preds = model_outputs.get("binary_predictions", {})
+            binary_lookup: Dict[
+                int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]
+            ] = {}
+            for (frame_id, obj_pair), preds in binary_preds.items():
+                if preds:
+                    binary_lookup.setdefault(frame_id, []).append((obj_pair, preds[:1]))
+            bboxes = model_outputs["bboxes"]
+            visualization_data = model_outputs.get("visualization_data", {})
+            visualizations: Dict[str, Dict[str, Any]] = {}
+            debug_visualizations = kwargs.get("debug_visualizations")
+            if debug_visualizations is None:
+                debug_visualizations = self.debug_visualizations
+            vine_frame_sets = render_vine_frame_sets(
+                frames_np,
+                bboxes,
+                cat_label_lookup,
+                unary_lookup,
+                binary_lookup,
+                visualization_data.get("sam_masks"),
+            )
+            vine_visuals: Dict[str, Dict[str, Any]] = {}
+            final_frames = vine_frame_sets.get("all", [])
+            if final_frames:
+                final_entry: Dict[str, Any] = {"frames": final_frames, "video_path": None}
+                final_dir = self._prepare_visualization_dir(
+                    "all", enabled=self.visualize
+                )
+                final_entry["video_path"] = self._create_temp_video(
+                    np.stack(final_frames, axis=0),
+                    base_dir=final_dir,
+                    prefix="all_visualization",
+                )
+                vine_visuals["all"] = final_entry
+            if debug_visualizations:
+                sam_masks = visualization_data.get("sam_masks")
+                if sam_masks:
+                    sam_frames = render_sam_frames(
+                        frames_np, sam_masks, visualization_data.get("dino_labels")
+                    )
+                    sam_entry = {"frames": sam_frames, "video_path": None}
+                    if sam_frames:
+                        sam_dir = self._prepare_visualization_dir(
+                            "sam", enabled=self.visualize
+                        )
+                        sam_entry["video_path"] = self._create_temp_video(
+                            np.stack(sam_frames, axis=0),
+                            base_dir=sam_dir,
+                            prefix="sam_visualization",
+                        )
+                    visualizations["sam"] = sam_entry
+                dino_labels = visualization_data.get("dino_labels")
+                if dino_labels:
+                    dino_frames = render_dino_frames(frames_np, bboxes, dino_labels)
+                    dino_entry = {"frames": dino_frames, "video_path": None}
+                    if dino_frames:
+                        dino_dir = self._prepare_visualization_dir(
+                            "dino", enabled=self.visualize
+                        )
+                        dino_entry["video_path"] = self._create_temp_video(
+                            np.stack(dino_frames, axis=0),
+                            base_dir=dino_dir,
+                            prefix="dino_visualization",
+                        )
+                    visualizations["dino"] = dino_entry
+                for name in ("object", "unary", "binary"):
+                    frames_list = vine_frame_sets.get(name, [])
+                    entry: Dict[str, Any] = {"frames": frames_list, "video_path": None}
+                    if frames_list:
+                        vine_dir = self._prepare_visualization_dir(
+                            name, enabled=self.visualize
+                        )
+                        entry["video_path"] = self._create_temp_video(
+                            np.stack(frames_list, axis=0),
+                            base_dir=vine_dir,
+                            prefix=f"{name}_visualization",
+                        )
+                    vine_visuals[name] = entry
+            if vine_visuals:
+                visualizations["vine"] = vine_visuals
+            if visualizations:
+                results["visualizations"] = visualizations
+        return results
+    # ------------------------------------------------------------------ #
+    # Summary JSON
+    # ------------------------------------------------------------------ #
+    def _generate_summary(self, model_outputs: Dict[str, Any]) -> Dict[str, Any]:
+        """
+        Per-object summary:
+        {
+          "num_objects_detected": N,
+          "objects": {
+            "<obj_id>": {
+              "top_categories": [{"label": str, "probability": float}, ...],
+              "top_unary": [{"frame_id": int, "predicate": str, "probability": float}, ...],
+            }
+          }
+        }
+        """
+        categorical_preds = model_outputs.get("categorical_predictions", {})
+        unary_preds = model_outputs.get("unary_predictions", {})
+        unary_by_obj: Dict[int, List[Tuple[float, str, int]]] = {}
+        for (frame_id, obj_id), preds in unary_preds.items():
+            for prob, predicate in preds:
+                prob_val = (
+                    float(prob.detach().cpu()) if torch.is_tensor(prob) else float(prob)
+                )
+                unary_by_obj.setdefault(obj_id, []).append((prob_val, predicate, frame_id))
+        objects_summary: Dict[str, Dict[str, Any]] = {}
+        all_obj_ids = set(categorical_preds.keys()) | set(unary_by_obj.keys())
+        for obj_id in sorted(all_obj_ids):
+            cat_list = categorical_preds.get(obj_id, [])
+            cat_sorted = sorted(
+                [
+                    (
+                        float(p.detach().cpu()) if torch.is_tensor(p) else float(p),
+                        label,
+                    )
+                    for p, label in cat_list
+                ],
+                key=lambda x: x[0],
+                reverse=True,
+            )[:3]
+            top_categories = [
+                {"label": label, "probability": prob} for prob, label in cat_sorted
+            ]
+            unary_list = unary_by_obj.get(obj_id, [])
+            unary_sorted = sorted(unary_list, key=lambda x: x[0], reverse=True)[:3]
+            top_unary = [
+                {
+                    "frame_id": int(frame_id),
+                    "predicate": predicate,
+                    "probability": prob,
+                }
+                for (prob, predicate, frame_id) in unary_sorted
+            ]
+            objects_summary[str(obj_id)] = {
+                "top_categories": top_categories,
+                "top_unary": top_unary,
+            }
+        summary = {
+            "num_objects_detected": len(objects_summary),
+            "objects": objects_summary,
+        }
+        return summary

vine_hf/vis_utils.py ADDED Viewed

	@@ -0,0 +1,941 @@

+import os
+import cv2
+import numpy as np
+import matplotlib.pyplot as plt
+import torch
+import random
+import math
+from matplotlib.patches import Rectangle
+import itertools
+from typing import Any, Dict, List, Tuple, Optional, Union
+from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
+########################################################################################
+##########                         Visualization Library                          ########
+########################################################################################
+# This module renders SAM masks, GroundingDINO boxes, and VINE predictions.
+#
+# Conventions (RGB frames, pixel coords):
+# - Frames: list[np.ndarray] with shape (H, W, 3) in RGB, or np.ndarray with shape (T, H, W, 3).
+# - Masks: 2D boolean arrays (H, W) or tensors convertible to that; (H, W, 1) is also accepted.
+# - BBoxes: (x1, y1, x2, y2) integer pixel coordinates with x2 > x1 and y2 > y1.
+#
+# Per-frame stores use one of:
+# - Dict[int(frame_id) -> Dict[int(obj_id) -> value]]
+# - List indexed by frame_id (each item may be a dict of obj_id->value or a list in order)
+#
+# Renderer inputs/outputs:
+# 1) render_sam_frames(frames, sam_masks, dino_labels=None) -> List[np.ndarray]
+#    - sam_masks: Dict[frame_id, Dict[obj_id, Mask]] or a list; Mask can be np.ndarray or torch.Tensor.
+#    - dino_labels: Optional Dict[obj_id, str] to annotate boxes derived from masks.
+#
+# 2) render_dino_frames(frames, bboxes, dino_labels=None) -> List[np.ndarray]
+#    - bboxes: Dict[frame_id, Dict[obj_id, Sequence[float]]] or a list; each bbox as [x1, y1, x2, y2].
+#
+# 3) render_vine_frames(frames, bboxes, cat_label_lookup, unary_lookup, binary_lookup, masks=None)
+#    -> List[np.ndarray] (the "all" view)
+#    - cat_label_lookup: Dict[obj_id, (label: str, prob: float)]
+#    - unary_lookup: Dict[frame_id, Dict[obj_id, List[(prob: float, label: str)]]]
+#    - binary_lookup: Dict[frame_id, List[((sub_id: int, obj_id: int), List[(prob: float, relation: str)])]]
+#    - masks: Optional; same structure as sam_masks, used for translucent overlays when unary labels exist.
+#
+# Ground-truth helpers used by plotting utilities:
+# - For a single frame, gt_relations is represented as List[(subject_label, object_label, relation_label)].
+#
+# All rendered frames returned by functions are RGB np.ndarray images suitable for saving or video writing.
+########################################################################################
+def clean_label(label):
+    """Replace underscores and slashes with spaces for uniformity."""
+    return label.replace("_", " ").replace("/", " ")
+# Should be performed somewhere else I believe
+def format_cate_preds(cate_preds):
+    # Group object predictions from the model output.
+    obj_pred_dict = {}
+    for (oid, label), prob in cate_preds.items():
+        # Clean the predicted label as well.
+        clean_pred = clean_label(label)
+        if oid not in obj_pred_dict:
+            obj_pred_dict[oid] = []
+        obj_pred_dict[oid].append((clean_pred, prob))
+    for oid in obj_pred_dict:
+        obj_pred_dict[oid].sort(key=lambda x: x[1], reverse=True)
+    return obj_pred_dict
+def format_binary_cate_preds(binary_preds):
+    frame_binary_preds = []
+    for key, score in binary_preds.items():
+        # Expect key format: (frame_id, (subject, object), predicted_relation)
+        try:
+            f_id, (subj, obj), pred_rel = key
+            frame_binary_preds.append((f_id, subj, obj, pred_rel, score))
+        except Exception as e:
+            print("Skipping key with unexpected format:", key)
+            continue
+    frame_binary_preds.sort(key=lambda x: x[3], reverse=True)
+    return frame_binary_preds
+_FONT = cv2.FONT_HERSHEY_SIMPLEX
+def _to_numpy_mask(mask: Union[np.ndarray, torch.Tensor, None]) -> Optional[np.ndarray]:
+    if mask is None:
+        return None
+    if isinstance(mask, torch.Tensor):
+        mask_np = mask.detach().cpu().numpy()
+    else:
+        mask_np = np.asarray(mask)
+    if mask_np.ndim == 0:
+        return None
+    if mask_np.ndim == 3:
+        mask_np = np.squeeze(mask_np)
+    if mask_np.ndim != 2:
+        return None
+    if mask_np.dtype == bool:
+        return mask_np
+    return mask_np > 0
+def _sanitize_bbox(bbox: Union[List[float], Tuple[float, ...], None], width: int, height: int) -> Optional[Tuple[int, int, int, int]]:
+    if bbox is None:
+        return None
+    if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
+        x1, y1, x2, y2 = [float(b) for b in bbox[:4]]
+    elif isinstance(bbox, np.ndarray) and bbox.size >= 4:
+        x1, y1, x2, y2 = [float(b) for b in bbox.flat[:4]]
+    else:
+        return None
+    x1 = int(np.clip(round(x1), 0, width - 1))
+    y1 = int(np.clip(round(y1), 0, height - 1))
+    x2 = int(np.clip(round(x2), 0, width - 1))
+    y2 = int(np.clip(round(y2), 0, height - 1))
+    if x2 <= x1 or y2 <= y1:
+        return None
+    return (x1, y1, x2, y2)
+def _object_color_bgr(obj_id: int) -> Tuple[int, int, int]:
+    color = get_color(obj_id)
+    rgb = [int(np.clip(c, 0.0, 1.0) * 255) for c in color[:3]]
+    return (rgb[2], rgb[1], rgb[0])
+def _background_color(color: Tuple[int, int, int]) -> Tuple[int, int, int]:
+    return tuple(int(0.25 * 255 + 0.75 * channel) for channel in color)
+def _draw_label_block(
+    image: np.ndarray,
+    lines: List[str],
+    anchor: Tuple[int, int],
+    color: Tuple[int, int, int],
+    font_scale: float = 0.5,
+    thickness: int = 1,
+    direction: str = "up",
+) -> None:
+    if not lines:
+        return
+    img_h, img_w = image.shape[:2]
+    x, y = anchor
+    x = int(np.clip(x, 0, img_w - 1))
+    y_cursor = int(np.clip(y, 0, img_h - 1))
+    bg_color = _background_color(color)
+    if direction == "down":
+        for text in lines:
+            text = str(text)
+            (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
+            left_x = x
+            right_x = min(left_x + tw + 8, img_w - 1)
+            top_y = int(np.clip(y_cursor + 6, 0, img_h - 1))
+            bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
+            if bottom_y <= top_y:
+                break
+            cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
+            text_x = left_x + 4
+            text_y = min(bottom_y - baseline - 2, img_h - 1)
+            cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
+            y_cursor = bottom_y
+    else:
+        for text in lines:
+            text = str(text)
+            (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
+            top_y = max(y_cursor - th - baseline - 6, 0)
+            left_x = x
+            right_x = min(left_x + tw + 8, img_w - 1)
+            bottom_y = min(top_y + th + baseline + 6, img_h - 1)
+            cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
+            text_x = left_x + 4
+            text_y = min(bottom_y - baseline - 2, img_h - 1)
+            cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
+            y_cursor = top_y
+def _draw_centered_label(
+    image: np.ndarray,
+    text: str,
+    center: Tuple[int, int],
+    color: Tuple[int, int, int],
+    font_scale: float = 0.5,
+    thickness: int = 1,
+) -> None:
+    text = str(text)
+    img_h, img_w = image.shape[:2]
+    (tw, th), baseline = cv2.getTextSize(text, _FONT, font_scale, thickness)
+    cx = int(np.clip(center[0], 0, img_w - 1))
+    cy = int(np.clip(center[1], 0, img_h - 1))
+    left_x = int(np.clip(cx - tw // 2 - 4, 0, img_w - 1))
+    top_y = int(np.clip(cy - th // 2 - baseline - 4, 0, img_h - 1))
+    right_x = int(np.clip(left_x + tw + 8, 0, img_w - 1))
+    bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
+    cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), _background_color(color), -1)
+    text_x = left_x + 4
+    text_y = min(bottom_y - baseline - 2, img_h - 1)
+    cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
+def _extract_frame_entities(store: Union[Dict[int, Dict[int, Any]], List, None], frame_idx: int) -> Dict[int, Any]:
+    if isinstance(store, dict):
+        frame_entry = store.get(frame_idx, {})
+    elif isinstance(store, list) and 0 <= frame_idx < len(store):
+        frame_entry = store[frame_idx]
+    else:
+        frame_entry = {}
+    if isinstance(frame_entry, dict):
+        return frame_entry
+    if isinstance(frame_entry, list):
+        return {i: value for i, value in enumerate(frame_entry)}
+    return {}
+def _label_anchor_and_direction(
+    bbox: Tuple[int, int, int, int],
+    position: str,
+) -> Tuple[Tuple[int, int], str]:
+    x1, y1, x2, y2 = bbox
+    if position == "bottom":
+        return (x1, y2), "down"
+    return (x1, y1), "up"
+def _draw_bbox_with_label(
+    image: np.ndarray,
+    bbox: Tuple[int, int, int, int],
+    obj_id: int,
+    title: Optional[str] = None,
+    sub_lines: Optional[List[str]] = None,
+    label_position: str = "top",
+) -> None:
+    color = _object_color_bgr(obj_id)
+    cv2.rectangle(image, (bbox[0], bbox[1]), (bbox[2], bbox[3]), color, 2)
+    head = title if title else f"#{obj_id}"
+    if not head.startswith("#"):
+        head = f"#{obj_id} {head}"
+    lines = [head]
+    if sub_lines:
+        lines.extend(sub_lines)
+    anchor, direction = _label_anchor_and_direction(bbox, label_position)
+    _draw_label_block(image, lines, anchor, color, direction=direction)
+def render_sam_frames(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    sam_masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None],
+    dino_labels: Optional[Dict[int, str]] = None,
+) -> List[np.ndarray]:
+    results: List[np.ndarray] = []
+    frames_iterable = frames if isinstance(frames, list) else list(frames)
+    dino_labels = dino_labels or {}
+    for frame_idx, frame in enumerate(frames_iterable):
+        if frame is None:
+            continue
+        frame_rgb = np.asarray(frame)
+        frame_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        overlay = frame_bgr.astype(np.float32)
+        masks_for_frame = _extract_frame_entities(sam_masks, frame_idx)
+        for obj_id, mask in masks_for_frame.items():
+            mask_np = _to_numpy_mask(mask)
+            if mask_np is None or not np.any(mask_np):
+                continue
+            color = _object_color_bgr(obj_id)
+            alpha = 0.45
+            overlay[mask_np] = (1.0 - alpha) * overlay[mask_np] + alpha * np.array(color, dtype=np.float32)
+        annotated = np.clip(overlay, 0, 255).astype(np.uint8)
+        frame_h, frame_w = annotated.shape[:2]
+        for obj_id, mask in masks_for_frame.items():
+            mask_np = _to_numpy_mask(mask)
+            if mask_np is None or not np.any(mask_np):
+                continue
+            bbox = mask_to_bbox(mask_np)
+            bbox = _sanitize_bbox(bbox, frame_w, frame_h)
+            if not bbox:
+                continue
+            label = dino_labels.get(obj_id)
+            title = f"{label}" if label else None
+            _draw_bbox_with_label(annotated, bbox, obj_id, title=title)
+        results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
+    return results
+def render_dino_frames(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
+    dino_labels: Optional[Dict[int, str]] = None,
+) -> List[np.ndarray]:
+    results: List[np.ndarray] = []
+    frames_iterable = frames if isinstance(frames, list) else list(frames)
+    dino_labels = dino_labels or {}
+    for frame_idx, frame in enumerate(frames_iterable):
+        if frame is None:
+            continue
+        frame_rgb = np.asarray(frame)
+        annotated = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        frame_h, frame_w = annotated.shape[:2]
+        frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
+        for obj_id, bbox_values in frame_bboxes.items():
+            bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
+            if not bbox:
+                continue
+            label = dino_labels.get(obj_id)
+            title = f"{label}" if label else None
+            _draw_bbox_with_label(annotated, bbox, obj_id, title=title)
+        results.append(cv2.cvtColor(annotated, cv2.COLOR_BGR2RGB))
+    return results
+def render_vine_frame_sets(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
+    cat_label_lookup: Dict[int, Tuple[str, float]],
+    unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
+    binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
+    masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
+) -> Dict[str, List[np.ndarray]]:
+    frame_groups: Dict[str, List[np.ndarray]] = {
+        "object": [],
+        "unary": [],
+        "binary": [],
+        "all": [],
+    }
+    frames_iterable = frames if isinstance(frames, list) else list(frames)
+    for frame_idx, frame in enumerate(frames_iterable):
+        if frame is None:
+            continue
+        frame_rgb = np.asarray(frame)
+        base_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
+        frame_h, frame_w = base_bgr.shape[:2]
+        frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
+        frame_masks = _extract_frame_entities(masks, frame_idx) if masks is not None else {}
+        objects_bgr = base_bgr.copy()
+        unary_bgr = base_bgr.copy()
+        binary_bgr = base_bgr.copy()
+        all_bgr = base_bgr.copy()
+        bbox_lookup: Dict[int, Tuple[int, int, int, int]] = {}
+        unary_lines_lookup: Dict[int, List[str]] = {}
+        titles_lookup: Dict[int, Optional[str]] = {}
+        for obj_id, bbox_values in frame_bboxes.items():
+            bbox = _sanitize_bbox(bbox_values, frame_w, frame_h)
+            if not bbox:
+                continue
+            bbox_lookup[obj_id] = bbox
+            cat_label, cat_prob = cat_label_lookup.get(obj_id, (None, None))
+            title_parts = []
+            if cat_label:
+                if cat_prob is not None:
+                    title_parts.append(f"{cat_label} {cat_prob:.2f}")
+                else:
+                    title_parts.append(cat_label)
+            titles_lookup[obj_id] = " ".join(title_parts) if title_parts else None
+            unary_preds = unary_lookup.get(frame_idx, {}).get(obj_id, [])
+            unary_lines = [f"{label} {prob:.2f}" for prob, label in unary_preds]
+            unary_lines_lookup[obj_id] = unary_lines
+        for obj_id, bbox in bbox_lookup.items():
+            unary_lines = unary_lines_lookup.get(obj_id, [])
+            if not unary_lines:
+                continue
+            mask_raw = frame_masks.get(obj_id)
+            mask_np = _to_numpy_mask(mask_raw)
+            if mask_np is None or not np.any(mask_np):
+                continue
+            color = np.array(_object_color_bgr(obj_id), dtype=np.float32)
+            alpha = 0.45
+            for target in (unary_bgr, all_bgr):
+                target_vals = target[mask_np].astype(np.float32)
+                blended = (1.0 - alpha) * target_vals + alpha * color
+                target[mask_np] = np.clip(blended, 0, 255).astype(np.uint8)
+        for obj_id, bbox in bbox_lookup.items():
+            title = titles_lookup.get(obj_id)
+            unary_lines = unary_lines_lookup.get(obj_id, [])
+            _draw_bbox_with_label(objects_bgr, bbox, obj_id, title=title, label_position="top")
+            _draw_bbox_with_label(unary_bgr, bbox, obj_id, title=title, label_position="top")
+            if unary_lines:
+                anchor, direction = _label_anchor_and_direction(bbox, "bottom")
+                _draw_label_block(unary_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
+            _draw_bbox_with_label(binary_bgr, bbox, obj_id, title=title, label_position="top")
+            _draw_bbox_with_label(all_bgr, bbox, obj_id, title=title, label_position="top")
+            if unary_lines:
+                anchor, direction = _label_anchor_and_direction(bbox, "bottom")
+                _draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
+        for obj_pair, relation_preds in binary_lookup.get(frame_idx, []):
+            if len(obj_pair) != 2 or not relation_preds:
+                continue
+            subj_id, obj_id = obj_pair
+            subj_bbox = bbox_lookup.get(subj_id)
+            obj_bbox = bbox_lookup.get(obj_id)
+            if not subj_bbox or not obj_bbox:
+                continue
+            start, end = relation_line(subj_bbox, obj_bbox)
+            color = tuple(int(c) for c in np.clip(
+                (np.array(_object_color_bgr(subj_id), dtype=np.float32) +
+                 np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
+                0, 255
+            ))
+            prob, relation = relation_preds[0]
+            label_text = f"{relation} {prob:.2f}"
+            mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
+            cv2.line(binary_bgr, start, end, color, 6, cv2.LINE_AA)
+            cv2.line(all_bgr, start, end, color, 6, cv2.LINE_AA)
+            _draw_centered_label(binary_bgr, label_text, mid_point, color)
+            _draw_centered_label(all_bgr, label_text, mid_point, color)
+        frame_groups["object"].append(cv2.cvtColor(objects_bgr, cv2.COLOR_BGR2RGB))
+        frame_groups["unary"].append(cv2.cvtColor(unary_bgr, cv2.COLOR_BGR2RGB))
+        frame_groups["binary"].append(cv2.cvtColor(binary_bgr, cv2.COLOR_BGR2RGB))
+        frame_groups["all"].append(cv2.cvtColor(all_bgr, cv2.COLOR_BGR2RGB))
+    return frame_groups
+def render_vine_frames(
+    frames: Union[np.ndarray, List[np.ndarray]],
+    bboxes: Union[Dict[int, Dict[int, Union[List[float], np.ndarray]]], List, None],
+    cat_label_lookup: Dict[int, Tuple[str, float]],
+    unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
+    binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
+    masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
+) -> List[np.ndarray]:
+    return render_vine_frame_sets(
+        frames,
+        bboxes,
+        cat_label_lookup,
+        unary_lookup,
+        binary_lookup,
+        masks,
+    ).get("all", [])
+def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
+    all_colors = []
+    all_texts = []
+    for (obj_id, bbox, gt_label) in gt_labels:
+        preds = obj_pred_dict.get(obj_id, [])
+        if len(preds) == 0:
+            top1 = "N/A"
+            box_color = (0, 0, 255)  # bright red if no prediction
+        else:
+            top1, prob1 = preds[0]
+            topk_labels = [p[0] for p in preds[:topk_object]]
+            # Compare cleaned labels.
+            if top1.lower() == gt_label.lower():
+                box_color = (0, 255, 0)      # bright green for correct
+            elif gt_label.lower() in [p.lower() for p in topk_labels]:
+                box_color = (0, 165, 255)    # bright orange for partial match
+            else:
+                box_color = (0, 0, 255)      # bright red for incorrect
+        label_text = f"ID:{obj_id}/P:{top1}/GT:{gt_label}"
+        all_colors.append(box_color)
+        all_texts.append(label_text)
+    return all_colors, all_texts
+def plot_unary(frame_img, gt_labels, all_colors, all_texts):
+    for (obj_id, bbox, gt_label), box_color, label_text in zip(gt_labels, all_colors, all_texts):
+        x1, y1, x2, y2 = map(int, bbox)
+        cv2.rectangle(frame_img, (x1, y1), (x2, y2), color=box_color, thickness=2)
+        (tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
+        cv2.rectangle(frame_img, (x1, y1 - th - baseline - 4), (x1 + tw, y1), box_color, -1)
+        cv2.putText(frame_img, label_text, (x1, y1 - 2), cv2.FONT_HERSHEY_SIMPLEX,
+            0.5, (0, 0, 0), 1, cv2.LINE_AA)
+    return frame_img
+def get_white_pane(pane_height,
+                   pane_width=600,
+                   header_height = 50,
+                   header_font = cv2.FONT_HERSHEY_SIMPLEX,
+                   header_font_scale = 0.7,
+                   header_thickness = 2,
+                   header_color = (0, 0, 0)):
+     # Create an expanded white pane to display text info.
+    white_pane = 255 * np.ones((pane_height, pane_width, 3), dtype=np.uint8)
+    # --- Adjust pane split: make predictions column wider (60% vs. 40%) ---
+    left_width = int(pane_width * 0.6)
+    right_width = pane_width - left_width
+    left_pane = white_pane[:, :left_width, :].copy()
+    right_pane = white_pane[:, left_width:, :].copy()
+    cv2.putText(left_pane, "Binary Predictions", (10, header_height - 30),
+                header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
+    cv2.putText(right_pane, "Ground Truth", (10, header_height - 30),
+                header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
+    return white_pane
+# This is for ploting binary prediction results with frame-based scene graphs
+def plot_binary_sg(frame_img,
+                   white_pane,
+                   bin_preds,
+                   gt_relations,
+                   topk_binary,
+                   header_height=50,
+                   indicator_size=20,
+                   pane_width=600):
+     # Leave vertical space for the headers.
+    line_height = 30  # vertical spacing per line
+    x_text = 10       # left margin for text
+    y_text_left = header_height + 10  # starting y for left pane text
+    y_text_right = header_height + 10 # starting y for right pane text
+    # Left section: top-k binary predictions.
+    left_width = int(pane_width * 0.6)
+    right_width = pane_width - left_width
+    left_pane = white_pane[:, :left_width, :].copy()
+    right_pane = white_pane[:, left_width:, :].copy()
+    for (subj, pred_rel, obj, score) in bin_preds[:topk_binary]:
+        correct = any((subj == gt[0] and pred_rel.lower() == gt[2].lower() and obj == gt[1])
+                      for gt in gt_relations)
+        indicator_color = (0, 255, 0) if correct else (0, 0, 255)
+        cv2.rectangle(left_pane, (x_text, y_text_left - indicator_size + 5),
+                      (x_text + indicator_size, y_text_left + 5), indicator_color, -1)
+        text = f"{subj} - {pred_rel} - {obj} :: {score:.2f}"
+        cv2.putText(left_pane, text, (x_text + indicator_size + 5, y_text_left + 5),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
+        y_text_left += line_height
+    # Right section: ground truth binary relations.
+    for gt in gt_relations:
+        if len(gt) != 3:
+            continue
+        text = f"{gt[0]} - {gt[2]} - {gt[1]}"
+        cv2.putText(right_pane, text, (x_text, y_text_right + 5),
+                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
+        y_text_right += line_height
+    # Combine the two text panes and then with the frame image.
+    combined_pane = np.hstack((left_pane, right_pane))
+    combined_image = np.hstack((frame_img, combined_pane))
+    return combined_image
+def visualized_frame(frame_img,
+                     bboxes,
+                     object_ids,
+                     gt_labels,
+                     cate_preds,
+                     binary_preds,
+                     gt_relations,
+                     topk_object,
+                     topk_binary,
+                     phase="unary"):
+    """Return the combined annotated frame for frame index i as an image (in BGR)."""
+    # Get the frame image (assuming batched_data['batched_reshaped_raw_videos'] is a list of frames)
+    # --- Process Object Predictions (for overlaying bboxes) ---
+    if phase == "unary":
+        objs = []
+        for ((_, f_id, obj_id), bbox, gt_label) in zip(object_ids, bboxes, gt_labels):
+            gt_label = clean_label(gt_label)
+            objs.append((obj_id, bbox, gt_label))
+        formatted_cate_preds = format_cate_preds(cate_preds)
+        all_colors, all_texts = color_for_cate_correctness(formatted_cate_preds, gt_labels, topk_object)
+        updated_frame_img = plot_unary(frame_img, gt_labels, all_colors, all_texts)
+        return updated_frame_img
+    else:
+        # --- Process Binary Predictions & Ground Truth for the Text Pane ---
+        formatted_binary_preds = format_binary_cate_preds(binary_preds)
+        # Ground truth binary relations for the frame.
+        # Clean ground truth relations.
+        gt_relations = [(clean_label(str(s)), clean_label(str(o)), clean_label(rel)) for s, o, rel in gt_relations]
+        pane_width = 600  # increased pane width for more horizontal space
+        pane_height = frame_img.shape[0]
+        # --- Add header labels to each text pane with extra space ---
+        header_height = 50  # increased header space
+        white_pane = get_white_pane(pane_height, pane_width, header_height=header_height)
+        combined_image = plot_binary_sg(frame_img, white_pane, formatted_binary_preds, gt_relations, topk_binary)
+        return combined_image
+def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
+    # Ensure mask is a numpy array
+    mask = np.array(mask)
+    # Handle different mask shapes
+    if mask.ndim == 3:
+        # (1, H, W) -> (H, W)
+        if mask.shape[0] == 1:
+            mask = mask.squeeze(0)
+        # (H, W, 1) -> (H, W)
+        elif mask.shape[2] == 1:
+            mask = mask.squeeze(2)
+    # Now mask should be (H, W)
+    assert mask.ndim == 2, f"Mask must be 2D after squeezing, got shape {mask.shape}"
+    if random_color:
+        color = np.concatenate([np.random.random(3), np.array([0.8])], axis=0)
+    else:
+        cmap = plt.get_cmap("gist_rainbow")
+        cmap_idx = 0 if obj_id is None else obj_id
+        color = list(cmap((cmap_idx * 47) % 256))
+        color[3] = 0.5
+        color = np.array(color)
+    # Expand mask to (H, W, 1) for broadcasting
+    mask_expanded = mask[..., None]
+    mask_image = mask_expanded * color.reshape(1, 1, -1)
+    # draw a box around the mask with the det_class as the label
+    if not det_class is None:
+        # Find the bounding box coordinates
+        y_indices, x_indices = np.where(mask > 0)
+        if y_indices.size > 0 and x_indices.size > 0:
+            x_min, x_max = x_indices.min(), x_indices.max()
+            y_min, y_max = y_indices.min(), y_indices.max()
+            rect = Rectangle(
+                (x_min, y_min),
+                x_max - x_min,
+                y_max - y_min,
+                linewidth=1.5,
+                edgecolor=color[:3],
+                facecolor="none",
+                alpha=color[3]
+            )
+            ax.add_patch(rect)
+            ax.text(
+                x_min,
+                y_min - 5,
+                f"{det_class}",
+                color="white",
+                fontsize=6,
+                backgroundcolor=np.array(color),
+                alpha=1
+            )
+    ax.imshow(mask_image)
+def save_mask_one_image(frame_image, masks, save_path):
+    """Render masks on top of a frame and store the visualization on disk."""
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    frame_np = (
+        frame_image.detach().cpu().numpy()
+        if torch.is_tensor(frame_image)
+        else np.asarray(frame_image)
+    )
+    frame_np = np.ascontiguousarray(frame_np)
+    if isinstance(masks, dict):
+        mask_iter = masks.items()
+    else:
+        mask_iter = enumerate(masks)
+    prepared_masks = {
+        obj_id: (
+            mask.detach().cpu().numpy()
+            if torch.is_tensor(mask)
+            else np.asarray(mask)
+        )
+        for obj_id, mask in mask_iter
+    }
+    ax.imshow(frame_np)
+    ax.axis("off")
+    for obj_id, mask_np in prepared_masks.items():
+        show_mask(mask_np, ax, obj_id=obj_id, det_class=None, random_color=False)
+    fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
+    plt.close(fig)
+    return save_path
+def get_video_masks_visualization(video_tensor,
+                                  video_masks,
+                                  video_id,
+                                  video_save_base_dir,
+                                  oid_class_pred=None,
+                                  sample_rate = 1):
+    video_save_dir = os.path.join(video_save_base_dir, video_id)
+    if not os.path.exists(video_save_dir):
+        os.makedirs(video_save_dir, exist_ok=True)
+    for frame_id, image in enumerate(video_tensor):
+        if frame_id not in video_masks:
+            print("No mask for Frame", frame_id)
+            continue
+        masks = video_masks[frame_id]
+        save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
+        get_mask_one_image(image, masks, oid_class_pred)
+def get_mask_one_image(frame_image, masks, oid_class_pred=None):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    # Display the frame image
+    ax.imshow(frame_image)
+    ax.axis('off')
+    if type(masks) == list:
+        masks = {i: m for i, m in enumerate(masks)}
+    # Add the masks
+    for obj_id, mask in masks.items():
+        det_class = f"{obj_id}. {oid_class_pred[obj_id]}" if not oid_class_pred is None else None
+        show_mask(mask, ax, obj_id=obj_id, det_class=det_class, random_color=False)
+    # Show the plot
+    return fig, ax
+def save_video(frames, output_filename, output_fps):
+    # --- Create a video from all frames ---
+    num_frames = len(frames)
+    frame_h, frame_w = frames.shape[:2]
+    # Use a codec supported by VS Code (H.264 via 'avc1').
+    fourcc = cv2.VideoWriter_fourcc(*'avc1')
+    out = cv2.VideoWriter(output_filename, fourcc, output_fps, (frame_w, frame_h))
+    print(f"Processing {num_frames} frames...")
+    for i in range(num_frames):
+        vis_frame = get_visualized_frame(i)
+        out.write(vis_frame)
+        if i % 10 == 0:
+            print(f"Processed frame {i+1}/{num_frames}")
+    out.release()
+    print(f"Video saved as {output_filename}")
+def list_depth(lst):
+    """Calculates the depth of a nested list."""
+    if not (isinstance(lst, list) or isinstance(lst, torch.Tensor)):
+        return 0
+    elif (isinstance(lst, torch.Tensor) and lst.shape == torch.Size([])) or (isinstance(lst, list) and len(lst) == 0):
+        return 1
+    else:
+        return 1 + max(list_depth(item) for item in lst)
+def normalize_prompt(points, labels):
+    if list_depth(points) == 3:
+        points = torch.stack([p.unsqueeze(0) for p in points])
+        labels = torch.stack([l.unsqueeze(0) for l in labels])
+    return points, labels
+def show_box(box, ax, object_id):
+    if len(box) == 0:
+        return
+    cmap = plt.get_cmap("gist_rainbow")
+    cmap_idx = 0 if object_id is None else object_id
+    color = list(cmap((cmap_idx * 47) % 256))
+    x0, y0 = box[0], box[1]
+    w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0,0,0,0), lw=2))
+def show_points(coords, labels, ax, object_id=None, marker_size=375):
+    if len(labels) == 0:
+        return
+    pos_points = coords[labels==1]
+    neg_points = coords[labels==0]
+    cmap = plt.get_cmap("gist_rainbow")
+    cmap_idx = 0 if object_id is None else object_id
+    color = list(cmap((cmap_idx * 47) % 256))
+    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='P', s=marker_size, edgecolor=color, linewidth=1.25)
+    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='s', s=marker_size, edgecolor=color, linewidth=1.25)
+def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    # Display the frame image
+    ax.imshow(frame_image)
+    ax.axis('off')
+    points, labels = normalize_prompt(points, labels)
+    if type(boxes) == torch.Tensor:
+        for object_id, box in enumerate(boxes):
+            # Add the bounding boxes
+            if not box is None:
+                show_box(box.cpu(), ax, object_id=object_id)
+    elif type(boxes) == dict:
+        for object_id, box in boxes.items():
+            # Add the bounding boxes
+            if not box is None:
+                show_box(box.cpu(), ax, object_id=object_id)
+    elif type(boxes) == list and len(boxes) == 0:
+        pass
+    else:
+        raise Exception()
+    for object_id, (point_ls, label_ls) in enumerate(zip(points, labels)):
+        if not len(point_ls) == 0:
+            show_points(point_ls.cpu(), label_ls.cpu(), ax, object_id=object_id)
+    # Show the plot
+    plt.savefig(save_path)
+    plt.close()
+def save_video_prompts_visualization(video_tensor, video_boxes, video_points, video_labels, video_id, video_save_base_dir):
+    video_save_dir = os.path.join(video_save_base_dir, video_id)
+    if not os.path.exists(video_save_dir):
+        os.makedirs(video_save_dir, exist_ok=True)
+    for frame_id, image in enumerate(video_tensor):
+        boxes, points, labels = [], [], []
+        if frame_id in video_boxes:
+            boxes = video_boxes[frame_id]
+        if frame_id in video_points:
+            points = video_points[frame_id]
+        if frame_id in video_labels:
+            labels = video_labels[frame_id]
+        save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
+        save_prompts_one_image(image, boxes, points, labels, save_path)
+def save_video_masks_visualization(video_tensor, video_masks, video_id, video_save_base_dir, oid_class_pred=None, sample_rate = 1):
+    video_save_dir = os.path.join(video_save_base_dir, video_id)
+    if not os.path.exists(video_save_dir):
+        os.makedirs(video_save_dir, exist_ok=True)
+    for frame_id, image in enumerate(video_tensor):
+        if random.random() > sample_rate:
+            continue
+        if frame_id not in video_masks:
+            print("No mask for Frame", frame_id)
+            continue
+        masks = video_masks[frame_id]
+        save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
+        save_mask_one_image(image, masks, save_path)
+def get_color(obj_id, cmap_name="gist_rainbow",alpha=0.5):
+    cmap = plt.get_cmap(cmap_name)
+    cmap_idx = 0 if obj_id is None else obj_id
+    color = list(cmap((cmap_idx * 47) % 256))
+    color[3] = 0.5
+    color = np.array(color)
+    return color
+def _bbox_center(bbox: Tuple[int, int, int, int]) -> Tuple[float, float]:
+    return ((bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0)
+def relation_line(
+    bbox1: Tuple[int, int, int, int],
+    bbox2: Tuple[int, int, int, int],
+) -> Tuple[Tuple[int, int], Tuple[int, int]]:
+    """
+    Returns integer pixel centers suitable for drawing a relation line. For
+    coincident boxes, nudges the target center to ensure the segment has span.
+    """
+    center1 = _bbox_center(bbox1)
+    center2 = _bbox_center(bbox2)
+    if math.isclose(center1[0], center2[0], abs_tol=1e-3) and math.isclose(center1[1], center2[1], abs_tol=1e-3):
+        offset = max(1.0, (bbox2[2] - bbox2[0]) * 0.05)
+        center2 = (center2[0] + offset, center2[1])
+    start = (int(round(center1[0])), int(round(center1[1])))
+    end = (int(round(center2[0])), int(round(center2[1])))
+    if start == end:
+        end = (end[0] + 1, end[1])
+    return start, end
+def get_binary_mask_one_image(frame_image, masks, rel_pred_ls=None):
+    # Create a figure and axis
+    fig, ax = plt.subplots(1, figsize=(6, 6))
+    # Display the frame image
+    ax.imshow(frame_image)
+    ax.axis('off')
+    all_objs_to_show = set()
+    all_lines_to_show = []
+    # print(rel_pred_ls[0])
+    for (from_obj_id, to_obj_id), rel_text in rel_pred_ls.items():
+        all_objs_to_show.add(from_obj_id)
+        all_objs_to_show.add(to_obj_id)
+        from_mask = masks[from_obj_id]
+        bbox1 = mask_to_bbox(from_mask)
+        to_mask = masks[to_obj_id]
+        bbox2 = mask_to_bbox(to_mask)
+        c1, c2 = shortest_line_between_bboxes(bbox1, bbox2)
+        line_color = get_color(from_obj_id)
+        face_color = get_color(to_obj_id)
+        line = c1, c2, face_color, line_color, rel_text
+        all_lines_to_show.append(line)
+    masks_to_show = {}
+    for oid in all_objs_to_show:
+        masks_to_show[oid] = masks[oid]
+    # Add the masks
+    for obj_id, mask in masks_to_show.items():
+        show_mask(mask, ax, obj_id=obj_id, random_color=False)
+    for (from_pt_x, from_pt_y), (to_pt_x, to_pt_y), face_color, line_color, rel_text in all_lines_to_show:
+        plt.plot([from_pt_x, to_pt_x], [from_pt_y, to_pt_y], color=line_color, linestyle='-', linewidth=3)
+        mid_pt_x = (from_pt_x + to_pt_x) / 2
+        mid_pt_y = (from_pt_y + to_pt_y) / 2
+        ax.text(
+                mid_pt_x - 5,
+                mid_pt_y,
+                rel_text,
+                color="white",
+                fontsize=6,
+                backgroundcolor=np.array(line_color),
+                bbox=dict(facecolor=face_color, edgecolor=line_color, boxstyle='round,pad=1'),
+                alpha=1
+            )
+    # Show the plot
+    return fig, ax