Spaces:

jiani-huang
/

LASER

Running on Zero

App Files Files Community

jiani-huang commited on 8 days ago

Commit

d3c563b

1 Parent(s): 21f4849

copying in saved code

Browse files

Files changed (7) hide show

app.py +257 -30
vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
vine_hf/vine_config.py +2 -0
vine_hf/vine_model.py +105 -6
vine_hf/vine_pipeline.py +11 -1

app.py CHANGED Viewed

@@ -60,12 +60,155 @@ print(
 )
 @lru_cache(maxsize=1)
 def _load_vine_pipeline():
     """
-    Lazy-load and cache the Vine pipeline so we don't re-download/rebuild it on every request.
     """
     from vine_hf import VineConfig, VineModel, VinePipeline
@@ -84,6 +227,7 @@ def _load_vine_pipeline():
         debug_visualizations=False,
         device="cuda",
         categorical_pool="max",
     )
     model = VineModel(config)
     return VinePipeline(
@@ -104,6 +248,7 @@ def process_video(
     categorical_keywords,
     unary_keywords,
     binary_keywords,
     output_fps,
     box_threshold,
     text_threshold,
@@ -121,34 +266,86 @@ def process_video(
     if not isinstance(video_file, (str, Path)):
         raise ValueError(f"Unsupported video input type: {type(video_file)}")
     categorical_keywords = (
-        [kw.strip() for kw in categorical_keywords.split(",")]
-        if categorical_keywords
         else []
     )
     unary_keywords = (
-        [kw.strip() for kw in unary_keywords.split(",")] if unary_keywords else []
-    )
-    binary_keywords = (
-        [kw.strip() for kw in binary_keywords.split(",")] if binary_keywords else []
     )
     # Debug: Print what we're sending to the pipeline
     print("\n" + "=" * 80)
-    print("INPUT TO VINE PIPELINE:")
     print(f"  categorical_keywords: {categorical_keywords}")
     print(f"  unary_keywords: {unary_keywords}")
-    print(f"  binary_keywords: {binary_keywords}")
     print("=" * 80 + "\n")
     # Object pairs is now optional - empty list will auto-generate all pairs in vine_model.py
-    object_pairs = []
     results = vine_pipe(
         inputs=video_file,
         categorical_keywords=categorical_keywords,
         unary_keywords=unary_keywords,
-        binary_keywords=binary_keywords,
         object_pairs=object_pairs,
         segmentation_method="grounding_dino_sam2",
         return_top_k=5,
@@ -159,6 +356,7 @@ def process_video(
         text_threshold=text_threshold,
         target_fps=output_fps,
         binary_confidence_threshold=binary_confidence_threshold,
     )
     # Debug: Print what the pipeline returned
@@ -193,6 +391,13 @@ def process_video(
         result_video_path = str(candidates[0]) if candidates else None
     summary = results_dict.get("summary") or {}
     if result_video_path and os.path.exists(result_video_path):
         gradio_tmp = (
             Path(os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir()))
@@ -246,7 +451,7 @@ def _create_blocks():
     """
     Build a Blocks context that works across Gradio versions.
     """
-    blocks_kwargs = {"title": "VINE Demo"}
     soft_theme = None
     if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"):
@@ -265,21 +470,23 @@ def _create_blocks():
 with _create_blocks() as demo:
     gr.Markdown(
         """
-        # 🎬 VINE: Video-based Interaction and Event Detection
-        Upload an MP4 video and specify keywords to detect objects, actions, and interactions in your video.
         """
     )
     with gr.Row():
         # Left column: Inputs
         with gr.Column(scale=1):
-            gr.Markdown("### Input Configuration")
             video_input = _video_component("Upload Video (MP4 only)", is_output=False)
             gr.Markdown("*Note: Only MP4 format is currently supported*")
-            gr.Markdown("#### Detection Keywords")
             categorical_input = gr.Textbox(
                 label="Categorical Keywords",
                 placeholder="e.g., person, car, dog",
@@ -294,8 +501,20 @@ with _create_blocks() as demo:
             )
             binary_input = gr.Textbox(
                 label="Binary Keywords",
-                placeholder="e.g., chasing, carrying",
-                info="Object-to-object interactions to detect (comma-separated)",
             )
             gr.Markdown("#### Processing Settings")
@@ -326,7 +545,7 @@ with _create_blocks() as demo:
                     label="Binary Relation Confidence Threshold",
                     minimum=0.0,
                     maximum=1.0,
-                    value=0.8,
                     step=0.05,
                     info="Minimum confidence to show binary relations and object pairs",
                 )
@@ -335,24 +554,31 @@ with _create_blocks() as demo:
         # Right column: Outputs
         with gr.Column(scale=1):
-            gr.Markdown("### Results")
             video_output = _video_component("Annotated Video Output", is_output=True)
-            gr.Markdown("### Detection Summary")
-            summary_output = gr.JSON(label="Summary of Detected Events")
     gr.Markdown(
         """
         ---
-        ### How to Use
-        1. Upload an MP4 video file
-        2. Specify the objects, actions, and interactions you want to detect
-        3. Adjust processing settings if needed (including binary relation confidence threshold)
-        4. Click "Process Video" to analyze
-        The system will automatically detect all binary relations between detected objects
-        and show only those with confidence above the threshold (default: 0.8).
         """
     )
@@ -363,6 +589,7 @@ with _create_blocks() as demo:
             categorical_input,
             unary_input,
             binary_input,
             fps_input,
             box_threshold_input,
             text_threshold_input,

 )
+def _split_top_level_commas(s: str):
+    """
+    Split a string on commas that are NOT inside parentheses.
+    Example:
+      "behind(person, dog), bite(dog, frisbee)"
+      -> ["behind(person, dog)", "bite(dog, frisbee)"]
+    """
+    parts = []
+    buf = []
+    depth = 0
+    for ch in s:
+        if ch == "(":
+            depth += 1
+            buf.append(ch)
+        elif ch == ")":
+            if depth > 0:
+                depth -= 1
+            buf.append(ch)
+        elif ch == "," and depth == 0:
+            part = "".join(buf).strip()
+            if part:
+                parts.append(part)
+            buf = []
+        else:
+            buf.append(ch)
+    if buf:
+        part = "".join(buf).strip()
+        if part:
+            parts.append(part)
+    return parts
+def _extract_categories_from_binary(binary_keywords_str: str) -> list[str]:
+    """
+    Pull candidate category tokens from binary keyword strings, e.g. relation(a, b).
+    Only returns tokens when parentheses and two comma-separated entries exist.
+    """
+    categories: list[str] = []
+    for kw in _split_top_level_commas(binary_keywords_str or ""):
+        lpar = kw.find("(")
+        rpar = kw.rfind(")")
+        if lpar == -1 or rpar <= lpar:
+            continue
+        inside = kw[lpar + 1 : rpar]
+        parts = [p.strip() for p in inside.split(",") if p.strip()]
+        if len(parts) == 2:
+            categories.extend(parts)
+    return categories
+def _parse_binary_keywords(binary_keywords_str: str, categorical_keywords: list[str]):
+    """
+    Parse binary keyword string like:
+        "behind(person, dog), bite(dog, frisbee)"
+    into:
+      - binary_keywords_list: list of raw strings (used as CLIP text)
+      - batched_binary_predicates: {0: [(rel_text, from_cat, to_cat), ...]} or None
+      - warnings: list of warning strings about invalid/mismatched categories
+    """
+    if not binary_keywords_str:
+        return [], None, []
+    cat_map = {
+        kw.strip().lower(): kw.strip()
+        for kw in categorical_keywords
+        if isinstance(kw, str) and kw.strip()
+    }
+    entries = _split_top_level_commas(binary_keywords_str)
+    binary_keywords_list: list[str] = []
+    predicates: list[tuple[str, str, str]] = []
+    warnings: list[str] = []
+    for raw in entries:
+        kw = raw.strip()
+        if not kw:
+            continue
+        # Always use the full raw keyword as the CLIP text string
+        binary_keywords_list.append(kw)
+        lpar = kw.find("(")
+        rpar = kw.rfind(")")
+        if (lpar == -1 and rpar != -1) or (lpar != -1 and rpar == -1) or rpar < lpar:
+            msg = (
+                f"Binary keyword '{kw}' has mismatched parentheses; expected "
+                "relation(from_category, to_category)."
+            )
+            print(msg)
+            warnings.append(msg)
+            continue
+        if lpar == -1 or rpar <= lpar:
+            # No explicit (from,to) part; treat as plain relation (no category filter)
+            continue
+        inside = kw[lpar + 1 : rpar]
+        parts = inside.split(",")
+        if len(parts) != 2:
+            msg = (
+                f"Ignoring '(from,to)' part in binary keyword '{kw}': "
+                f"expected exactly two comma-separated items."
+            )
+            print(msg)
+            warnings.append(msg)
+            continue
+        from_raw = parts[0].strip()
+        to_raw = parts[1].strip()
+        if not from_raw or not to_raw:
+            msg = f"Ignoring binary keyword '{kw}': empty from/to category."
+            print(msg)
+            warnings.append(msg)
+            continue
+        canonical_from = cat_map.get(from_raw.lower())
+        canonical_to = cat_map.get(to_raw.lower())
+        if canonical_from is None:
+            msg = (
+                f"Binary keyword '{kw}': from-category '{from_raw}' does not "
+                f"match any categorical keyword {categorical_keywords}."
+            )
+            print(msg)
+            warnings.append(msg)
+        if canonical_to is None:
+            msg = (
+                f"Binary keyword '{kw}': to-category '{to_raw}' does not "
+                f"match any categorical keyword {categorical_keywords}."
+            )
+            print(msg)
+            warnings.append(msg)
+        if canonical_from is None or canonical_to is None:
+            continue
+        # Store (relation_text, from_category, to_category)
+        predicates.append((kw, canonical_from, canonical_to))
+    if not predicates:
+        return binary_keywords_list, None, warnings
+    return binary_keywords_list, {0: predicates}, warnings
 @lru_cache(maxsize=1)
 def _load_vine_pipeline():
     """
+    Lazy-load and cache the LASER (VINE HF) pipeline so we don't re-download/rebuild it on every request.
     """
     from vine_hf import VineConfig, VineModel, VinePipeline
         debug_visualizations=False,
         device="cuda",
         categorical_pool="max",
+        auto_add_not_unary=False,  # UI will control this per-call
     )
     model = VineModel(config)
     return VinePipeline(
     categorical_keywords,
     unary_keywords,
     binary_keywords,
+    auto_add_not_unary,
     output_fps,
     box_threshold,
     text_threshold,
     if not isinstance(video_file, (str, Path)):
         raise ValueError(f"Unsupported video input type: {type(video_file)}")
+    video_path = Path(video_file)
+    if video_path.suffix.lower() != ".mp4":
+        msg = (
+            "Please upload an MP4 file. LASER currently supports MP4 inputs for "
+            "scene-graph generation."
+        )
+        print(msg)
+        return None, {"error": msg}
+    video_file = str(video_path)
+    # Keep original strings for parsing
+    categorical_keywords_str = categorical_keywords
+    unary_keywords_str = unary_keywords
+    binary_keywords_str = binary_keywords
     categorical_keywords = (
+        [kw.strip() for kw in categorical_keywords_str.split(",")]
+        if categorical_keywords_str
         else []
     )
     unary_keywords = (
+        [kw.strip() for kw in unary_keywords_str.split(",")]
+        if unary_keywords_str
+        else []
     )
+    # Preprocess: pull categories referenced in binary keywords and add any missing ones
+    added_categories: list[str] = []
+    extra_cats = _extract_categories_from_binary(binary_keywords_str or "")
+    if extra_cats:
+        existing_lower = {kw.lower() for kw in categorical_keywords}
+        for cat in extra_cats:
+            if cat and cat.lower() not in existing_lower:
+                categorical_keywords.append(cat)
+                existing_lower.add(cat.lower())
+                added_categories.append(cat)
+    # Parse binary keywords with category info (if provided)
+    (
+        binary_keywords_list,
+        batched_binary_predicates,
+        binary_input_warnings,
+    ) = _parse_binary_keywords(binary_keywords_str or "", categorical_keywords)
+    if added_categories:
+        binary_input_warnings.append(
+            "Auto-added categorical keywords from binary relations: "
+            + ", ".join(added_categories)
+        )
+    skip_binary = len(binary_keywords_list) == 0
     # Debug: Print what we're sending to the pipeline
     print("\n" + "=" * 80)
+    print("INPUT TO LASER PIPELINE:")
     print(f"  categorical_keywords: {categorical_keywords}")
     print(f"  unary_keywords: {unary_keywords}")
+    print(f"  binary_keywords (raw parsed): {binary_keywords_list}")
+    print(f"  batched_binary_predicates: {batched_binary_predicates}")
+    print(f"  auto_add_not_unary: {auto_add_not_unary}")
+    print(f"  skip_binary: {skip_binary}")
     print("=" * 80 + "\n")
     # Object pairs is now optional - empty list will auto-generate all pairs in vine_model.py
+    object_pairs: list[tuple[int, int]] = []
+    extra_forward_kwargs = {}
+    if batched_binary_predicates is not None and not skip_binary:
+        # Use category-based filtering of binary pairs
+        extra_forward_kwargs["batched_binary_predicates"] = batched_binary_predicates
+        extra_forward_kwargs["topk_cate"] = 1  # as requested
+    extra_forward_kwargs["auto_add_not_unary"] = bool(auto_add_not_unary)
+    if skip_binary:
+        extra_forward_kwargs["disable_binary"] = True
     results = vine_pipe(
         inputs=video_file,
         categorical_keywords=categorical_keywords,
         unary_keywords=unary_keywords,
+        binary_keywords=binary_keywords_list,
         object_pairs=object_pairs,
         segmentation_method="grounding_dino_sam2",
         return_top_k=5,
         text_threshold=text_threshold,
         target_fps=output_fps,
         binary_confidence_threshold=binary_confidence_threshold,
+        **extra_forward_kwargs,
     )
     # Debug: Print what the pipeline returned
         result_video_path = str(candidates[0]) if candidates else None
     summary = results_dict.get("summary") or {}
+    # Attach any binary category parsing warnings into the summary JSON
+    if binary_input_warnings:
+        if "binary_input_warnings" in summary:
+            summary["binary_input_warnings"].extend(binary_input_warnings)
+        else:
+            summary["binary_input_warnings"] = binary_input_warnings
     if result_video_path and os.path.exists(result_video_path):
         gradio_tmp = (
             Path(os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir()))
     """
     Build a Blocks context that works across Gradio versions.
     """
+    blocks_kwargs = {"title": "LASER Scene Graph Demo"}
     soft_theme = None
     if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"):
 with _create_blocks() as demo:
     gr.Markdown(
         """
+        # 🎬 LASER: Spatio-temporal Scene Graphs for Video
+        Turn any MP4 into a spatio-temporal scene graph with LASER - our 100-million parameter foundation model for scene-graph generation. LASER trains on 87K+ open-domain videos using a neurosymbolic caption-to-scene alignment pipeline, so it learns fine-grained video semantics without human labels.
+        Upload an MP4 and sketch the scene graph you care about: specify the objects, actions, and interactions you want, and LASER will assemble a spatio-temporal scene graph plus an annotated video.
         """
     )
     with gr.Row():
         # Left column: Inputs
         with gr.Column(scale=1):
+            gr.Markdown("### Scene Graph Inputs")
             video_input = _video_component("Upload Video (MP4 only)", is_output=False)
             gr.Markdown("*Note: Only MP4 format is currently supported*")
+            gr.Markdown("#### Scene Graph Queries")
             categorical_input = gr.Textbox(
                 label="Categorical Keywords",
                 placeholder="e.g., person, car, dog",
             )
             binary_input = gr.Textbox(
                 label="Binary Keywords",
+                placeholder="e.g., behind(person, dog), bite(dog, frisbee)",
+                info=(
+                    "Object-to-object interactions to detect. "
+                    "Use format: relation(from_category, to_category). "
+                    "Example: 'behind(person, dog), bite(dog, frisbee)'. "
+                    "If you omit '(from,to)', the relation will be applied to all object pairs (default behavior). "
+                    "Leave blank to skip binary relation search entirely."
+                ),
+            )
+            add_not_unary_checkbox = gr.Checkbox(
+                label="Also query 'not <unary>' predicates",
+                value=False,
+                info="If enabled, for each unary keyword X, also query 'not X'.",
             )
             gr.Markdown("#### Processing Settings")
                     label="Binary Relation Confidence Threshold",
                     minimum=0.0,
                     maximum=1.0,
+                    value=.5,
                     step=0.05,
                     info="Minimum confidence to show binary relations and object pairs",
                 )
         # Right column: Outputs
         with gr.Column(scale=1):
+            gr.Markdown("### Scene Graph Results")
             video_output = _video_component("Annotated Video Output", is_output=True)
+            gr.Markdown("### Scene Graph Summary")
+            summary_output = gr.JSON(label="Scene Graph / Detected Events")
     gr.Markdown(
         """
         ---
+        ### How to Use LASER
+        1. Upload an MP4 (we validate the format for you).
+        2. Describe the **nodes** of your spatio-temporal scene graph with categorical keywords (objects) and unary keywords (single-object actions).
+        3. Wire up **binary** relations:
+           - Use the structured form `relation(from_category, to_category)` (e.g., `behind(person, dog), bite(dog, frisbee)`) to limit relations to those category pairs.
+           - Or list relation names (`chasing, carrying`) to evaluate all object pairs.
+           - Leave the field blank to skip binary relations entirely (no pair search or binary predicates).
+           - Categories referenced inside binary relations are auto-added to the categorical list for you.
+        4. Optionally enable automatic `'not <unary>'` predicates.
+        5. Adjust processing settings if needed and click **Process Video** to receive an annotated video plus the serialized scene graph.
+        More to explore:
+        - LASER paper (ICLR'25): https://arxiv.org/abs/2304.07647 | Demo: https://huggingface.co/spaces/jiani-huang/LASER | Code: https://github.com/video-fm/LASER
+        - ESCA paper: https://arxiv.org/abs/2510.15963 | Code: https://github.com/video-fm/ESCA | Model: https://huggingface.co/video-fm/vine_v0 | Dataset: https://huggingface.co/datasets/video-fm/ESCA-video-87K
+        - Meet us at **NeurIPS 2025** (San Diego, Exhibit Hall C/D/E, Booth #4908 - Wed, Dec 3 - 11:00 a.m.-2:00 p.m. PST) for the foundation model demo, code, and full paper.
         """
     )
             categorical_input,
             unary_input,
             binary_input,
+            add_not_unary_checkbox,
             fps_input,
             box_threshold_input,
             text_threshold_input,

vine_hf/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/__init__.cpython-310.pyc and b/vine_hf/__pycache__/__init__.cpython-310.pyc differ

vine_hf/__pycache__/vine_config.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_config.cpython-310.pyc and b/vine_hf/__pycache__/vine_config.cpython-310.pyc differ

vine_hf/__pycache__/vine_model.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_model.cpython-310.pyc and b/vine_hf/__pycache__/vine_model.cpython-310.pyc differ

vine_hf/vine_config.py CHANGED Viewed

@@ -41,6 +41,7 @@ class VineConfig(PretrainedConfig):
         interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
         debug_visualizations: bool = False,
         device: Optional[Union[str, int]] = None,
         **kwargs: Any,
     ):
         self.model_name = model_name
@@ -77,6 +78,7 @@ class VineConfig(PretrainedConfig):
         self.return_valid_pairs = return_valid_pairs
         self.interested_object_pairs = interested_object_pairs or []
         self.debug_visualizations = debug_visualizations
         if isinstance(device, int):
             self._device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"

         interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
         debug_visualizations: bool = False,
         device: Optional[Union[str, int]] = None,
+        auto_add_not_unary: bool = False,
         **kwargs: Any,
     ):
         self.model_name = model_name
         self.return_valid_pairs = return_valid_pairs
         self.interested_object_pairs = interested_object_pairs or []
         self.debug_visualizations = debug_visualizations
+        self.auto_add_not_unary = auto_add_not_unary
         if isinstance(device, int):
             self._device = f"cuda:{device}" if torch.cuda.is_available() else "cpu"

vine_hf/vine_model.py CHANGED Viewed

@@ -326,6 +326,7 @@ class VineModel(PreTrainedModel):
         debug_visualizations: Optional[bool] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
         if unary_keywords is None:
             unary_keywords = []
         if binary_keywords is None:
@@ -353,6 +354,8 @@ class VineModel(PreTrainedModel):
         multi_class = kwargs.pop("multi_class", getattr(self.config, "multi_class", False))
         output_logit = kwargs.pop("output_logit", getattr(self.config, "output_logit", False))
         output_embeddings = kwargs.pop("output_embeddings", False)
         batched_video_ids = [0]
@@ -385,12 +388,12 @@ class VineModel(PreTrainedModel):
         batched_names = [list(categorical_keywords)]
         batched_unary_kws = [list(unary_keywords)]
-        batched_binary_kws = [list(binary_keywords)]
         batched_obj_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
         # Auto-generate all object pairs if binary_keywords provided but object_pairs is empty
-        if not object_pairs and binary_keywords:
             # Get all unique object IDs across all frames
             all_object_ids = set()
             for frame_masks in masks.values():
@@ -404,7 +407,10 @@ class VineModel(PreTrainedModel):
                     if from_oid != to_oid:
                         object_pairs.append((from_oid, to_oid))
-            print(f"Auto-generated {len(object_pairs)} bidirectional object pairs for binary relation detection: {object_pairs}")
         if object_pairs:
             for frame_id, frame_masks in masks.items():
@@ -416,12 +422,34 @@ class VineModel(PreTrainedModel):
                         batched_obj_pairs.append((0, frame_id, (from_oid, to_oid)))
         batched_video_splits = [0]
-        batched_binary_predicates = [None]
-        def fill_empty(batched_kw):
             new_batched = []
             for kw_ls in batched_kw:
                 if len(kw_ls) == 0:
                     new_batched.append([dummy_str])
                 else:
                     new_batched.append(list(kw_ls))
@@ -429,7 +457,7 @@ class VineModel(PreTrainedModel):
         batched_names = fill_empty(batched_names)
         batched_unary_kws = fill_empty(batched_unary_kws)
-        batched_binary_kws = fill_empty(batched_binary_kws)
         dummy_prob = torch.tensor(0.0, device=self._device)
@@ -673,6 +701,31 @@ class VineModel(PreTrainedModel):
             batched_obj_per_cate[vid_id] = obj_per_cate
         # Step 4: binary pairs
         batched_cropped_obj_pairs: Dict[int, List[np.ndarray]] = {}
         frame_splits: Dict[Tuple[int, int], Dict[str, int]] = {}
         current_info = (0, 0)
@@ -701,6 +754,8 @@ class VineModel(PreTrainedModel):
             selected_pairs = set(batched_obj_pairs)
         else:
             for bp_vid, binary_predicates in enumerate(batched_binary_predicates):
                 topk_cate_candidates = batched_topk_cate_candidates[bp_vid]
                 for (rel_name, from_obj_name, to_obj_name) in binary_predicates:
                     if (
@@ -925,6 +980,21 @@ class VineModel(PreTrainedModel):
         inputs = self.clip_processor(images=image, return_tensors="pt").to(self._device)
         return self._image_features_checkpoint(model, inputs["pixel_values"])
     # ------------------------------------------------------------------ #
     # High-level predict API
     # ------------------------------------------------------------------ #
@@ -942,7 +1012,35 @@ class VineModel(PreTrainedModel):
         return_valid_pairs: Optional[bool] = None,
         interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
         debug_visualizations: Optional[bool] = None,
     ) -> Dict[str, Any]:
         with torch.no_grad():
             outputs = self.forward(
                 video_frames=video_frames,
@@ -956,6 +1054,7 @@ class VineModel(PreTrainedModel):
                 return_valid_pairs=return_valid_pairs,
                 interested_object_pairs=interested_object_pairs,
                 debug_visualizations=debug_visualizations,
             )
         formatted_categorical: Dict[int, List[Tuple[float, str]]] = {}

         debug_visualizations: Optional[bool] = None,
         **kwargs: Any,
     ) -> Dict[str, Any]:
+        disable_binary = kwargs.pop("disable_binary", False)
         if unary_keywords is None:
             unary_keywords = []
         if binary_keywords is None:
         multi_class = kwargs.pop("multi_class", getattr(self.config, "multi_class", False))
         output_logit = kwargs.pop("output_logit", getattr(self.config, "output_logit", False))
         output_embeddings = kwargs.pop("output_embeddings", False)
+        batched_binary_predicates_arg = kwargs.pop("batched_binary_predicates", None)
+        skip_binary = disable_binary or len(binary_keywords) == 0
         batched_video_ids = [0]
         batched_names = [list(categorical_keywords)]
         batched_unary_kws = [list(unary_keywords)]
+        batched_binary_kws = [list(binary_keywords)] if not skip_binary else [[]]
         batched_obj_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
         # Auto-generate all object pairs if binary_keywords provided but object_pairs is empty
+        if not object_pairs and binary_keywords and not skip_binary:
             # Get all unique object IDs across all frames
             all_object_ids = set()
             for frame_masks in masks.values():
                     if from_oid != to_oid:
                         object_pairs.append((from_oid, to_oid))
+            print(
+                f"Auto-generated {len(object_pairs)} bidirectional object "
+                f"pairs for binary relation detection: {object_pairs}"
+            )
         if object_pairs:
             for frame_id, frame_masks in masks.items():
                         batched_obj_pairs.append((0, frame_id, (from_oid, to_oid)))
         batched_video_splits = [0]
+        # Prepare binary predicates per video (single-video setup)
+        if batched_binary_predicates_arg is None:
+            batched_binary_predicates = [None]
+        elif skip_binary:
+            batched_binary_predicates = [None]
+        else:
+            if isinstance(batched_binary_predicates_arg, dict):
+                preds_for_vid0 = batched_binary_predicates_arg.get(0, [])
+                if preds_for_vid0:
+                    batched_binary_predicates = [preds_for_vid0]
+                else:
+                    batched_binary_predicates = [None]
+            else:
+                if isinstance(batched_binary_predicates_arg, (list, tuple)) and len(
+                    batched_binary_predicates_arg
+                ) > 0:
+                    batched_binary_predicates = [list(batched_binary_predicates_arg)]
+                else:
+                    batched_binary_predicates = [None]
+        def fill_empty(batched_kw, *, allow_dummy: bool = True):
             new_batched = []
             for kw_ls in batched_kw:
                 if len(kw_ls) == 0:
+                    if not allow_dummy:
+                        new_batched.append([])
+                        continue
                     new_batched.append([dummy_str])
                 else:
                     new_batched.append(list(kw_ls))
         batched_names = fill_empty(batched_names)
         batched_unary_kws = fill_empty(batched_unary_kws)
+        batched_binary_kws = fill_empty(batched_binary_kws, allow_dummy=not skip_binary)
         dummy_prob = torch.tensor(0.0, device=self._device)
             batched_obj_per_cate[vid_id] = obj_per_cate
         # Step 4: binary pairs
+        if skip_binary:
+            batched_image_binary_probs = [{} for _ in range(batch_size)]
+            batched_obj_pair_features: Dict[int, torch.Tensor] = {
+                vid: torch.tensor([]) for vid in range(batch_size)
+            }
+            result: Dict[str, Any] = {
+                "categorical_probs": batched_image_cate_probs,
+                "unary_probs": batched_image_unary_probs,
+                "binary_probs": batched_image_binary_probs,
+                "dummy_prob": dummy_prob,
+            }
+            if output_embeddings:
+                embeddings_dict = {
+                    "cate_obj_clip_features": batched_obj_cate_features,
+                    "cate_obj_name_features": batched_obj_name_features,
+                    "unary_obj_features": batched_obj_unary_features,
+                    "unary_nl_features": batched_unary_nl_features,
+                    "binary_obj_pair_features": batched_obj_pair_features,
+                    "binary_nl_features": batched_binary_nl_features,
+                }
+                result.update(embeddings_dict)
+            return result
         batched_cropped_obj_pairs: Dict[int, List[np.ndarray]] = {}
         frame_splits: Dict[Tuple[int, int], Dict[str, int]] = {}
         current_info = (0, 0)
             selected_pairs = set(batched_obj_pairs)
         else:
             for bp_vid, binary_predicates in enumerate(batched_binary_predicates):
+                if binary_predicates is None:
+                    continue
                 topk_cate_candidates = batched_topk_cate_candidates[bp_vid]
                 for (rel_name, from_obj_name, to_obj_name) in binary_predicates:
                     if (
         inputs = self.clip_processor(images=image, return_tensors="pt").to(self._device)
         return self._image_features_checkpoint(model, inputs["pixel_values"])
+    def _augment_unary_with_negation(self, unary_keywords: List[str]) -> List[str]:
+        """
+        Given unary predicates like ["running", "walking"], add "not running",
+        "not walking" if they are not already present (case-insensitive).
+        """
+        base = [kw for kw in unary_keywords if isinstance(kw, str) and kw.strip()]
+        seen_lower = {kw.lower() for kw in base}
+        augmented = list(base)
+        for kw in base:
+            neg = f"not {kw}"
+            if neg.lower() not in seen_lower:
+                augmented.append(neg)
+                seen_lower.add(neg.lower())
+        return augmented
     # ------------------------------------------------------------------ #
     # High-level predict API
     # ------------------------------------------------------------------ #
         return_valid_pairs: Optional[bool] = None,
         interested_object_pairs: Optional[List[Tuple[int, int]]] = None,
         debug_visualizations: Optional[bool] = None,
+        auto_add_not_unary: Optional[bool] = None,
+        batched_binary_predicates: Optional[Dict[int, List[Tuple[str, str, str]]]] = None,
+        topk_cate: Optional[int] = None,
     ) -> Dict[str, Any]:
+        if unary_keywords is None:
+            unary_keywords = []
+        else:
+            unary_keywords = list(unary_keywords)
+        if binary_keywords is None:
+            binary_keywords = []
+        else:
+            binary_keywords = list(binary_keywords)
+        if object_pairs is None:
+            object_pairs = []
+        if auto_add_not_unary is None:
+            auto_add_not_unary = getattr(self.config, "auto_add_not_unary", False)
+        if auto_add_not_unary:
+            unary_keywords = self._augment_unary_with_negation(unary_keywords)
+        forward_extra_kwargs: Dict[str, Any] = {}
+        if batched_binary_predicates is not None:
+            forward_extra_kwargs["batched_binary_predicates"] = batched_binary_predicates
+        if topk_cate is not None:
+            forward_extra_kwargs["topk_cate"] = topk_cate
         with torch.no_grad():
             outputs = self.forward(
                 video_frames=video_frames,
                 return_valid_pairs=return_valid_pairs,
                 interested_object_pairs=interested_object_pairs,
                 debug_visualizations=debug_visualizations,
+                **forward_extra_kwargs,
             )
         formatted_categorical: Dict[int, List[Tuple[float, str]]] = {}

vine_hf/vine_pipeline.py CHANGED Viewed

@@ -107,6 +107,14 @@ class VinePipeline(Pipeline):
             forward_kwargs["binary_keywords"] = kwargs["binary_keywords"]
         if "object_pairs" in kwargs:
             forward_kwargs["object_pairs"] = kwargs["object_pairs"]
         if "return_flattened_segments" in kwargs:
             forward_kwargs["return_flattened_segments"] = kwargs[
                 "return_flattened_segments"
@@ -126,7 +134,9 @@ class VinePipeline(Pipeline):
         if "self.visualize" in kwargs:
             postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
         if "binary_confidence_threshold" in kwargs:
-            postprocess_kwargs["binary_confidence_threshold"] = kwargs["binary_confidence_threshold"]
         return preprocess_kwargs, forward_kwargs, postprocess_kwargs

             forward_kwargs["binary_keywords"] = kwargs["binary_keywords"]
         if "object_pairs" in kwargs:
             forward_kwargs["object_pairs"] = kwargs["object_pairs"]
+        if "batched_binary_predicates" in kwargs:
+            # New: per-video (rel, from_cat, to_cat) triples for binary filtering
+            forward_kwargs["batched_binary_predicates"] = kwargs["batched_binary_predicates"]
+        if "topk_cate" in kwargs:
+            # New: override topk_cate when binary filtering is requested
+            forward_kwargs["topk_cate"] = kwargs["topk_cate"]
+        if "auto_add_not_unary" in kwargs:
+            forward_kwargs["auto_add_not_unary"] = kwargs["auto_add_not_unary"]
         if "return_flattened_segments" in kwargs:
             forward_kwargs["return_flattened_segments"] = kwargs[
                 "return_flattened_segments"
         if "self.visualize" in kwargs:
             postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
         if "binary_confidence_threshold" in kwargs:
+            postprocess_kwargs["binary_confidence_threshold"] = kwargs[
+                "binary_confidence_threshold"
+            ]
         return preprocess_kwargs, forward_kwargs, postprocess_kwargs