Spaces:

jiani-huang
/

LASER

Running on Zero

App Files Files Community

moqingyan123 commited on 10 days ago

Commit

17f21ca

1 Parent(s): f71f431

updates:

Browse files

Files changed (22) hide show

app.py +364 -41
outputs/debug_crops/frame_0_obj_0.jpg +0 -0
outputs/debug_crops/frame_0_obj_1.jpg +0 -0
outputs/debug_crops/frame_0_obj_2.jpg +0 -0
outputs/debug_crops/frame_0_obj_3.jpg +0 -0
outputs/debug_crops/frame_0_obj_4.jpg +0 -0
outputs/debug_crops/frame_0_obj_5.jpg +0 -0
outputs/debug_crops/frame_1_obj_0.jpg +0 -0
outputs/debug_crops/frame_1_obj_1.jpg +0 -0
outputs/debug_crops/frame_1_obj_2.jpg +0 -0
outputs/debug_crops/frame_1_obj_3.jpg +0 -0
outputs/debug_crops/frame_1_obj_5.jpg +0 -0
src/LASER/laser/models/model_utils.py +11 -4
vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
vine_hf/__pycache__/flattening.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_pipeline.cpython-310.pyc +0 -0
vine_hf/__pycache__/vis_utils.cpython-310.pyc +0 -0
vine_hf/vine_model.py +18 -0
vine_hf/vine_pipeline.py +46 -0
vine_hf/vis_utils.py +25 -3

app.py CHANGED Viewed

@@ -60,6 +60,208 @@ print(
 )
 @lru_cache(maxsize=1)
 def _load_vine_pipeline():
     """
@@ -96,16 +298,16 @@ def _load_vine_pipeline():
     )
-@spaces.GPU(duration=300)  # Up to ~5 minutes of H200 ZeroGPU time per call
 def process_video(
     video_file,
     categorical_keywords,
     unary_keywords,
     binary_keywords,
-    object_pairs,
     output_fps,
     box_threshold,
     text_threshold,
 ):
     vine_pipe = _load_vine_pipeline()
@@ -130,11 +332,17 @@ def process_video(
     binary_keywords = (
         [kw.strip() for kw in binary_keywords.split(",")] if binary_keywords else []
     )
-    object_pairs = (
-        [tuple(map(int, pair.split("-"))) for pair in object_pairs.split(",")]
-        if object_pairs
-        else []
-    )
     results = vine_pipe(
         inputs=video_file,
@@ -150,8 +358,17 @@ def process_video(
         box_threshold=box_threshold,
         text_threshold=text_threshold,
         target_fps=output_fps,
     )
     vine_pipe.box_threshold = box_threshold
     vine_pipe.text_threshold = text_threshold
     vine_pipe.target_fps = output_fps
@@ -194,7 +411,47 @@ def process_video(
             "Warning: annotated video not found or empty; check visualization settings."
         )
-    return video_path_for_ui, summary
 def _video_component(label: str, *, is_output: bool = False):
@@ -214,6 +471,9 @@ def _video_component(label: str, *, is_output: bool = False):
             kwargs["type"] = "filepath"
         if "sources" in sig.parameters:
             kwargs["sources"] = ["upload"]
     if is_output and "autoplay" in sig.parameters:
         kwargs["autoplay"] = True
@@ -240,40 +500,103 @@ def _create_blocks():
     return gr.Blocks(**blocks_kwargs)
-# Create Gradio interface
 with _create_blocks() as demo:
-    video_input = _video_component("Upload Video", is_output=False)
-    categorical_input = gr.Textbox(
-        label="Categorical Keywords (comma-separated)",
-        value="person, car, tree, background",
-    )
-    unary_input = gr.Textbox(
-        label="Unary Keywords (comma-separated)", value="walking, running, standing"
-    )
-    binary_input = gr.Textbox(
-        label="Binary Keywords (comma-separated)",
-        placeholder="e.g., chasing, carrying",
-    )
-    pairs_input = gr.Textbox(
-        label="Object Pairs (comma-separated indices)",
-        placeholder="e.g., 0-1,0-2 for pairs of objects",
-    )
-    fps_input = gr.Number(
-        label="Output FPS (affects processing speed)", value=1  # default 1 FPS
-    )
-    with gr.Accordion("Advanced Settings", open=False):
-        box_threshold_input = gr.Slider(
-            label="Box Threshold", minimum=0.1, maximum=0.9, value=0.35, step=0.05
-        )
-        text_threshold_input = gr.Slider(
-            label="Text Threshold", minimum=0.1, maximum=0.9, value=0.25, step=0.05
-        )
-    submit_btn = gr.Button("Process Video", variant="primary")
-    video_output = _video_component("Output Video with Annotations", is_output=True)
-    json_output = gr.JSON(label="Summary of Detected Events")
     submit_btn.click(
         fn=process_video,
@@ -282,12 +605,12 @@ with _create_blocks() as demo:
             categorical_input,
             unary_input,
             binary_input,
-            pairs_input,
             fps_input,
             box_threshold_input,
             text_threshold_input,
         ],
-        outputs=[video_output, json_output],
     )
 if __name__ == "__main__":

 )
+def format_summary(summary, binary_confidence_threshold=0.8):
+    """
+    Format the summary dictionary into a readable markdown string.
+    Filters binary relations by confidence threshold.
+    """
+    if not summary or not isinstance(summary, dict):
+        return "# Detection Summary\n\nNo events detected or processing in progress..."
+    output_lines = ["# Detection Summary\n"]
+    has_content = False
+    # Categorical keywords
+    if "categorical_keywords" in summary and summary["categorical_keywords"]:
+        output_lines.append("## Categorical Keywords\n")
+        cate = summary["categorical_keywords"]
+        if isinstance(cate, dict) and cate:
+            has_content = True
+            for kw, info in cate.items():
+                output_lines.append(f"**{kw}**")
+                if isinstance(info, dict):
+                    for key, val in info.items():
+                        output_lines.append(f"  - {key}: {val}")
+                else:
+                    output_lines.append(f"  - {info}")
+                output_lines.append("")
+        elif isinstance(cate, list) and cate:
+            has_content = True
+            for item in cate:
+                output_lines.append(f"- {item}")
+            output_lines.append("")
+    # Unary keywords
+    if "unary_keywords" in summary and summary["unary_keywords"]:
+        output_lines.append("## Unary Keywords\n")
+        unary = summary["unary_keywords"]
+        if isinstance(unary, dict) and unary:
+            has_content = True
+            for kw, info in unary.items():
+                output_lines.append(f"**{kw}**")
+                if isinstance(info, dict):
+                    for key, val in info.items():
+                        output_lines.append(f"  - {key}: {val}")
+                else:
+                    output_lines.append(f"  - {info}")
+                output_lines.append("")
+        elif isinstance(unary, list) and unary:
+            has_content = True
+            for item in unary:
+                output_lines.append(f"- {item}")
+            output_lines.append("")
+    # Binary keywords - show ALL binary relations for debugging
+    print(f"DEBUG: Checking binary_keywords...")
+    print(f"  'binary_keywords' in summary: {'binary_keywords' in summary}")
+    if 'binary_keywords' in summary:
+        print(f"  summary['binary_keywords'] truthy: {bool(summary['binary_keywords'])}")
+        print(f"  summary['binary_keywords'] type: {type(summary['binary_keywords'])}")
+        print(f"  summary['binary_keywords'] value: {summary['binary_keywords']}")
+    if "binary_keywords" in summary and summary["binary_keywords"]:
+        output_lines.append(f"## Binary Keywords\n")
+        binary = summary["binary_keywords"]
+        print(f"DEBUG: Processing binary keywords, type: {type(binary)}, length: {len(binary) if isinstance(binary, (dict, list)) else 'N/A'}")
+        if isinstance(binary, dict) and binary:
+            has_content = True
+            # Show all binary relations, sorted by confidence
+            binary_items = []
+            for kw, info in binary.items():
+                if isinstance(info, dict):
+                    confidence = info.get("confidence", info.get("score", 0))
+                    binary_items.append((kw, info, confidence))
+                else:
+                    binary_items.append((kw, info, 0))
+            # Sort by confidence descending
+            binary_items.sort(key=lambda x: x[2], reverse=True)
+            high_conf_count = 0
+            low_conf_count = 0
+            # Show high confidence items first
+            output_lines.append(f"### High Confidence (≥ {binary_confidence_threshold})\n")
+            for kw, info, confidence in binary_items:
+                if confidence >= binary_confidence_threshold:
+                    high_conf_count += 1
+                    if isinstance(info, dict):
+                        output_lines.append(f"**{kw}** (confidence: {confidence:.2f})")
+                        for key, val in info.items():
+                            if key not in ["confidence", "score"]:
+                                output_lines.append(f"  - {key}: {val}")
+                    else:
+                        output_lines.append(f"**{kw}**: {info}")
+                    output_lines.append("")
+            if high_conf_count == 0:
+                output_lines.append(f"*No binary relations found with confidence ≥ {binary_confidence_threshold}*\n")
+            # Show lower confidence items for debugging
+            output_lines.append(f"### Lower Confidence (< {binary_confidence_threshold})\n")
+            for kw, info, confidence in binary_items:
+                if confidence < binary_confidence_threshold:
+                    low_conf_count += 1
+                    if isinstance(info, dict):
+                        output_lines.append(f"**{kw}** (confidence: {confidence:.2f})")
+                        for key, val in info.items():
+                            if key not in ["confidence", "score"]:
+                                output_lines.append(f"  - {key}: {val}")
+                    else:
+                        output_lines.append(f"**{kw}**: {info}")
+                    output_lines.append("")
+            if low_conf_count == 0:
+                output_lines.append(f"*No binary relations found with confidence < {binary_confidence_threshold}*\n")
+            output_lines.append(f"**Total binary relations detected: {len(binary_items)}**\n")
+        elif isinstance(binary, list) and binary:
+            has_content = True
+            for item in binary:
+                output_lines.append(f"- {item}")
+            output_lines.append("")
+    # Object pairs - show ALL object pair interactions for debugging
+    print(f"DEBUG: Checking object_pairs...")
+    print(f"  'object_pairs' in summary: {'object_pairs' in summary}")
+    if 'object_pairs' in summary:
+        print(f"  summary['object_pairs'] truthy: {bool(summary['object_pairs'])}")
+        print(f"  summary['object_pairs'] type: {type(summary['object_pairs'])}")
+        print(f"  summary['object_pairs'] value: {summary['object_pairs']}")
+    if "object_pairs" in summary and summary["object_pairs"]:
+        output_lines.append(f"## Object Pair Interactions\n")
+        pairs = summary["object_pairs"]
+        print(f"DEBUG: Processing object pairs, type: {type(pairs)}, length: {len(pairs) if isinstance(pairs, (dict, list)) else 'N/A'}")
+        if isinstance(pairs, dict) and pairs:
+            has_content = True
+            # Show all object pairs, sorted by confidence
+            pair_items = []
+            for pair, info in pairs.items():
+                if isinstance(info, dict):
+                    confidence = info.get("confidence", info.get("score", 0))
+                    pair_items.append((pair, info, confidence))
+                else:
+                    pair_items.append((pair, info, 0))
+            # Sort by confidence descending
+            pair_items.sort(key=lambda x: x[2], reverse=True)
+            high_conf_count = 0
+            low_conf_count = 0
+            # Show high confidence items first
+            output_lines.append(f"### High Confidence (≥ {binary_confidence_threshold})\n")
+            for pair, info, confidence in pair_items:
+                if confidence >= binary_confidence_threshold:
+                    high_conf_count += 1
+                    if isinstance(info, dict):
+                        output_lines.append(f"**{pair}** (confidence: {confidence:.2f})")
+                        for key, val in info.items():
+                            if key not in ["confidence", "score"]:
+                                output_lines.append(f"  - {key}: {val}")
+                    else:
+                        output_lines.append(f"**{pair}**: {info}")
+                    output_lines.append("")
+            if high_conf_count == 0:
+                output_lines.append(f"*No object pairs found with confidence ≥ {binary_confidence_threshold}*\n")
+            # Show lower confidence items for debugging
+            output_lines.append(f"### Lower Confidence (< {binary_confidence_threshold})\n")
+            for pair, info, confidence in pair_items:
+                if confidence < binary_confidence_threshold:
+                    low_conf_count += 1
+                    if isinstance(info, dict):
+                        output_lines.append(f"**{pair}** (confidence: {confidence:.2f})")
+                        for key, val in info.items():
+                            if key not in ["confidence", "score"]:
+                                output_lines.append(f"  - {key}: {val}")
+                    else:
+                        output_lines.append(f"**{pair}**: {info}")
+                    output_lines.append("")
+            if low_conf_count == 0:
+                output_lines.append(f"*No object pairs found with confidence < {binary_confidence_threshold}*\n")
+            output_lines.append(f"**Total object pairs detected: {len(pair_items)}**\n")
+        elif isinstance(pairs, list) and pairs:
+            has_content = True
+            for item in pairs:
+                output_lines.append(f"- {item}")
+            output_lines.append("")
+    # If no content was added, show the raw summary for debugging
+    if not has_content:
+        output_lines.append("## Raw Summary Data\n")
+        output_lines.append("```json")
+        import json
+        output_lines.append(json.dumps(summary, indent=2, default=str))
+        output_lines.append("```")
+    return "\n".join(output_lines)
 @lru_cache(maxsize=1)
 def _load_vine_pipeline():
     """
     )
+@spaces.GPU(duration=120)  # Up to ~5 minutes of H200 ZeroGPU time per call
 def process_video(
     video_file,
     categorical_keywords,
     unary_keywords,
     binary_keywords,
     output_fps,
     box_threshold,
     text_threshold,
+    binary_confidence_threshold,
 ):
     vine_pipe = _load_vine_pipeline()
     binary_keywords = (
         [kw.strip() for kw in binary_keywords.split(",")] if binary_keywords else []
     )
+    # Debug: Print what we're sending to the pipeline
+    print("\n" + "=" * 80)
+    print("INPUT TO VINE PIPELINE:")
+    print(f"  categorical_keywords: {categorical_keywords}")
+    print(f"  unary_keywords: {unary_keywords}")
+    print(f"  binary_keywords: {binary_keywords}")
+    print("=" * 80 + "\n")
+    # Object pairs is now optional - empty list will auto-generate all pairs in vine_model.py
+    object_pairs = []
     results = vine_pipe(
         inputs=video_file,
         box_threshold=box_threshold,
         text_threshold=text_threshold,
         target_fps=output_fps,
+        binary_confidence_threshold=binary_confidence_threshold,
     )
+    # Debug: Print what the pipeline returned
+    print("\n" + "=" * 80)
+    print("PIPELINE RESULTS DEBUG:")
+    print(f"  results type: {type(results)}")
+    if isinstance(results, dict):
+        print(f"  results keys: {list(results.keys())}")
+    print("=" * 80 + "\n")
     vine_pipe.box_threshold = box_threshold
     vine_pipe.text_threshold = text_threshold
     vine_pipe.target_fps = output_fps
             "Warning: annotated video not found or empty; check visualization settings."
         )
+    # Debug: Print summary structure
+    import json
+    print("=" * 80)
+    print("SUMMARY DEBUG OUTPUT:")
+    print(f"Summary type: {type(summary)}")
+    print(f"Summary keys: {summary.keys() if isinstance(summary, dict) else 'N/A'}")
+    if isinstance(summary, dict):
+        print("\nFULL SUMMARY JSON:")
+        print(json.dumps(summary, indent=2, default=str))
+        print("\n" + "=" * 80)
+        # Check for any keys that might contain binary relation data
+        print("\nLOOKING FOR BINARY RELATION DATA:")
+        possible_keys = ['binary', 'binary_keywords', 'binary_relations', 'object_pairs',
+                        'pairs', 'relations', 'interactions', 'pairwise']
+        for pkey in possible_keys:
+            if pkey in summary:
+                print(f"  FOUND: '{pkey}' -> {summary[pkey]}")
+        print("\nALL KEYS IN SUMMARY:")
+        for key in summary.keys():
+            print(f"\n{key}:")
+            print(f"  Type: {type(summary[key])}")
+            if isinstance(summary[key], dict):
+                print(f"  Length: {len(summary[key])}")
+                print(f"  Keys (first 10): {list(summary[key].keys())[:10]}")
+                # Print all items for anything that might be binary relations
+                if any(term in key.lower() for term in ['binary', 'pair', 'relation', 'interaction']):
+                    print(f"  ALL ITEMS:")
+                    for k, v in list(summary[key].items())[:20]:  # First 20 items
+                        print(f"    {k}: {v}")
+                else:
+                    print(f"  Sample: {dict(list(summary[key].items())[:2])}")
+            elif isinstance(summary[key], list):
+                print(f"  Length: {len(summary[key])}")
+                print(f"  Sample: {summary[key][:2]}")
+    print("=" * 80)
+    # Format summary as readable markdown text, filtering by confidence threshold
+    formatted_summary = format_summary(summary, binary_confidence_threshold)
+    return video_path_for_ui, formatted_summary
 def _video_component(label: str, *, is_output: bool = False):
             kwargs["type"] = "filepath"
         if "sources" in sig.parameters:
             kwargs["sources"] = ["upload"]
+        # Restrict to MP4 files only
+        if "file_types" in sig.parameters:
+            kwargs["file_types"] = [".mp4"]
     if is_output and "autoplay" in sig.parameters:
         kwargs["autoplay"] = True
     return gr.Blocks(**blocks_kwargs)
+# Create Gradio interface with two-column layout
 with _create_blocks() as demo:
+    gr.Markdown(
+        """
+        # 🎬 VINE: Video-based Interaction and Event Detection
+        Upload an MP4 video and specify keywords to detect objects, actions, and interactions in your video.
+        """
+    )
+    with gr.Row():
+        # Left column: Inputs
+        with gr.Column(scale=1):
+            gr.Markdown("### Input Configuration")
+            video_input = _video_component("Upload Video (MP4 only)", is_output=False)
+            gr.Markdown("*Note: Only MP4 format is currently supported*")
+            gr.Markdown("#### Detection Keywords")
+            categorical_input = gr.Textbox(
+                label="Categorical Keywords",
+                placeholder="e.g., person, car, dog",
+                value="person, car, dog",
+                info="Objects to detect in the video (comma-separated)"
+            )
+            unary_input = gr.Textbox(
+                label="Unary Keywords",
+                placeholder="e.g., walking, running, standing",
+                value="walking, running, standing",
+                info="Single-object actions to detect (comma-separated)"
+            )
+            binary_input = gr.Textbox(
+                label="Binary Keywords",
+                placeholder="e.g., chasing, carrying",
+                info="Object-to-object interactions to detect (comma-separated)"
+            )
+            gr.Markdown("#### Processing Settings")
+            fps_input = gr.Number(
+                label="Output FPS",
+                value=1,
+                info="Frames per second for processing (lower = faster)"
+            )
+            with gr.Accordion("Advanced Settings", open=False):
+                box_threshold_input = gr.Slider(
+                    label="Box Threshold",
+                    minimum=0.1,
+                    maximum=0.9,
+                    value=0.35,
+                    step=0.05,
+                    info="Confidence threshold for object detection"
+                )
+                text_threshold_input = gr.Slider(
+                    label="Text Threshold",
+                    minimum=0.1,
+                    maximum=0.9,
+                    value=0.25,
+                    step=0.05,
+                    info="Confidence threshold for text-based detection"
+                )
+                binary_confidence_input = gr.Slider(
+                    label="Binary Relation Confidence Threshold",
+                    minimum=0.0,
+                    maximum=1.0,
+                    value=0.8,
+                    step=0.05,
+                    info="Minimum confidence to show binary relations and object pairs"
+                )
+            submit_btn = gr.Button("🚀 Process Video", variant="primary", size="lg")
+        # Right column: Outputs
+        with gr.Column(scale=1):
+            gr.Markdown("### Results")
+            video_output = _video_component("Annotated Video Output", is_output=True)
+            gr.Markdown("### Detection Summary")
+            summary_output = gr.Markdown(
+                value="Results will appear here after processing...",
+                elem_classes=["summary-output"]
+            )
+    gr.Markdown(
+        """
+        ---
+        ### How to Use
+        1. Upload an MP4 video file
+        2. Specify the objects, actions, and interactions you want to detect
+        3. Adjust processing settings if needed (including binary relation confidence threshold)
+        4. Click "Process Video" to analyze
+        The system will automatically detect all binary relations between detected objects
+        and show only those with confidence above the threshold (default: 0.8).
+        """
+    )
     submit_btn.click(
         fn=process_video,
             categorical_input,
             unary_input,
             binary_input,
             fps_input,
             box_threshold_input,
             text_threshold_input,
+            binary_confidence_input,
         ],
+        outputs=[video_output, summary_output],
     )
 if __name__ == "__main__":

outputs/debug_crops/frame_0_obj_0.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_1.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_2.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_3.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_4.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_5.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_0.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_1.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_2.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_3.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_5.jpg CHANGED Viewed

src/LASER/laser/models/model_utils.py CHANGED Viewed

@@ -117,7 +117,12 @@ def crop_image_contain_bboxes(img, bbox_ls, data_id):
     return img[y1:y2, x1:x2]
 def extract_object_subject(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8):
-    # Ensure the masks are binary (0 or 1)
     red_mask = red_mask.astype(bool)
     blue_mask = blue_mask.astype(bool)
     non_masked_area = ~(red_mask | blue_mask)
@@ -126,16 +131,18 @@ def extract_object_subject(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8)
     b, g, r = cv2.split(img)
     # Adjust the red channel based on the red mask
-    r = np.where(red_mask[:, :, 0], np.clip(r + (255 - r) * alpha, 0, 255), r).astype(np.uint8)
     # Adjust the blue channel based on the blue mask
-    b = np.where(blue_mask[:, :, 0], np.clip(b + (255 - b) * alpha, 0, 255), b).astype(np.uint8)
     # Merge the channels back together
     output_img = cv2.merge((b, g, r))
     white_img = np.full_like(output_img, 255, dtype=np.uint8)
-    output_img = np.where(non_masked_area, cv2.addWeighted(output_img, 1 - white_alpha, white_img, white_alpha, 0), output_img)
     return output_img

     return img[y1:y2, x1:x2]
 def extract_object_subject(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8):
+    # Ensure the masks are 2D and binary (0 or 1)
+    if red_mask.ndim == 3:
+        red_mask = red_mask[:, :, 0]
+    if blue_mask.ndim == 3:
+        blue_mask = blue_mask[:, :, 0]
     red_mask = red_mask.astype(bool)
     blue_mask = blue_mask.astype(bool)
     non_masked_area = ~(red_mask | blue_mask)
     b, g, r = cv2.split(img)
     # Adjust the red channel based on the red mask
+    r = np.where(red_mask, np.clip(r + (255 - r) * alpha, 0, 255), r).astype(np.uint8)
     # Adjust the blue channel based on the blue mask
+    b = np.where(blue_mask, np.clip(b + (255 - b) * alpha, 0, 255), b).astype(np.uint8)
     # Merge the channels back together
     output_img = cv2.merge((b, g, r))
     white_img = np.full_like(output_img, 255, dtype=np.uint8)
+    # Expand non_masked_area to 3D for proper broadcasting with 3-channel images
+    non_masked_area_3d = np.expand_dims(non_masked_area, axis=-1)
+    output_img = np.where(non_masked_area_3d, cv2.addWeighted(output_img, 1 - white_alpha, white_img, white_alpha, 0), output_img)
     return output_img

vine_hf/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/__init__.cpython-310.pyc and b/vine_hf/__pycache__/__init__.cpython-310.pyc differ

vine_hf/__pycache__/flattening.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/flattening.cpython-310.pyc and b/vine_hf/__pycache__/flattening.cpython-310.pyc differ

vine_hf/__pycache__/vine_config.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_config.cpython-310.pyc and b/vine_hf/__pycache__/vine_config.cpython-310.pyc differ

vine_hf/__pycache__/vine_model.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_model.cpython-310.pyc and b/vine_hf/__pycache__/vine_model.cpython-310.pyc differ

vine_hf/__pycache__/vine_pipeline.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc and b/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc differ

vine_hf/__pycache__/vis_utils.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vis_utils.cpython-310.pyc and b/vine_hf/__pycache__/vis_utils.cpython-310.pyc differ

vine_hf/vine_model.py CHANGED Viewed

@@ -388,6 +388,24 @@ class VineModel(PreTrainedModel):
         batched_binary_kws = [list(binary_keywords)]
         batched_obj_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
         if object_pairs:
             for frame_id, frame_masks in masks.items():
                 if frame_id >= num_frames:

         batched_binary_kws = [list(binary_keywords)]
         batched_obj_pairs: List[Tuple[int, int, Tuple[int, int]]] = []
+        # Auto-generate all object pairs if binary_keywords provided but object_pairs is empty
+        if not object_pairs and binary_keywords:
+            # Get all unique object IDs across all frames
+            all_object_ids = set()
+            for frame_masks in masks.values():
+                all_object_ids.update(frame_masks.keys())
+            # Generate all bidirectional pairs (i, j) where i != j
+            object_pairs = []
+            sorted_ids = sorted(all_object_ids)
+            for from_oid in sorted_ids:
+                for to_oid in sorted_ids:
+                    if from_oid != to_oid:
+                        object_pairs.append((from_oid, to_oid))
+            print(f"Auto-generated {len(object_pairs)} bidirectional object pairs for binary relation detection: {object_pairs}")
         if object_pairs:
             for frame_id, frame_masks in masks.items():
                 if frame_id >= num_frames:

vine_hf/vine_pipeline.py CHANGED Viewed

@@ -125,6 +125,8 @@ class VinePipeline(Pipeline):
             postprocess_kwargs["return_top_k"] = kwargs["return_top_k"]
         if "self.visualize" in kwargs:
             postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
         return preprocess_kwargs, forward_kwargs, postprocess_kwargs
@@ -781,6 +783,9 @@ class VinePipeline(Pipeline):
             if debug_visualizations is None:
                 debug_visualizations = self.debug_visualizations
             vine_frame_sets = render_vine_frame_sets(
                 frames_np,
                 bboxes,
@@ -788,6 +793,7 @@ class VinePipeline(Pipeline):
                 unary_lookup,
                 binary_lookup,
                 visualization_data.get("sam_masks"),
             )
             vine_visuals: Dict[str, Dict[str, Any]] = {}
@@ -872,11 +878,27 @@ class VinePipeline(Pipeline):
               "top_categories": [{"label": str, "probability": float}, ...],
               "top_unary": [{"frame_id": int, "predicate": str, "probability": float}, ...],
             }
           }
         }
         """
         categorical_preds = model_outputs.get("categorical_predictions", {})
         unary_preds = model_outputs.get("unary_predictions", {})
         unary_by_obj: Dict[int, List[Tuple[float, str, int]]] = {}
         for (frame_id, obj_id), preds in unary_preds.items():
@@ -886,6 +908,24 @@ class VinePipeline(Pipeline):
                 )
                 unary_by_obj.setdefault(obj_id, []).append((prob_val, predicate, frame_id))
         objects_summary: Dict[str, Dict[str, Any]] = {}
         all_obj_ids = set(categorical_preds.keys()) | set(unary_by_obj.keys())
@@ -927,4 +967,10 @@ class VinePipeline(Pipeline):
             "num_objects_detected": len(objects_summary),
             "objects": objects_summary,
         }
         return summary

             postprocess_kwargs["return_top_k"] = kwargs["return_top_k"]
         if "self.visualize" in kwargs:
             postprocess_kwargs["self.visualize"] = kwargs["self.visualize"]
+        if "binary_confidence_threshold" in kwargs:
+            postprocess_kwargs["binary_confidence_threshold"] = kwargs["binary_confidence_threshold"]
         return preprocess_kwargs, forward_kwargs, postprocess_kwargs
             if debug_visualizations is None:
                 debug_visualizations = self.debug_visualizations
+            # Get binary confidence threshold from kwargs (default 0.0 means show all)
+            binary_confidence_threshold = kwargs.get("binary_confidence_threshold", 0.0)
             vine_frame_sets = render_vine_frame_sets(
                 frames_np,
                 bboxes,
                 unary_lookup,
                 binary_lookup,
                 visualization_data.get("sam_masks"),
+                binary_confidence_threshold,
             )
             vine_visuals: Dict[str, Dict[str, Any]] = {}
               "top_categories": [{"label": str, "probability": float}, ...],
               "top_unary": [{"frame_id": int, "predicate": str, "probability": float}, ...],
             }
+          },
+          "binary_keywords": {
+            "<from_id>-<to_id>": {"predicate": str, "confidence": float, "frame_id": int}
           }
         }
         """
         categorical_preds = model_outputs.get("categorical_predictions", {})
         unary_preds = model_outputs.get("unary_predictions", {})
+        binary_preds = model_outputs.get("binary_predictions", {})
+        # Debug: Print binary predictions
+        print("\n" + "=" * 80)
+        print("DEBUG _generate_summary: Binary predictions from model")
+        print(f"  Type: {type(binary_preds)}")
+        print(f"  Length: {len(binary_preds) if isinstance(binary_preds, dict) else 'N/A'}")
+        print(f"  Keys (first 20): {list(binary_preds.keys())[:20] if isinstance(binary_preds, dict) else 'N/A'}")
+        if isinstance(binary_preds, dict) and len(binary_preds) > 0:
+            print(f"  Sample entries:")
+            for i, (key, val) in enumerate(list(binary_preds.items())[:5]):
+                print(f"    {key}: {val}")
+        print("=" * 80 + "\n")
         unary_by_obj: Dict[int, List[Tuple[float, str, int]]] = {}
         for (frame_id, obj_id), preds in unary_preds.items():
                 )
                 unary_by_obj.setdefault(obj_id, []).append((prob_val, predicate, frame_id))
+        # Process binary predictions
+        binary_keywords: Dict[str, Dict[str, Any]] = {}
+        for (frame_id, (from_id, to_id)), preds in binary_preds.items():
+            for prob, predicate in preds:
+                prob_val = (
+                    float(prob.detach().cpu()) if torch.is_tensor(prob) else float(prob)
+                )
+                pair_key = f"{from_id}-{to_id}"
+                # Keep only the highest confidence prediction for each pair
+                if pair_key not in binary_keywords or prob_val > binary_keywords[pair_key]["confidence"]:
+                    binary_keywords[pair_key] = {
+                        "predicate": predicate,
+                        "confidence": prob_val,
+                        "frame_id": int(frame_id),
+                        "from_id": int(from_id),
+                        "to_id": int(to_id),
+                    }
         objects_summary: Dict[str, Dict[str, Any]] = {}
         all_obj_ids = set(categorical_preds.keys()) | set(unary_by_obj.keys())
             "num_objects_detected": len(objects_summary),
             "objects": objects_summary,
         }
+        # Add binary keywords to summary if any exist
+        if binary_keywords:
+            summary["binary_keywords"] = binary_keywords
+            print(f"\nDEBUG: Added {len(binary_keywords)} binary keywords to summary")
         return summary

vine_hf/vis_utils.py CHANGED Viewed

@@ -330,6 +330,7 @@ def render_vine_frame_sets(
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
     masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
 ) -> Dict[str, List[np.ndarray]]:
     frame_groups: Dict[str, List[np.ndarray]] = {
         "object": [],
@@ -403,6 +404,9 @@ def render_vine_frame_sets(
                 anchor, direction = _label_anchor_and_direction(bbox, "bottom")
                 _draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
         for obj_pair, relation_preds in binary_lookup.get(frame_idx, []):
             if len(obj_pair) != 2 or not relation_preds:
                 continue
@@ -411,17 +415,33 @@ def render_vine_frame_sets(
             obj_bbox = bbox_lookup.get(obj_id)
             if not subj_bbox or not obj_bbox:
                 continue
             start, end = relation_line(subj_bbox, obj_bbox)
             color = tuple(int(c) for c in np.clip(
                 (np.array(_object_color_bgr(subj_id), dtype=np.float32) +
                  np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
                 0, 255
             ))
-            prob, relation = relation_preds[0]
             label_text = f"{relation} {prob:.2f}"
             mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
-            cv2.line(binary_bgr, start, end, color, 6, cv2.LINE_AA)
-            cv2.line(all_bgr, start, end, color, 6, cv2.LINE_AA)
             _draw_centered_label(binary_bgr, label_text, mid_point, color)
             _draw_centered_label(all_bgr, label_text, mid_point, color)
@@ -440,6 +460,7 @@ def render_vine_frames(
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
     masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
 ) -> List[np.ndarray]:
     return render_vine_frame_sets(
         frames,
@@ -448,6 +469,7 @@ def render_vine_frames(
         unary_lookup,
         binary_lookup,
         masks,
     ).get("all", [])
 def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):

     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
     masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
+    binary_confidence_threshold: float = 0.0,
 ) -> Dict[str, List[np.ndarray]]:
     frame_groups: Dict[str, List[np.ndarray]] = {
         "object": [],
                 anchor, direction = _label_anchor_and_direction(bbox, "bottom")
                 _draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
+        # First pass: collect all pairs above threshold and deduplicate bidirectional pairs
+        pairs_to_draw = {}  # (min_id, max_id) -> (subj_id, obj_id, prob, relation)
         for obj_pair, relation_preds in binary_lookup.get(frame_idx, []):
             if len(obj_pair) != 2 or not relation_preds:
                 continue
             obj_bbox = bbox_lookup.get(obj_id)
             if not subj_bbox or not obj_bbox:
                 continue
+            prob, relation = relation_preds[0]
+            # Filter by confidence threshold
+            if prob < binary_confidence_threshold:
+                continue
+            # Create canonical key (smaller_id, larger_id) for deduplication
+            pair_key = (min(subj_id, obj_id), max(subj_id, obj_id))
+            # Keep the higher confidence direction
+            if pair_key not in pairs_to_draw or prob > pairs_to_draw[pair_key][2]:
+                pairs_to_draw[pair_key] = (subj_id, obj_id, prob, relation)
+        # Second pass: draw the selected pairs
+        for subj_id, obj_id, prob, relation in pairs_to_draw.values():
+            subj_bbox = bbox_lookup.get(subj_id)
+            obj_bbox = bbox_lookup.get(obj_id)
             start, end = relation_line(subj_bbox, obj_bbox)
             color = tuple(int(c) for c in np.clip(
                 (np.array(_object_color_bgr(subj_id), dtype=np.float32) +
                  np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
                 0, 255
             ))
             label_text = f"{relation} {prob:.2f}"
             mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
+            # Draw arrowed lines showing direction from subject to object (smaller arrow tip)
+            cv2.arrowedLine(binary_bgr, start, end, color, 6, cv2.LINE_AA, tipLength=0.05)
+            cv2.arrowedLine(all_bgr, start, end, color, 6, cv2.LINE_AA, tipLength=0.05)
             _draw_centered_label(binary_bgr, label_text, mid_point, color)
             _draw_centered_label(all_bgr, label_text, mid_point, color)
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
     masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
+    binary_confidence_threshold: float = 0.0,
 ) -> List[np.ndarray]:
     return render_vine_frame_sets(
         frames,
         unary_lookup,
         binary_lookup,
         masks,
+        binary_confidence_threshold,
     ).get("all", [])
 def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):