Spaces:

jiani-huang
/

LASER

Running on T4

App Files Files Community

ASethi04 commited on 7 days ago

Commit

21f4849

1 Parent(s): 17f21ca

updates

Browse files

Files changed (23) hide show

app.py +13 -255
outputs/debug_crops/frame_0_obj_0.jpg +0 -0
outputs/debug_crops/frame_0_obj_1.jpg +0 -0
outputs/debug_crops/frame_0_obj_2.jpg +0 -0
outputs/debug_crops/frame_0_obj_3.jpg +0 -0
outputs/debug_crops/frame_0_obj_4.jpg +0 -0
outputs/debug_crops/frame_0_obj_5.jpg +0 -0
outputs/debug_crops/frame_1_obj_0.jpg +0 -0
outputs/debug_crops/frame_1_obj_1.jpg +0 -0
outputs/debug_crops/frame_1_obj_2.jpg +0 -0
outputs/debug_crops/frame_1_obj_3.jpg +0 -0
outputs/debug_crops/frame_1_obj_4.jpg +0 -0
outputs/debug_crops/frame_1_obj_5.jpg +0 -0
outputs/debug_crops/frame_1_obj_6.jpg +0 -0
src/LASER/laser/models/model_utils.py +127 -52
vine_hf/__pycache__/__init__.cpython-310.pyc +0 -0
vine_hf/__pycache__/flattening.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_config.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_model.cpython-310.pyc +0 -0
vine_hf/__pycache__/vine_pipeline.cpython-310.pyc +0 -0
vine_hf/__pycache__/vis_utils.cpython-310.pyc +0 -0
vine_hf/vine_pipeline.py +32 -1
vine_hf/vis_utils.py +372 -175

app.py CHANGED Viewed

@@ -60,206 +60,6 @@ print(
 )
-def format_summary(summary, binary_confidence_threshold=0.8):
-    """
-    Format the summary dictionary into a readable markdown string.
-    Filters binary relations by confidence threshold.
-    """
-    if not summary or not isinstance(summary, dict):
-        return "# Detection Summary\n\nNo events detected or processing in progress..."
-    output_lines = ["# Detection Summary\n"]
-    has_content = False
-    # Categorical keywords
-    if "categorical_keywords" in summary and summary["categorical_keywords"]:
-        output_lines.append("## Categorical Keywords\n")
-        cate = summary["categorical_keywords"]
-        if isinstance(cate, dict) and cate:
-            has_content = True
-            for kw, info in cate.items():
-                output_lines.append(f"**{kw}**")
-                if isinstance(info, dict):
-                    for key, val in info.items():
-                        output_lines.append(f"  - {key}: {val}")
-                else:
-                    output_lines.append(f"  - {info}")
-                output_lines.append("")
-        elif isinstance(cate, list) and cate:
-            has_content = True
-            for item in cate:
-                output_lines.append(f"- {item}")
-            output_lines.append("")
-    # Unary keywords
-    if "unary_keywords" in summary and summary["unary_keywords"]:
-        output_lines.append("## Unary Keywords\n")
-        unary = summary["unary_keywords"]
-        if isinstance(unary, dict) and unary:
-            has_content = True
-            for kw, info in unary.items():
-                output_lines.append(f"**{kw}**")
-                if isinstance(info, dict):
-                    for key, val in info.items():
-                        output_lines.append(f"  - {key}: {val}")
-                else:
-                    output_lines.append(f"  - {info}")
-                output_lines.append("")
-        elif isinstance(unary, list) and unary:
-            has_content = True
-            for item in unary:
-                output_lines.append(f"- {item}")
-            output_lines.append("")
-    # Binary keywords - show ALL binary relations for debugging
-    print(f"DEBUG: Checking binary_keywords...")
-    print(f"  'binary_keywords' in summary: {'binary_keywords' in summary}")
-    if 'binary_keywords' in summary:
-        print(f"  summary['binary_keywords'] truthy: {bool(summary['binary_keywords'])}")
-        print(f"  summary['binary_keywords'] type: {type(summary['binary_keywords'])}")
-        print(f"  summary['binary_keywords'] value: {summary['binary_keywords']}")
-    if "binary_keywords" in summary and summary["binary_keywords"]:
-        output_lines.append(f"## Binary Keywords\n")
-        binary = summary["binary_keywords"]
-        print(f"DEBUG: Processing binary keywords, type: {type(binary)}, length: {len(binary) if isinstance(binary, (dict, list)) else 'N/A'}")
-        if isinstance(binary, dict) and binary:
-            has_content = True
-            # Show all binary relations, sorted by confidence
-            binary_items = []
-            for kw, info in binary.items():
-                if isinstance(info, dict):
-                    confidence = info.get("confidence", info.get("score", 0))
-                    binary_items.append((kw, info, confidence))
-                else:
-                    binary_items.append((kw, info, 0))
-            # Sort by confidence descending
-            binary_items.sort(key=lambda x: x[2], reverse=True)
-            high_conf_count = 0
-            low_conf_count = 0
-            # Show high confidence items first
-            output_lines.append(f"### High Confidence (≥ {binary_confidence_threshold})\n")
-            for kw, info, confidence in binary_items:
-                if confidence >= binary_confidence_threshold:
-                    high_conf_count += 1
-                    if isinstance(info, dict):
-                        output_lines.append(f"**{kw}** (confidence: {confidence:.2f})")
-                        for key, val in info.items():
-                            if key not in ["confidence", "score"]:
-                                output_lines.append(f"  - {key}: {val}")
-                    else:
-                        output_lines.append(f"**{kw}**: {info}")
-                    output_lines.append("")
-            if high_conf_count == 0:
-                output_lines.append(f"*No binary relations found with confidence ≥ {binary_confidence_threshold}*\n")
-            # Show lower confidence items for debugging
-            output_lines.append(f"### Lower Confidence (< {binary_confidence_threshold})\n")
-            for kw, info, confidence in binary_items:
-                if confidence < binary_confidence_threshold:
-                    low_conf_count += 1
-                    if isinstance(info, dict):
-                        output_lines.append(f"**{kw}** (confidence: {confidence:.2f})")
-                        for key, val in info.items():
-                            if key not in ["confidence", "score"]:
-                                output_lines.append(f"  - {key}: {val}")
-                    else:
-                        output_lines.append(f"**{kw}**: {info}")
-                    output_lines.append("")
-            if low_conf_count == 0:
-                output_lines.append(f"*No binary relations found with confidence < {binary_confidence_threshold}*\n")
-            output_lines.append(f"**Total binary relations detected: {len(binary_items)}**\n")
-        elif isinstance(binary, list) and binary:
-            has_content = True
-            for item in binary:
-                output_lines.append(f"- {item}")
-            output_lines.append("")
-    # Object pairs - show ALL object pair interactions for debugging
-    print(f"DEBUG: Checking object_pairs...")
-    print(f"  'object_pairs' in summary: {'object_pairs' in summary}")
-    if 'object_pairs' in summary:
-        print(f"  summary['object_pairs'] truthy: {bool(summary['object_pairs'])}")
-        print(f"  summary['object_pairs'] type: {type(summary['object_pairs'])}")
-        print(f"  summary['object_pairs'] value: {summary['object_pairs']}")
-    if "object_pairs" in summary and summary["object_pairs"]:
-        output_lines.append(f"## Object Pair Interactions\n")
-        pairs = summary["object_pairs"]
-        print(f"DEBUG: Processing object pairs, type: {type(pairs)}, length: {len(pairs) if isinstance(pairs, (dict, list)) else 'N/A'}")
-        if isinstance(pairs, dict) and pairs:
-            has_content = True
-            # Show all object pairs, sorted by confidence
-            pair_items = []
-            for pair, info in pairs.items():
-                if isinstance(info, dict):
-                    confidence = info.get("confidence", info.get("score", 0))
-                    pair_items.append((pair, info, confidence))
-                else:
-                    pair_items.append((pair, info, 0))
-            # Sort by confidence descending
-            pair_items.sort(key=lambda x: x[2], reverse=True)
-            high_conf_count = 0
-            low_conf_count = 0
-            # Show high confidence items first
-            output_lines.append(f"### High Confidence (≥ {binary_confidence_threshold})\n")
-            for pair, info, confidence in pair_items:
-                if confidence >= binary_confidence_threshold:
-                    high_conf_count += 1
-                    if isinstance(info, dict):
-                        output_lines.append(f"**{pair}** (confidence: {confidence:.2f})")
-                        for key, val in info.items():
-                            if key not in ["confidence", "score"]:
-                                output_lines.append(f"  - {key}: {val}")
-                    else:
-                        output_lines.append(f"**{pair}**: {info}")
-                    output_lines.append("")
-            if high_conf_count == 0:
-                output_lines.append(f"*No object pairs found with confidence ≥ {binary_confidence_threshold}*\n")
-            # Show lower confidence items for debugging
-            output_lines.append(f"### Lower Confidence (< {binary_confidence_threshold})\n")
-            for pair, info, confidence in pair_items:
-                if confidence < binary_confidence_threshold:
-                    low_conf_count += 1
-                    if isinstance(info, dict):
-                        output_lines.append(f"**{pair}** (confidence: {confidence:.2f})")
-                        for key, val in info.items():
-                            if key not in ["confidence", "score"]:
-                                output_lines.append(f"  - {key}: {val}")
-                    else:
-                        output_lines.append(f"**{pair}**: {info}")
-                    output_lines.append("")
-            if low_conf_count == 0:
-                output_lines.append(f"*No object pairs found with confidence < {binary_confidence_threshold}*\n")
-            output_lines.append(f"**Total object pairs detected: {len(pair_items)}**\n")
-        elif isinstance(pairs, list) and pairs:
-            has_content = True
-            for item in pairs:
-                output_lines.append(f"- {item}")
-            output_lines.append("")
-    # If no content was added, show the raw summary for debugging
-    if not has_content:
-        output_lines.append("## Raw Summary Data\n")
-        output_lines.append("```json")
-        import json
-        output_lines.append(json.dumps(summary, indent=2, default=str))
-        output_lines.append("```")
-    return "\n".join(output_lines)
 @lru_cache(maxsize=1)
@@ -394,9 +194,10 @@ def process_video(
     summary = results_dict.get("summary") or {}
     if result_video_path and os.path.exists(result_video_path):
-        gradio_tmp = Path(
-            os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir())
-        ) / "vine_outputs"
         gradio_tmp.mkdir(parents=True, exist_ok=True)
         dest_path = gradio_tmp / Path(result_video_path).name
         try:
@@ -411,47 +212,7 @@ def process_video(
             "Warning: annotated video not found or empty; check visualization settings."
         )
-    # Debug: Print summary structure
-    import json
-    print("=" * 80)
-    print("SUMMARY DEBUG OUTPUT:")
-    print(f"Summary type: {type(summary)}")
-    print(f"Summary keys: {summary.keys() if isinstance(summary, dict) else 'N/A'}")
-    if isinstance(summary, dict):
-        print("\nFULL SUMMARY JSON:")
-        print(json.dumps(summary, indent=2, default=str))
-        print("\n" + "=" * 80)
-        # Check for any keys that might contain binary relation data
-        print("\nLOOKING FOR BINARY RELATION DATA:")
-        possible_keys = ['binary', 'binary_keywords', 'binary_relations', 'object_pairs',
-                        'pairs', 'relations', 'interactions', 'pairwise']
-        for pkey in possible_keys:
-            if pkey in summary:
-                print(f"  FOUND: '{pkey}' -> {summary[pkey]}")
-        print("\nALL KEYS IN SUMMARY:")
-        for key in summary.keys():
-            print(f"\n{key}:")
-            print(f"  Type: {type(summary[key])}")
-            if isinstance(summary[key], dict):
-                print(f"  Length: {len(summary[key])}")
-                print(f"  Keys (first 10): {list(summary[key].keys())[:10]}")
-                # Print all items for anything that might be binary relations
-                if any(term in key.lower() for term in ['binary', 'pair', 'relation', 'interaction']):
-                    print(f"  ALL ITEMS:")
-                    for k, v in list(summary[key].items())[:20]:  # First 20 items
-                        print(f"    {k}: {v}")
-                else:
-                    print(f"  Sample: {dict(list(summary[key].items())[:2])}")
-            elif isinstance(summary[key], list):
-                print(f"  Length: {len(summary[key])}")
-                print(f"  Sample: {summary[key][:2]}")
-    print("=" * 80)
-    # Format summary as readable markdown text, filtering by confidence threshold
-    formatted_summary = format_summary(summary, binary_confidence_threshold)
-    return video_path_for_ui, formatted_summary
 def _video_component(label: str, *, is_output: bool = False):
@@ -523,25 +284,25 @@ with _create_blocks() as demo:
                 label="Categorical Keywords",
                 placeholder="e.g., person, car, dog",
                 value="person, car, dog",
-                info="Objects to detect in the video (comma-separated)"
             )
             unary_input = gr.Textbox(
                 label="Unary Keywords",
                 placeholder="e.g., walking, running, standing",
                 value="walking, running, standing",
-                info="Single-object actions to detect (comma-separated)"
             )
             binary_input = gr.Textbox(
                 label="Binary Keywords",
                 placeholder="e.g., chasing, carrying",
-                info="Object-to-object interactions to detect (comma-separated)"
             )
             gr.Markdown("#### Processing Settings")
             fps_input = gr.Number(
                 label="Output FPS",
                 value=1,
-                info="Frames per second for processing (lower = faster)"
             )
             with gr.Accordion("Advanced Settings", open=False):
@@ -551,7 +312,7 @@ with _create_blocks() as demo:
                     maximum=0.9,
                     value=0.35,
                     step=0.05,
-                    info="Confidence threshold for object detection"
                 )
                 text_threshold_input = gr.Slider(
                     label="Text Threshold",
@@ -559,7 +320,7 @@ with _create_blocks() as demo:
                     maximum=0.9,
                     value=0.25,
                     step=0.05,
-                    info="Confidence threshold for text-based detection"
                 )
                 binary_confidence_input = gr.Slider(
                     label="Binary Relation Confidence Threshold",
@@ -567,7 +328,7 @@ with _create_blocks() as demo:
                     maximum=1.0,
                     value=0.8,
                     step=0.05,
-                    info="Minimum confidence to show binary relations and object pairs"
                 )
             submit_btn = gr.Button("🚀 Process Video", variant="primary", size="lg")
@@ -579,10 +340,7 @@ with _create_blocks() as demo:
             video_output = _video_component("Annotated Video Output", is_output=True)
             gr.Markdown("### Detection Summary")
-            summary_output = gr.Markdown(
-                value="Results will appear here after processing...",
-                elem_classes=["summary-output"]
-            )
     gr.Markdown(
         """

 )
 @lru_cache(maxsize=1)
     summary = results_dict.get("summary") or {}
     if result_video_path and os.path.exists(result_video_path):
+        gradio_tmp = (
+            Path(os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir()))
+            / "vine_outputs"
+        )
         gradio_tmp.mkdir(parents=True, exist_ok=True)
         dest_path = gradio_tmp / Path(result_video_path).name
         try:
             "Warning: annotated video not found or empty; check visualization settings."
         )
+    return video_path_for_ui, summary
 def _video_component(label: str, *, is_output: bool = False):
                 label="Categorical Keywords",
                 placeholder="e.g., person, car, dog",
                 value="person, car, dog",
+                info="Objects to detect in the video (comma-separated)",
             )
             unary_input = gr.Textbox(
                 label="Unary Keywords",
                 placeholder="e.g., walking, running, standing",
                 value="walking, running, standing",
+                info="Single-object actions to detect (comma-separated)",
             )
             binary_input = gr.Textbox(
                 label="Binary Keywords",
                 placeholder="e.g., chasing, carrying",
+                info="Object-to-object interactions to detect (comma-separated)",
             )
             gr.Markdown("#### Processing Settings")
             fps_input = gr.Number(
                 label="Output FPS",
                 value=1,
+                info="Frames per second for processing (lower = faster)",
             )
             with gr.Accordion("Advanced Settings", open=False):
                     maximum=0.9,
                     value=0.35,
                     step=0.05,
+                    info="Confidence threshold for object detection",
                 )
                 text_threshold_input = gr.Slider(
                     label="Text Threshold",
                     maximum=0.9,
                     value=0.25,
                     step=0.05,
+                    info="Confidence threshold for text-based detection",
                 )
                 binary_confidence_input = gr.Slider(
                     label="Binary Relation Confidence Threshold",
                     maximum=1.0,
                     value=0.8,
                     step=0.05,
+                    info="Minimum confidence to show binary relations and object pairs",
                 )
             submit_btn = gr.Button("🚀 Process Video", variant="primary", size="lg")
             video_output = _video_component("Annotated Video Output", is_output=True)
             gr.Markdown("### Detection Summary")
+            summary_output = gr.JSON(label="Summary of Detected Events")
     gr.Markdown(
         """

outputs/debug_crops/frame_0_obj_0.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_1.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_2.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_3.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_4.jpg CHANGED Viewed

outputs/debug_crops/frame_0_obj_5.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_0.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_1.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_2.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_3.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_4.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_5.jpg CHANGED Viewed

outputs/debug_crops/frame_1_obj_6.jpg CHANGED Viewed

src/LASER/laser/models/model_utils.py CHANGED Viewed

@@ -6,20 +6,22 @@ import torch
 import jax.numpy as jnp
 import jax
 def increase_brightness(img, alpha=0.2):
     height, width, _ = img.shape
-    white_img = np.zeros([height,width,3],dtype=np.uint8)
-    white_img.fill(255) # or img[:] = 255
-    dst = cv2.addWeighted(img, alpha , white_img, 1-alpha, 0)
     return dst
 def increase_brightness_except(img, bbox_ls, alpha=0.2):
     height, width, _ = img.shape
-    white_img = np.zeros([height,width,3],dtype=np.uint8)
-    white_img.fill(255) # or img[:] = 255
-    output_img = cv2.addWeighted(img, alpha , white_img, 1-alpha, 0)
     for x1, y1, x2, y2 in bbox_ls:
         output_img[y1:y2, x1:x2] = img[y1:y2, x1:x2]
@@ -28,12 +30,12 @@ def increase_brightness_except(img, bbox_ls, alpha=0.2):
 def extract_single_object(img, mask, alpha=0.8):
     """OpenCV version of extract_single_object that works with numpy arrays.
     Args:
         img: numpy array of shape (height, width, 3)
         mask: numpy array of shape (height, width, 1) or (height, width)
         alpha: float between 0 and 1 for blending
     Returns:
         numpy array of shape (height, width, 3)
     """
@@ -51,18 +53,21 @@ def extract_single_object(img, mask, alpha=0.8):
     masked_white_img = np.where(mask, white_img, img)
     # Blend the original image with the masked white image
-    output_img = cv2.addWeighted(img.astype(np.uint8), 1-alpha, masked_white_img.astype(np.uint8), alpha, 0)
     return output_img
 def extract_single_object_jax(img, mask, alpha=0.8):
     """JAX version of extract_single_object that works with JAX arrays.
     Args:
         img: JAX array of shape (height, width, 3)
         mask: JAX array of shape (height, width, 1) or (height, width)
         alpha: float between 0 and 1 for blending
     Returns:
         JAX array of shape (height, width, 3)
     """
@@ -80,10 +85,11 @@ def extract_single_object_jax(img, mask, alpha=0.8):
     masked_white_img = jnp.where(mask, white_img, img)
     # Blend the original image with the masked white image
-    output_img = img * (1-alpha) + masked_white_img * alpha
     return output_img
 def crop_image_contain_bboxes(img, bbox_ls, data_id):
     all_bx1 = []
     all_by1 = []
@@ -92,9 +98,11 @@ def crop_image_contain_bboxes(img, bbox_ls, data_id):
     for bbox in bbox_ls:
         if isinstance(bbox, dict):
-            bx1, by1, bx2, by2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
         elif isinstance(bbox, (list, tuple, np.ndarray)):
-            bx1, by1, bx2, by2 = map(int, bbox[:4])  # Convert first 4 elements to integers
         else:
             raise ValueError(f"Unsupported bbox format: {type(bbox)}")
@@ -111,13 +119,36 @@ def crop_image_contain_bboxes(img, bbox_ls, data_id):
     y1 = min(all_by1)
     y2 = max(all_by2)
-    assert(x1 < x2), f"image bbox issue: {data_id}"
-    assert(y1 < y2), f"image bbox issue: {data_id}"
     return img[y1:y2, x1:x2]
 def extract_object_subject(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8):
-    # Ensure the masks are 2D and binary (0 or 1)
     if red_mask.ndim == 3:
         red_mask = red_mask[:, :, 0]
     if blue_mask.ndim == 3:
@@ -125,44 +156,62 @@ def extract_object_subject(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8)
     red_mask = red_mask.astype(bool)
     blue_mask = blue_mask.astype(bool)
     non_masked_area = ~(red_mask | blue_mask)
-    # Split the image into its color channels (B, G, R)
     b, g, r = cv2.split(img)
-    # Adjust the red channel based on the red mask
-    r = np.where(red_mask, np.clip(r + (255 - r) * alpha, 0, 255), r).astype(np.uint8)
-    # Adjust the blue channel based on the blue mask
-    b = np.where(blue_mask, np.clip(b + (255 - b) * alpha, 0, 255), b).astype(np.uint8)
-    # Merge the channels back together
     output_img = cv2.merge((b, g, r))
     white_img = np.full_like(output_img, 255, dtype=np.uint8)
-    # Expand non_masked_area to 3D for proper broadcasting with 3-channel images
-    non_masked_area_3d = np.expand_dims(non_masked_area, axis=-1)
-    output_img = np.where(non_masked_area_3d, cv2.addWeighted(output_img, 1 - white_alpha, white_img, white_alpha, 0), output_img)
     return output_img
 def extract_object_subject_jax(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8):
     """JAX version of extract_object_subject that works with JAX arrays.
     Args:
         img: JAX array of shape (height, width, 3) in BGR format
         red_mask: JAX array of shape (height, width, 1) or (height, width)
         blue_mask: JAX array of shape (height, width, 1) or (height, width)
         alpha: float between 0 and 1 for color highlighting
         white_alpha: float between 0 and 1 for background blending
     Returns:
         JAX array of shape (height, width, 3) in BGR format with uint8 dtype
     """
     # Convert input image to float32 for calculations
     img = img.astype(jnp.float32)
     # Ensure the masks are binary (0 or 1)
     red_mask = red_mask.astype(bool)
     blue_mask = blue_mask.astype(bool)
@@ -179,54 +228,58 @@ def extract_object_subject_jax(img, red_mask, blue_mask, alpha=0.5, white_alpha=
     r = img[..., 2]  # Red channel
     # Adjust the red channel based on the red mask
-    r = jnp.where(red_mask[..., 0],
-                  jnp.clip(r + (255 - r) * alpha, 0, 255),
-                  r)
     # Adjust the blue channel based on the blue mask
-    b = jnp.where(blue_mask[..., 0],
-                  jnp.clip(b + (255 - b) * alpha, 0, 255),
-                  b)
     # Stack the channels back together
     output_img = jnp.stack([b, g, r], axis=-1)
     # Create white background and blend
     white_img = jnp.full_like(output_img, 255.0, dtype=jnp.float32)
-    output_img = jnp.where(non_masked_area,
-                          output_img * (1 - white_alpha) + white_img * white_alpha,
-                          output_img)
     # Round to nearest integer and cast to uint8
     output_img = jnp.round(output_img)
     return output_img.astype(jnp.uint8)
-def increase_brightness_draw_outer_edge(img, bbox_ls, alpha=0.2, colormap_name='Set1', thickness=2):
     if isinstance(img, torch.Tensor):
         img = img.cpu().numpy().astype(np.uint8)
     else:
         img = img.astype(np.uint8)
     height, width, _ = img.shape
-    white_img = np.zeros([height,width,3],dtype=np.uint8)
-    white_img.fill(255) # or img[:] = 255
-    output_img = cv2.addWeighted(img, alpha , white_img, 1-alpha, 0)
     colormap = plt.colormaps[colormap_name]
     for bbox_id, (x1, y1, x2, y2) in enumerate(bbox_ls):
         output_img[y1:y2, x1:x2] = img[y1:y2, x1:x2]
-        color =  [c * 255 for c in mpl.colors.to_rgb(colormap(bbox_id))]
         # print(f"color: {color}")
         output_img = cv2.rectangle(output_img, (x1, y1), (x2, y2), color, thickness)
     return torch.tensor(output_img, dtype=torch.float32)
 def get_print_hook(name):
     def print_hook(grad):
         print(f"{name}: \n {grad} \n")
         return grad
     return print_hook
 def segment_list(l, n=5):
     current_seg = []
     all_segs = []
@@ -242,18 +295,22 @@ def segment_list(l, n=5):
     return all_segs
 def get_tensor_size(a):
     return a.element_size() * a.nelement()
 def comp_diff(v1, v2):
     return 2 * torch.abs(v1 - v2) / (v1 + v2)
 def gather_names(pred_res):
     all_names = set()
     for name, _ in pred_res:
         all_names.add(name)
     return list(all_names)
 def extract_nl_feats(tokenizer, model, names, device):
     if len(names) == 0:
         features = []
@@ -262,14 +319,23 @@ def extract_nl_feats(tokenizer, model, names, device):
         features = model.get_text_features(**name_tokens)
     return features
-def extract_all_nl_feats(tokenizer, model, batch_size, batched_names, batched_unary_kws, batched_binary_kws, device):
     batched_obj_name_features = [[] for _ in range(batch_size)]
     batched_unary_nl_features = [[] for _ in range(batch_size)]
     batched_binary_nl_features = [[] for _ in range(batch_size)]
-    for vid, (object_names, unary_kws, binary_kws) in \
-        enumerate(zip(batched_names, batched_unary_kws, batched_binary_kws)):
         obj_name_features = extract_nl_feats(tokenizer, model, object_names, device)
         batched_obj_name_features[vid] = obj_name_features
@@ -279,22 +345,31 @@ def extract_all_nl_feats(tokenizer, model, batch_size, batched_names, batched_un
         binary_features = extract_nl_feats(tokenizer, model, binary_kws, device)
         batched_binary_nl_features[vid] = binary_features
-    return batched_obj_name_features, batched_unary_nl_features, batched_binary_nl_features
-def single_object_crop(batch_size, batched_videos, batched_object_ids, batched_bboxes, batched_video_splits):
     batched_frame_bboxes = {}
     batched_cropped_objs = [[] for _ in range(batch_size)]
     for (video_id, frame_id, obj_id), bbox in zip(batched_object_ids, batched_bboxes):
         overall_frame_id = batched_video_splits[video_id] + frame_id
         if type(bbox) == dict:
-            bx1, by1, bx2, by2 = bbox['x1'], bbox['y1'], bbox['x2'], bbox['y2']
         else:
             bx1, by1, bx2, by2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
         assert by2 > by1
         assert bx2 > bx1
-        batched_cropped_objs[video_id].append((batched_videos[overall_frame_id][by1:by2, bx1:bx2]))
         batched_frame_bboxes[video_id, frame_id, obj_id] = (bx1, by1, bx2, by2)
     return batched_cropped_objs, batched_frame_bboxes

 import jax.numpy as jnp
 import jax
 def increase_brightness(img, alpha=0.2):
     height, width, _ = img.shape
+    white_img = np.zeros([height, width, 3], dtype=np.uint8)
+    white_img.fill(255)  # or img[:] = 255
+    dst = cv2.addWeighted(img, alpha, white_img, 1 - alpha, 0)
     return dst
 def increase_brightness_except(img, bbox_ls, alpha=0.2):
     height, width, _ = img.shape
+    white_img = np.zeros([height, width, 3], dtype=np.uint8)
+    white_img.fill(255)  # or img[:] = 255
+    output_img = cv2.addWeighted(img, alpha, white_img, 1 - alpha, 0)
     for x1, y1, x2, y2 in bbox_ls:
         output_img[y1:y2, x1:x2] = img[y1:y2, x1:x2]
 def extract_single_object(img, mask, alpha=0.8):
     """OpenCV version of extract_single_object that works with numpy arrays.
     Args:
         img: numpy array of shape (height, width, 3)
         mask: numpy array of shape (height, width, 1) or (height, width)
         alpha: float between 0 and 1 for blending
     Returns:
         numpy array of shape (height, width, 3)
     """
     masked_white_img = np.where(mask, white_img, img)
     # Blend the original image with the masked white image
+    output_img = cv2.addWeighted(
+        img.astype(np.uint8), 1 - alpha, masked_white_img.astype(np.uint8), alpha, 0
+    )
     return output_img
 def extract_single_object_jax(img, mask, alpha=0.8):
     """JAX version of extract_single_object that works with JAX arrays.
     Args:
         img: JAX array of shape (height, width, 3)
         mask: JAX array of shape (height, width, 1) or (height, width)
         alpha: float between 0 and 1 for blending
     Returns:
         JAX array of shape (height, width, 3)
     """
     masked_white_img = jnp.where(mask, white_img, img)
     # Blend the original image with the masked white image
+    output_img = img * (1 - alpha) + masked_white_img * alpha
     return output_img
 def crop_image_contain_bboxes(img, bbox_ls, data_id):
     all_bx1 = []
     all_by1 = []
     for bbox in bbox_ls:
         if isinstance(bbox, dict):
+            bx1, by1, bx2, by2 = bbox["x1"], bbox["y1"], bbox["x2"], bbox["y2"]
         elif isinstance(bbox, (list, tuple, np.ndarray)):
+            bx1, by1, bx2, by2 = map(
+                int, bbox[:4]
+            )  # Convert first 4 elements to integers
         else:
             raise ValueError(f"Unsupported bbox format: {type(bbox)}")
     y1 = min(all_by1)
     y2 = max(all_by2)
+    assert x1 < x2, f"image bbox issue: {data_id}"
+    assert y1 < y2, f"image bbox issue: {data_id}"
     return img[y1:y2, x1:x2]
+import numpy as np
+import cv2
 def extract_object_subject(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8):
+    """
+    Blend subject/object regions into the image:
+    - red_mask: subject
+    - blue_mask: object
+    - alpha: how strong color highlight is
+    - white_alpha: how strongly to fade background toward white
+    """
+    # Ensure img is uint8 HxWx3
+    img = np.asarray(img)
+    if img.ndim == 2:
+        img = cv2.cvtColor(img, cv2.COLOR_GRAY2BGR)
+    if img.dtype != np.uint8:
+        img = (img * 255).astype(np.uint8) if img.max() <= 1.0 else img.astype(np.uint8)
+    # Normalize masks to 2D
+    red_mask = np.asarray(red_mask)
+    blue_mask = np.asarray(blue_mask)
     if red_mask.ndim == 3:
         red_mask = red_mask[:, :, 0]
     if blue_mask.ndim == 3:
     red_mask = red_mask.astype(bool)
     blue_mask = blue_mask.astype(bool)
+    # Background = areas not in either mask
     non_masked_area = ~(red_mask | blue_mask)
+    # Split channels
     b, g, r = cv2.split(img)
+    # Highlight red region
+    r = np.where(
+        red_mask,
+        np.clip(r + (255 - r) * alpha, 0, 255),
+        r,
+    )
+    # Highlight blue region
+    b = np.where(
+        blue_mask,
+        np.clip(b + (255 - b) * alpha, 0, 255),
+        b,
+    )
+    # Ensure proper dtype
+    b = b.astype(np.uint8)
+    g = g.astype(np.uint8)
+    r = r.astype(np.uint8)
     output_img = cv2.merge((b, g, r))
+    # Fade non-masked area toward white
     white_img = np.full_like(output_img, 255, dtype=np.uint8)
+    non_masked_area_3d = non_masked_area[
+        ..., None
+    ]  # (H, W, 1) -> broadcast to (H, W, 3)
+    faded = cv2.addWeighted(output_img, 1 - white_alpha, white_img, white_alpha, 0)
+    output_img = np.where(non_masked_area_3d, faded, output_img)
     return output_img
 def extract_object_subject_jax(img, red_mask, blue_mask, alpha=0.5, white_alpha=0.8):
     """JAX version of extract_object_subject that works with JAX arrays.
     Args:
         img: JAX array of shape (height, width, 3) in BGR format
         red_mask: JAX array of shape (height, width, 1) or (height, width)
         blue_mask: JAX array of shape (height, width, 1) or (height, width)
         alpha: float between 0 and 1 for color highlighting
         white_alpha: float between 0 and 1 for background blending
     Returns:
         JAX array of shape (height, width, 3) in BGR format with uint8 dtype
     """
     # Convert input image to float32 for calculations
     img = img.astype(jnp.float32)
     # Ensure the masks are binary (0 or 1)
     red_mask = red_mask.astype(bool)
     blue_mask = blue_mask.astype(bool)
     r = img[..., 2]  # Red channel
     # Adjust the red channel based on the red mask
+    r = jnp.where(red_mask[..., 0], jnp.clip(r + (255 - r) * alpha, 0, 255), r)
     # Adjust the blue channel based on the blue mask
+    b = jnp.where(blue_mask[..., 0], jnp.clip(b + (255 - b) * alpha, 0, 255), b)
     # Stack the channels back together
     output_img = jnp.stack([b, g, r], axis=-1)
     # Create white background and blend
     white_img = jnp.full_like(output_img, 255.0, dtype=jnp.float32)
+    output_img = jnp.where(
+        non_masked_area,
+        output_img * (1 - white_alpha) + white_img * white_alpha,
+        output_img,
+    )
     # Round to nearest integer and cast to uint8
     output_img = jnp.round(output_img)
     return output_img.astype(jnp.uint8)
+def increase_brightness_draw_outer_edge(
+    img, bbox_ls, alpha=0.2, colormap_name="Set1", thickness=2
+):
     if isinstance(img, torch.Tensor):
         img = img.cpu().numpy().astype(np.uint8)
     else:
         img = img.astype(np.uint8)
     height, width, _ = img.shape
+    white_img = np.zeros([height, width, 3], dtype=np.uint8)
+    white_img.fill(255)  # or img[:] = 255
+    output_img = cv2.addWeighted(img, alpha, white_img, 1 - alpha, 0)
     colormap = plt.colormaps[colormap_name]
     for bbox_id, (x1, y1, x2, y2) in enumerate(bbox_ls):
         output_img[y1:y2, x1:x2] = img[y1:y2, x1:x2]
+        color = [c * 255 for c in mpl.colors.to_rgb(colormap(bbox_id))]
         # print(f"color: {color}")
         output_img = cv2.rectangle(output_img, (x1, y1), (x2, y2), color, thickness)
     return torch.tensor(output_img, dtype=torch.float32)
 def get_print_hook(name):
     def print_hook(grad):
         print(f"{name}: \n {grad} \n")
         return grad
     return print_hook
 def segment_list(l, n=5):
     current_seg = []
     all_segs = []
     return all_segs
 def get_tensor_size(a):
     return a.element_size() * a.nelement()
 def comp_diff(v1, v2):
     return 2 * torch.abs(v1 - v2) / (v1 + v2)
 def gather_names(pred_res):
     all_names = set()
     for name, _ in pred_res:
         all_names.add(name)
     return list(all_names)
 def extract_nl_feats(tokenizer, model, names, device):
     if len(names) == 0:
         features = []
         features = model.get_text_features(**name_tokens)
     return features
+def extract_all_nl_feats(
+    tokenizer,
+    model,
+    batch_size,
+    batched_names,
+    batched_unary_kws,
+    batched_binary_kws,
+    device,
+):
     batched_obj_name_features = [[] for _ in range(batch_size)]
     batched_unary_nl_features = [[] for _ in range(batch_size)]
     batched_binary_nl_features = [[] for _ in range(batch_size)]
+    for vid, (object_names, unary_kws, binary_kws) in enumerate(
+        zip(batched_names, batched_unary_kws, batched_binary_kws)
+    ):
         obj_name_features = extract_nl_feats(tokenizer, model, object_names, device)
         batched_obj_name_features[vid] = obj_name_features
         binary_features = extract_nl_feats(tokenizer, model, binary_kws, device)
         batched_binary_nl_features[vid] = binary_features
+    return (
+        batched_obj_name_features,
+        batched_unary_nl_features,
+        batched_binary_nl_features,
+    )
+def single_object_crop(
+    batch_size, batched_videos, batched_object_ids, batched_bboxes, batched_video_splits
+):
     batched_frame_bboxes = {}
     batched_cropped_objs = [[] for _ in range(batch_size)]
     for (video_id, frame_id, obj_id), bbox in zip(batched_object_ids, batched_bboxes):
         overall_frame_id = batched_video_splits[video_id] + frame_id
         if type(bbox) == dict:
+            bx1, by1, bx2, by2 = bbox["x1"], bbox["y1"], bbox["x2"], bbox["y2"]
         else:
             bx1, by1, bx2, by2 = int(bbox[0]), int(bbox[1]), int(bbox[2]), int(bbox[3])
         assert by2 > by1
         assert bx2 > bx1
+        batched_cropped_objs[video_id].append(
+            (batched_videos[overall_frame_id][by1:by2, bx1:bx2])
+        )
         batched_frame_bboxes[video_id, frame_id, obj_id] = (bx1, by1, bx2, by2)
     return batched_cropped_objs, batched_frame_bboxes

vine_hf/__pycache__/__init__.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/__init__.cpython-310.pyc and b/vine_hf/__pycache__/__init__.cpython-310.pyc differ

vine_hf/__pycache__/flattening.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/flattening.cpython-310.pyc and b/vine_hf/__pycache__/flattening.cpython-310.pyc differ

vine_hf/__pycache__/vine_config.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_config.cpython-310.pyc and b/vine_hf/__pycache__/vine_config.cpython-310.pyc differ

vine_hf/__pycache__/vine_model.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_model.cpython-310.pyc and b/vine_hf/__pycache__/vine_model.cpython-310.pyc differ

vine_hf/__pycache__/vine_pipeline.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc and b/vine_hf/__pycache__/vine_pipeline.cpython-310.pyc differ

vine_hf/__pycache__/vis_utils.cpython-310.pyc CHANGED Viewed

Binary files a/vine_hf/__pycache__/vis_utils.cpython-310.pyc and b/vine_hf/__pycache__/vis_utils.cpython-310.pyc differ

vine_hf/vine_pipeline.py CHANGED Viewed

@@ -586,8 +586,17 @@ class VinePipeline(Pipeline):
         import subprocess
         try:
             ffmpeg_cmd = [
-                "ffmpeg",
                 "-y",
                 "-f",
                 "rawvideo",
@@ -657,6 +666,10 @@ class VinePipeline(Pipeline):
         out = None
         used_codec = None
         for codec in codecs_to_try:
             try:
                 fourcc = cv2.VideoWriter_fourcc(*codec)
@@ -679,19 +692,37 @@ class VinePipeline(Pipeline):
         print(f"Using OpenCV with codec: {used_codec}")
         for frame in video_tensor:
             if len(frame.shape) == 3 and frame.shape[2] == 3:
                 frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
             else:
                 frame_bgr = frame
             if frame_bgr.dtype != np.uint8:
                 frame_bgr = (
                     (frame_bgr * 255).astype(np.uint8)
                     if frame_bgr.max() <= 1
                     else frame_bgr.astype(np.uint8)
                 )
             out.write(frame_bgr)
         out.release()
         return temp_path

         import subprocess
         try:
+            # Try to get FFmpeg from imageio-ffmpeg first, then fall back to system FFmpeg
+            try:
+                import imageio_ffmpeg
+                ffmpeg_exe = imageio_ffmpeg.get_ffmpeg_exe()
+                print(f"Using FFmpeg from imageio-ffmpeg: {ffmpeg_exe}")
+            except ImportError:
+                ffmpeg_exe = "ffmpeg"
+                print("Using system FFmpeg")
             ffmpeg_cmd = [
+                ffmpeg_exe,
                 "-y",
                 "-f",
                 "rawvideo",
         out = None
         used_codec = None
+        # Debug: Print video tensor info
+        print(f"DEBUG: video_tensor shape: {video_tensor.shape}, dtype: {video_tensor.dtype}")
+        print(f"DEBUG: Expected dimensions - width: {width}, height: {height}, fps: {fps}")
         for codec in codecs_to_try:
             try:
                 fourcc = cv2.VideoWriter_fourcc(*codec)
         print(f"Using OpenCV with codec: {used_codec}")
+        frame_count = 0
         for frame in video_tensor:
+            # Debug: Print first frame info
+            if frame_count == 0:
+                print(f"DEBUG: First frame shape: {frame.shape}, dtype: {frame.dtype}")
+                print(f"DEBUG: First frame min: {frame.min()}, max: {frame.max()}, mean: {frame.mean()}")
             if len(frame.shape) == 3 and frame.shape[2] == 3:
                 frame_bgr = cv2.cvtColor(frame, cv2.COLOR_RGB2BGR)
             else:
                 frame_bgr = frame
             if frame_bgr.dtype != np.uint8:
                 frame_bgr = (
                     (frame_bgr * 255).astype(np.uint8)
                     if frame_bgr.max() <= 1
                     else frame_bgr.astype(np.uint8)
                 )
+            # Debug: Check if frame dimensions match VideoWriter expectations
+            if frame_count == 0:
+                print(f"DEBUG: After conversion - frame_bgr shape: {frame_bgr.shape}, dtype: {frame_bgr.dtype}")
+                print(f"DEBUG: After conversion - min: {frame_bgr.min()}, max: {frame_bgr.max()}")
+                actual_height, actual_width = frame_bgr.shape[:2]
+                if actual_height != height or actual_width != width:
+                    print(f"WARNING: Frame size mismatch! Expected ({height}, {width}), got ({actual_height}, {actual_width})")
             out.write(frame_bgr)
+            frame_count += 1
+        print(f"DEBUG: Wrote {frame_count} frames to video")
         out.release()
         return temp_path

vine_hf/vis_utils.py CHANGED Viewed

@@ -54,10 +54,12 @@ from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
 # All rendered frames returned by functions are RGB np.ndarray images suitable for saving or video writing.
 ########################################################################################
 def clean_label(label):
     """Replace underscores and slashes with spaces for uniformity."""
     return label.replace("_", " ").replace("/", " ")
 # Should be performed somewhere else I believe
 def format_cate_preds(cate_preds):
     # Group object predictions from the model output.
@@ -72,6 +74,7 @@ def format_cate_preds(cate_preds):
         obj_pred_dict[oid].sort(key=lambda x: x[1], reverse=True)
     return obj_pred_dict
 def format_binary_cate_preds(binary_preds):
     frame_binary_preds = []
     for key, score in binary_preds.items():
@@ -85,6 +88,7 @@ def format_binary_cate_preds(binary_preds):
     frame_binary_preds.sort(key=lambda x: x[3], reverse=True)
     return frame_binary_preds
 _FONT = cv2.FONT_HERSHEY_SIMPLEX
@@ -106,7 +110,9 @@ def _to_numpy_mask(mask: Union[np.ndarray, torch.Tensor, None]) -> Optional[np.n
     return mask_np > 0
-def _sanitize_bbox(bbox: Union[List[float], Tuple[float, ...], None], width: int, height: int) -> Optional[Tuple[int, int, int, int]]:
     if bbox is None:
         return None
     if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
@@ -164,7 +170,16 @@ def _draw_label_block(
             cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
             text_x = left_x + 4
             text_y = min(bottom_y - baseline - 2, img_h - 1)
-            cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
             y_cursor = bottom_y
     else:
         for text in lines:
@@ -177,7 +192,16 @@ def _draw_label_block(
             cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
             text_x = left_x + 4
             text_y = min(bottom_y - baseline - 2, img_h - 1)
-            cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
             y_cursor = top_y
@@ -198,13 +222,26 @@ def _draw_centered_label(
     top_y = int(np.clip(cy - th // 2 - baseline - 4, 0, img_h - 1))
     right_x = int(np.clip(left_x + tw + 8, 0, img_w - 1))
     bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
-    cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), _background_color(color), -1)
     text_x = left_x + 4
     text_y = min(bottom_y - baseline - 2, img_h - 1)
-    cv2.putText(image, text, (text_x, text_y), _FONT, font_scale, (0, 0, 0), thickness, cv2.LINE_AA)
-def _extract_frame_entities(store: Union[Dict[int, Dict[int, Any]], List, None], frame_idx: int) -> Dict[int, Any]:
     if isinstance(store, dict):
         frame_entry = store.get(frame_idx, {})
     elif isinstance(store, list) and 0 <= frame_idx < len(store):
@@ -271,7 +308,9 @@ def render_sam_frames(
                 continue
             color = _object_color_bgr(obj_id)
             alpha = 0.45
-            overlay[mask_np] = (1.0 - alpha) * overlay[mask_np] + alpha * np.array(color, dtype=np.float32)
         annotated = np.clip(overlay, 0, 255).astype(np.uint8)
         frame_h, frame_w = annotated.shape[:2]
@@ -329,7 +368,9 @@ def render_vine_frame_sets(
     cat_label_lookup: Dict[int, Tuple[str, float]],
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
-    masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
     binary_confidence_threshold: float = 0.0,
 ) -> Dict[str, List[np.ndarray]]:
     frame_groups: Dict[str, List[np.ndarray]] = {
@@ -347,7 +388,9 @@ def render_vine_frame_sets(
         base_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
         frame_h, frame_w = base_bgr.shape[:2]
         frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
-        frame_masks = _extract_frame_entities(masks, frame_idx) if masks is not None else {}
         objects_bgr = base_bgr.copy()
         unary_bgr = base_bgr.copy()
@@ -393,16 +436,36 @@ def render_vine_frame_sets(
         for obj_id, bbox in bbox_lookup.items():
             title = titles_lookup.get(obj_id)
             unary_lines = unary_lines_lookup.get(obj_id, [])
-            _draw_bbox_with_label(objects_bgr, bbox, obj_id, title=title, label_position="top")
-            _draw_bbox_with_label(unary_bgr, bbox, obj_id, title=title, label_position="top")
             if unary_lines:
                 anchor, direction = _label_anchor_and_direction(bbox, "bottom")
-                _draw_label_block(unary_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
-            _draw_bbox_with_label(binary_bgr, bbox, obj_id, title=title, label_position="top")
-            _draw_bbox_with_label(all_bgr, bbox, obj_id, title=title, label_position="top")
             if unary_lines:
                 anchor, direction = _label_anchor_and_direction(bbox, "bottom")
-                _draw_label_block(all_bgr, unary_lines, anchor, _object_color_bgr(obj_id), direction=direction)
         # First pass: collect all pairs above threshold and deduplicate bidirectional pairs
         pairs_to_draw = {}  # (min_id, max_id) -> (subj_id, obj_id, prob, relation)
@@ -432,15 +495,24 @@ def render_vine_frame_sets(
             subj_bbox = bbox_lookup.get(subj_id)
             obj_bbox = bbox_lookup.get(obj_id)
             start, end = relation_line(subj_bbox, obj_bbox)
-            color = tuple(int(c) for c in np.clip(
-                (np.array(_object_color_bgr(subj_id), dtype=np.float32) +
-                 np.array(_object_color_bgr(obj_id), dtype=np.float32)) / 2.0,
-                0, 255
-            ))
             label_text = f"{relation} {prob:.2f}"
             mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
             # Draw arrowed lines showing direction from subject to object (smaller arrow tip)
-            cv2.arrowedLine(binary_bgr, start, end, color, 6, cv2.LINE_AA, tipLength=0.05)
             cv2.arrowedLine(all_bgr, start, end, color, 6, cv2.LINE_AA, tipLength=0.05)
             _draw_centered_label(binary_bgr, label_text, mid_point, color)
             _draw_centered_label(all_bgr, label_text, mid_point, color)
@@ -459,7 +531,9 @@ def render_vine_frames(
     cat_label_lookup: Dict[int, Tuple[str, float]],
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
-    masks: Union[Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None] = None,
     binary_confidence_threshold: float = 0.0,
 ) -> List[np.ndarray]:
     return render_vine_frame_sets(
@@ -471,11 +545,12 @@ def render_vine_frames(
         masks,
         binary_confidence_threshold,
     ).get("all", [])
 def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
     all_colors = []
     all_texts = []
-    for (obj_id, bbox, gt_label) in gt_labels:
         preds = obj_pred_dict.get(obj_id, [])
         if len(preds) == 0:
             top1 = "N/A"
@@ -485,143 +560,214 @@ def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
             topk_labels = [p[0] for p in preds[:topk_object]]
             # Compare cleaned labels.
             if top1.lower() == gt_label.lower():
-                box_color = (0, 255, 0)      # bright green for correct
             elif gt_label.lower() in [p.lower() for p in topk_labels]:
-                box_color = (0, 165, 255)    # bright orange for partial match
             else:
-                box_color = (0, 0, 255)      # bright red for incorrect
         label_text = f"ID:{obj_id}/P:{top1}/GT:{gt_label}"
         all_colors.append(box_color)
         all_texts.append(label_text)
     return all_colors, all_texts
 def plot_unary(frame_img, gt_labels, all_colors, all_texts):
-    for (obj_id, bbox, gt_label), box_color, label_text in zip(gt_labels, all_colors, all_texts):
         x1, y1, x2, y2 = map(int, bbox)
         cv2.rectangle(frame_img, (x1, y1), (x2, y2), color=box_color, thickness=2)
-        (tw, th), baseline = cv2.getTextSize(label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1)
-        cv2.rectangle(frame_img, (x1, y1 - th - baseline - 4), (x1 + tw, y1), box_color, -1)
-        cv2.putText(frame_img, label_text, (x1, y1 - 2), cv2.FONT_HERSHEY_SIMPLEX,
-            0.5, (0, 0, 0), 1, cv2.LINE_AA)
     return frame_img
-def get_white_pane(pane_height,
-                   pane_width=600,
-                   header_height = 50,
-                   header_font = cv2.FONT_HERSHEY_SIMPLEX,
-                   header_font_scale = 0.7,
-                   header_thickness = 2,
-                   header_color = (0, 0, 0)):
-     # Create an expanded white pane to display text info.
     white_pane = 255 * np.ones((pane_height, pane_width, 3), dtype=np.uint8)
     # --- Adjust pane split: make predictions column wider (60% vs. 40%) ---
     left_width = int(pane_width * 0.6)
     right_width = pane_width - left_width
     left_pane = white_pane[:, :left_width, :].copy()
     right_pane = white_pane[:, left_width:, :].copy()
-    cv2.putText(left_pane, "Binary Predictions", (10, header_height - 30),
-                header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
-    cv2.putText(right_pane, "Ground Truth", (10, header_height - 30),
-                header_font, header_font_scale, header_color, header_thickness, cv2.LINE_AA)
     return white_pane
 # This is for ploting binary prediction results with frame-based scene graphs
-def plot_binary_sg(frame_img,
-                   white_pane,
-                   bin_preds,
-                   gt_relations,
-                   topk_binary,
-                   header_height=50,
-                   indicator_size=20,
-                   pane_width=600):
-     # Leave vertical space for the headers.
     line_height = 30  # vertical spacing per line
-    x_text = 10       # left margin for text
     y_text_left = header_height + 10  # starting y for left pane text
-    y_text_right = header_height + 10 # starting y for right pane text
     # Left section: top-k binary predictions.
     left_width = int(pane_width * 0.6)
     right_width = pane_width - left_width
     left_pane = white_pane[:, :left_width, :].copy()
     right_pane = white_pane[:, left_width:, :].copy()
-    for (subj, pred_rel, obj, score) in bin_preds[:topk_binary]:
-        correct = any((subj == gt[0] and pred_rel.lower() == gt[2].lower() and obj == gt[1])
-                      for gt in gt_relations)
         indicator_color = (0, 255, 0) if correct else (0, 0, 255)
-        cv2.rectangle(left_pane, (x_text, y_text_left - indicator_size + 5),
-                      (x_text + indicator_size, y_text_left + 5), indicator_color, -1)
         text = f"{subj} - {pred_rel} - {obj} :: {score:.2f}"
-        cv2.putText(left_pane, text, (x_text + indicator_size + 5, y_text_left + 5),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
         y_text_left += line_height
     # Right section: ground truth binary relations.
     for gt in gt_relations:
         if len(gt) != 3:
             continue
         text = f"{gt[0]} - {gt[2]} - {gt[1]}"
-        cv2.putText(right_pane, text, (x_text, y_text_right + 5),
-                    cv2.FONT_HERSHEY_SIMPLEX, 0.6, (0, 0, 0), 1, cv2.LINE_AA)
         y_text_right += line_height
     # Combine the two text panes and then with the frame image.
     combined_pane = np.hstack((left_pane, right_pane))
     combined_image = np.hstack((frame_img, combined_pane))
     return combined_image
-def visualized_frame(frame_img,
-                     bboxes,
-                     object_ids,
-                     gt_labels,
-                     cate_preds,
-                     binary_preds,
-                     gt_relations,
-                     topk_object,
-                     topk_binary,
-                     phase="unary"):
     """Return the combined annotated frame for frame index i as an image (in BGR)."""
     # Get the frame image (assuming batched_data['batched_reshaped_raw_videos'] is a list of frames)
     # --- Process Object Predictions (for overlaying bboxes) ---
     if phase == "unary":
         objs = []
-        for ((_, f_id, obj_id), bbox, gt_label) in zip(object_ids, bboxes, gt_labels):
             gt_label = clean_label(gt_label)
             objs.append((obj_id, bbox, gt_label))
         formatted_cate_preds = format_cate_preds(cate_preds)
-        all_colors, all_texts = color_for_cate_correctness(formatted_cate_preds, gt_labels, topk_object)
         updated_frame_img = plot_unary(frame_img, gt_labels, all_colors, all_texts)
         return updated_frame_img
     else:
         # --- Process Binary Predictions & Ground Truth for the Text Pane ---
         formatted_binary_preds = format_binary_cate_preds(binary_preds)
         # Ground truth binary relations for the frame.
         # Clean ground truth relations.
-        gt_relations = [(clean_label(str(s)), clean_label(str(o)), clean_label(rel)) for s, o, rel in gt_relations]
         pane_width = 600  # increased pane width for more horizontal space
         pane_height = frame_img.shape[0]
         # --- Add header labels to each text pane with extra space ---
         header_height = 50  # increased header space
-        white_pane = get_white_pane(pane_height, pane_width, header_height=header_height)
-        combined_image = plot_binary_sg(frame_img, white_pane, formatted_binary_preds, gt_relations, topk_binary)
         return combined_image
 def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
     # Ensure mask is a numpy array
     mask = np.array(mask)
@@ -644,7 +790,7 @@ def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
         color = list(cmap((cmap_idx * 47) % 256))
         color[3] = 0.5
         color = np.array(color)
     # Expand mask to (H, W, 1) for broadcasting
     mask_expanded = mask[..., None]
     mask_image = mask_expanded * color.reshape(1, 1, -1)
@@ -663,7 +809,7 @@ def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
                 linewidth=1.5,
                 edgecolor=color[:3],
                 facecolor="none",
-                alpha=color[3]
             )
             ax.add_patch(rect)
             ax.text(
@@ -673,10 +819,11 @@ def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
                 color="white",
                 fontsize=6,
                 backgroundcolor=np.array(color),
-                alpha=1
             )
     ax.imshow(mask_image)
 def save_mask_one_image(frame_image, masks, save_path):
     """Render masks on top of a frame and store the visualization on disk."""
     fig, ax = plt.subplots(1, figsize=(6, 6))
@@ -695,9 +842,7 @@ def save_mask_one_image(frame_image, masks, save_path):
     prepared_masks = {
         obj_id: (
-            mask.detach().cpu().numpy()
-            if torch.is_tensor(mask)
-            else np.asarray(mask)
         )
         for obj_id, mask in mask_iter
     }
@@ -711,54 +856,61 @@ def save_mask_one_image(frame_image, masks, save_path):
     fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
     plt.close(fig)
     return save_path
-def get_video_masks_visualization(video_tensor,
-                                  video_masks,
-                                  video_id,
-                                  video_save_base_dir,
-                                  oid_class_pred=None,
-                                  sample_rate = 1):
     video_save_dir = os.path.join(video_save_base_dir, video_id)
     if not os.path.exists(video_save_dir):
         os.makedirs(video_save_dir, exist_ok=True)
     for frame_id, image in enumerate(video_tensor):
         if frame_id not in video_masks:
             print("No mask for Frame", frame_id)
             continue
         masks = video_masks[frame_id]
         save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
         get_mask_one_image(image, masks, oid_class_pred)
 def get_mask_one_image(frame_image, masks, oid_class_pred=None):
     # Create a figure and axis
     fig, ax = plt.subplots(1, figsize=(6, 6))
     # Display the frame image
     ax.imshow(frame_image)
-    ax.axis('off')
     if type(masks) == list:
         masks = {i: m for i, m in enumerate(masks)}
     # Add the masks
     for obj_id, mask in masks.items():
-        det_class = f"{obj_id}. {oid_class_pred[obj_id]}" if not oid_class_pred is None else None
         show_mask(mask, ax, obj_id=obj_id, det_class=det_class, random_color=False)
     # Show the plot
     return fig, ax
 def save_video(frames, output_filename, output_fps):
     # --- Create a video from all frames ---
     num_frames = len(frames)
     frame_h, frame_w = frames.shape[:2]
     # Use a codec supported by VS Code (H.264 via 'avc1').
-    fourcc = cv2.VideoWriter_fourcc(*'avc1')
     out = cv2.VideoWriter(output_filename, fourcc, output_fps, (frame_w, frame_h))
     print(f"Processing {num_frames} frames...")
@@ -766,23 +918,26 @@ def save_video(frames, output_filename, output_fps):
         vis_frame = get_visualized_frame(i)
         out.write(vis_frame)
         if i % 10 == 0:
-            print(f"Processed frame {i+1}/{num_frames}")
     out.release()
     print(f"Video saved as {output_filename}")
 def list_depth(lst):
     """Calculates the depth of a nested list."""
     if not (isinstance(lst, list) or isinstance(lst, torch.Tensor)):
         return 0
-    elif (isinstance(lst, torch.Tensor) and lst.shape == torch.Size([])) or (isinstance(lst, list) and len(lst) == 0):
         return 1
     else:
         return 1 + max(list_depth(item) for item in lst)
 def normalize_prompt(points, labels):
-    if list_depth(points) == 3:
         points = torch.stack([p.unsqueeze(0) for p in points])
         labels = torch.stack([l.unsqueeze(0) for l in labels])
     return points, labels
@@ -791,36 +946,56 @@ def normalize_prompt(points, labels):
 def show_box(box, ax, object_id):
     if len(box) == 0:
         return
     cmap = plt.get_cmap("gist_rainbow")
     cmap_idx = 0 if object_id is None else object_id
     color = list(cmap((cmap_idx * 47) % 256))
     x0, y0 = box[0], box[1]
     w, h = box[2] - box[0], box[3] - box[1]
-    ax.add_patch(plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0,0,0,0), lw=2))
 def show_points(coords, labels, ax, object_id=None, marker_size=375):
     if len(labels) == 0:
         return
-    pos_points = coords[labels==1]
-    neg_points = coords[labels==0]
     cmap = plt.get_cmap("gist_rainbow")
     cmap_idx = 0 if object_id is None else object_id
     color = list(cmap((cmap_idx * 47) % 256))
-    ax.scatter(pos_points[:, 0], pos_points[:, 1], color='green', marker='P', s=marker_size, edgecolor=color, linewidth=1.25)
-    ax.scatter(neg_points[:, 0], neg_points[:, 1], color='red', marker='s', s=marker_size, edgecolor=color, linewidth=1.25)
 def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
     # Create a figure and axis
     fig, ax = plt.subplots(1, figsize=(6, 6))
     # Display the frame image
     ax.imshow(frame_image)
-    ax.axis('off')
     points, labels = normalize_prompt(points, labels)
     if type(boxes) == torch.Tensor:
@@ -837,40 +1012,50 @@ def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
         pass
     else:
         raise Exception()
     for object_id, (point_ls, label_ls) in enumerate(zip(points, labels)):
         if not len(point_ls) == 0:
             show_points(point_ls.cpu(), label_ls.cpu(), ax, object_id=object_id)
     # Show the plot
     plt.savefig(save_path)
     plt.close()
-def save_video_prompts_visualization(video_tensor, video_boxes, video_points, video_labels, video_id, video_save_base_dir):
     video_save_dir = os.path.join(video_save_base_dir, video_id)
     if not os.path.exists(video_save_dir):
         os.makedirs(video_save_dir, exist_ok=True)
     for frame_id, image in enumerate(video_tensor):
         boxes, points, labels = [], [], []
         if frame_id in video_boxes:
             boxes = video_boxes[frame_id]
         if frame_id in video_points:
             points = video_points[frame_id]
         if frame_id in video_labels:
             labels = video_labels[frame_id]
         save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
         save_prompts_one_image(image, boxes, points, labels, save_path)
-def save_video_masks_visualization(video_tensor, video_masks, video_id, video_save_base_dir, oid_class_pred=None, sample_rate = 1):
     video_save_dir = os.path.join(video_save_base_dir, video_id)
     if not os.path.exists(video_save_dir):
         os.makedirs(video_save_dir, exist_ok=True)
     for frame_id, image in enumerate(video_tensor):
         if random.random() > sample_rate:
             continue
@@ -880,18 +1065,17 @@ def save_video_masks_visualization(video_tensor, video_masks, video_id, video_sa
         masks = video_masks[frame_id]
         save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
         save_mask_one_image(image, masks, save_path)
-def get_color(obj_id, cmap_name="gist_rainbow",alpha=0.5):
     cmap = plt.get_cmap(cmap_name)
     cmap_idx = 0 if obj_id is None else obj_id
     color = list(cmap((cmap_idx * 47) % 256))
     color[3] = 0.5
     color = np.array(color)
     return color
 def _bbox_center(bbox: Tuple[int, int, int, int]) -> Tuple[float, float]:
     return ((bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0)
@@ -906,7 +1090,9 @@ def relation_line(
     """
     center1 = _bbox_center(bbox1)
     center2 = _bbox_center(bbox2)
-    if math.isclose(center1[0], center2[0], abs_tol=1e-3) and math.isclose(center1[1], center2[1], abs_tol=1e-3):
         offset = max(1.0, (bbox2[2] - bbox2[0]) * 0.05)
         center2 = (center2[0] + offset, center2[1])
     start = (int(round(center1[0])), int(round(center1[1])))
@@ -915,57 +1101,68 @@ def relation_line(
         end = (end[0] + 1, end[1])
     return start, end
 def get_binary_mask_one_image(frame_image, masks, rel_pred_ls=None):
     # Create a figure and axis
     fig, ax = plt.subplots(1, figsize=(6, 6))
     # Display the frame image
     ax.imshow(frame_image)
-    ax.axis('off')
     all_objs_to_show = set()
     all_lines_to_show = []
     # print(rel_pred_ls[0])
     for (from_obj_id, to_obj_id), rel_text in rel_pred_ls.items():
-        all_objs_to_show.add(from_obj_id)
-        all_objs_to_show.add(to_obj_id)
         from_mask = masks[from_obj_id]
         bbox1 = mask_to_bbox(from_mask)
         to_mask = masks[to_obj_id]
         bbox2 = mask_to_bbox(to_mask)
         c1, c2 = shortest_line_between_bboxes(bbox1, bbox2)
         line_color = get_color(from_obj_id)
         face_color = get_color(to_obj_id)
         line = c1, c2, face_color, line_color, rel_text
         all_lines_to_show.append(line)
     masks_to_show = {}
     for oid in all_objs_to_show:
         masks_to_show[oid] = masks[oid]
     # Add the masks
     for obj_id, mask in masks_to_show.items():
         show_mask(mask, ax, obj_id=obj_id, random_color=False)
-    for (from_pt_x, from_pt_y), (to_pt_x, to_pt_y), face_color, line_color, rel_text in all_lines_to_show:
-        plt.plot([from_pt_x, to_pt_x], [from_pt_y, to_pt_y], color=line_color, linestyle='-', linewidth=3)
         mid_pt_x = (from_pt_x + to_pt_x) / 2
         mid_pt_y = (from_pt_y + to_pt_y) / 2
         ax.text(
-                mid_pt_x - 5,
-                mid_pt_y,
-                rel_text,
-                color="white",
-                fontsize=6,
-                backgroundcolor=np.array(line_color),
-                bbox=dict(facecolor=face_color, edgecolor=line_color, boxstyle='round,pad=1'),
-                alpha=1
-            )
     # Show the plot
     return fig, ax

 # All rendered frames returned by functions are RGB np.ndarray images suitable for saving or video writing.
 ########################################################################################
 def clean_label(label):
     """Replace underscores and slashes with spaces for uniformity."""
     return label.replace("_", " ").replace("/", " ")
 # Should be performed somewhere else I believe
 def format_cate_preds(cate_preds):
     # Group object predictions from the model output.
         obj_pred_dict[oid].sort(key=lambda x: x[1], reverse=True)
     return obj_pred_dict
 def format_binary_cate_preds(binary_preds):
     frame_binary_preds = []
     for key, score in binary_preds.items():
     frame_binary_preds.sort(key=lambda x: x[3], reverse=True)
     return frame_binary_preds
 _FONT = cv2.FONT_HERSHEY_SIMPLEX
     return mask_np > 0
+def _sanitize_bbox(
+    bbox: Union[List[float], Tuple[float, ...], None], width: int, height: int
+) -> Optional[Tuple[int, int, int, int]]:
     if bbox is None:
         return None
     if isinstance(bbox, (list, tuple)) and len(bbox) >= 4:
             cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
             text_x = left_x + 4
             text_y = min(bottom_y - baseline - 2, img_h - 1)
+            cv2.putText(
+                image,
+                text,
+                (text_x, text_y),
+                _FONT,
+                font_scale,
+                (0, 0, 0),
+                thickness,
+                cv2.LINE_AA,
+            )
             y_cursor = bottom_y
     else:
         for text in lines:
             cv2.rectangle(image, (left_x, top_y), (right_x, bottom_y), bg_color, -1)
             text_x = left_x + 4
             text_y = min(bottom_y - baseline - 2, img_h - 1)
+            cv2.putText(
+                image,
+                text,
+                (text_x, text_y),
+                _FONT,
+                font_scale,
+                (0, 0, 0),
+                thickness,
+                cv2.LINE_AA,
+            )
             y_cursor = top_y
     top_y = int(np.clip(cy - th // 2 - baseline - 4, 0, img_h - 1))
     right_x = int(np.clip(left_x + tw + 8, 0, img_w - 1))
     bottom_y = int(np.clip(top_y + th + baseline + 6, 0, img_h - 1))
+    cv2.rectangle(
+        image, (left_x, top_y), (right_x, bottom_y), _background_color(color), -1
+    )
     text_x = left_x + 4
     text_y = min(bottom_y - baseline - 2, img_h - 1)
+    cv2.putText(
+        image,
+        text,
+        (text_x, text_y),
+        _FONT,
+        font_scale,
+        (0, 0, 0),
+        thickness,
+        cv2.LINE_AA,
+    )
+def _extract_frame_entities(
+    store: Union[Dict[int, Dict[int, Any]], List, None], frame_idx: int
+) -> Dict[int, Any]:
     if isinstance(store, dict):
         frame_entry = store.get(frame_idx, {})
     elif isinstance(store, list) and 0 <= frame_idx < len(store):
                 continue
             color = _object_color_bgr(obj_id)
             alpha = 0.45
+            overlay[mask_np] = (1.0 - alpha) * overlay[mask_np] + alpha * np.array(
+                color, dtype=np.float32
+            )
         annotated = np.clip(overlay, 0, 255).astype(np.uint8)
         frame_h, frame_w = annotated.shape[:2]
     cat_label_lookup: Dict[int, Tuple[str, float]],
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
+    masks: Union[
+        Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None
+    ] = None,
     binary_confidence_threshold: float = 0.0,
 ) -> Dict[str, List[np.ndarray]]:
     frame_groups: Dict[str, List[np.ndarray]] = {
         base_bgr = cv2.cvtColor(frame_rgb, cv2.COLOR_RGB2BGR)
         frame_h, frame_w = base_bgr.shape[:2]
         frame_bboxes = _extract_frame_entities(bboxes, frame_idx)
+        frame_masks = (
+            _extract_frame_entities(masks, frame_idx) if masks is not None else {}
+        )
         objects_bgr = base_bgr.copy()
         unary_bgr = base_bgr.copy()
         for obj_id, bbox in bbox_lookup.items():
             title = titles_lookup.get(obj_id)
             unary_lines = unary_lines_lookup.get(obj_id, [])
+            _draw_bbox_with_label(
+                objects_bgr, bbox, obj_id, title=title, label_position="top"
+            )
+            _draw_bbox_with_label(
+                unary_bgr, bbox, obj_id, title=title, label_position="top"
+            )
             if unary_lines:
                 anchor, direction = _label_anchor_and_direction(bbox, "bottom")
+                _draw_label_block(
+                    unary_bgr,
+                    unary_lines,
+                    anchor,
+                    _object_color_bgr(obj_id),
+                    direction=direction,
+                )
+            _draw_bbox_with_label(
+                binary_bgr, bbox, obj_id, title=title, label_position="top"
+            )
+            _draw_bbox_with_label(
+                all_bgr, bbox, obj_id, title=title, label_position="top"
+            )
             if unary_lines:
                 anchor, direction = _label_anchor_and_direction(bbox, "bottom")
+                _draw_label_block(
+                    all_bgr,
+                    unary_lines,
+                    anchor,
+                    _object_color_bgr(obj_id),
+                    direction=direction,
+                )
         # First pass: collect all pairs above threshold and deduplicate bidirectional pairs
         pairs_to_draw = {}  # (min_id, max_id) -> (subj_id, obj_id, prob, relation)
             subj_bbox = bbox_lookup.get(subj_id)
             obj_bbox = bbox_lookup.get(obj_id)
             start, end = relation_line(subj_bbox, obj_bbox)
+            color = tuple(
+                int(c)
+                for c in np.clip(
+                    (
+                        np.array(_object_color_bgr(subj_id), dtype=np.float32)
+                        + np.array(_object_color_bgr(obj_id), dtype=np.float32)
+                    )
+                    / 2.0,
+                    0,
+                    255,
+                )
+            )
             label_text = f"{relation} {prob:.2f}"
             mid_point = (int((start[0] + end[0]) / 2), int((start[1] + end[1]) / 2))
             # Draw arrowed lines showing direction from subject to object (smaller arrow tip)
+            cv2.arrowedLine(
+                binary_bgr, start, end, color, 6, cv2.LINE_AA, tipLength=0.05
+            )
             cv2.arrowedLine(all_bgr, start, end, color, 6, cv2.LINE_AA, tipLength=0.05)
             _draw_centered_label(binary_bgr, label_text, mid_point, color)
             _draw_centered_label(all_bgr, label_text, mid_point, color)
     cat_label_lookup: Dict[int, Tuple[str, float]],
     unary_lookup: Dict[int, Dict[int, List[Tuple[float, str]]]],
     binary_lookup: Dict[int, List[Tuple[Tuple[int, int], List[Tuple[float, str]]]]],
+    masks: Union[
+        Dict[int, Dict[int, Union[np.ndarray, torch.Tensor]]], List, None
+    ] = None,
     binary_confidence_threshold: float = 0.0,
 ) -> List[np.ndarray]:
     return render_vine_frame_sets(
         masks,
         binary_confidence_threshold,
     ).get("all", [])
 def color_for_cate_correctness(obj_pred_dict, gt_labels, topk_object):
     all_colors = []
     all_texts = []
+    for obj_id, bbox, gt_label in gt_labels:
         preds = obj_pred_dict.get(obj_id, [])
         if len(preds) == 0:
             top1 = "N/A"
             topk_labels = [p[0] for p in preds[:topk_object]]
             # Compare cleaned labels.
             if top1.lower() == gt_label.lower():
+                box_color = (0, 255, 0)  # bright green for correct
             elif gt_label.lower() in [p.lower() for p in topk_labels]:
+                box_color = (0, 165, 255)  # bright orange for partial match
             else:
+                box_color = (0, 0, 255)  # bright red for incorrect
         label_text = f"ID:{obj_id}/P:{top1}/GT:{gt_label}"
         all_colors.append(box_color)
         all_texts.append(label_text)
     return all_colors, all_texts
 def plot_unary(frame_img, gt_labels, all_colors, all_texts):
+    for (obj_id, bbox, gt_label), box_color, label_text in zip(
+        gt_labels, all_colors, all_texts
+    ):
         x1, y1, x2, y2 = map(int, bbox)
         cv2.rectangle(frame_img, (x1, y1), (x2, y2), color=box_color, thickness=2)
+        (tw, th), baseline = cv2.getTextSize(
+            label_text, cv2.FONT_HERSHEY_SIMPLEX, 0.5, 1
+        )
+        cv2.rectangle(
+            frame_img, (x1, y1 - th - baseline - 4), (x1 + tw, y1), box_color, -1
+        )
+        cv2.putText(
+            frame_img,
+            label_text,
+            (x1, y1 - 2),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.5,
+            (0, 0, 0),
+            1,
+            cv2.LINE_AA,
+        )
     return frame_img
+def get_white_pane(
+    pane_height,
+    pane_width=600,
+    header_height=50,
+    header_font=cv2.FONT_HERSHEY_SIMPLEX,
+    header_font_scale=0.7,
+    header_thickness=2,
+    header_color=(0, 0, 0),
+):
+    # Create an expanded white pane to display text info.
     white_pane = 255 * np.ones((pane_height, pane_width, 3), dtype=np.uint8)
     # --- Adjust pane split: make predictions column wider (60% vs. 40%) ---
     left_width = int(pane_width * 0.6)
     right_width = pane_width - left_width
     left_pane = white_pane[:, :left_width, :].copy()
     right_pane = white_pane[:, left_width:, :].copy()
+    cv2.putText(
+        left_pane,
+        "Binary Predictions",
+        (10, header_height - 30),
+        header_font,
+        header_font_scale,
+        header_color,
+        header_thickness,
+        cv2.LINE_AA,
+    )
+    cv2.putText(
+        right_pane,
+        "Ground Truth",
+        (10, header_height - 30),
+        header_font,
+        header_font_scale,
+        header_color,
+        header_thickness,
+        cv2.LINE_AA,
+    )
     return white_pane
 # This is for ploting binary prediction results with frame-based scene graphs
+def plot_binary_sg(
+    frame_img,
+    white_pane,
+    bin_preds,
+    gt_relations,
+    topk_binary,
+    header_height=50,
+    indicator_size=20,
+    pane_width=600,
+):
+    # Leave vertical space for the headers.
     line_height = 30  # vertical spacing per line
+    x_text = 10  # left margin for text
     y_text_left = header_height + 10  # starting y for left pane text
+    y_text_right = header_height + 10  # starting y for right pane text
     # Left section: top-k binary predictions.
     left_width = int(pane_width * 0.6)
     right_width = pane_width - left_width
     left_pane = white_pane[:, :left_width, :].copy()
     right_pane = white_pane[:, left_width:, :].copy()
+    for subj, pred_rel, obj, score in bin_preds[:topk_binary]:
+        correct = any(
+            (subj == gt[0] and pred_rel.lower() == gt[2].lower() and obj == gt[1])
+            for gt in gt_relations
+        )
         indicator_color = (0, 255, 0) if correct else (0, 0, 255)
+        cv2.rectangle(
+            left_pane,
+            (x_text, y_text_left - indicator_size + 5),
+            (x_text + indicator_size, y_text_left + 5),
+            indicator_color,
+            -1,
+        )
         text = f"{subj} - {pred_rel} - {obj} :: {score:.2f}"
+        cv2.putText(
+            left_pane,
+            text,
+            (x_text + indicator_size + 5, y_text_left + 5),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.6,
+            (0, 0, 0),
+            1,
+            cv2.LINE_AA,
+        )
         y_text_left += line_height
     # Right section: ground truth binary relations.
     for gt in gt_relations:
         if len(gt) != 3:
             continue
         text = f"{gt[0]} - {gt[2]} - {gt[1]}"
+        cv2.putText(
+            right_pane,
+            text,
+            (x_text, y_text_right + 5),
+            cv2.FONT_HERSHEY_SIMPLEX,
+            0.6,
+            (0, 0, 0),
+            1,
+            cv2.LINE_AA,
+        )
         y_text_right += line_height
     # Combine the two text panes and then with the frame image.
     combined_pane = np.hstack((left_pane, right_pane))
     combined_image = np.hstack((frame_img, combined_pane))
     return combined_image
+def visualized_frame(
+    frame_img,
+    bboxes,
+    object_ids,
+    gt_labels,
+    cate_preds,
+    binary_preds,
+    gt_relations,
+    topk_object,
+    topk_binary,
+    phase="unary",
+):
     """Return the combined annotated frame for frame index i as an image (in BGR)."""
     # Get the frame image (assuming batched_data['batched_reshaped_raw_videos'] is a list of frames)
     # --- Process Object Predictions (for overlaying bboxes) ---
     if phase == "unary":
         objs = []
+        for (_, f_id, obj_id), bbox, gt_label in zip(object_ids, bboxes, gt_labels):
             gt_label = clean_label(gt_label)
             objs.append((obj_id, bbox, gt_label))
         formatted_cate_preds = format_cate_preds(cate_preds)
+        all_colors, all_texts = color_for_cate_correctness(
+            formatted_cate_preds, gt_labels, topk_object
+        )
         updated_frame_img = plot_unary(frame_img, gt_labels, all_colors, all_texts)
         return updated_frame_img
     else:
         # --- Process Binary Predictions & Ground Truth for the Text Pane ---
         formatted_binary_preds = format_binary_cate_preds(binary_preds)
         # Ground truth binary relations for the frame.
         # Clean ground truth relations.
+        gt_relations = [
+            (clean_label(str(s)), clean_label(str(o)), clean_label(rel))
+            for s, o, rel in gt_relations
+        ]
         pane_width = 600  # increased pane width for more horizontal space
         pane_height = frame_img.shape[0]
         # --- Add header labels to each text pane with extra space ---
         header_height = 50  # increased header space
+        white_pane = get_white_pane(
+            pane_height, pane_width, header_height=header_height
+        )
+        combined_image = plot_binary_sg(
+            frame_img, white_pane, formatted_binary_preds, gt_relations, topk_binary
+        )
         return combined_image
 def show_mask(mask, ax, obj_id=None, det_class=None, random_color=False):
     # Ensure mask is a numpy array
     mask = np.array(mask)
         color = list(cmap((cmap_idx * 47) % 256))
         color[3] = 0.5
         color = np.array(color)
     # Expand mask to (H, W, 1) for broadcasting
     mask_expanded = mask[..., None]
     mask_image = mask_expanded * color.reshape(1, 1, -1)
                 linewidth=1.5,
                 edgecolor=color[:3],
                 facecolor="none",
+                alpha=color[3],
             )
             ax.add_patch(rect)
             ax.text(
                 color="white",
                 fontsize=6,
                 backgroundcolor=np.array(color),
+                alpha=1,
             )
     ax.imshow(mask_image)
 def save_mask_one_image(frame_image, masks, save_path):
     """Render masks on top of a frame and store the visualization on disk."""
     fig, ax = plt.subplots(1, figsize=(6, 6))
     prepared_masks = {
         obj_id: (
+            mask.detach().cpu().numpy() if torch.is_tensor(mask) else np.asarray(mask)
         )
         for obj_id, mask in mask_iter
     }
     fig.savefig(save_path, bbox_inches="tight", pad_inches=0)
     plt.close(fig)
     return save_path
+def get_video_masks_visualization(
+    video_tensor,
+    video_masks,
+    video_id,
+    video_save_base_dir,
+    oid_class_pred=None,
+    sample_rate=1,
+):
     video_save_dir = os.path.join(video_save_base_dir, video_id)
     if not os.path.exists(video_save_dir):
         os.makedirs(video_save_dir, exist_ok=True)
     for frame_id, image in enumerate(video_tensor):
         if frame_id not in video_masks:
             print("No mask for Frame", frame_id)
             continue
         masks = video_masks[frame_id]
         save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
         get_mask_one_image(image, masks, oid_class_pred)
 def get_mask_one_image(frame_image, masks, oid_class_pred=None):
     # Create a figure and axis
     fig, ax = plt.subplots(1, figsize=(6, 6))
     # Display the frame image
     ax.imshow(frame_image)
+    ax.axis("off")
     if type(masks) == list:
         masks = {i: m for i, m in enumerate(masks)}
     # Add the masks
     for obj_id, mask in masks.items():
+        det_class = (
+            f"{obj_id}. {oid_class_pred[obj_id]}"
+            if not oid_class_pred is None
+            else None
+        )
         show_mask(mask, ax, obj_id=obj_id, det_class=det_class, random_color=False)
     # Show the plot
     return fig, ax
 def save_video(frames, output_filename, output_fps):
     # --- Create a video from all frames ---
     num_frames = len(frames)
     frame_h, frame_w = frames.shape[:2]
     # Use a codec supported by VS Code (H.264 via 'avc1').
+    fourcc = cv2.VideoWriter_fourcc(*"avc1")
     out = cv2.VideoWriter(output_filename, fourcc, output_fps, (frame_w, frame_h))
     print(f"Processing {num_frames} frames...")
         vis_frame = get_visualized_frame(i)
         out.write(vis_frame)
         if i % 10 == 0:
+            print(f"Processed frame {i + 1}/{num_frames}")
     out.release()
     print(f"Video saved as {output_filename}")
 def list_depth(lst):
     """Calculates the depth of a nested list."""
     if not (isinstance(lst, list) or isinstance(lst, torch.Tensor)):
         return 0
+    elif (isinstance(lst, torch.Tensor) and lst.shape == torch.Size([])) or (
+        isinstance(lst, list) and len(lst) == 0
+    ):
         return 1
     else:
         return 1 + max(list_depth(item) for item in lst)
 def normalize_prompt(points, labels):
+    if list_depth(points) == 3:
         points = torch.stack([p.unsqueeze(0) for p in points])
         labels = torch.stack([l.unsqueeze(0) for l in labels])
     return points, labels
 def show_box(box, ax, object_id):
     if len(box) == 0:
         return
     cmap = plt.get_cmap("gist_rainbow")
     cmap_idx = 0 if object_id is None else object_id
     color = list(cmap((cmap_idx * 47) % 256))
     x0, y0 = box[0], box[1]
     w, h = box[2] - box[0], box[3] - box[1]
+    ax.add_patch(
+        plt.Rectangle((x0, y0), w, h, edgecolor=color, facecolor=(0, 0, 0, 0), lw=2)
+    )
 def show_points(coords, labels, ax, object_id=None, marker_size=375):
     if len(labels) == 0:
         return
+    pos_points = coords[labels == 1]
+    neg_points = coords[labels == 0]
     cmap = plt.get_cmap("gist_rainbow")
     cmap_idx = 0 if object_id is None else object_id
     color = list(cmap((cmap_idx * 47) % 256))
+    ax.scatter(
+        pos_points[:, 0],
+        pos_points[:, 1],
+        color="green",
+        marker="P",
+        s=marker_size,
+        edgecolor=color,
+        linewidth=1.25,
+    )
+    ax.scatter(
+        neg_points[:, 0],
+        neg_points[:, 1],
+        color="red",
+        marker="s",
+        s=marker_size,
+        edgecolor=color,
+        linewidth=1.25,
+    )
 def save_prompts_one_image(frame_image, boxes, points, labels, save_path):
     # Create a figure and axis
     fig, ax = plt.subplots(1, figsize=(6, 6))
     # Display the frame image
     ax.imshow(frame_image)
+    ax.axis("off")
     points, labels = normalize_prompt(points, labels)
     if type(boxes) == torch.Tensor:
         pass
     else:
         raise Exception()
     for object_id, (point_ls, label_ls) in enumerate(zip(points, labels)):
         if not len(point_ls) == 0:
             show_points(point_ls.cpu(), label_ls.cpu(), ax, object_id=object_id)
     # Show the plot
     plt.savefig(save_path)
     plt.close()
+def save_video_prompts_visualization(
+    video_tensor, video_boxes, video_points, video_labels, video_id, video_save_base_dir
+):
     video_save_dir = os.path.join(video_save_base_dir, video_id)
     if not os.path.exists(video_save_dir):
         os.makedirs(video_save_dir, exist_ok=True)
     for frame_id, image in enumerate(video_tensor):
         boxes, points, labels = [], [], []
         if frame_id in video_boxes:
             boxes = video_boxes[frame_id]
         if frame_id in video_points:
             points = video_points[frame_id]
         if frame_id in video_labels:
             labels = video_labels[frame_id]
         save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
         save_prompts_one_image(image, boxes, points, labels, save_path)
+def save_video_masks_visualization(
+    video_tensor,
+    video_masks,
+    video_id,
+    video_save_base_dir,
+    oid_class_pred=None,
+    sample_rate=1,
+):
     video_save_dir = os.path.join(video_save_base_dir, video_id)
     if not os.path.exists(video_save_dir):
         os.makedirs(video_save_dir, exist_ok=True)
     for frame_id, image in enumerate(video_tensor):
         if random.random() > sample_rate:
             continue
         masks = video_masks[frame_id]
         save_path = os.path.join(video_save_dir, f"{frame_id}.jpg")
         save_mask_one_image(image, masks, save_path)
+def get_color(obj_id, cmap_name="gist_rainbow", alpha=0.5):
     cmap = plt.get_cmap(cmap_name)
     cmap_idx = 0 if obj_id is None else obj_id
     color = list(cmap((cmap_idx * 47) % 256))
     color[3] = 0.5
     color = np.array(color)
     return color
 def _bbox_center(bbox: Tuple[int, int, int, int]) -> Tuple[float, float]:
     return ((bbox[0] + bbox[2]) / 2.0, (bbox[1] + bbox[3]) / 2.0)
     """
     center1 = _bbox_center(bbox1)
     center2 = _bbox_center(bbox2)
+    if math.isclose(center1[0], center2[0], abs_tol=1e-3) and math.isclose(
+        center1[1], center2[1], abs_tol=1e-3
+    ):
         offset = max(1.0, (bbox2[2] - bbox2[0]) * 0.05)
         center2 = (center2[0] + offset, center2[1])
     start = (int(round(center1[0])), int(round(center1[1])))
         end = (end[0] + 1, end[1])
     return start, end
 def get_binary_mask_one_image(frame_image, masks, rel_pred_ls=None):
     # Create a figure and axis
     fig, ax = plt.subplots(1, figsize=(6, 6))
     # Display the frame image
     ax.imshow(frame_image)
+    ax.axis("off")
     all_objs_to_show = set()
     all_lines_to_show = []
     # print(rel_pred_ls[0])
     for (from_obj_id, to_obj_id), rel_text in rel_pred_ls.items():
+        all_objs_to_show.add(from_obj_id)
+        all_objs_to_show.add(to_obj_id)
         from_mask = masks[from_obj_id]
         bbox1 = mask_to_bbox(from_mask)
         to_mask = masks[to_obj_id]
         bbox2 = mask_to_bbox(to_mask)
         c1, c2 = shortest_line_between_bboxes(bbox1, bbox2)
         line_color = get_color(from_obj_id)
         face_color = get_color(to_obj_id)
         line = c1, c2, face_color, line_color, rel_text
         all_lines_to_show.append(line)
     masks_to_show = {}
     for oid in all_objs_to_show:
         masks_to_show[oid] = masks[oid]
     # Add the masks
     for obj_id, mask in masks_to_show.items():
         show_mask(mask, ax, obj_id=obj_id, random_color=False)
+    for (from_pt_x, from_pt_y), (
+        to_pt_x,
+        to_pt_y,
+    ), face_color, line_color, rel_text in all_lines_to_show:
+        plt.plot(
+            [from_pt_x, to_pt_x],
+            [from_pt_y, to_pt_y],
+            color=line_color,
+            linestyle="-",
+            linewidth=3,
+        )
         mid_pt_x = (from_pt_x + to_pt_x) / 2
         mid_pt_y = (from_pt_y + to_pt_y) / 2
         ax.text(
+            mid_pt_x - 5,
+            mid_pt_y,
+            rel_text,
+            color="white",
+            fontsize=6,
+            backgroundcolor=np.array(line_color),
+            bbox=dict(
+                facecolor=face_color, edgecolor=line_color, boxstyle="round,pad=1"
+            ),
+            alpha=1,
+        )
     # Show the plot
     return fig, ax