# Example visualization runner for VINE # - Loads a video (path, demo, or random) # - Runs the VINE pipeline # - Saves annotated frames and an MP4 if available import os import sys import argparse import cv2 import numpy as np from collections.abc import Mapping, Sequence from transformers.pipelines import PIPELINE_REGISTRY from transformers import pipeline # Set your OpenAI API key here or via environment variable os.environ['OPENAI_API_KEY'] = "dummy-key" # Local imports (workspace) sys.path.append(os.path.dirname(__file__)) from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used from vine_hf.vine_model import VineModel from vine_hf.vine_config import VineConfig from laser.loading import load_video def build_pipeline(args) -> VinePipeline: # Register pipeline type PIPELINE_REGISTRY.register_pipeline( "vine-video-understanding", pipeline_class=VinePipeline, pt_model=VineModel, type="multimodal", ) config = VineConfig( segmentation_method="grounding_dino_sam2", model_name="openai/clip-vit-base-patch32", # Example: load from HF repo use_hf_repo=True, model_repo="video-fm/vine_v0", # Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename box_threshold=args.box_threshold, text_threshold=args.text_threshold, target_fps=args.fps, topk_cate=args.topk_cate, visualization_dir=args.out_dir, visualize=True, debug_visualizations=True, device=args.device, ) model = VineModel(config) # Create pipeline instance with segmentation model paths (if provided) vine_pipe = VinePipeline( model=model, tokenizer=None, sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml", sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt", gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth", device=args.device, trust_remote_code=True, ) return vine_pipe def resolve_video(args) -> np.ndarray | str: # Priority: user --video -> demo video -> random frames if args.video and os.path.exists(args.video): return args.video demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4" demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4" if os.path.exists(demo_video): return demo_video if os.path.exists(demo_alt): return demo_alt # Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3 print("No video found; using random frames.") rng = np.random.default_rng(0) frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8) return frames def main(): parser = argparse.ArgumentParser(description="VINE visualization example") parser.add_argument("--video", type=str, default=None, help="Path to a video file") parser.add_argument("--out_dir", type=str, default="output", help="Output directory") parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method") parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing") parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold") parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold") parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display") parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU") parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations") args = parser.parse_args() vine_pipe = build_pipeline(args) video = resolve_video(args) # Keywords similar to examples/tests categorical_keywords = ["dog", "frisbee", "cat"] unary_keywords = ["running", "jumping", "sitting", "flying"] binary_keywords = ["behind", "next to", "chasing","biting"] object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)] print("Running VINE pipeline...") call_kwargs = dict( categorical_keywords=categorical_keywords, unary_keywords=unary_keywords, binary_keywords=binary_keywords, object_pairs=object_pairs, segmentation_method=args.method, return_top_k=args.topk_cate, include_visualizations=True, debug_visualizations=args.debug_visualizations, ) results = vine_pipe( video, **call_kwargs, ) # Normalize pipeline output to a dict (can be dict or list[dict]) if isinstance(results, Mapping): result = results elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping): result = results[0] else: result = {} # Print brief summary summary = result.get("summary", {}) if isinstance(result, dict) else {} print("Summary:", summary) if __name__ == "__main__": main()