Spaces:
Running
on
Zero
Running
on
Zero
| # Example visualization runner for VINE | |
| # - Loads a video (path, demo, or random) | |
| # - Runs the VINE pipeline | |
| # - Saves annotated frames and an MP4 if available | |
| import os | |
| import sys | |
| from pathlib import Path | |
| import argparse | |
| import cv2 | |
| import numpy as np | |
| from collections.abc import Mapping, Sequence | |
| from transformers.pipelines import PIPELINE_REGISTRY | |
| from transformers import pipeline | |
| # Set your OpenAI API key here or via environment variable | |
| os.environ['OPENAI_API_KEY'] = "dummy-key" | |
| # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable | |
| current_dir = Path(__file__).resolve().parent | |
| src_dir = current_dir.parent / "src" | |
| if src_dir.is_dir() and str(src_dir) not in sys.path: | |
| sys.path.insert(0, str(src_dir)) | |
| from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used | |
| from vine_hf.vine_model import VineModel | |
| from vine_hf.vine_config import VineConfig | |
| from laser.loading import load_video | |
| def build_pipeline(args) -> VinePipeline: | |
| # Register pipeline type | |
| PIPELINE_REGISTRY.register_pipeline( | |
| "vine-video-understanding", | |
| pipeline_class=VinePipeline, | |
| pt_model=VineModel, | |
| type="multimodal", | |
| ) | |
| config = VineConfig( | |
| segmentation_method="grounding_dino_sam2", | |
| model_name="openai/clip-vit-base-patch32", | |
| # Example: load from HF repo | |
| use_hf_repo=True, | |
| model_repo="video-fm/vine_v0", | |
| # Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename | |
| box_threshold=args.box_threshold, | |
| text_threshold=args.text_threshold, | |
| target_fps=args.fps, | |
| topk_cate=args.topk_cate, | |
| visualization_dir=args.out_dir, | |
| visualize=True, | |
| debug_visualizations=True, | |
| device=args.device, | |
| ) | |
| model = VineModel(config) | |
| # Create pipeline instance with segmentation model paths (if provided) | |
| vine_pipe = VinePipeline( | |
| model=model, | |
| tokenizer=None, | |
| sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml", | |
| sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt", | |
| gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py", | |
| gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth", | |
| device=args.device, | |
| trust_remote_code=True, | |
| ) | |
| return vine_pipe | |
| def resolve_video(args) -> np.ndarray | str: | |
| # Priority: user --video -> demo video -> random frames | |
| if args.video and os.path.exists(args.video): | |
| return args.video | |
| demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4" | |
| demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4" | |
| if os.path.exists(demo_video): | |
| return demo_video | |
| if os.path.exists(demo_alt): | |
| return demo_alt | |
| # Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3 | |
| print("No video found; using random frames.") | |
| rng = np.random.default_rng(0) | |
| frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8) | |
| return frames | |
| def main(): | |
| parser = argparse.ArgumentParser(description="VINE visualization example") | |
| parser.add_argument("--video", type=str, default=None, help="Path to a video file") | |
| parser.add_argument("--out_dir", type=str, default="output", help="Output directory") | |
| parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method") | |
| parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing") | |
| parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold") | |
| parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold") | |
| parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display") | |
| parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU") | |
| parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations") | |
| args = parser.parse_args() | |
| vine_pipe = build_pipeline(args) | |
| video = resolve_video(args) | |
| # Keywords similar to examples/tests | |
| categorical_keywords = ["dog", "frisbee", "cat"] | |
| unary_keywords = ["running", "jumping", "sitting", "flying"] | |
| binary_keywords = ["behind", "next to", "chasing","biting"] | |
| object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)] | |
| print("Running VINE pipeline...") | |
| call_kwargs = dict( | |
| categorical_keywords=categorical_keywords, | |
| unary_keywords=unary_keywords, | |
| binary_keywords=binary_keywords, | |
| object_pairs=object_pairs, | |
| segmentation_method=args.method, | |
| return_top_k=args.topk_cate, | |
| include_visualizations=True, | |
| debug_visualizations=args.debug_visualizations, | |
| ) | |
| results = vine_pipe( | |
| video, | |
| **call_kwargs, | |
| ) | |
| # Normalize pipeline output to a dict (can be dict or list[dict]) | |
| if isinstance(results, Mapping): | |
| result = results | |
| elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping): | |
| result = results[0] | |
| else: | |
| result = {} | |
| # Print brief summary | |
| summary = result.get("summary", {}) if isinstance(result, dict) else {} | |
| print("Summary:", summary) | |
| if __name__ == "__main__": | |
| main() | |