File size: 5,502 Bytes
f9a6349
 
 
 
 
 
 
f71f431
f9a6349
 
 
 
 
 
 
 
 
 
 
f71f431
 
 
 
 
f9a6349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
# Example visualization runner for VINE
# - Loads a video (path, demo, or random)
# - Runs the VINE pipeline
# - Saves annotated frames and an MP4 if available

import os
import sys
from pathlib import Path
import argparse
import cv2
import numpy as np
from collections.abc import Mapping, Sequence

from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline

# Set your OpenAI API key here or via environment variable
os.environ['OPENAI_API_KEY'] = "dummy-key"

# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
current_dir = Path(__file__).resolve().parent
src_dir = current_dir.parent / "src"
if src_dir.is_dir() and str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

from vine_hf.vine_pipeline import VinePipeline  # https://github.com link not needed; local path used
from vine_hf.vine_model import VineModel
from vine_hf.vine_config import VineConfig
from laser.loading import load_video


def build_pipeline(args) -> VinePipeline:
    # Register pipeline type
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )

    config = VineConfig(
        segmentation_method="grounding_dino_sam2",
        model_name="openai/clip-vit-base-patch32",
        # Example: load from HF repo
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",
        # Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
        box_threshold=args.box_threshold,
        text_threshold=args.text_threshold,
        target_fps=args.fps,
        topk_cate=args.topk_cate,
        visualization_dir=args.out_dir,
        visualize=True,
        debug_visualizations=True,
        device=args.device,
    )
    
    model = VineModel(config)

    # Create pipeline instance with segmentation model paths (if provided)
    vine_pipe = VinePipeline(
        model=model,
        tokenizer=None,
        sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
        sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
        gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
        device=args.device,
        trust_remote_code=True,
    )
    return vine_pipe


def resolve_video(args) -> np.ndarray | str:
    # Priority: user --video -> demo video -> random frames
    if args.video and os.path.exists(args.video):
        return args.video

    demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
    demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
    if os.path.exists(demo_video):
        return demo_video
    if os.path.exists(demo_alt):
        return demo_alt

    # Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
    print("No video found; using random frames.")
    rng = np.random.default_rng(0)
    frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
    return frames



def main():
    parser = argparse.ArgumentParser(description="VINE visualization example")
    parser.add_argument("--video", type=str, default=None, help="Path to a video file")
    parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
    parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
    parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
    parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
    parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
    parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
    parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
    parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")


    args = parser.parse_args()

    vine_pipe = build_pipeline(args)
    video = resolve_video(args)

    # Keywords similar to examples/tests
    categorical_keywords = ["dog", "frisbee", "cat"]
    unary_keywords = ["running", "jumping", "sitting", "flying"]
    binary_keywords = ["behind", "next to", "chasing","biting"]
    object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]

    print("Running VINE pipeline...")
    call_kwargs = dict(
        categorical_keywords=categorical_keywords,
        unary_keywords=unary_keywords,
        binary_keywords=binary_keywords,
        object_pairs=object_pairs,
        segmentation_method=args.method,
        return_top_k=args.topk_cate,
        include_visualizations=True,
        debug_visualizations=args.debug_visualizations,
    )


    results = vine_pipe(
        video,
        **call_kwargs,
    )

    # Normalize pipeline output to a dict (can be dict or list[dict])
    if isinstance(results, Mapping):
        result = results
    elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
        result = results[0]
    else:
        result = {}

    # Print brief summary
    summary = result.get("summary", {}) if isinstance(result, dict) else {}
    print("Summary:", summary)


if __name__ == "__main__":
    main()