LASER / src /vine_hf /example_visualization.py
moqingyan123
final fixes
888f9e4
raw
history blame
5.3 kB
# Example visualization runner for VINE
# - Loads a video (path, demo, or random)
# - Runs the VINE pipeline
# - Saves annotated frames and an MP4 if available
import os
import sys
import argparse
import cv2
import numpy as np
from collections.abc import Mapping, Sequence
from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline
# Set your OpenAI API key here or via environment variable
os.environ['OPENAI_API_KEY'] = "dummy-key"
# Local imports (workspace)
sys.path.append(os.path.dirname(__file__))
from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
from vine_hf.vine_model import VineModel
from vine_hf.vine_config import VineConfig
from laser.loading import load_video
def build_pipeline(args) -> VinePipeline:
# Register pipeline type
PIPELINE_REGISTRY.register_pipeline(
"vine-video-understanding",
pipeline_class=VinePipeline,
pt_model=VineModel,
type="multimodal",
)
config = VineConfig(
segmentation_method="grounding_dino_sam2",
model_name="openai/clip-vit-base-patch32",
# Example: load from HF repo
use_hf_repo=True,
model_repo="video-fm/vine_v0",
# Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
box_threshold=args.box_threshold,
text_threshold=args.text_threshold,
target_fps=args.fps,
topk_cate=args.topk_cate,
visualization_dir=args.out_dir,
visualize=True,
debug_visualizations=True,
device=args.device,
)
model = VineModel(config)
# Create pipeline instance with segmentation model paths (if provided)
vine_pipe = VinePipeline(
model=model,
tokenizer=None,
sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
device=args.device,
trust_remote_code=True,
)
return vine_pipe
def resolve_video(args) -> np.ndarray | str:
# Priority: user --video -> demo video -> random frames
if args.video and os.path.exists(args.video):
return args.video
demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
if os.path.exists(demo_video):
return demo_video
if os.path.exists(demo_alt):
return demo_alt
# Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
print("No video found; using random frames.")
rng = np.random.default_rng(0)
frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
return frames
def main():
parser = argparse.ArgumentParser(description="VINE visualization example")
parser.add_argument("--video", type=str, default=None, help="Path to a video file")
parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")
args = parser.parse_args()
vine_pipe = build_pipeline(args)
video = resolve_video(args)
# Keywords similar to examples/tests
categorical_keywords = ["dog", "frisbee", "cat"]
unary_keywords = ["running", "jumping", "sitting", "flying"]
binary_keywords = ["behind", "next to", "chasing","biting"]
object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]
print("Running VINE pipeline...")
call_kwargs = dict(
categorical_keywords=categorical_keywords,
unary_keywords=unary_keywords,
binary_keywords=binary_keywords,
object_pairs=object_pairs,
segmentation_method=args.method,
return_top_k=args.topk_cate,
include_visualizations=True,
debug_visualizations=args.debug_visualizations,
)
results = vine_pipe(
video,
**call_kwargs,
)
# Normalize pipeline output to a dict (can be dict or list[dict])
if isinstance(results, Mapping):
result = results
elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
result = results[0]
else:
result = {}
# Print brief summary
summary = result.get("summary", {}) if isinstance(result, dict) else {}
print("Summary:", summary)
if __name__ == "__main__":
main()