Spaces:

jiani-huang
/

LASER

Running on Zero

LASER / src /vine_hf /example_visualization.py

moqingyan123

final fixes

888f9e4 15 days ago

5.3 kB

	# Example visualization runner for VINE
	# - Loads a video (path, demo, or random)
	# - Runs the VINE pipeline
	# - Saves annotated frames and an MP4 if available

	import os
	import sys
	import argparse
	import cv2
	import numpy as np
	from collections.abc import Mapping, Sequence

	from transformers.pipelines import PIPELINE_REGISTRY
	from transformers import pipeline

	# Set your OpenAI API key here or via environment variable
	os.environ['OPENAI_API_KEY'] = "dummy-key"

	# Local imports (workspace)
	sys.path.append(os.path.dirname(__file__))

	from vine_hf.vine_pipeline import VinePipeline # https://github.com link not needed; local path used
	from vine_hf.vine_model import VineModel
	from vine_hf.vine_config import VineConfig
	from laser.loading import load_video


	def build_pipeline(args) -> VinePipeline:
	# Register pipeline type
	PIPELINE_REGISTRY.register_pipeline(
	"vine-video-understanding",
	pipeline_class=VinePipeline,
	pt_model=VineModel,
	type="multimodal",
	)

	config = VineConfig(
	segmentation_method="grounding_dino_sam2",
	model_name="openai/clip-vit-base-patch32",
	# Example: load from HF repo
	use_hf_repo=True,
	model_repo="video-fm/vine_v0",
	# Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
	box_threshold=args.box_threshold,
	text_threshold=args.text_threshold,
	target_fps=args.fps,
	topk_cate=args.topk_cate,
	visualization_dir=args.out_dir,
	visualize=True,
	debug_visualizations=True,
	device=args.device,
	)

	model = VineModel(config)

	# Create pipeline instance with segmentation model paths (if provided)
	vine_pipe = VinePipeline(
	model=model,
	tokenizer=None,
	sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
	sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
	gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
	gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
	device=args.device,
	trust_remote_code=True,
	)
	return vine_pipe


	def resolve_video(args) -> np.ndarray \| str:
	# Priority: user --video -> demo video -> random frames
	if args.video and os.path.exists(args.video):
	return args.video

	demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
	demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
	if os.path.exists(demo_video):
	return demo_video
	if os.path.exists(demo_alt):
	return demo_alt

	# Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
	print("No video found; using random frames.")
	rng = np.random.default_rng(0)
	frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
	return frames



	def main():
	parser = argparse.ArgumentParser(description="VINE visualization example")
	parser.add_argument("--video", type=str, default=None, help="Path to a video file")
	parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
	parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
	parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
	parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
	parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
	parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
	parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
	parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")


	args = parser.parse_args()

	vine_pipe = build_pipeline(args)
	video = resolve_video(args)

	# Keywords similar to examples/tests
	categorical_keywords = ["dog", "frisbee", "cat"]
	unary_keywords = ["running", "jumping", "sitting", "flying"]
	binary_keywords = ["behind", "next to", "chasing","biting"]
	object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]

	print("Running VINE pipeline...")
	call_kwargs = dict(
	categorical_keywords=categorical_keywords,
	unary_keywords=unary_keywords,
	binary_keywords=binary_keywords,
	object_pairs=object_pairs,
	segmentation_method=args.method,
	return_top_k=args.topk_cate,
	include_visualizations=True,
	debug_visualizations=args.debug_visualizations,
	)


	results = vine_pipe(
	video,
	**call_kwargs,
	)

	# Normalize pipeline output to a dict (can be dict or list[dict])
	if isinstance(results, Mapping):
	result = results
	elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
	result = results[0]
	else:
	result = {}

	# Print brief summary
	summary = result.get("summary", {}) if isinstance(result, dict) else {}
	print("Summary:", summary)


	if __name__ == "__main__":
	main()