Spaces:

jiani-huang
/

LASER

Running on Zero

LASER / src /vine_hf /example_usage.py

moqingyan123

final fixes

888f9e4 12 days ago

10.8 kB

	"""
	Example usage of VINE HuggingFace interface

	This script demonstrates how to use the VINE model through the HuggingFace interface
	for video understanding with categorical, unary, and binary keyword predictions.
	"""

	import os
	import sys
	import torch
	from transformers import pipeline, AutoModel
	from transformers.pipelines import PIPELINE_REGISTRY

	# Add the parent directory to the path to import vine_hf
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	# Uncomment or set your own
	#os.environ['OPENAI_API_KEY'] = 'dummy-key'
	from vine_hf import VineConfig, VineModel, VinePipeline

	def example_direct_model_usage():
	"""Example of using the VINE model directly."""
	print("=== Direct Model Usage ===")

	# Create configuration
	config = VineConfig(
	model_name="openai/clip-vit-base-patch32",
	segmentation_method="grounding_dino_sam2",
	use_hf_repo=True,
	model_repo="video-fm/vine_v0", # Your HF Hub model
	debug_visualizations=True,
	debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
	target_fps=30,
	box_threshold=0.35,
	text_threshold=0.25
	)

	# Initialize model
	model = VineModel(config)

	print(f"Model initialized with CLIP backbone: {config.model_name}")
	print(f"Segmentation method: {config.segmentation_method}")
	print(f"Device: {model.device}")

	# Example video data (placeholder - in real usage, load from video file)
	num_frames, height, width = 3, 224, 224
	video_frames = torch.randn(num_frames, height, width, 3) * 255
	video_frames = video_frames.clamp(0, 255).byte()

	# Example masks and bboxes (placeholder - in real usage, generated by segmentation)
	masks = {
	0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
	1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
	2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
	}

	bboxes = {
	0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
	1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
	2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
	}

	# Define keywords
	categorical_keywords = ["human", "dog", "frisbee"]
	unary_keywords = ["running", "jumping", "sitting", "standing"]
	binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
	object_pairs = [(1, 2)] # Object 1 relates to Object 2

	# Run prediction
	print("\nRunning prediction...")
	results = model.predict(
	video_frames=video_frames,
	masks=masks,
	bboxes=bboxes,
	categorical_keywords=categorical_keywords,
	unary_keywords=unary_keywords,
	binary_keywords=binary_keywords,
	object_pairs=object_pairs,
	return_top_k=3
	)

	print("\nResults:")
	print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
	print(f"Unary predictions: {len(results['unary_predictions'])} actions")
	print(f"Binary predictions: {len(results['binary_predictions'])} relations")
	print(f"Confidence scores: {results['confidence_scores']}")


	def example_pipeline_usage():
	"""Example of using the VINE pipeline."""
	print("\n=== Pipeline Usage ===")

	# Register the pipeline
	PIPELINE_REGISTRY.register_pipeline(
	"vine-video-understanding",
	pipeline_class=VinePipeline,
	pt_model=VineModel,
	type="multimodal",
	)
	vine_config = VineConfig(
	model_name="openai/clip-vit-base-patch32",
	use_hf_repo=True,
	model_repo="video-fm/vine_v0", # Your HF Hub model
	segmentation_method="grounding_dino_sam2",
	debug_visualizations=True,
	)

	vine_pipe = VinePipeline(
	model=VineModel(vine_config),
	tokenizer=None,
	trust_remote_code=True,
	# SAM2 configuration
	sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
	sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
	gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
	gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
	device=0,
	)


	print("Pipeline created successfully!")

	# Example usage with video path
	video_path = "path/to/your/video.mp4" # Replace with actual video path

	# For demonstration, we'll show the expected usage format
	print(f"\nExample pipeline call (replace with actual video path):")
	print(f"results = vine_pipeline(")
	print(f" '{video_path}',")
	print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
	print(f" unary_keywords=['running', 'jumping', 'sitting'],")
	print(f" binary_keywords=['behind', 'in front of', 'next to'],")
	print(f" object_pairs=[(1, 2)],")
	print(f" segmentation_method='grounding_dino_sam2',")
	print(f" return_top_k=3,")
	print(f" return_flattened_segments=True,")
	print(f" return_valid_pairs=True,")
	print(f" include_visualizations=True,")
	print(f" debug_visualizations=True")
	print(f")")

	# Note: Actual execution would require proper video file and segmentation models


	def example_huggingface_hub_usage():
	"""Example of how to push and load from HuggingFace Hub."""
	print("\n=== HuggingFace Hub Usage ===")

	# Example of preparing model for Hub
	config = VineConfig()
	model = VineModel(config)

	# Register for auto classes
	config.register_for_auto_class()
	model.register_for_auto_class("AutoModel")

	print("Model registered for auto classes")

	# Example push to hub (commented out - requires actual model weights and credentials)
	# config.push_to_hub('your-username/vine-model')
	# model.push_to_hub('your-username/vine-model')

	# Example load from hub (commented out - requires actual model on hub)
	# model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
	# pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)

	print("To push to Hub:")
	print("1. config.push_to_hub('your-username/vine-model')")
	print("2. model.push_to_hub('your-username/vine-model')")
	print("\nTo load from Hub:")
	print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
	print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")


	def example_with_real_video():
	"""Example showing how to use with a real video file."""
	print("\n=== Real Video Usage Example ===")

	# Check if demo video exists
	demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")

	if os.path.exists(demo_video_path):
	print(f"Found demo video: {demo_video_path}")

	# Create pipeline with segmentation model paths
	PIPELINE_REGISTRY.register_pipeline(
	"vine-video-understanding",
	pipeline_class=VinePipeline,
	pt_model=VineModel,
	type="multimodal",
	)

	vine_config = VineConfig(
	model_name="openai/clip-vit-base-patch32",
	use_hf_repo=True,
	model_repo="video-fm/vine_v0", # Your HF Hub model
	segmentation_method="grounding_dino_sam2",
	debug_visualizations=True,
	debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
	)

	vine_pipeline = VinePipeline(
	model=VineModel(vine_config),
	tokenizer=None,
	trust_remote_code=True,
	# SAM2 configuration
	sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
	sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
	gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
	gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
	)

	# Define keywords based on the demo
	categorical_keywords = ['human', 'dog', 'frisbee']
	unary_keywords = ['running', 'jumping', 'catching', 'throwing']
	binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
	object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships

	print("\nProcessing video with VINE...")
	print("Keywords:")
	print(f" Categorical: {categorical_keywords}")
	print(f" Unary: {unary_keywords}")
	print(f" Binary: {binary_keywords}")
	print(f" Object pairs: {object_pairs}")

	# Note: This would require proper segmentation models to be set up
	try:
	results = vine_pipeline(
	demo_video_path,
	categorical_keywords=categorical_keywords,
	unary_keywords=unary_keywords,
	binary_keywords=binary_keywords,
	object_pairs=object_pairs,
	segmentation_method='grounding_dino_sam2',
	return_top_k=3,
	include_visualizations=False,
	debug_visualizations=True,
	)

	print("\nResults:")
	print(f"Summary: {results['summary']}")

	except Exception as e:
	print(f"Note: Full execution requires segmentation models to be properly set up.")
	print(f"Error: {e}")

	else:
	print(f"Demo video not found at: {demo_video_path}")
	print("To use with a real video, provide the path to your video file.")


	if __name__ == "__main__":
	print("VINE HuggingFace Interface Examples")
	print("=" * 50)

	# Run examples
	try:
	example_direct_model_usage()
	except Exception as e:
	print(f"Direct model usage failed: {e}")

	try:
	example_pipeline_usage()
	except Exception as e:
	print(f"Pipeline usage failed: {e}")

	try:
	example_huggingface_hub_usage()
	except Exception as e:
	print(f"Hub usage example failed: {e}")

	try:
	example_with_real_video()
	except Exception as e:
	print(f"Real video example failed: {e}")

	print("\n" + "=" * 50)
	print("Examples completed!")
	print("\nNext steps:")
	print("1. Set up Grounding DINO and SAM2 models for segmentation")
	print("2. Load your pretrained VINE model weights")
	print("3. Test with your own videos")
	print("4. Push to HuggingFace Hub for sharing")