""" Example usage of VINE HuggingFace interface This script demonstrates how to use the VINE model through the HuggingFace interface for video understanding with categorical, unary, and binary keyword predictions. """ import os import sys from pathlib import Path import torch from transformers import pipeline, AutoModel from transformers.pipelines import PIPELINE_REGISTRY # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable current_dir = Path(__file__).resolve().parent src_dir = current_dir.parent / "src" if src_dir.is_dir() and str(src_dir) not in sys.path: sys.path.insert(0, str(src_dir)) # Uncomment or set your own #os.environ['OPENAI_API_KEY'] = 'dummy-key' from vine_hf import VineConfig, VineModel, VinePipeline def example_direct_model_usage(): """Example of using the VINE model directly.""" print("=== Direct Model Usage ===") # Create configuration config = VineConfig( model_name="openai/clip-vit-base-patch32", segmentation_method="grounding_dino_sam2", use_hf_repo=True, model_repo="video-fm/vine_v0", # Your HF Hub model debug_visualizations=True, debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"), target_fps=30, box_threshold=0.35, text_threshold=0.25 ) # Initialize model model = VineModel(config) print(f"Model initialized with CLIP backbone: {config.model_name}") print(f"Segmentation method: {config.segmentation_method}") print(f"Device: {model.device}") # Example video data (placeholder - in real usage, load from video file) num_frames, height, width = 3, 224, 224 video_frames = torch.randn(num_frames, height, width, 3) * 255 video_frames = video_frames.clamp(0, 255).byte() # Example masks and bboxes (placeholder - in real usage, generated by segmentation) masks = { 0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}, 1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}, 2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)} } bboxes = { 0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]}, 1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]}, 2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]} } # Define keywords categorical_keywords = ["human", "dog", "frisbee"] unary_keywords = ["running", "jumping", "sitting", "standing"] binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"] object_pairs = [(1, 2)] # Object 1 relates to Object 2 # Run prediction print("\nRunning prediction...") results = model.predict( video_frames=video_frames, masks=masks, bboxes=bboxes, categorical_keywords=categorical_keywords, unary_keywords=unary_keywords, binary_keywords=binary_keywords, object_pairs=object_pairs, return_top_k=3 ) print("\nResults:") print(f"Categorical predictions: {len(results['categorical_predictions'])} objects") print(f"Unary predictions: {len(results['unary_predictions'])} actions") print(f"Binary predictions: {len(results['binary_predictions'])} relations") print(f"Confidence scores: {results['confidence_scores']}") def example_pipeline_usage(): """Example of using the VINE pipeline.""" print("\n=== Pipeline Usage ===") # Register the pipeline PIPELINE_REGISTRY.register_pipeline( "vine-video-understanding", pipeline_class=VinePipeline, pt_model=VineModel, type="multimodal", ) vine_config = VineConfig( model_name="openai/clip-vit-base-patch32", use_hf_repo=True, model_repo="video-fm/vine_v0", # Your HF Hub model segmentation_method="grounding_dino_sam2", debug_visualizations=True, ) vine_pipe = VinePipeline( model=VineModel(vine_config), tokenizer=None, trust_remote_code=True, # SAM2 configuration sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", device=0, ) print("Pipeline created successfully!") # Example usage with video path video_path = "path/to/your/video.mp4" # Replace with actual video path # For demonstration, we'll show the expected usage format print(f"\nExample pipeline call (replace with actual video path):") print(f"results = vine_pipeline(") print(f" '{video_path}',") print(f" categorical_keywords=['human', 'dog', 'frisbee'],") print(f" unary_keywords=['running', 'jumping', 'sitting'],") print(f" binary_keywords=['behind', 'in front of', 'next to'],") print(f" object_pairs=[(1, 2)],") print(f" segmentation_method='grounding_dino_sam2',") print(f" return_top_k=3,") print(f" return_flattened_segments=True,") print(f" return_valid_pairs=True,") print(f" include_visualizations=True,") print(f" debug_visualizations=True") print(f")") # Note: Actual execution would require proper video file and segmentation models def example_huggingface_hub_usage(): """Example of how to push and load from HuggingFace Hub.""" print("\n=== HuggingFace Hub Usage ===") # Example of preparing model for Hub config = VineConfig() model = VineModel(config) # Register for auto classes config.register_for_auto_class() model.register_for_auto_class("AutoModel") print("Model registered for auto classes") # Example push to hub (commented out - requires actual model weights and credentials) # config.push_to_hub('your-username/vine-model') # model.push_to_hub('your-username/vine-model') # Example load from hub (commented out - requires actual model on hub) # model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True) # pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True) print("To push to Hub:") print("1. config.push_to_hub('your-username/vine-model')") print("2. model.push_to_hub('your-username/vine-model')") print("\nTo load from Hub:") print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)") print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)") def example_with_real_video(): """Example showing how to use with a real video file.""" print("\n=== Real Video Usage Example ===") # Check if demo video exists demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4") if os.path.exists(demo_video_path): print(f"Found demo video: {demo_video_path}") # Create pipeline with segmentation model paths PIPELINE_REGISTRY.register_pipeline( "vine-video-understanding", pipeline_class=VinePipeline, pt_model=VineModel, type="multimodal", ) vine_config = VineConfig( model_name="openai/clip-vit-base-patch32", use_hf_repo=True, model_repo="video-fm/vine_v0", # Your HF Hub model segmentation_method="grounding_dino_sam2", debug_visualizations=True, debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"), ) vine_pipeline = VinePipeline( model=VineModel(vine_config), tokenizer=None, trust_remote_code=True, # SAM2 configuration sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", ) # Define keywords based on the demo categorical_keywords = ['human', 'dog', 'frisbee'] unary_keywords = ['running', 'jumping', 'catching', 'throwing'] binary_keywords = ['behind', 'in front of', 'next to', 'chasing'] object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships print("\nProcessing video with VINE...") print("Keywords:") print(f" Categorical: {categorical_keywords}") print(f" Unary: {unary_keywords}") print(f" Binary: {binary_keywords}") print(f" Object pairs: {object_pairs}") # Note: This would require proper segmentation models to be set up try: results = vine_pipeline( demo_video_path, categorical_keywords=categorical_keywords, unary_keywords=unary_keywords, binary_keywords=binary_keywords, object_pairs=object_pairs, segmentation_method='grounding_dino_sam2', return_top_k=3, include_visualizations=False, debug_visualizations=True, ) print("\nResults:") print(f"Summary: {results['summary']}") except Exception as e: print(f"Note: Full execution requires segmentation models to be properly set up.") print(f"Error: {e}") else: print(f"Demo video not found at: {demo_video_path}") print("To use with a real video, provide the path to your video file.") if __name__ == "__main__": print("VINE HuggingFace Interface Examples") print("=" * 50) # Run examples try: example_direct_model_usage() except Exception as e: print(f"Direct model usage failed: {e}") try: example_pipeline_usage() except Exception as e: print(f"Pipeline usage failed: {e}") try: example_huggingface_hub_usage() except Exception as e: print(f"Hub usage example failed: {e}") try: example_with_real_video() except Exception as e: print(f"Real video example failed: {e}") print("\n" + "=" * 50) print("Examples completed!") print("\nNext steps:") print("1. Set up Grounding DINO and SAM2 models for segmentation") print("2. Load your pretrained VINE model weights") print("3. Test with your own videos") print("4. Push to HuggingFace Hub for sharing")