Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Example usage of VINE HuggingFace interface | |
| This script demonstrates how to use the VINE model through the HuggingFace interface | |
| for video understanding with categorical, unary, and binary keyword predictions. | |
| """ | |
| import os | |
| import sys | |
| import torch | |
| from transformers import pipeline, AutoModel | |
| from transformers.pipelines import PIPELINE_REGISTRY | |
| # Add the parent directory to the path to import vine_hf | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| # Uncomment or set your own | |
| #os.environ['OPENAI_API_KEY'] = 'dummy-key' | |
| from vine_hf import VineConfig, VineModel, VinePipeline | |
| def example_direct_model_usage(): | |
| """Example of using the VINE model directly.""" | |
| print("=== Direct Model Usage ===") | |
| # Create configuration | |
| config = VineConfig( | |
| model_name="openai/clip-vit-base-patch32", | |
| segmentation_method="grounding_dino_sam2", | |
| use_hf_repo=True, | |
| model_repo="video-fm/vine_v0", # Your HF Hub model | |
| debug_visualizations=True, | |
| debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"), | |
| target_fps=30, | |
| box_threshold=0.35, | |
| text_threshold=0.25 | |
| ) | |
| # Initialize model | |
| model = VineModel(config) | |
| print(f"Model initialized with CLIP backbone: {config.model_name}") | |
| print(f"Segmentation method: {config.segmentation_method}") | |
| print(f"Device: {model.device}") | |
| # Example video data (placeholder - in real usage, load from video file) | |
| num_frames, height, width = 3, 224, 224 | |
| video_frames = torch.randn(num_frames, height, width, 3) * 255 | |
| video_frames = video_frames.clamp(0, 255).byte() | |
| # Example masks and bboxes (placeholder - in real usage, generated by segmentation) | |
| masks = { | |
| 0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}, | |
| 1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}, | |
| 2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)} | |
| } | |
| bboxes = { | |
| 0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]}, | |
| 1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]}, | |
| 2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]} | |
| } | |
| # Define keywords | |
| categorical_keywords = ["human", "dog", "frisbee"] | |
| unary_keywords = ["running", "jumping", "sitting", "standing"] | |
| binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"] | |
| object_pairs = [(1, 2)] # Object 1 relates to Object 2 | |
| # Run prediction | |
| print("\nRunning prediction...") | |
| results = model.predict( | |
| video_frames=video_frames, | |
| masks=masks, | |
| bboxes=bboxes, | |
| categorical_keywords=categorical_keywords, | |
| unary_keywords=unary_keywords, | |
| binary_keywords=binary_keywords, | |
| object_pairs=object_pairs, | |
| return_top_k=3 | |
| ) | |
| print("\nResults:") | |
| print(f"Categorical predictions: {len(results['categorical_predictions'])} objects") | |
| print(f"Unary predictions: {len(results['unary_predictions'])} actions") | |
| print(f"Binary predictions: {len(results['binary_predictions'])} relations") | |
| print(f"Confidence scores: {results['confidence_scores']}") | |
| def example_pipeline_usage(): | |
| """Example of using the VINE pipeline.""" | |
| print("\n=== Pipeline Usage ===") | |
| # Register the pipeline | |
| PIPELINE_REGISTRY.register_pipeline( | |
| "vine-video-understanding", | |
| pipeline_class=VinePipeline, | |
| pt_model=VineModel, | |
| type="multimodal", | |
| ) | |
| vine_config = VineConfig( | |
| model_name="openai/clip-vit-base-patch32", | |
| use_hf_repo=True, | |
| model_repo="video-fm/vine_v0", # Your HF Hub model | |
| segmentation_method="grounding_dino_sam2", | |
| debug_visualizations=True, | |
| ) | |
| vine_pipe = VinePipeline( | |
| model=VineModel(vine_config), | |
| tokenizer=None, | |
| trust_remote_code=True, | |
| # SAM2 configuration | |
| sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", | |
| sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", | |
| gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", | |
| gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", | |
| device=0, | |
| ) | |
| print("Pipeline created successfully!") | |
| # Example usage with video path | |
| video_path = "path/to/your/video.mp4" # Replace with actual video path | |
| # For demonstration, we'll show the expected usage format | |
| print(f"\nExample pipeline call (replace with actual video path):") | |
| print(f"results = vine_pipeline(") | |
| print(f" '{video_path}',") | |
| print(f" categorical_keywords=['human', 'dog', 'frisbee'],") | |
| print(f" unary_keywords=['running', 'jumping', 'sitting'],") | |
| print(f" binary_keywords=['behind', 'in front of', 'next to'],") | |
| print(f" object_pairs=[(1, 2)],") | |
| print(f" segmentation_method='grounding_dino_sam2',") | |
| print(f" return_top_k=3,") | |
| print(f" return_flattened_segments=True,") | |
| print(f" return_valid_pairs=True,") | |
| print(f" include_visualizations=True,") | |
| print(f" debug_visualizations=True") | |
| print(f")") | |
| # Note: Actual execution would require proper video file and segmentation models | |
| def example_huggingface_hub_usage(): | |
| """Example of how to push and load from HuggingFace Hub.""" | |
| print("\n=== HuggingFace Hub Usage ===") | |
| # Example of preparing model for Hub | |
| config = VineConfig() | |
| model = VineModel(config) | |
| # Register for auto classes | |
| config.register_for_auto_class() | |
| model.register_for_auto_class("AutoModel") | |
| print("Model registered for auto classes") | |
| # Example push to hub (commented out - requires actual model weights and credentials) | |
| # config.push_to_hub('your-username/vine-model') | |
| # model.push_to_hub('your-username/vine-model') | |
| # Example load from hub (commented out - requires actual model on hub) | |
| # model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True) | |
| # pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True) | |
| print("To push to Hub:") | |
| print("1. config.push_to_hub('your-username/vine-model')") | |
| print("2. model.push_to_hub('your-username/vine-model')") | |
| print("\nTo load from Hub:") | |
| print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)") | |
| print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)") | |
| def example_with_real_video(): | |
| """Example showing how to use with a real video file.""" | |
| print("\n=== Real Video Usage Example ===") | |
| # Check if demo video exists | |
| demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4") | |
| if os.path.exists(demo_video_path): | |
| print(f"Found demo video: {demo_video_path}") | |
| # Create pipeline with segmentation model paths | |
| PIPELINE_REGISTRY.register_pipeline( | |
| "vine-video-understanding", | |
| pipeline_class=VinePipeline, | |
| pt_model=VineModel, | |
| type="multimodal", | |
| ) | |
| vine_config = VineConfig( | |
| model_name="openai/clip-vit-base-patch32", | |
| use_hf_repo=True, | |
| model_repo="video-fm/vine_v0", # Your HF Hub model | |
| segmentation_method="grounding_dino_sam2", | |
| debug_visualizations=True, | |
| debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"), | |
| ) | |
| vine_pipeline = VinePipeline( | |
| model=VineModel(vine_config), | |
| tokenizer=None, | |
| trust_remote_code=True, | |
| # SAM2 configuration | |
| sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", | |
| sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", | |
| gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", | |
| gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", | |
| ) | |
| # Define keywords based on the demo | |
| categorical_keywords = ['human', 'dog', 'frisbee'] | |
| unary_keywords = ['running', 'jumping', 'catching', 'throwing'] | |
| binary_keywords = ['behind', 'in front of', 'next to', 'chasing'] | |
| object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships | |
| print("\nProcessing video with VINE...") | |
| print("Keywords:") | |
| print(f" Categorical: {categorical_keywords}") | |
| print(f" Unary: {unary_keywords}") | |
| print(f" Binary: {binary_keywords}") | |
| print(f" Object pairs: {object_pairs}") | |
| # Note: This would require proper segmentation models to be set up | |
| try: | |
| results = vine_pipeline( | |
| demo_video_path, | |
| categorical_keywords=categorical_keywords, | |
| unary_keywords=unary_keywords, | |
| binary_keywords=binary_keywords, | |
| object_pairs=object_pairs, | |
| segmentation_method='grounding_dino_sam2', | |
| return_top_k=3, | |
| include_visualizations=False, | |
| debug_visualizations=True, | |
| ) | |
| print("\nResults:") | |
| print(f"Summary: {results['summary']}") | |
| except Exception as e: | |
| print(f"Note: Full execution requires segmentation models to be properly set up.") | |
| print(f"Error: {e}") | |
| else: | |
| print(f"Demo video not found at: {demo_video_path}") | |
| print("To use with a real video, provide the path to your video file.") | |
| if __name__ == "__main__": | |
| print("VINE HuggingFace Interface Examples") | |
| print("=" * 50) | |
| # Run examples | |
| try: | |
| example_direct_model_usage() | |
| except Exception as e: | |
| print(f"Direct model usage failed: {e}") | |
| try: | |
| example_pipeline_usage() | |
| except Exception as e: | |
| print(f"Pipeline usage failed: {e}") | |
| try: | |
| example_huggingface_hub_usage() | |
| except Exception as e: | |
| print(f"Hub usage example failed: {e}") | |
| try: | |
| example_with_real_video() | |
| except Exception as e: | |
| print(f"Real video example failed: {e}") | |
| print("\n" + "=" * 50) | |
| print("Examples completed!") | |
| print("\nNext steps:") | |
| print("1. Set up Grounding DINO and SAM2 models for segmentation") | |
| print("2. Load your pretrained VINE model weights") | |
| print("3. Test with your own videos") | |
| print("4. Push to HuggingFace Hub for sharing") | |