Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Script to convert existing inference.py workflow to use VINE HuggingFace interface | |
| This script demonstrates how to migrate from the original inference.py approach | |
| to the new HuggingFace-compatible interface. | |
| """ | |
| import os | |
| import sys | |
| import torch | |
| import numpy as np | |
| from typing import Dict, List, Tuple, Any | |
| # Add paths for imports | |
| sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) | |
| from vine_hf import VineConfig, VineModel, VinePipeline | |
| from laser.loading import load_video | |
| def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel: | |
| """ | |
| Load a pretrained VINE model from the original format into HuggingFace format. | |
| Args: | |
| model_dir: Directory containing the model | |
| model_name: Name of the model file (without .{epoch}.model extension) | |
| epoch: Epoch number to load | |
| Returns: | |
| VineModel instance with loaded weights | |
| """ | |
| print(f"Loading pretrained VINE model from {model_dir}") | |
| # Create configuration (adjust parameters as needed) | |
| # We expect local ensemble weights in `model_dir`, so configure | |
| # VineConfig to load from local directory/filename. | |
| model_file = f"{model_name}.{epoch}.model" | |
| config = VineConfig( | |
| model_name="openai/clip-vit-base-patch32", | |
| segmentation_method="grounding_dino_sam2", | |
| target_fps=1, | |
| box_threshold=0.35, | |
| text_threshold=0.25, | |
| use_hf_repo=False, | |
| local_dir=model_dir, | |
| local_filename=model_file, | |
| ) | |
| # Initialize model (VineModel will consult the config when loading) | |
| vine_model = VineModel(config) | |
| # Load original weights | |
| model_file = f"{model_name}.{epoch}.model" | |
| model_path = os.path.join(model_dir, model_file) | |
| if os.path.exists(model_path): | |
| print(f"Loading weights from: {model_path}") | |
| try: | |
| # Add safe globals for PyTorch 2.6+ | |
| import torch.serialization | |
| from laser.models.llava_clip_model_v3 import PredicateModel | |
| torch.serialization.add_safe_globals([PredicateModel]) | |
| # Load the original model | |
| original_model = torch.load(model_path, map_location='cpu', weights_only=False) | |
| # Transfer weights to HuggingFace model | |
| # This assumes the original model has the same structure | |
| # You may need to adjust this based on your specific model structure | |
| if hasattr(original_model, 'clip_cate_model'): | |
| vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict()) | |
| if hasattr(original_model, 'clip_unary_model'): | |
| vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict()) | |
| if hasattr(original_model, 'clip_binary_model'): | |
| vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict()) | |
| if hasattr(original_model, 'clip_tokenizer'): | |
| vine_model.clip_tokenizer = original_model.clip_tokenizer | |
| if hasattr(original_model, 'clip_processor'): | |
| vine_model.clip_processor = original_model.clip_processor | |
| print("β Weights transferred successfully") | |
| except Exception as e: | |
| print(f"β Error loading weights: {e}") | |
| print("You may need to adjust the weight loading logic for your specific model") | |
| else: | |
| print(f"β Model file not found: {model_path}") | |
| return vine_model | |
| def convert_inference_workflow(): | |
| """ | |
| Convert the original inference.py workflow to use HuggingFace interface. | |
| This function demonstrates how to replicate the original inference workflow | |
| using the new HuggingFace-compatible components. | |
| """ | |
| print("=== Converting Inference Workflow ===") | |
| # Original parameters from inference.py | |
| video_id = 'v1' | |
| target_fps = 1 | |
| classes = ['human', 'dog', 'frisbee'] | |
| unary_keywords = ['running', 'jumping', 'sitting', 'standing'] | |
| binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left'] | |
| # Paths (adjust these to match your setup) | |
| demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo")) | |
| video_dir = os.path.join(demo_dir, "videos") | |
| video_path = os.path.join(video_dir, f"{video_id}.mp4") | |
| # Model paths (adjust these to match your setup) | |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) | |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") | |
| model_name = "ensemble-2025-02-10-14-57-22" | |
| # Segmentation model paths (adjust these to your actual paths) | |
| sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml" | |
| sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt" | |
| gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py" | |
| gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth" | |
| print(f"Video path: {video_path}") | |
| print(f"Model dir: {model_dir}") | |
| print(f"SAM2 config: {sam_config_path}") | |
| print(f"GroundingDINO config: {gd_config_path}") | |
| # Check if video exists | |
| if not os.path.exists(video_path): | |
| print(f"β Video not found: {video_path}") | |
| print("Please adjust the video path or use your own video file") | |
| return | |
| # 1. Load video (same as original) | |
| print(f"Loading video: {video_id}") | |
| video_tensor = load_video(video_path, target_fps=target_fps) | |
| print(f"Video shape: {video_tensor.shape}") | |
| # 2. Load VINE model with HuggingFace interface | |
| print("Loading VINE model...") | |
| if os.path.exists(model_dir): | |
| vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0) | |
| else: | |
| print(f"Model directory not found: {model_dir}") | |
| print("Creating new model with random weights for demonstration") | |
| config = VineConfig() | |
| vine_model = VineModel(config) | |
| # 3. Create pipeline for easier use | |
| print("Creating VINE pipeline...") | |
| from transformers.pipelines import PIPELINE_REGISTRY | |
| # Register pipeline if not already registered | |
| try: | |
| PIPELINE_REGISTRY.register_pipeline( | |
| "vine-video-understanding", | |
| pipeline_class=VinePipeline, | |
| pt_model=VineModel, | |
| type="multimodal", | |
| ) | |
| except Exception: | |
| pass # Already registered | |
| # Create pipeline instance with segmentation model paths | |
| vine_pipeline = VinePipeline( | |
| model=vine_model, | |
| tokenizer=None, | |
| # SAM2 configuration | |
| sam_config_path=sam_config_path, | |
| sam_checkpoint_path=sam_checkpoint_path, | |
| # GroundingDINO configuration | |
| gd_config_path=gd_config_path, | |
| gd_checkpoint_path=gd_checkpoint_path | |
| ) | |
| # 4. Process video with new interface | |
| print("Processing video with VINE HuggingFace interface...") | |
| try: | |
| # Use the pipeline to process the video | |
| results = vine_pipeline( | |
| video_path, | |
| categorical_keywords=classes, | |
| unary_keywords=unary_keywords, | |
| binary_keywords=binary_keywords, | |
| object_pairs=[(1, 2), (2, 3)], # Example object pairs | |
| segmentation_method='grounding_dino_sam2', | |
| target_fps=target_fps, | |
| return_top_k=3, | |
| include_visualizations=False | |
| ) | |
| # 5. Display results (similar to original format) | |
| print("\n=== VINE Results (HuggingFace Interface) ===") | |
| # Categorical predictions | |
| print("\nCategorical Predictions:") | |
| for obj_id, predictions in results['categorical_predictions'].items(): | |
| print(f" Object {obj_id}:") | |
| for prob, category in predictions: | |
| print(f" {prob:.3f}: {category}") | |
| # Unary predictions | |
| print("\nUnary Predictions:") | |
| for (frame_id, obj_id), predictions in results['unary_predictions'].items(): | |
| print(f" Frame {frame_id}, Object {obj_id}:") | |
| for prob, action in predictions: | |
| print(f" {prob:.3f}: {action}") | |
| # Binary predictions | |
| print("\nBinary Predictions:") | |
| for (frame_id, obj_pair), predictions in results['binary_predictions'].items(): | |
| print(f" Frame {frame_id}, Objects {obj_pair}:") | |
| for prob, relation in predictions: | |
| print(f" {prob:.3f}: {relation}") | |
| # Summary | |
| print(f"\nSummary:") | |
| print(f" Objects detected: {results['summary']['num_objects_detected']}") | |
| print(f" Top categories: {results['summary']['top_categories']}") | |
| print(f" Top actions: {results['summary']['top_actions']}") | |
| print(f" Top relations: {results['summary']['top_relations']}") | |
| print("\nβ Successfully processed video with VINE HuggingFace interface!") | |
| except Exception as e: | |
| print(f"β Error processing video: {e}") | |
| print("This may be due to missing segmentation models or other dependencies") | |
| print("The interface is set up correctly, but full functionality requires:") | |
| print(" 1. Properly installed Grounding DINO and SAM2") | |
| print(" 2. Correct model weights") | |
| print(" 3. Proper configuration paths") | |
| def compare_interfaces(): | |
| """ | |
| Compare the original inference.py approach with the new HuggingFace interface. | |
| """ | |
| print("\n=== Interface Comparison ===") | |
| print("\nOriginal inference.py approach:") | |
| print("β Direct access to model internals") | |
| print("β Full control over segmentation pipeline") | |
| print("β Complex setup and configuration") | |
| print("β Not compatible with HuggingFace ecosystem") | |
| print("β Requires manual handling of all components") | |
| print("\nNew HuggingFace interface:") | |
| print("β Easy to use pipeline interface") | |
| print("β Compatible with HuggingFace Hub") | |
| print("β Standardized configuration") | |
| print("β Automatic handling of preprocessing/postprocessing") | |
| print("β Easy sharing and distribution") | |
| print("β Configurable segmentation model paths") | |
| print("β Slightly less direct control (can still access model directly)") | |
| print("\nMigration benefits:") | |
| print("β’ Share your model easily on HuggingFace Hub") | |
| print("β’ Users can load your model with a single line") | |
| print("β’ Standardized interface for video understanding") | |
| print("β’ Better integration with other HuggingFace tools") | |
| print("β’ Simplified deployment and inference") | |
| print("β’ Flexible segmentation model configuration") | |
| if __name__ == "__main__": | |
| print("VINE HuggingFace Interface Conversion") | |
| print("=" * 50) | |
| # Run conversion demonstration | |
| convert_inference_workflow() | |
| # Show comparison | |
| compare_interfaces() | |
| print("\n" + "=" * 50) | |
| print("Next steps:") | |
| print("1. Install SAM2 and GroundingDINO dependencies") | |
| print("2. Download the required model checkpoints") | |
| print("3. Update the paths in this script to point to your models") | |
| print("4. Test the interface with your specific model weights") | |
| print("5. Adjust configuration parameters as needed") | |
| print("6. Push your model to HuggingFace Hub using push_to_hub.py") | |
| print("7. Share with the community!") |