""" Script to convert existing inference.py workflow to use VINE HuggingFace interface This script demonstrates how to migrate from the original inference.py approach to the new HuggingFace-compatible interface. """ import os import sys from pathlib import Path import torch import numpy as np from typing import Dict, List, Tuple, Any # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable current_dir = Path(__file__).resolve().parent src_dir = current_dir.parent / "src" if src_dir.is_dir() and str(src_dir) not in sys.path: sys.path.insert(0, str(src_dir)) from vine_hf import VineConfig, VineModel, VinePipeline from laser.loading import load_video def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel: """ Load a pretrained VINE model from the original format into HuggingFace format. Args: model_dir: Directory containing the model model_name: Name of the model file (without .{epoch}.model extension) epoch: Epoch number to load Returns: VineModel instance with loaded weights """ print(f"Loading pretrained VINE model from {model_dir}") # Create configuration (adjust parameters as needed) # We expect local ensemble weights in `model_dir`, so configure # VineConfig to load from local directory/filename. model_file = f"{model_name}.{epoch}.model" config = VineConfig( model_name="openai/clip-vit-base-patch32", segmentation_method="grounding_dino_sam2", target_fps=1, box_threshold=0.35, text_threshold=0.25, use_hf_repo=False, local_dir=model_dir, local_filename=model_file, ) # Initialize model (VineModel will consult the config when loading) vine_model = VineModel(config) # Load original weights model_file = f"{model_name}.{epoch}.model" model_path = os.path.join(model_dir, model_file) if os.path.exists(model_path): print(f"Loading weights from: {model_path}") try: # Add safe globals for PyTorch 2.6+ import torch.serialization from laser.models.llava_clip_model_v3 import PredicateModel torch.serialization.add_safe_globals([PredicateModel]) # Load the original model original_model = torch.load(model_path, map_location='cpu', weights_only=False) # Transfer weights to HuggingFace model # This assumes the original model has the same structure # You may need to adjust this based on your specific model structure if hasattr(original_model, 'clip_cate_model'): vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict()) if hasattr(original_model, 'clip_unary_model'): vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict()) if hasattr(original_model, 'clip_binary_model'): vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict()) if hasattr(original_model, 'clip_tokenizer'): vine_model.clip_tokenizer = original_model.clip_tokenizer if hasattr(original_model, 'clip_processor'): vine_model.clip_processor = original_model.clip_processor print("✓ Weights transferred successfully") except Exception as e: print(f"✗ Error loading weights: {e}") print("You may need to adjust the weight loading logic for your specific model") else: print(f"✗ Model file not found: {model_path}") return vine_model def convert_inference_workflow(): """ Convert the original inference.py workflow to use HuggingFace interface. This function demonstrates how to replicate the original inference workflow using the new HuggingFace-compatible components. """ print("=== Converting Inference Workflow ===") # Original parameters from inference.py video_id = 'v1' target_fps = 1 classes = ['human', 'dog', 'frisbee'] unary_keywords = ['running', 'jumping', 'sitting', 'standing'] binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left'] # Paths (adjust these to match your setup) demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo")) video_dir = os.path.join(demo_dir, "videos") video_path = os.path.join(video_dir, f"{video_id}.mp4") # Model paths (adjust these to match your setup) data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") model_name = "ensemble-2025-02-10-14-57-22" # Segmentation model paths (adjust these to your actual paths) sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml" sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt" gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py" gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth" print(f"Video path: {video_path}") print(f"Model dir: {model_dir}") print(f"SAM2 config: {sam_config_path}") print(f"GroundingDINO config: {gd_config_path}") # Check if video exists if not os.path.exists(video_path): print(f"✗ Video not found: {video_path}") print("Please adjust the video path or use your own video file") return # 1. Load video (same as original) print(f"Loading video: {video_id}") video_tensor = load_video(video_path, target_fps=target_fps) print(f"Video shape: {video_tensor.shape}") # 2. Load VINE model with HuggingFace interface print("Loading VINE model...") if os.path.exists(model_dir): vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0) else: print(f"Model directory not found: {model_dir}") print("Creating new model with random weights for demonstration") config = VineConfig() vine_model = VineModel(config) # 3. Create pipeline for easier use print("Creating VINE pipeline...") from transformers.pipelines import PIPELINE_REGISTRY # Register pipeline if not already registered try: PIPELINE_REGISTRY.register_pipeline( "vine-video-understanding", pipeline_class=VinePipeline, pt_model=VineModel, type="multimodal", ) except Exception: pass # Already registered # Create pipeline instance with segmentation model paths vine_pipeline = VinePipeline( model=vine_model, tokenizer=None, # SAM2 configuration sam_config_path=sam_config_path, sam_checkpoint_path=sam_checkpoint_path, # GroundingDINO configuration gd_config_path=gd_config_path, gd_checkpoint_path=gd_checkpoint_path ) # 4. Process video with new interface print("Processing video with VINE HuggingFace interface...") try: # Use the pipeline to process the video results = vine_pipeline( video_path, categorical_keywords=classes, unary_keywords=unary_keywords, binary_keywords=binary_keywords, object_pairs=[(1, 2), (2, 3)], # Example object pairs segmentation_method='grounding_dino_sam2', target_fps=target_fps, return_top_k=3, include_visualizations=False ) # 5. Display results (similar to original format) print("\n=== VINE Results (HuggingFace Interface) ===") # Categorical predictions print("\nCategorical Predictions:") for obj_id, predictions in results['categorical_predictions'].items(): print(f" Object {obj_id}:") for prob, category in predictions: print(f" {prob:.3f}: {category}") # Unary predictions print("\nUnary Predictions:") for (frame_id, obj_id), predictions in results['unary_predictions'].items(): print(f" Frame {frame_id}, Object {obj_id}:") for prob, action in predictions: print(f" {prob:.3f}: {action}") # Binary predictions print("\nBinary Predictions:") for (frame_id, obj_pair), predictions in results['binary_predictions'].items(): print(f" Frame {frame_id}, Objects {obj_pair}:") for prob, relation in predictions: print(f" {prob:.3f}: {relation}") # Summary print(f"\nSummary:") print(f" Objects detected: {results['summary']['num_objects_detected']}") print(f" Top categories: {results['summary']['top_categories']}") print(f" Top actions: {results['summary']['top_actions']}") print(f" Top relations: {results['summary']['top_relations']}") print("\n✓ Successfully processed video with VINE HuggingFace interface!") except Exception as e: print(f"✗ Error processing video: {e}") print("This may be due to missing segmentation models or other dependencies") print("The interface is set up correctly, but full functionality requires:") print(" 1. Properly installed Grounding DINO and SAM2") print(" 2. Correct model weights") print(" 3. Proper configuration paths") def compare_interfaces(): """ Compare the original inference.py approach with the new HuggingFace interface. """ print("\n=== Interface Comparison ===") print("\nOriginal inference.py approach:") print("✓ Direct access to model internals") print("✓ Full control over segmentation pipeline") print("✗ Complex setup and configuration") print("✗ Not compatible with HuggingFace ecosystem") print("✗ Requires manual handling of all components") print("\nNew HuggingFace interface:") print("✓ Easy to use pipeline interface") print("✓ Compatible with HuggingFace Hub") print("✓ Standardized configuration") print("✓ Automatic handling of preprocessing/postprocessing") print("✓ Easy sharing and distribution") print("✓ Configurable segmentation model paths") print("✗ Slightly less direct control (can still access model directly)") print("\nMigration benefits:") print("• Share your model easily on HuggingFace Hub") print("• Users can load your model with a single line") print("• Standardized interface for video understanding") print("• Better integration with other HuggingFace tools") print("• Simplified deployment and inference") print("• Flexible segmentation model configuration") if __name__ == "__main__": print("VINE HuggingFace Interface Conversion") print("=" * 50) # Run conversion demonstration convert_inference_workflow() # Show comparison compare_interfaces() print("\n" + "=" * 50) print("Next steps:") print("1. Install SAM2 and GroundingDINO dependencies") print("2. Download the required model checkpoints") print("3. Update the paths in this script to point to your models") print("4. Test the interface with your specific model weights") print("5. Adjust configuration parameters as needed") print("6. Push your model to HuggingFace Hub using push_to_hub.py") print("7. Share with the community!")