""" Example demonstrating how to load and use VINE ensemble weights This script shows the correct way to load your pretrained VINE ensemble weights and use them with the HuggingFace interface, based on the actual inference.py workflow. """ import os import sys from pathlib import Path import torch import numpy as np from transformers.pipelines import PIPELINE_REGISTRY #os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable current_dir = Path(__file__).resolve().parent src_dir = current_dir.parent / "src" if src_dir.is_dir() and str(src_dir) not in sys.path: sys.path.insert(0, str(src_dir)) from vine_hf import VineConfig, VineModel, VinePipeline from laser.loading import load_video def example_load_ensemble_weights(): """Example of loading ensemble weights correctly.""" print("=== Loading Ensemble VINE Weights ===") # Path to your ensemble model (adjust this to your actual path) data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") print(f"Looking for ensemble weights in: {model_dir}") if os.path.exists(model_dir): print("✓ Model directory found") # List available model files model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')] print(f"Available model files: {model_files}") if model_files: # Create configuration with ensemble path (local directory with .model files) config = VineConfig( segmentation_method="grounding_dino_sam2", use_hf_repo=False, local_dir=model_dir, local_filename=None, ) print("Creating VINE model with ensemble weights...") vine_model = VineModel(config) print("✓ VINE model created with ensemble weights!") return vine_model else: print("✗ No .model files found in directory") return None else: print(f"✗ Model directory not found: {model_dir}") print("Please adjust the path to point to your ensemble weights") return None def example_direct_ensemble_loading(): """Example of loading ensemble weights using from_pretrained_vine.""" print("\n=== Direct Ensemble Loading ===") # Path to specific ensemble file data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") if os.path.exists(model_dir): try: # Use the class method for direct loading vine_model = VineModel.from_pretrained_vine( model_path=model_dir, epoch=0 # Load epoch 0 ) print("✓ Model loaded using from_pretrained_vine!") return vine_model except Exception as e: print(f"✗ Error loading with from_pretrained_vine: {e}") return None else: print(f"✗ Model directory not found: {model_dir}") return None def example_compare_original_vs_hf(): """Compare the original inference.py approach with HuggingFace interface.""" print("\n=== Comparing Original vs HuggingFace Interface ===") data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") model_name = "ensemble-2025-02-10-14-57-22" epoch = 0 if not os.path.exists(model_dir): print(f"Model directory not found: {model_dir}") return print("Original approach (from inference.py):") print("```python") print("def load_model(model_dir, model_name, epoch, device):") print(" model_name = model_name + f'.{epoch}.model'") print(" predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)") print(" return predicate_model") print("") print("predicate_model = load_model(model_dir, model_name, epoch, device)") print("```") print("\nNew HuggingFace approach:") print("```python") print("config = VineConfig(pretrained_vine_path=model_dir)") print("vine_model = VineModel(config)") print("# or") print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)") print("```") # Try to load with both approaches if possible try: # Original approach def load_model(model_dir, model_name, epoch, device): model_name = model_name + f'.{epoch}.model' model_path = os.path.join(model_dir, model_name) if os.path.exists(model_path): return torch.load(model_path, map_location=device, weights_only=False) else: print(f"Model file not found: {model_path}") return None device = "cuda" if torch.cuda.is_available() else "cpu" original_model = load_model(model_dir, model_name, epoch, device) if original_model: print(f"✓ Original model loaded: {type(original_model)}") print(f" Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}") print(f" Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}") print(f" Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}") # HuggingFace approach vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch) if vine_model: print(f"✓ HuggingFace model loaded: {type(vine_model)}") print(f" Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}") print(f" Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}") print(f" Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}") print("\n✓ Both approaches work! HuggingFace interface successfully loads ensemble weights.") except Exception as e: print(f"Error in comparison: {e}") def example_ensemble_with_pipeline(): """Example using ensemble weights with the pipeline.""" print("\n=== Using Ensemble Weights with Pipeline ===") data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") if not os.path.exists(model_dir): print(f"Model directory not found: {model_dir}") return # Register pipeline PIPELINE_REGISTRY.register_pipeline( "vine-video-understanding", pipeline_class=VinePipeline, pt_model=VineModel, type="multimodal", ) # Create model with ensemble weights (local directory) config = VineConfig( segmentation_method="grounding_dino_sam2", use_hf_repo=False, local_dir=model_dir, local_filename=None, ) vine_model = VineModel(config) # Create pipeline with segmentation model paths vine_pipeline = VinePipeline( model=vine_model, tokenizer=None, # SAM2 configuration sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", # GroundingDINO configuration gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", device="cuda" if torch.cuda.is_available() else "cpu", ) print("✓ Pipeline created with ensemble VINE weights") # Check for demo video demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4") if os.path.exists(demo_video): print(f"Found demo video: {demo_video}") # Use the same keywords as in the original inference.py categorical_keywords = ['human', 'dog', 'frisbee'] unary_keywords = ['running', 'jumping', 'catching', 'throwing'] binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left'] print("Example pipeline usage:") print("```python") print("results = vine_pipeline(") print(f" '{demo_video}',") print(f" categorical_keywords={categorical_keywords},") print(f" unary_keywords={unary_keywords},") print(f" binary_keywords={binary_keywords},") print(" segmentation_method='grounding_dino_sam2'") print(")") print("```") # Uncomment to actually run (requires segmentation models) # try: # results = vine_pipeline( # demo_video, # categorical_keywords=categorical_keywords, # unary_keywords=unary_keywords, # binary_keywords=binary_keywords, # segmentation_method='grounding_dino_sam2' # ) # print("Results:", results['summary']) # except Exception as e: # print(f"Pipeline execution failed: {e}") # print("This is expected if segmentation models are not set up") return vine_pipeline def demonstrate_weight_transfer(): """Demonstrate how weights are transferred from ensemble to HuggingFace format.""" print("\n=== Weight Transfer Demonstration ===") print("The ensemble model structure (PredicateModel):") print("- clip_cate_model: CLIP model for categorical classification") print("- clip_unary_model: CLIP model for unary predicates") print("- clip_binary_model: CLIP model for binary relations") print("- clip_tokenizer: Tokenizer for text processing") print("- clip_processor: Processor for image processing") print("\nWeight transfer process:") print("1. Load ensemble model with torch.load()") print("2. Initialize base CLIP models in HuggingFace format") print("3. Transfer state_dict from ensemble to HuggingFace models:") print(" - ensemble.clip_cate_model → hf.clip_cate_model") print(" - ensemble.clip_unary_model → hf.clip_unary_model") print(" - ensemble.clip_binary_model → hf.clip_binary_model") print("4. Transfer tokenizer and processor") print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!") def troubleshooting_guide(): """Provide troubleshooting guide for common issues.""" print("\n=== Troubleshooting Guide ===") print("Common Issues:") print("1. 'No model file found for epoch X'") print(" → Check that .model files exist in the directory") print(" → Verify the epoch number is correct") print(" → List files: ls /path/to/model/dir/*.model") print("\n2. 'Error loading VINE weights'") print(" → Check file permissions") print(" → Verify the model file is not corrupted") print(" → Try loading with torch.load() directly first") print("\n3. 'CLIP model mismatch'") print(" → Ensure config.model_name matches the base model used in training") print("\n4. 'Device mismatch errors'") print(" → Models are loaded to CPU first, then moved to device") print(" → Check CUDA availability with torch.cuda.is_available()") print("\nDebugging steps:") print("1. Test loading ensemble model directly:") print(" model = torch.load('path/to/model.0.model', map_location='cpu')") print("2. Check model attributes:") print(" print(dir(model))") print("3. Verify state_dict keys:") print(" print(model.clip_cate_model.state_dict().keys())") if __name__ == "__main__": print("VINE Ensemble Weights Loading Examples") print("=" * 50) # Test ensemble weight loading try: model1 = example_load_ensemble_weights() except Exception as e: print(f"Ensemble loading example failed: {e}") try: model2 = example_direct_ensemble_loading() except Exception as e: print(f"Direct loading example failed: {e}") # Compare approaches try: example_compare_original_vs_hf() except Exception as e: print(f"Comparison example failed: {e}") # Test pipeline with ensemble weights try: pipeline = example_ensemble_with_pipeline() except Exception as e: print(f"Pipeline example failed: {e}") # Educational content demonstrate_weight_transfer() troubleshooting_guide() print("\n" + "=" * 50) print("Key Points:") print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights") print("2. Use torch.load() to load the ensemble, then transfer weights") print("3. The HuggingFace interface preserves your fine-tuned weights") print("4. Specify pretrained_vine_path in VineConfig to auto-load weights") print("5. Use VineModel.from_pretrained_vine() for direct loading")