Spaces:
Running
on
Zero
Running
on
Zero
| """ | |
| Example demonstrating how to load and use VINE ensemble weights | |
| This script shows the correct way to load your pretrained VINE ensemble weights | |
| and use them with the HuggingFace interface, based on the actual inference.py workflow. | |
| """ | |
| import os | |
| import sys | |
| from pathlib import Path | |
| import torch | |
| import numpy as np | |
| from transformers.pipelines import PIPELINE_REGISTRY | |
| #os.environ["OPENAI_API_KEY"]="dummy-key" # Set your OpenAI API key here or via environment variable | |
| # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable | |
| current_dir = Path(__file__).resolve().parent | |
| src_dir = current_dir.parent / "src" | |
| if src_dir.is_dir() and str(src_dir) not in sys.path: | |
| sys.path.insert(0, str(src_dir)) | |
| from vine_hf import VineConfig, VineModel, VinePipeline | |
| from laser.loading import load_video | |
| def example_load_ensemble_weights(): | |
| """Example of loading ensemble weights correctly.""" | |
| print("=== Loading Ensemble VINE Weights ===") | |
| # Path to your ensemble model (adjust this to your actual path) | |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) | |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") | |
| print(f"Looking for ensemble weights in: {model_dir}") | |
| if os.path.exists(model_dir): | |
| print("β Model directory found") | |
| # List available model files | |
| model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')] | |
| print(f"Available model files: {model_files}") | |
| if model_files: | |
| # Create configuration with ensemble path (local directory with .model files) | |
| config = VineConfig( | |
| segmentation_method="grounding_dino_sam2", | |
| use_hf_repo=False, | |
| local_dir=model_dir, | |
| local_filename=None, | |
| ) | |
| print("Creating VINE model with ensemble weights...") | |
| vine_model = VineModel(config) | |
| print("β VINE model created with ensemble weights!") | |
| return vine_model | |
| else: | |
| print("β No .model files found in directory") | |
| return None | |
| else: | |
| print(f"β Model directory not found: {model_dir}") | |
| print("Please adjust the path to point to your ensemble weights") | |
| return None | |
| def example_direct_ensemble_loading(): | |
| """Example of loading ensemble weights using from_pretrained_vine.""" | |
| print("\n=== Direct Ensemble Loading ===") | |
| # Path to specific ensemble file | |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) | |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") | |
| if os.path.exists(model_dir): | |
| try: | |
| # Use the class method for direct loading | |
| vine_model = VineModel.from_pretrained_vine( | |
| model_path=model_dir, | |
| epoch=0 # Load epoch 0 | |
| ) | |
| print("β Model loaded using from_pretrained_vine!") | |
| return vine_model | |
| except Exception as e: | |
| print(f"β Error loading with from_pretrained_vine: {e}") | |
| return None | |
| else: | |
| print(f"β Model directory not found: {model_dir}") | |
| return None | |
| def example_compare_original_vs_hf(): | |
| """Compare the original inference.py approach with HuggingFace interface.""" | |
| print("\n=== Comparing Original vs HuggingFace Interface ===") | |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) | |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") | |
| model_name = "ensemble-2025-02-10-14-57-22" | |
| epoch = 0 | |
| if not os.path.exists(model_dir): | |
| print(f"Model directory not found: {model_dir}") | |
| return | |
| print("Original approach (from inference.py):") | |
| print("```python") | |
| print("def load_model(model_dir, model_name, epoch, device):") | |
| print(" model_name = model_name + f'.{epoch}.model'") | |
| print(" predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)") | |
| print(" return predicate_model") | |
| print("") | |
| print("predicate_model = load_model(model_dir, model_name, epoch, device)") | |
| print("```") | |
| print("\nNew HuggingFace approach:") | |
| print("```python") | |
| print("config = VineConfig(pretrained_vine_path=model_dir)") | |
| print("vine_model = VineModel(config)") | |
| print("# or") | |
| print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)") | |
| print("```") | |
| # Try to load with both approaches if possible | |
| try: | |
| # Original approach | |
| def load_model(model_dir, model_name, epoch, device): | |
| model_name = model_name + f'.{epoch}.model' | |
| model_path = os.path.join(model_dir, model_name) | |
| if os.path.exists(model_path): | |
| return torch.load(model_path, map_location=device, weights_only=False) | |
| else: | |
| print(f"Model file not found: {model_path}") | |
| return None | |
| device = "cuda" if torch.cuda.is_available() else "cpu" | |
| original_model = load_model(model_dir, model_name, epoch, device) | |
| if original_model: | |
| print(f"β Original model loaded: {type(original_model)}") | |
| print(f" Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}") | |
| print(f" Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}") | |
| print(f" Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}") | |
| # HuggingFace approach | |
| vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch) | |
| if vine_model: | |
| print(f"β HuggingFace model loaded: {type(vine_model)}") | |
| print(f" Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}") | |
| print(f" Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}") | |
| print(f" Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}") | |
| print("\nβ Both approaches work! HuggingFace interface successfully loads ensemble weights.") | |
| except Exception as e: | |
| print(f"Error in comparison: {e}") | |
| def example_ensemble_with_pipeline(): | |
| """Example using ensemble weights with the pipeline.""" | |
| print("\n=== Using Ensemble Weights with Pipeline ===") | |
| data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data")) | |
| model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10") | |
| if not os.path.exists(model_dir): | |
| print(f"Model directory not found: {model_dir}") | |
| return | |
| # Register pipeline | |
| PIPELINE_REGISTRY.register_pipeline( | |
| "vine-video-understanding", | |
| pipeline_class=VinePipeline, | |
| pt_model=VineModel, | |
| type="multimodal", | |
| ) | |
| # Create model with ensemble weights (local directory) | |
| config = VineConfig( | |
| segmentation_method="grounding_dino_sam2", | |
| use_hf_repo=False, | |
| local_dir=model_dir, | |
| local_filename=None, | |
| ) | |
| vine_model = VineModel(config) | |
| # Create pipeline with segmentation model paths | |
| vine_pipeline = VinePipeline( | |
| model=vine_model, | |
| tokenizer=None, | |
| # SAM2 configuration | |
| sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml", | |
| sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt", | |
| # GroundingDINO configuration | |
| gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py", | |
| gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth", | |
| device="cuda" if torch.cuda.is_available() else "cpu", | |
| ) | |
| print("β Pipeline created with ensemble VINE weights") | |
| # Check for demo video | |
| demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4") | |
| if os.path.exists(demo_video): | |
| print(f"Found demo video: {demo_video}") | |
| # Use the same keywords as in the original inference.py | |
| categorical_keywords = ['human', 'dog', 'frisbee'] | |
| unary_keywords = ['running', 'jumping', 'catching', 'throwing'] | |
| binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left'] | |
| print("Example pipeline usage:") | |
| print("```python") | |
| print("results = vine_pipeline(") | |
| print(f" '{demo_video}',") | |
| print(f" categorical_keywords={categorical_keywords},") | |
| print(f" unary_keywords={unary_keywords},") | |
| print(f" binary_keywords={binary_keywords},") | |
| print(" segmentation_method='grounding_dino_sam2'") | |
| print(")") | |
| print("```") | |
| # Uncomment to actually run (requires segmentation models) | |
| # try: | |
| # results = vine_pipeline( | |
| # demo_video, | |
| # categorical_keywords=categorical_keywords, | |
| # unary_keywords=unary_keywords, | |
| # binary_keywords=binary_keywords, | |
| # segmentation_method='grounding_dino_sam2' | |
| # ) | |
| # print("Results:", results['summary']) | |
| # except Exception as e: | |
| # print(f"Pipeline execution failed: {e}") | |
| # print("This is expected if segmentation models are not set up") | |
| return vine_pipeline | |
| def demonstrate_weight_transfer(): | |
| """Demonstrate how weights are transferred from ensemble to HuggingFace format.""" | |
| print("\n=== Weight Transfer Demonstration ===") | |
| print("The ensemble model structure (PredicateModel):") | |
| print("- clip_cate_model: CLIP model for categorical classification") | |
| print("- clip_unary_model: CLIP model for unary predicates") | |
| print("- clip_binary_model: CLIP model for binary relations") | |
| print("- clip_tokenizer: Tokenizer for text processing") | |
| print("- clip_processor: Processor for image processing") | |
| print("\nWeight transfer process:") | |
| print("1. Load ensemble model with torch.load()") | |
| print("2. Initialize base CLIP models in HuggingFace format") | |
| print("3. Transfer state_dict from ensemble to HuggingFace models:") | |
| print(" - ensemble.clip_cate_model β hf.clip_cate_model") | |
| print(" - ensemble.clip_unary_model β hf.clip_unary_model") | |
| print(" - ensemble.clip_binary_model β hf.clip_binary_model") | |
| print("4. Transfer tokenizer and processor") | |
| print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!") | |
| def troubleshooting_guide(): | |
| """Provide troubleshooting guide for common issues.""" | |
| print("\n=== Troubleshooting Guide ===") | |
| print("Common Issues:") | |
| print("1. 'No model file found for epoch X'") | |
| print(" β Check that .model files exist in the directory") | |
| print(" β Verify the epoch number is correct") | |
| print(" β List files: ls /path/to/model/dir/*.model") | |
| print("\n2. 'Error loading VINE weights'") | |
| print(" β Check file permissions") | |
| print(" β Verify the model file is not corrupted") | |
| print(" β Try loading with torch.load() directly first") | |
| print("\n3. 'CLIP model mismatch'") | |
| print(" β Ensure config.model_name matches the base model used in training") | |
| print("\n4. 'Device mismatch errors'") | |
| print(" β Models are loaded to CPU first, then moved to device") | |
| print(" β Check CUDA availability with torch.cuda.is_available()") | |
| print("\nDebugging steps:") | |
| print("1. Test loading ensemble model directly:") | |
| print(" model = torch.load('path/to/model.0.model', map_location='cpu')") | |
| print("2. Check model attributes:") | |
| print(" print(dir(model))") | |
| print("3. Verify state_dict keys:") | |
| print(" print(model.clip_cate_model.state_dict().keys())") | |
| if __name__ == "__main__": | |
| print("VINE Ensemble Weights Loading Examples") | |
| print("=" * 50) | |
| # Test ensemble weight loading | |
| try: | |
| model1 = example_load_ensemble_weights() | |
| except Exception as e: | |
| print(f"Ensemble loading example failed: {e}") | |
| try: | |
| model2 = example_direct_ensemble_loading() | |
| except Exception as e: | |
| print(f"Direct loading example failed: {e}") | |
| # Compare approaches | |
| try: | |
| example_compare_original_vs_hf() | |
| except Exception as e: | |
| print(f"Comparison example failed: {e}") | |
| # Test pipeline with ensemble weights | |
| try: | |
| pipeline = example_ensemble_with_pipeline() | |
| except Exception as e: | |
| print(f"Pipeline example failed: {e}") | |
| # Educational content | |
| demonstrate_weight_transfer() | |
| troubleshooting_guide() | |
| print("\n" + "=" * 50) | |
| print("Key Points:") | |
| print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights") | |
| print("2. Use torch.load() to load the ensemble, then transfer weights") | |
| print("3. The HuggingFace interface preserves your fine-tuned weights") | |
| print("4. Specify pretrained_vine_path in VineConfig to auto-load weights") | |
| print("5. Use VineModel.from_pretrained_vine() for direct loading") | |