Spaces:

jiani-huang
/

LASER

Running on Zero

File size: 13,168 Bytes

888f9e4

"""
Example demonstrating how to load and use VINE ensemble weights

This script shows the correct way to load your pretrained VINE ensemble weights
and use them with the HuggingFace interface, based on the actual inference.py workflow.
"""

import os
import sys
import torch
import numpy as np
from transformers.pipelines import PIPELINE_REGISTRY

#os.environ["OPENAI_API_KEY"]="dummy-key"  # Set your OpenAI API key here or via environment variable

# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from vine_hf import VineConfig, VineModel, VinePipeline
from laser.loading import load_video


def example_load_ensemble_weights():
    """Example of loading ensemble weights correctly."""
    print("=== Loading Ensemble VINE Weights ===")
    
    # Path to your ensemble model (adjust this to your actual path)
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    print(f"Looking for ensemble weights in: {model_dir}")
    
    if os.path.exists(model_dir):
        print("✓ Model directory found")
        
        # List available model files
        model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')]
        print(f"Available model files: {model_files}")
        
        if model_files:
            # Create configuration with ensemble path (local directory with .model files)
            config = VineConfig(
                segmentation_method="grounding_dino_sam2",
                use_hf_repo=False,
                local_dir=model_dir,
                local_filename=None,
            )
            
            print("Creating VINE model with ensemble weights...")
            vine_model = VineModel(config)
            
            print("✓ VINE model created with ensemble weights!")
            return vine_model
        else:
            print("✗ No .model files found in directory")
            return None
    else:
        print(f"✗ Model directory not found: {model_dir}")
        print("Please adjust the path to point to your ensemble weights")
        return None


def example_direct_ensemble_loading():
    """Example of loading ensemble weights using from_pretrained_vine."""
    print("\n=== Direct Ensemble Loading ===")
    
    # Path to specific ensemble file
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    if os.path.exists(model_dir):
        try:
            # Use the class method for direct loading
            vine_model = VineModel.from_pretrained_vine(
                model_path=model_dir,
                epoch=0  # Load epoch 0
            )
            
            print("✓ Model loaded using from_pretrained_vine!")
            return vine_model
            
        except Exception as e:
            print(f"✗ Error loading with from_pretrained_vine: {e}")
            return None
    else:
        print(f"✗ Model directory not found: {model_dir}")
        return None


def example_compare_original_vs_hf():
    """Compare the original inference.py approach with HuggingFace interface."""
    print("\n=== Comparing Original vs HuggingFace Interface ===")
    
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    model_name = "ensemble-2025-02-10-14-57-22"
    epoch = 0
    
    if not os.path.exists(model_dir):
        print(f"Model directory not found: {model_dir}")
        return
    
    print("Original approach (from inference.py):")
    print("```python")
    print("def load_model(model_dir, model_name, epoch, device):")
    print("    model_name = model_name + f'.{epoch}.model'")
    print("    predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)")
    print("    return predicate_model")
    print("")
    print("predicate_model = load_model(model_dir, model_name, epoch, device)")
    print("```")
    
    print("\nNew HuggingFace approach:")
    print("```python")
    print("config = VineConfig(pretrained_vine_path=model_dir)")
    print("vine_model = VineModel(config)")
    print("# or")
    print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)")
    print("```")
    
    # Try to load with both approaches if possible
    try:
        # Original approach
        def load_model(model_dir, model_name, epoch, device):
            model_name = model_name + f'.{epoch}.model'
            model_path = os.path.join(model_dir, model_name)
            if os.path.exists(model_path):
                return torch.load(model_path, map_location=device, weights_only=False)
            else:
                print(f"Model file not found: {model_path}")
                return None
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        original_model = load_model(model_dir, model_name, epoch, device)
        
        if original_model:
            print(f"✓ Original model loaded: {type(original_model)}")
            print(f"  Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}")
            print(f"  Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}")
            print(f"  Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}")
        
        # HuggingFace approach
        vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch)
        
        if vine_model:
            print(f"✓ HuggingFace model loaded: {type(vine_model)}")
            print(f"  Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}")
            print(f"  Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}")
            print(f"  Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}")
            
            print("\n✓ Both approaches work! HuggingFace interface successfully loads ensemble weights.")
        
    except Exception as e:
        print(f"Error in comparison: {e}")


def example_ensemble_with_pipeline():
    """Example using ensemble weights with the pipeline."""
    print("\n=== Using Ensemble Weights with Pipeline ===")
    
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    if not os.path.exists(model_dir):
        print(f"Model directory not found: {model_dir}")
        return
    
    # Register pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    
    # Create model with ensemble weights (local directory)
    config = VineConfig(
        segmentation_method="grounding_dino_sam2",
        use_hf_repo=False,
        local_dir=model_dir,
        local_filename=None,
    )
    
    vine_model = VineModel(config)
    # Create pipeline with segmentation model paths
    vine_pipeline = VinePipeline(
        model=vine_model,
        tokenizer=None,
        # SAM2 configuration
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        # GroundingDINO configuration  
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device="cuda" if torch.cuda.is_available() else "cpu",
    )
    
    print("✓ Pipeline created with ensemble VINE weights")
    
    # Check for demo video
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
    
    if os.path.exists(demo_video):
        print(f"Found demo video: {demo_video}")
        
        # Use the same keywords as in the original inference.py
        categorical_keywords = ['human', 'dog', 'frisbee']
        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
        binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
        
        print("Example pipeline usage:")
        print("```python")
        print("results = vine_pipeline(")
        print(f"    '{demo_video}',")
        print(f"    categorical_keywords={categorical_keywords},")
        print(f"    unary_keywords={unary_keywords},")
        print(f"    binary_keywords={binary_keywords},")
        print("    segmentation_method='grounding_dino_sam2'")
        print(")")
        print("```")
        
        # Uncomment to actually run (requires segmentation models)
        # try:
        #     results = vine_pipeline(
        #         demo_video,
        #         categorical_keywords=categorical_keywords,
        #         unary_keywords=unary_keywords,
        #         binary_keywords=binary_keywords,
        #         segmentation_method='grounding_dino_sam2'
        #     )
        #     print("Results:", results['summary'])
        # except Exception as e:
        #     print(f"Pipeline execution failed: {e}")
        #     print("This is expected if segmentation models are not set up")
    
    return vine_pipeline



def demonstrate_weight_transfer():
    """Demonstrate how weights are transferred from ensemble to HuggingFace format."""
    print("\n=== Weight Transfer Demonstration ===")
    
    print("The ensemble model structure (PredicateModel):")
    print("- clip_cate_model: CLIP model for categorical classification")
    print("- clip_unary_model: CLIP model for unary predicates")  
    print("- clip_binary_model: CLIP model for binary relations")
    print("- clip_tokenizer: Tokenizer for text processing")
    print("- clip_processor: Processor for image processing")
    
    print("\nWeight transfer process:")
    print("1. Load ensemble model with torch.load()")
    print("2. Initialize base CLIP models in HuggingFace format")
    print("3. Transfer state_dict from ensemble to HuggingFace models:")
    print("   - ensemble.clip_cate_model → hf.clip_cate_model")
    print("   - ensemble.clip_unary_model → hf.clip_unary_model")
    print("   - ensemble.clip_binary_model → hf.clip_binary_model")
    print("4. Transfer tokenizer and processor")
    
    print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!")


def troubleshooting_guide():
    """Provide troubleshooting guide for common issues."""
    print("\n=== Troubleshooting Guide ===")
    
    print("Common Issues:")
    print("1. 'No model file found for epoch X'")
    print("   → Check that .model files exist in the directory")
    print("   → Verify the epoch number is correct")
    print("   → List files: ls /path/to/model/dir/*.model")
    
    print("\n2. 'Error loading VINE weights'")
    print("   → Check file permissions")
    print("   → Verify the model file is not corrupted")
    print("   → Try loading with torch.load() directly first")
    
    print("\n3. 'CLIP model mismatch'")
    print("   → Ensure config.model_name matches the base model used in training")
    
    print("\n4. 'Device mismatch errors'")
    print("   → Models are loaded to CPU first, then moved to device")
    print("   → Check CUDA availability with torch.cuda.is_available()")
    
    print("\nDebugging steps:")
    print("1. Test loading ensemble model directly:")
    print("   model = torch.load('path/to/model.0.model', map_location='cpu')")
    print("2. Check model attributes:")
    print("   print(dir(model))")
    print("3. Verify state_dict keys:")
    print("   print(model.clip_cate_model.state_dict().keys())")


if __name__ == "__main__":
    print("VINE Ensemble Weights Loading Examples")
    print("=" * 50)
    
    # Test ensemble weight loading
    try:
        model1 = example_load_ensemble_weights()
    except Exception as e:
        print(f"Ensemble loading example failed: {e}")
    
    try:
        model2 = example_direct_ensemble_loading()
    except Exception as e:
        print(f"Direct loading example failed: {e}")
    
    # Compare approaches
    try:
        example_compare_original_vs_hf()
    except Exception as e:
        print(f"Comparison example failed: {e}")
    
    # Test pipeline with ensemble weights
    try:
        pipeline = example_ensemble_with_pipeline()
    except Exception as e:
        print(f"Pipeline example failed: {e}")
    
    # Educational content
    demonstrate_weight_transfer()
    troubleshooting_guide()
    
    print("\n" + "=" * 50)
    print("Key Points:")
    print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights")
    print("2. Use torch.load() to load the ensemble, then transfer weights")
    print("3. The HuggingFace interface preserves your fine-tuned weights")
    print("4. Specify pretrained_vine_path in VineConfig to auto-load weights")
    print("5. Use VineModel.from_pretrained_vine() for direct loading")