Spaces:

jiani-huang
/

LASER

Running on Zero

File size: 10,118 Bytes

888f9e4

"""
Example usage of VINE HuggingFace interface with pretrained VINE weights

This script demonstrates how to use the VINE model with your pretrained weights
from the ensemble format or from video-fm/vine_v0.
"""

import os
import sys
import torch
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY

# Set your OpenAI API key here or via environment variable
#os.environ['OPENAI_API_KEY'] = "dummy-key"

# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from vine_hf import VineConfig, VineModel, VinePipeline


def example_with_local_pretrained_weights():
    print("=== Using Local Pretrained VINE Weights ===")
    
    
    # Download https://huggingface.co/video-fm/vine_v0/tree/main/laser_model_v1.pt
    pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt"  # Replace with your local path
    
    
    # Create configuration with your pretrained path (local file)
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        target_fps=1,
        visualize=True,
        visualization_dir="path/to/visualization/dir",
        debug_visualizations=True,
        use_hf_repo=False,
        local_dir=os.path.dirname(pretrained_vine_file),
        local_filename=os.path.basename(pretrained_vine_file),
    )
    
    # Method 1: Initialize model directly
    print("Method 1: Direct model initialization")
    vine_model = VineModel(config)
    print(f"✓ Model initialized with pretrained weights from: {pretrained_vine_file}")
    
    # Method 2: Use the from_pretrained_vine class method
    print("\nMethod 2: Using from_pretrained_vine class method")
    vine_model_2 = VineModel.from_pretrained_vine(
        model_path=pretrained_vine_file,
        config=config,
        epoch=0  # Specify epoch number
    )
    print("✓ Model loaded using from_pretrained_vine method")
    
    return vine_model


def example_with_huggingface_hub():
    """Example using VINE weights from HuggingFace Hub."""
    print("\n=== Using HuggingFace Hub Weights ===")
    
    # Create configuration to use HuggingFace Hub weights
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",  # Your HF Hub model
        segmentation_method="grounding_dino_sam2",
        visualize=True,
        visualization_dir="path/to/visualization/dir",
        debug_visualizations=True,
    )
    
    try:
        # Initialize model (will try to load from HF Hub)
        vine_model = VineModel(config)
        print("✓ Model loaded from HuggingFace Hub: video-fm/vine_v0")
        return vine_model
    except Exception as e:
        print(f"✗ Could not load from HuggingFace Hub: {e}")
        print("Make sure your model is pushed to video-fm/vine_v0")
        return None


def example_pipeline_with_pretrained():
    """Example using pipeline with pretrained VINE weights."""
    print("\n=== Pipeline with Pretrained VINE ===")
    
    # Register the pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    
    # Create configuration with your weights
    pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt"  # Replace with your local path
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        visualize=True,
        visualization_dir="path/to/visualization/dir",
        debug_visualizations=True,
        use_hf_repo=False,
        local_dir=os.path.dirname(pretrained_vine_file),
        local_filename=os.path.basename(pretrained_vine_file),
    )
    
    # Create model with pretrained weights
    vine_model = VineModel(config)
    
    # Create pipeline with segmentation model paths
    vine_pipeline = VinePipeline(
        model=vine_model,
        tokenizer=None,
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device=0
    )
    
    print("✓ Pipeline created with pretrained VINE weights")
    
    # Example usage (would require actual video file)
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
    
    if os.path.exists(demo_video):
        print(f"Found demo video: {demo_video}")
        print("Example pipeline call:")
        print(f"results = vine_pipeline(")
        print(f"    '{demo_video}',")
        print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
        print(f"    unary_keywords=['running', 'jumping', 'sitting'],")
        print(f"    binary_keywords=['behind', 'chasing', 'next to']")
        print(f"    debug_visualizations=True")
        print(f")")
        
        # Uncomment to actually run (requires segmentation models)
        # results = vine_pipeline(
        #     demo_video,
        #     categorical_keywords=['human', 'dog', 'frisbee'],
        #     unary_keywords=['running', 'jumping', 'sitting'],
        #     binary_keywords=['behind', 'chasing', 'next to'],
        #     debug_visualizations=True,
        # )
        # print("Results:", results['summary'])
    
    return vine_pipeline



def example_manual_weight_loading():
    """Example of manually loading weights after model creation."""
    print("\n=== Manual Weight Loading ===")
    
    # Create model with base CLIP weights
    # No pretrained path: create base config (no HF repo or local file configured)
    config = VineConfig()
    vine_model = VineModel(config)
    print("✓ Model created with base CLIP weights")
    model_dir = "/path/to/your/local/ensemble/model_dir.pt"  # Replace with your model directory
    
    if os.path.exists(model_dir):
        success = vine_model.load_pretrained_vine_weights(model_dir, epoch=0)
        if success:
            print("✓ Successfully loaded pretrained VINE weights manually")
        else:
            print("✗ Failed to load pretrained weights")
    else:
        print(f"✗ Model directory not found: {model_dir}")
    
    return vine_model


def compare_model_outputs():
    """Compare outputs between base CLIP and pretrained VINE."""
    print("\n=== Comparing Model Outputs ===")
    
    # Create dummy data for testing
    video_frames = torch.randn(3, 224, 224, 3) * 255  # 3 frames
    video_frames = video_frames.clamp(0, 255).byte()
    
    masks = {
        0: {1: torch.ones(224, 224, 1)},
        1: {1: torch.ones(224, 224, 1)},
        2: {1: torch.ones(224, 224, 1)}
    }
    
    bboxes = {
        0: {1: [50, 50, 150, 150]},
        1: {1: [52, 52, 152, 152]},
        2: {1: [54, 54, 154, 154]}
    }
    
    keywords = ['human', 'dog', 'frisbee']
    
    # Model 1: Base CLIP
    print("Creating model with base CLIP weights...")
    config_base = VineConfig()
    model_base = VineModel(config_base)
    
    # Model 2: Pretrained VINE (if available)
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    if os.path.exists(model_dir):
        print("Creating model with pretrained VINE weights...")
        config_vine = VineConfig(
            use_hf_repo=False,
            local_dir=model_dir,
            local_filename=None,
        )
        model_vine = VineModel(config_vine)
        
        print("\nComparing predictions...")
        
        # Get predictions from both models
        with torch.no_grad():
            results_base = model_base.predict(
                video_frames=video_frames,
                masks=masks,
                bboxes=bboxes,
                categorical_keywords=keywords,
                return_top_k=3
            )
            
            results_vine = model_vine.predict(
                video_frames=video_frames,
                masks=masks,
                bboxes=bboxes,
                categorical_keywords=keywords,
                return_top_k=3
            )
        
        print("Base CLIP confidence scores:", results_base['confidence_scores'])
        print("Pretrained VINE confidence scores:", results_vine['confidence_scores'])
        
        print("✓ Successfully compared both models")
    else:
        print(f"Pretrained model not found at: {model_dir}")
        print("Skipping comparison")


if __name__ == "__main__":
    print("VINE HuggingFace Interface - Pretrained Weights Examples")
    print("=" * 60)
    
    try:
        # Test local pretrained weights
        model1 = example_with_local_pretrained_weights()
    except Exception as e:
        print(f"Local weights example failed: {e}")
    
    try:
        # Test HuggingFace Hub weights
        model2 = example_with_huggingface_hub()
    except Exception as e:
        print(f"HuggingFace Hub example failed: {e}")
    
    try:
        # Test pipeline with pretrained weights
        pipeline = example_pipeline_with_pretrained()
    except Exception as e:
        print(f"Pipeline example failed: {e}")
    
    # try:
    #     # Test manual weight loading
    #     #model3 = example_manual_weight_loading()
    # except Exception as e:
    #     print(f"Manual loading example failed: {e}")
    
    # try:
    #     # Compare model outputs
    #     #compare_model_outputs()
    # except Exception as e:
    #     print(f"Comparison example failed: {e}")
    
    print("\n" + "=" * 60)
    print("Examples completed!")
    print("\nUsage Summary:")
    print("1. Configure VineConfig with `use_hf_repo` + `model_repo` for Hub models, or `use_hf_repo=False` + `local_dir`/`local_filename` for local weights")
    print("2. Use VineModel.from_pretrained_vine() for direct loading")