Spaces:

jiani-huang
/

LASER

Running on Zero

File size: 11,598 Bytes

f9a6349

"""
Script to convert existing inference.py workflow to use VINE HuggingFace interface

This script demonstrates how to migrate from the original inference.py approach
to the new HuggingFace-compatible interface.
"""

import os
import sys
import torch
import numpy as np
from typing import Dict, List, Tuple, Any

# Add paths for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from vine_hf import VineConfig, VineModel, VinePipeline
from laser.loading import load_video


def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
    """
    Load a pretrained VINE model from the original format into HuggingFace format.
    
    Args:
        model_dir: Directory containing the model
        model_name: Name of the model file (without .{epoch}.model extension)
        epoch: Epoch number to load
        
    Returns:
        VineModel instance with loaded weights
    """
    print(f"Loading pretrained VINE model from {model_dir}")
    
    # Create configuration (adjust parameters as needed)
    # We expect local ensemble weights in `model_dir`, so configure
    # VineConfig to load from local directory/filename.
    model_file = f"{model_name}.{epoch}.model"
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        target_fps=1,
        box_threshold=0.35,
        text_threshold=0.25,
        use_hf_repo=False,
        local_dir=model_dir,
        local_filename=model_file,
    )

    # Initialize model (VineModel will consult the config when loading)
    vine_model = VineModel(config)
    
    # Load original weights
    model_file = f"{model_name}.{epoch}.model"
    model_path = os.path.join(model_dir, model_file)
    
    if os.path.exists(model_path):
        print(f"Loading weights from: {model_path}")
        try:
            # Add safe globals for PyTorch 2.6+
            import torch.serialization
            from laser.models.llava_clip_model_v3 import PredicateModel
            torch.serialization.add_safe_globals([PredicateModel])
            
            # Load the original model
            original_model = torch.load(model_path, map_location='cpu', weights_only=False)
            
            # Transfer weights to HuggingFace model
            # This assumes the original model has the same structure
            # You may need to adjust this based on your specific model structure
            
            if hasattr(original_model, 'clip_cate_model'):
                vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
            if hasattr(original_model, 'clip_unary_model'):
                vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
            if hasattr(original_model, 'clip_binary_model'):
                vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
            if hasattr(original_model, 'clip_tokenizer'):
                vine_model.clip_tokenizer = original_model.clip_tokenizer
            if hasattr(original_model, 'clip_processor'):
                vine_model.clip_processor = original_model.clip_processor
                
            print("✓ Weights transferred successfully")
            
        except Exception as e:
            print(f"✗ Error loading weights: {e}")
            print("You may need to adjust the weight loading logic for your specific model")
            
    else:
        print(f"✗ Model file not found: {model_path}")
        
    return vine_model


def convert_inference_workflow():
    """
    Convert the original inference.py workflow to use HuggingFace interface.
    
    This function demonstrates how to replicate the original inference workflow
    using the new HuggingFace-compatible components.
    """
    print("=== Converting Inference Workflow ===")
    
    # Original parameters from inference.py
    video_id = 'v1'
    target_fps = 1
    classes = ['human', 'dog', 'frisbee']
    unary_keywords = ['running', 'jumping', 'sitting', 'standing']
    binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
    
    # Paths (adjust these to match your setup)
    demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
    video_dir = os.path.join(demo_dir, "videos")
    video_path = os.path.join(video_dir, f"{video_id}.mp4")
    
    # Model paths (adjust these to match your setup)
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    model_name = "ensemble-2025-02-10-14-57-22"
    
    # Segmentation model paths (adjust these to your actual paths)
    sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
    sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
    gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
    gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"
    
    print(f"Video path: {video_path}")
    print(f"Model dir: {model_dir}")
    print(f"SAM2 config: {sam_config_path}")
    print(f"GroundingDINO config: {gd_config_path}")
    
    # Check if video exists
    if not os.path.exists(video_path):
        print(f"✗ Video not found: {video_path}")
        print("Please adjust the video path or use your own video file")
        return
    
    # 1. Load video (same as original)
    print(f"Loading video: {video_id}")
    video_tensor = load_video(video_path, target_fps=target_fps)
    print(f"Video shape: {video_tensor.shape}")
    
    # 2. Load VINE model with HuggingFace interface
    print("Loading VINE model...")
    if os.path.exists(model_dir):
        vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
    else:
        print(f"Model directory not found: {model_dir}")
        print("Creating new model with random weights for demonstration")
        config = VineConfig()
        vine_model = VineModel(config)
    
    # 3. Create pipeline for easier use
    print("Creating VINE pipeline...")
    from transformers.pipelines import PIPELINE_REGISTRY
    
    # Register pipeline if not already registered
    try:
        PIPELINE_REGISTRY.register_pipeline(
            "vine-video-understanding",
            pipeline_class=VinePipeline,
            pt_model=VineModel,
            type="multimodal",
        )
    except Exception:
        pass  # Already registered
    
    # Create pipeline instance with segmentation model paths
    vine_pipeline = VinePipeline(
        model=vine_model, 
        tokenizer=None,
        # SAM2 configuration
        sam_config_path=sam_config_path,
        sam_checkpoint_path=sam_checkpoint_path,
        # GroundingDINO configuration
        gd_config_path=gd_config_path,
        gd_checkpoint_path=gd_checkpoint_path
    )
    
    # 4. Process video with new interface
    print("Processing video with VINE HuggingFace interface...")
    
    try:
        # Use the pipeline to process the video
        results = vine_pipeline(
            video_path,
            categorical_keywords=classes,
            unary_keywords=unary_keywords,
            binary_keywords=binary_keywords,
            object_pairs=[(1, 2), (2, 3)],  # Example object pairs
            segmentation_method='grounding_dino_sam2',
            target_fps=target_fps,
            return_top_k=3,
            include_visualizations=False
        )
        
        # 5. Display results (similar to original format)
        print("\n=== VINE Results (HuggingFace Interface) ===")
        
        # Categorical predictions
        print("\nCategorical Predictions:")
        for obj_id, predictions in results['categorical_predictions'].items():
            print(f"  Object {obj_id}:")
            for prob, category in predictions:
                print(f"    {prob:.3f}: {category}")
        
        # Unary predictions  
        print("\nUnary Predictions:")
        for (frame_id, obj_id), predictions in results['unary_predictions'].items():
            print(f"  Frame {frame_id}, Object {obj_id}:")
            for prob, action in predictions:
                print(f"    {prob:.3f}: {action}")
        
        # Binary predictions
        print("\nBinary Predictions:")
        for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
            print(f"  Frame {frame_id}, Objects {obj_pair}:")
            for prob, relation in predictions:
                print(f"    {prob:.3f}: {relation}")
        
        # Summary
        print(f"\nSummary:")
        print(f"  Objects detected: {results['summary']['num_objects_detected']}")
        print(f"  Top categories: {results['summary']['top_categories']}")
        print(f"  Top actions: {results['summary']['top_actions']}")
        print(f"  Top relations: {results['summary']['top_relations']}")
        
        print("\n✓ Successfully processed video with VINE HuggingFace interface!")
        
    except Exception as e:
        print(f"✗ Error processing video: {e}")
        print("This may be due to missing segmentation models or other dependencies")
        print("The interface is set up correctly, but full functionality requires:")
        print("  1. Properly installed Grounding DINO and SAM2")
        print("  2. Correct model weights")
        print("  3. Proper configuration paths")


def compare_interfaces():
    """
    Compare the original inference.py approach with the new HuggingFace interface.
    """
    print("\n=== Interface Comparison ===")
    
    print("\nOriginal inference.py approach:")
    print("✓ Direct access to model internals")
    print("✓ Full control over segmentation pipeline")
    print("✗ Complex setup and configuration")
    print("✗ Not compatible with HuggingFace ecosystem")
    print("✗ Requires manual handling of all components")
    
    print("\nNew HuggingFace interface:")
    print("✓ Easy to use pipeline interface")
    print("✓ Compatible with HuggingFace Hub")
    print("✓ Standardized configuration")
    print("✓ Automatic handling of preprocessing/postprocessing")
    print("✓ Easy sharing and distribution")
    print("✓ Configurable segmentation model paths")
    print("✗ Slightly less direct control (can still access model directly)")
    
    print("\nMigration benefits:")
    print("• Share your model easily on HuggingFace Hub")
    print("• Users can load your model with a single line")
    print("• Standardized interface for video understanding")
    print("• Better integration with other HuggingFace tools")
    print("• Simplified deployment and inference")
    print("• Flexible segmentation model configuration")


if __name__ == "__main__":
    print("VINE HuggingFace Interface Conversion")
    print("=" * 50)
    
    # Run conversion demonstration
    convert_inference_workflow()
    
    # Show comparison
    compare_interfaces()
    
    print("\n" + "=" * 50)
    print("Next steps:")
    print("1. Install SAM2 and GroundingDINO dependencies")
    print("2. Download the required model checkpoints")
    print("3. Update the paths in this script to point to your models")
    print("4. Test the interface with your specific model weights")
    print("5. Adjust configuration parameters as needed")
    print("6. Push your model to HuggingFace Hub using push_to_hub.py")
    print("7. Share with the community!")