Spaces:

jiani-huang
/

LASER

Running on Zero

File size: 10,793 Bytes

888f9e4

"""
Example usage of VINE HuggingFace interface

This script demonstrates how to use the VINE model through the HuggingFace interface
for video understanding with categorical, unary, and binary keyword predictions.
"""

import os
import sys
import torch
from transformers import pipeline, AutoModel
from transformers.pipelines import PIPELINE_REGISTRY

# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Uncomment or set your own
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
from vine_hf import VineConfig, VineModel, VinePipeline

def example_direct_model_usage():
    """Example of using the VINE model directly."""
    print("=== Direct Model Usage ===")
    
    # Create configuration
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",  # Your HF Hub model
        debug_visualizations=True,
        debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
        target_fps=30,
        box_threshold=0.35,
        text_threshold=0.25
    )
    
    # Initialize model
    model = VineModel(config)
    
    print(f"Model initialized with CLIP backbone: {config.model_name}")
    print(f"Segmentation method: {config.segmentation_method}")
    print(f"Device: {model.device}")
    
    # Example video data (placeholder - in real usage, load from video file)
    num_frames, height, width = 3, 224, 224
    video_frames = torch.randn(num_frames, height, width, 3) * 255
    video_frames = video_frames.clamp(0, 255).byte()
    
    # Example masks and bboxes (placeholder - in real usage, generated by segmentation)
    masks = {
        0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
        1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
        2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
    }
    
    bboxes = {
        0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
        1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
        2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
    }
    
    # Define keywords
    categorical_keywords = ["human", "dog", "frisbee"]
    unary_keywords = ["running", "jumping", "sitting", "standing"]
    binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
    object_pairs = [(1, 2)]  # Object 1 relates to Object 2
    
    # Run prediction
    print("\nRunning prediction...")
    results = model.predict(
        video_frames=video_frames,
        masks=masks,
        bboxes=bboxes,
        categorical_keywords=categorical_keywords,
        unary_keywords=unary_keywords,
        binary_keywords=binary_keywords,
        object_pairs=object_pairs,
        return_top_k=3
    )
    
    print("\nResults:")
    print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
    print(f"Unary predictions: {len(results['unary_predictions'])} actions")
    print(f"Binary predictions: {len(results['binary_predictions'])} relations")
    print(f"Confidence scores: {results['confidence_scores']}")


def example_pipeline_usage():
    """Example of using the VINE pipeline."""
    print("\n=== Pipeline Usage ===")
    
    # Register the pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    vine_config = VineConfig(
             model_name="openai/clip-vit-base-patch32",
            use_hf_repo=True,
            model_repo="video-fm/vine_v0",  # Your HF Hub model
            segmentation_method="grounding_dino_sam2",
            debug_visualizations=True,
        )
        
    vine_pipe = VinePipeline(
        model=VineModel(vine_config),
        tokenizer=None,
        trust_remote_code=True,
        # SAM2 configuration
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device=0,
    )
        
    
    print("Pipeline created successfully!")
    
    # Example usage with video path
    video_path = "path/to/your/video.mp4"  # Replace with actual video path
    
    # For demonstration, we'll show the expected usage format
    print(f"\nExample pipeline call (replace with actual video path):")
    print(f"results = vine_pipeline(")
    print(f"    '{video_path}',")
    print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
    print(f"    unary_keywords=['running', 'jumping', 'sitting'],")
    print(f"    binary_keywords=['behind', 'in front of', 'next to'],")
    print(f"    object_pairs=[(1, 2)],")
    print(f"    segmentation_method='grounding_dino_sam2',")
    print(f"    return_top_k=3,")
    print(f"    return_flattened_segments=True,")
    print(f"    return_valid_pairs=True,")
    print(f"    include_visualizations=True,")
    print(f"    debug_visualizations=True")
    print(f")")
    
    # Note: Actual execution would require proper video file and segmentation models


def example_huggingface_hub_usage():
    """Example of how to push and load from HuggingFace Hub."""
    print("\n=== HuggingFace Hub Usage ===")
    
    # Example of preparing model for Hub
    config = VineConfig()
    model = VineModel(config)
    
    # Register for auto classes
    config.register_for_auto_class()
    model.register_for_auto_class("AutoModel")
    
    print("Model registered for auto classes")
    
    # Example push to hub (commented out - requires actual model weights and credentials)
    # config.push_to_hub('your-username/vine-model')
    # model.push_to_hub('your-username/vine-model')
    
    # Example load from hub (commented out - requires actual model on hub)
    # model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
    # pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)
    
    print("To push to Hub:")
    print("1. config.push_to_hub('your-username/vine-model')")
    print("2. model.push_to_hub('your-username/vine-model')")
    print("\nTo load from Hub:")
    print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
    print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")


def example_with_real_video():
    """Example showing how to use with a real video file."""
    print("\n=== Real Video Usage Example ===")
    
    # Check if demo video exists
    demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
    
    if os.path.exists(demo_video_path):
        print(f"Found demo video: {demo_video_path}")
        
        # Create pipeline with segmentation model paths
        PIPELINE_REGISTRY.register_pipeline(
            "vine-video-understanding",
            pipeline_class=VinePipeline,
            pt_model=VineModel,
            type="multimodal",
        )
        
        vine_config = VineConfig(
            model_name="openai/clip-vit-base-patch32",
            use_hf_repo=True,
            model_repo="video-fm/vine_v0",  # Your HF Hub model
            segmentation_method="grounding_dino_sam2",
            debug_visualizations=True,
            debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
        )
        
        vine_pipeline = VinePipeline(
            model=VineModel(vine_config),
            tokenizer=None,
            trust_remote_code=True,
            # SAM2 configuration
            sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",    
            sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
            gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
            gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        )
        
        # Define keywords based on the demo
        categorical_keywords = ['human', 'dog', 'frisbee']
        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
        binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
        object_pairs = [(0, 1), (0, 2), (1, 2)]  # human-dog, dog-frisbee relationships
        
        print("\nProcessing video with VINE...")
        print("Keywords:")
        print(f"  Categorical: {categorical_keywords}")
        print(f"  Unary: {unary_keywords}")
        print(f"  Binary: {binary_keywords}")
        print(f"  Object pairs: {object_pairs}")
        
        # Note: This would require proper segmentation models to be set up
        try:
            results = vine_pipeline(
                demo_video_path,
                categorical_keywords=categorical_keywords,
                unary_keywords=unary_keywords,
                binary_keywords=binary_keywords,
                object_pairs=object_pairs,
                segmentation_method='grounding_dino_sam2',
                return_top_k=3,
                include_visualizations=False,
                debug_visualizations=True,
            )
            
            print("\nResults:")
            print(f"Summary: {results['summary']}")
            
        except Exception as e:
            print(f"Note: Full execution requires segmentation models to be properly set up.")
            print(f"Error: {e}")
            
    else:
        print(f"Demo video not found at: {demo_video_path}")
        print("To use with a real video, provide the path to your video file.")


if __name__ == "__main__":
    print("VINE HuggingFace Interface Examples")
    print("=" * 50)
    
    # Run examples
    try:
        example_direct_model_usage()
    except Exception as e:
        print(f"Direct model usage failed: {e}")
    
    try:
        example_pipeline_usage()
    except Exception as e:
        print(f"Pipeline usage failed: {e}")
    
    try:
        example_huggingface_hub_usage()
    except Exception as e:
        print(f"Hub usage example failed: {e}")
    
    try:
        example_with_real_video()
    except Exception as e:
        print(f"Real video example failed: {e}")
    
    print("\n" + "=" * 50)
    print("Examples completed!")
    print("\nNext steps:")
    print("1. Set up Grounding DINO and SAM2 models for segmentation")
    print("2. Load your pretrained VINE model weights")
    print("3. Test with your own videos")
    print("4. Push to HuggingFace Hub for sharing")