"""
Example demonstrating SAM2 mask generation in VINE HuggingFace interface

This script shows how to use both SAM2-only and Grounding DINO + SAM2
segmentation methods with the VINE model.
"""

import os
import sys
from pathlib import Path
import torch
import numpy as np
from transformers.pipelines import PIPELINE_REGISTRY

# Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable
current_dir = Path(__file__).resolve().parent
src_dir = current_dir.parent / "src"
if src_dir.is_dir() and str(src_dir) not in sys.path:
    sys.path.insert(0, str(src_dir))

#Either uncomment the below or set a environemental key, though it isn't needed to run.
#os.environ['OPENAI_API_KEY'] = 'dummy-key' 

from vine_hf import VineConfig, VineModel, VinePipeline
from laser.loading import load_video


def example_sam2_only_segmentation():
    """Example using SAM2 automatic mask generation only."""
    print("=== SAM2-Only Segmentation Example ===")
    
    # Create configuration for SAM2-only
    config = VineConfig(
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",
        segmentation_method="sam2",  # Use SAM2 only
        target_fps=1,
        debug_visualizations=True,
    )
    
    # Register pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    
    # Create model and pipeline with SAM2 paths
    vine_model = VineModel(config)
    vine_pipeline = VinePipeline(
        model=vine_model,
        tokenizer=None,
        sam_config_path="path/to/your/sam2/sam_config.yaml",
        sam_checkpoint_path="path/to/your/sam2/sam_checkpoint.pth",
        gd_config_path="path/to/your/groundingdino/config.py",
        gd_checkpoint_path="path/to/your/groundingdino/checkpoint.pth",
    )
    
    # Check for demo video
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
    
    if os.path.exists(demo_video):
        print(f"Processing video: {demo_video}")
        
        # Define keywords (SAM2 will find all objects, then classify them)
        categorical_keywords = ['human', 'dog', 'frisbee', 'object', 'person', 'animal']
        unary_keywords = ['running', 'jumping', 'sitting', 'standing', 'moving', 'static']
        binary_keywords = ['behind', 'in front of', 'next to', 'chasing', 'following']
        object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]  

        
        print("Using SAM2 automatic mask generation...")
        print("This will find all objects in the video automatically")
        
        try:
            # Process with SAM2 only
            results = vine_pipeline(
                demo_video,
                categorical_keywords=categorical_keywords,
                unary_keywords=unary_keywords,
                binary_keywords=binary_keywords,
                object_pairs=object_pairs,
                segmentation_method="sam2",
                return_top_k=3,
                debug_visualizations=True,
                debug_visualization_path=os.path.join(os.getcwd(), "sam2_debug_masks.png"),
            )
            
            print("\n✓ SAM2 segmentation completed!")
            print("Results summary:")
            print(f"  Objects detected: {results['summary']['num_objects_detected']}")
            print(f"  Top categories: {results['summary']['top_categories']}")
            print(f"  Top actions: {results['summary']['top_actions']}")
            
            return results
            
        except Exception as e:
            print(f"SAM2 segmentation failed: {e}")
            print("Make sure SAM2 models are properly installed")
            return None
    else:
        print(f"Demo video not found: {demo_video}")
        return None

def example_grounding_dino_sam2_segmentation():
    """Example using Grounding DINO + SAM2 text-guided segmentation."""
    print("\n=== Grounding DINO + SAM2 Segmentation Example ===")
    
    # Create configuration for Grounding DINO + SAM2
    config = VineConfig(
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",
        segmentation_method="grounding_dino_sam2",  # Use text-guided segmentation
        box_threshold=0.35,
        text_threshold=0.25,
        target_fps=1,
        debug_visualizations=True,
    )
    
    # Create model and pipeline with both SAM2 and GroundingDINO paths
    vine_model = VineModel(config)
    vine_pipeline = VinePipeline(
        model=vine_model,
        tokenizer=None,
        # SAM2 configuration
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device=0,
    )
    
    # Check for demo video
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
    
    if os.path.exists(demo_video):
        print(f"Processing video: {demo_video}")
        
        # Define keywords (Grounding DINO will look specifically for these)
        categorical_keywords = ['human', 'dog', 'frisbee']  # Specific objects to find
        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
        binary_keywords = ['behind', 'chasing', 'next to', 'throwing to']
        object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3), (0,4)]
        print("Using Grounding DINO + SAM2 text-guided segmentation...")
        print(f"Looking specifically for: {categorical_keywords}")
        
        try:
            # Process with Grounding DINO + SAM2
            results = vine_pipeline(
                demo_video,
                categorical_keywords=categorical_keywords,
                unary_keywords=unary_keywords,
                binary_keywords=binary_keywords,
                object_pairs=object_pairs,
                segmentation_method="grounding_dino_sam2",
                box_threshold=0.35,
                text_threshold=0.25,
                return_top_k=3,
                debug_visualizations=True,
            )
            
            print("\n✓ Grounding DINO + SAM2 segmentation completed!")
            print("Results summary:")
            print(f"  Objects detected: {results['summary']['num_objects_detected']}")
            print(f"  Top categories: {results['summary']['top_categories']}")
            print(f"  Top actions: {results['summary']['top_actions']}")
            print(f"  Top relations: {results['summary']['top_relations']}")
            
            return results
            
        except Exception as e:
            print(f"Grounding DINO + SAM2 segmentation failed: {e}")
            print("Make sure both Grounding DINO and SAM2 models are properly installed")
            return None
    else:
        print(f"Demo video not found: {demo_video}")
        return None


def compare_segmentation_methods():
    """Compare SAM2-only vs Grounding DINO + SAM2 approaches."""
    print("\n=== Comparing Segmentation Methods ===")
    
    print("\nSAM2-Only Approach:")
    print("✓ Finds all objects automatically")
    print("✓ No need to specify what to look for")
    print("✓ Good for exploratory analysis")
    print("✗ May find too many irrelevant objects")
    print("✗ Less precise for specific object types")
    
    print("\nGrounding DINO + SAM2 Approach:")
    print("✓ Finds specific objects based on text prompts")
    print("✓ More precise and targeted")
    print("✓ Better for known object categories")
    print("✓ Integrates object detection with segmentation")
    print("✗ Limited to specified categories")
    print("✗ Requires knowing what objects to look for")


def demonstrate_mask_processing():
    """Demonstrate how masks are processed internally."""
    print("\n=== Mask Processing Demonstration ===")
    
    # Load a video to show the processing pipeline
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/output.mp4")
    
    if os.path.exists(demo_video):
        print("Loading video for mask processing demo...")
        
        # Load video tensor
        video_tensor = np.asarray(load_video(demo_video, target_fps=1))
        print(f"Video shape: {video_tensor.shape}")
        
        # Create pipeline with segmentation model paths
        config = VineConfig(segmentation_method="sam2")
        vine_model = VineModel(config)
        vine_pipeline = VinePipeline(
            model=vine_model,
            tokenizer=None,
            # SAM2 configuration
            sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
            sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
            gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
            gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        )
        
        try:
            # Process just the first few frames to show the pipeline
            print("\nProcessing first 2 frames with SAM2...")
            
            # Manually call the preprocessing to show the steps
            processed_data = vine_pipeline.preprocess(
                video_tensor[:2],  # Just first 2 frames
                segmentation_method="sam2",
                categorical_keywords=['object']
            )
            
            print("Mask processing results:")
            print(f"  Number of frames processed: {processed_data['num_frames']}")
            print(f"  Frames with masks: {list(processed_data['masks'].keys())}")
            
            # Show mask details
            for frame_id, frame_masks in processed_data['masks'].items():
                print(f"  Frame {frame_id}: {len(frame_masks)} objects detected")
                for obj_id, mask in frame_masks.items():
                    print(f"    Object {obj_id}: mask shape {mask.shape}")
            
            print("\nBounding box extraction:")
            for frame_id, frame_bboxes in processed_data['bboxes'].items():
                print(f"  Frame {frame_id}: {len(frame_bboxes)} bounding boxes")
                for obj_id, bbox in frame_bboxes.items():
                    print(f"    Object {obj_id}: bbox {bbox}")
            
        except Exception as e:
            print(f"Mask processing failed: {e}")
            print("This is expected if SAM2 models are not properly set up")
    else:
        print(f"Demo video not found: {demo_video}")


def test_mask_formats():
    """Test different mask input formats."""
    print("\n=== Testing Mask Formats ===")
    
    # Create dummy data to test mask processing
    height, width = 224, 224
    
    # Test different mask formats
    print("Testing mask format conversions...")
    
    # Format 1: NumPy boolean array
    mask_np = np.random.rand(height, width) > 0.5
    print(f"NumPy mask: {mask_np.shape}, dtype: {mask_np.dtype}")
    
    # Format 2: PyTorch tensor
    mask_torch = torch.from_numpy(mask_np)
    print(f"PyTorch mask: {mask_torch.shape}, dtype: {mask_torch.dtype}")
    
    # Format 3: 3D mask with singleton dimension
    mask_3d = mask_torch.unsqueeze(-1)
    print(f"3D mask: {mask_3d.shape}")
    
    # Test bounding box extraction
    from laser.preprocess.mask_generation_grounding_dino import mask_to_bbox
    
    try:
        bbox = mask_to_bbox(mask_torch)
        print(f"Extracted bbox: {bbox}")
        print("✓ Mask format testing successful")
    except Exception as e:
        print(f"Mask format testing failed: {e}")


if __name__ == "__main__":
    print("VINE SAM2 Mask Generation Examples")
    print("=" * 50)
    
    # Test SAM2-only approach
    try:
        sam2_results = example_sam2_only_segmentation()
    except Exception as e:
        print(f"SAM2-only example failed: {e}")
    
    # Test Grounding DINO + SAM2 approach  
    try:
        gd_sam2_results = example_grounding_dino_sam2_segmentation()
    except Exception as e:
        print(f"Grounding DINO + SAM2 example failed: {e}")
    
    # Compare approaches
    compare_segmentation_methods()
    
    # Demonstrate mask processing
    try:
        demonstrate_mask_processing()
    except Exception as e:
        print(f"Mask processing demo failed: {e}")
    
    # Test mask formats
    try:
        test_mask_formats()
    except Exception as e:
        print(f"Mask format testing failed: {e}")
    
    print("\n" + "=" * 50)
    print("Examples completed!")
    print("\nKey takeaways:")
    print("1. SAM2-only: Automatic object detection and segmentation")
    print("2. Grounding DINO + SAM2: Text-guided object detection and segmentation") 
    print("3. Both methods provide masks and bounding boxes for VINE model")
    print("4. Choose method based on whether you know what objects to look for")