# Example visualization runner for VINE
# - Loads a video (path, demo, or random)
# - Runs the VINE pipeline
# - Saves annotated frames and an MP4 if available

import os
import sys
import argparse
import cv2
import numpy as np
from collections.abc import Mapping, Sequence

from transformers.pipelines import PIPELINE_REGISTRY
from transformers import pipeline

# Set your OpenAI API key here or via environment variable
os.environ['OPENAI_API_KEY'] = "dummy-key"

# Local imports (workspace)
sys.path.append(os.path.dirname(__file__))

from vine_hf.vine_pipeline import VinePipeline  # https://github.com link not needed; local path used
from vine_hf.vine_model import VineModel
from vine_hf.vine_config import VineConfig
from laser.loading import load_video


def build_pipeline(args) -> VinePipeline:
    # Register pipeline type
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )

    config = VineConfig(
        segmentation_method="grounding_dino_sam2",
        model_name="openai/clip-vit-base-patch32",
        # Example: load from HF repo
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",
        # Alternatively use a local path by setting use_hf_repo=False and local_dir/local_filename
        box_threshold=args.box_threshold,
        text_threshold=args.text_threshold,
        target_fps=args.fps,
        topk_cate=args.topk_cate,
        visualization_dir=args.out_dir,
        visualize=True,
        debug_visualizations=True,
        device=args.device,
    )
    
    model = VineModel(config)

    # Create pipeline instance with segmentation model paths (if provided)
    vine_pipe = VinePipeline(
        model=model,
        tokenizer=None,
        sam_config_path="//home/kevinx/LASER/video-sam2/sam2/sam2_hiera_t.yaml",
        sam_checkpoint_path="//home/kevinx/LASER/video-sam2/sam2_hiera_tiny.pt",
        gd_config_path="//home/kevinx/LASER/GroundingDINO/groundingdino/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="//home/kevinx/LASER/GroundingDINO/weights/groundingdino_swint_ogc.pth",
        device=args.device,
        trust_remote_code=True,
    )
    return vine_pipe


def resolve_video(args) -> np.ndarray | str:
    # Priority: user --video -> demo video -> random frames
    if args.video and os.path.exists(args.video):
        return args.video

    demo_video = "//home/kevinx/LASER/LASER/demo/videos/v1.mp4"
    demo_alt = "//home/kevinx/LASER/LASER/demo/videos/v2.mp4"
    if os.path.exists(demo_video):
        return demo_video
    if os.path.exists(demo_alt):
        return demo_alt

    # Fallback to random frames (uint8 HxWx3) shaped as T x H x W x 3
    print("No video found; using random frames.")
    rng = np.random.default_rng(0)
    frames = rng.integers(0, 255, size=(args.rand_frames, args.height, args.width, 3), dtype=np.uint8)
    return frames


def main():
    parser = argparse.ArgumentParser(description="VINE visualization example")
    parser.add_argument("--video", type=str, default=None, help="Path to a video file")
    parser.add_argument("--out_dir", type=str, default="output", help="Output directory")
    parser.add_argument("--method", type=str, default="grounding_dino_sam2", choices=["sam2", "grounding_dino_sam2"], help="Segmentation method")
    parser.add_argument("--fps", type=int, default=5, help="Target FPS for processing")
    parser.add_argument("--box_threshold", type=float, default=0.3, help="GroundingDINO box threshold")
    parser.add_argument("--text_threshold", type=float, default=0.3, help="GroundingDINO text threshold")
    parser.add_argument("--topk_cate", type=int, default=5, help="Top-K categories to display")
    parser.add_argument("--device", type=int, default=0, help="CUDA device index or -1 for CPU")
    parser.add_argument("--debug_visualizations", action="store_true", help="Enable debug visualizations")


    args = parser.parse_args()

    vine_pipe = build_pipeline(args)
    video = resolve_video(args)

    # Keywords similar to examples/tests
    categorical_keywords = ["dog", "frisbee", "cat"]
    unary_keywords = ["running", "jumping", "sitting", "flying"]
    binary_keywords = ["behind", "next to", "chasing","biting"]
    object_pairs = [(0,1), (0, 2), (1, 2), (1, 3), (2, 3)]

    print("Running VINE pipeline...")
    call_kwargs = dict(
        categorical_keywords=categorical_keywords,
        unary_keywords=unary_keywords,
        binary_keywords=binary_keywords,
        object_pairs=object_pairs,
        segmentation_method=args.method,
        return_top_k=args.topk_cate,
        include_visualizations=True,
        debug_visualizations=args.debug_visualizations,
    )


    results = vine_pipe(
        video,
        **call_kwargs,
    )

    # Normalize pipeline output to a dict (can be dict or list[dict])
    if isinstance(results, Mapping):
        result = results
    elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping):
        result = results[0]
    else:
        result = {}

    # Print brief summary
    summary = result.get("summary", {}) if isinstance(result, dict) else {}
    print("Summary:", summary)


if __name__ == "__main__":
    main()