{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "44d53281",
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "/home/kevinx/miniconda3/envs/laser_env/lib/python3.10/site-packages/pydantic/_internal/_config.py:383: UserWarning: Valid config keys have changed in V2:\n",
      "* 'schema_extra' has been renamed to 'json_schema_extra'\n",
      "  warnings.warn(message, UserWarning)\n",
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n",
      "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n"
     ]
    }
   ],
   "source": [
    "import os\n",
    "import sys\n",
    "import torch\n",
    "from transformers import pipeline, AutoModel\n",
    "from transformers.pipelines import PIPELINE_REGISTRY\n",
    "\n",
    "# Uncomment or set your own\n",
    "#os.environ['OPENAI_API_KEY'] = 'dummy-key'\n",
    "from vine_hf import VineConfig, VineModel, VinePipeline"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "id": "174e479f",
   "metadata": {},
   "outputs": [],
   "source": [
    "PIPELINE_REGISTRY.register_pipeline(\n",
    "            \"vine-video-understanding\",\n",
    "            pipeline_class=VinePipeline,\n",
    "            pt_model=VineModel,\n",
    "            type=\"multimodal\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "a9af2770",
   "metadata": {},
   "outputs": [],
   "source": [
    "vine_config = VineConfig(\n",
    "    model_name=\"openai/clip-vit-base-patch32\",\n",
    "    # Local file example: set use_hf_repo=False and provide local_dir/local_filename\n",
    "    use_hf_repo=False,\n",
    "    local_dir=os.path.dirname('/path/to/your/pretrained/model.pt'),\n",
    "    local_filename=os.path.basename('/path/to/your/pretrained/model.pt'),  # Local file path\n",
    "    segmentation_method=\"grounding_dino_sam2\",\n",
    "    visualize=True,\n",
    "    visualization_dir=\"path/to/visualization/dir\",\n",
    "    debug_visualizations=True,\n",
    "    device=0,  # Change to your desired device\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "id": "274e6515",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Loaded state type: <class 'collections.OrderedDict'>\n"
     ]
    }
   ],
   "source": [
    "vine_pipeline = VinePipeline(\n",
    "    model=VineModel(vine_config),        \n",
    "    tokenizer=None,\n",
    "    sam_config_path=\"path/to/sam2/configs/sam2_hiera_base_plus.yaml\",\n",
    "    sam_checkpoint_path=\"path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt\",\n",
    "    gd_config_path=\"path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py\",\n",
    "    gd_checkpoint_path=\"path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth\",\n",
    ")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "id": "123a090d",
   "metadata": {},
   "outputs": [],
   "source": [
    "categorical_keywords = ['human', 'dog', 'frisbee']\n",
    "unary_keywords = ['running', 'jumping', 'catching', 'throwing']\n",
    "binary_keywords = ['behind', 'in front of', 'next to', 'chasing']\n",
    "object_pairs = [(0, 1), (0, 2), (1, 2)]  # human-dog, dog-frisbee relationships "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "id": "0b42f032",
   "metadata": {},
   "outputs": [],
   "source": [
    "demo_video_path = \"/home/kevinx/LASER/LASER/demo/videos/v1.mp4\"  # Replace with your video file path"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 8,
   "id": "8202c654",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Segmentation method: grounding_dino_sam2\n",
      "Generating Grounding DINO + SAM2 masks...\n",
      "<class 'int'>\n",
      "✓ SAM2 models initialized successfully\n",
      "<class 'int'>\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:4314.)\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "final text_encoder_type: bert-base-uncased\n",
      "✓ GroundingDINO model initialized successfully\n",
      "Start detecting objects at time  05:08:58.178592\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Detecting objects:   0%|          | 0/3 [00:00<?, ?it/s]FutureWarning: The `device` argument is deprecated and will be removed in v5 of Transformers.\n",
      "UserWarning: torch.utils.checkpoint: the use_reentrant parameter should be passed explicitly. In version 2.5 we will raise an exception if use_reentrant is not passed. use_reentrant=False is recommended, but if you need to preserve the current default behavior, you can pass use_reentrant=True. Refer to docs for more details on the differences between the two variants.\n",
      "UserWarning: None of the inputs have requires_grad=True. Gradients will be None\n",
      "FutureWarning: `torch.cuda.amp.autocast(args...)` is deprecated. Please use `torch.amp.autocast('cuda', args...)` instead.\n",
      "Detecting objects: 100%|██████████| 3/3 [00:01<00:00,  2.82it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Finished detecting objects at time  05:08:59.250419\n",
      "Loading inference state at time  05:08:59.544425\n",
      "Number of frames:  3\n",
      "None\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "Processing frames: 100%|██████████| 3/3 [00:00<00:00, 11.77it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Annotated frames:  []\n",
      "Find the most dense prompt at time  05:09:01.413703\n",
      "Most dense frame: 0\n",
      "\n",
      "\n",
      "Start propagating objects at time  05:09:01.416367\n",
      "Pass count:  0\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 20.20it/s]\n",
      "propagate in video: 0it [00:00, ?it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most dense frame: 1\n",
      "\n",
      "\n",
      "Pass count:  1\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 19.25it/s]\n",
      "propagate in video: 0it [00:00, ?it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most dense frame: 2\n",
      "\n",
      "\n",
      "Pass count:  2\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "propagate in video: 100%|██████████| 3/3 [00:00<00:00, 25.92it/s]\n",
      "propagate in video: 0it [00:00, ?it/s]\n"
     ]
    },
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Most dense frame: -1\n",
      "\n",
      "\n",
      "\n",
      "Results:\n",
      "Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
     ]
    }
   ],
   "source": [
    "try:\n",
    "    results = vine_pipeline(\n",
    "        demo_video_path,\n",
    "        categorical_keywords=categorical_keywords,\n",
    "        unary_keywords=unary_keywords,\n",
    "        binary_keywords=binary_keywords,\n",
    "        object_pairs=object_pairs,\n",
    "        segmentation_method='grounding_dino_sam2',\n",
    "        return_top_k=3,\n",
    "        include_visualizations=False,\n",
    "        debug_visualizations=False,\n",
    "    )\n",
    "    \n",
    "    print(\"\\nResults:\")\n",
    "    print(f\"Summary: {results['summary']}\")\n",
    "    \n",
    "except Exception as e:\n",
    "    print(f\"Note: Full execution requires segmentation models to be properly set up.\")\n",
    "    print(f\"Error: {e}\")"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 9,
   "id": "414ede9b",
   "metadata": {},
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "Summary: {'num_objects_detected': 4, 'num_unary_predictions': 10, 'num_binary_predictions': 3, 'top_categories': [('frisbee', 0.9989640712738037), ('dog', 0.957672655582428), ('dog', 0.957672655582428)], 'top_actions': [('running', 0.8483631610870361), ('running', 0.832377016544342), ('running', 0.8178836107254028)], 'top_relations': [('chasing', 0.9616015553474426), ('chasing', 0.9478002786636353), ('chasing', 0.6380977630615234)]}\n"
     ]
    }
   ],
   "source": [
    "print(f\"Summary: {results['summary']}\")"
   ]
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "laser_env",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.10.0"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 5
}