{ "cells": [ { "cell_type": "code", "execution_count": null, "id": "44d53281", "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/kevinx/miniconda3/envs/laser_env/lib/python3.10/site-packages/pydantic/_internal/_config.py:383: UserWarning: Valid config keys have changed in V2:\n", "* 'schema_extra' has been renamed to 'json_schema_extra'\n", " warnings.warn(message, UserWarning)\n", "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n", "Warning : `load_model` does not return WordVectorModel or SupervisedModel any more, but a `FastText` object which is very similar.\n" ] } ], "source": [ "import os\n", "import sys\n", "import torch\n", "from transformers import pipeline, AutoModel\n", "from transformers.pipelines import PIPELINE_REGISTRY\n", "\n", "# Uncomment or set your own\n", "#os.environ['OPENAI_API_KEY'] = 'dummy-key'\n", "from vine_hf import VineConfig, VineModel, VinePipeline" ] }, { "cell_type": "code", "execution_count": 2, "id": "174e479f", "metadata": {}, "outputs": [], "source": [ "PIPELINE_REGISTRY.register_pipeline(\n", " \"vine-video-understanding\",\n", " pipeline_class=VinePipeline,\n", " pt_model=VineModel,\n", " type=\"multimodal\",\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "a9af2770", "metadata": {}, "outputs": [], "source": [ "vine_config = VineConfig(\n", " model_name=\"openai/clip-vit-base-patch32\",\n", " # Local file example: set use_hf_repo=False and provide local_dir/local_filename\n", " use_hf_repo=False,\n", " local_dir=os.path.dirname('/path/to/your/pretrained/model.pt'),\n", " local_filename=os.path.basename('/path/to/your/pretrained/model.pt'), # Local file path\n", " segmentation_method=\"grounding_dino_sam2\",\n", " visualize=True,\n", " visualization_dir=\"path/to/visualization/dir\",\n", " debug_visualizations=True,\n", " device=0, # Change to your desired device\n", ")" ] }, { "cell_type": "code", "execution_count": null, "id": "274e6515", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Loaded state type: \n" ] } ], "source": [ "vine_pipeline = VinePipeline(\n", " model=VineModel(vine_config), \n", " tokenizer=None,\n", " sam_config_path=\"path/to/sam2/configs/sam2_hiera_base_plus.yaml\",\n", " sam_checkpoint_path=\"path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt\",\n", " gd_config_path=\"path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py\",\n", " gd_checkpoint_path=\"path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth\",\n", ")" ] }, { "cell_type": "code", "execution_count": 6, "id": "123a090d", "metadata": {}, "outputs": [], "source": [ "categorical_keywords = ['human', 'dog', 'frisbee']\n", "unary_keywords = ['running', 'jumping', 'catching', 'throwing']\n", "binary_keywords = ['behind', 'in front of', 'next to', 'chasing']\n", "object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships " ] }, { "cell_type": "code", "execution_count": 7, "id": "0b42f032", "metadata": {}, "outputs": [], "source": [ "demo_video_path = \"/home/kevinx/LASER/LASER/demo/videos/v1.mp4\" # Replace with your video file path" ] }, { "cell_type": "code", "execution_count": 8, "id": "8202c654", "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "Segmentation method: grounding_dino_sam2\n", "Generating Grounding DINO + SAM2 masks...\n", "\n", "✓ SAM2 models initialized successfully\n", "\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "UserWarning: torch.meshgrid: in an upcoming release, it will be required to pass the indexing argument. (Triggered internally at /pytorch/aten/src/ATen/native/TensorShape.cpp:4314.)\n" ] }, { "name": "stdout", "output_type": "stream", "text": [ "final text_encoder_type: bert-base-uncased\n", "✓ GroundingDINO model initialized successfully\n", "Start detecting objects at time 05:08:58.178592\n" ] }, { "name": "stderr", "output_type": "stream", "text": [ "Detecting objects: 0%| | 0/3 [00:00