File size: 10,118 Bytes
f9a6349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""
Example usage of VINE HuggingFace interface with pretrained VINE weights

This script demonstrates how to use the VINE model with your pretrained weights
from the ensemble format or from video-fm/vine_v0.
"""

import os
import sys
import torch
from transformers import pipeline
from transformers.pipelines import PIPELINE_REGISTRY

# Set your OpenAI API key here or via environment variable
#os.environ['OPENAI_API_KEY'] = "dummy-key"

# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from vine_hf import VineConfig, VineModel, VinePipeline


def example_with_local_pretrained_weights():
    print("=== Using Local Pretrained VINE Weights ===")
    
    
    # Download https://huggingface.co/video-fm/vine_v0/tree/main/laser_model_v1.pt
    pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt"  # Replace with your local path
    
    
    # Create configuration with your pretrained path (local file)
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        target_fps=1,
        visualize=True,
        visualization_dir="path/to/visualization/dir",
        debug_visualizations=True,
        use_hf_repo=False,
        local_dir=os.path.dirname(pretrained_vine_file),
        local_filename=os.path.basename(pretrained_vine_file),
    )
    
    # Method 1: Initialize model directly
    print("Method 1: Direct model initialization")
    vine_model = VineModel(config)
    print(f"βœ“ Model initialized with pretrained weights from: {pretrained_vine_file}")
    
    # Method 2: Use the from_pretrained_vine class method
    print("\nMethod 2: Using from_pretrained_vine class method")
    vine_model_2 = VineModel.from_pretrained_vine(
        model_path=pretrained_vine_file,
        config=config,
        epoch=0  # Specify epoch number
    )
    print("βœ“ Model loaded using from_pretrained_vine method")
    
    return vine_model


def example_with_huggingface_hub():
    """Example using VINE weights from HuggingFace Hub."""
    print("\n=== Using HuggingFace Hub Weights ===")
    
    # Create configuration to use HuggingFace Hub weights
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",  # Your HF Hub model
        segmentation_method="grounding_dino_sam2",
        visualize=True,
        visualization_dir="path/to/visualization/dir",
        debug_visualizations=True,
    )
    
    try:
        # Initialize model (will try to load from HF Hub)
        vine_model = VineModel(config)
        print("βœ“ Model loaded from HuggingFace Hub: video-fm/vine_v0")
        return vine_model
    except Exception as e:
        print(f"βœ— Could not load from HuggingFace Hub: {e}")
        print("Make sure your model is pushed to video-fm/vine_v0")
        return None


def example_pipeline_with_pretrained():
    """Example using pipeline with pretrained VINE weights."""
    print("\n=== Pipeline with Pretrained VINE ===")
    
    # Register the pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    
    # Create configuration with your weights
    pretrained_vine_file = "/path/to/your/local/laser_model_v1.pt"  # Replace with your local path
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        visualize=True,
        visualization_dir="path/to/visualization/dir",
        debug_visualizations=True,
        use_hf_repo=False,
        local_dir=os.path.dirname(pretrained_vine_file),
        local_filename=os.path.basename(pretrained_vine_file),
    )
    
    # Create model with pretrained weights
    vine_model = VineModel(config)
    
    # Create pipeline with segmentation model paths
    vine_pipeline = VinePipeline(
        model=vine_model,
        tokenizer=None,
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device=0
    )
    
    print("βœ“ Pipeline created with pretrained VINE weights")
    
    # Example usage (would require actual video file)
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
    
    if os.path.exists(demo_video):
        print(f"Found demo video: {demo_video}")
        print("Example pipeline call:")
        print(f"results = vine_pipeline(")
        print(f"    '{demo_video}',")
        print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
        print(f"    unary_keywords=['running', 'jumping', 'sitting'],")
        print(f"    binary_keywords=['behind', 'chasing', 'next to']")
        print(f"    debug_visualizations=True")
        print(f")")
        
        # Uncomment to actually run (requires segmentation models)
        # results = vine_pipeline(
        #     demo_video,
        #     categorical_keywords=['human', 'dog', 'frisbee'],
        #     unary_keywords=['running', 'jumping', 'sitting'],
        #     binary_keywords=['behind', 'chasing', 'next to'],
        #     debug_visualizations=True,
        # )
        # print("Results:", results['summary'])
    
    return vine_pipeline



def example_manual_weight_loading():
    """Example of manually loading weights after model creation."""
    print("\n=== Manual Weight Loading ===")
    
    # Create model with base CLIP weights
    # No pretrained path: create base config (no HF repo or local file configured)
    config = VineConfig()
    vine_model = VineModel(config)
    print("βœ“ Model created with base CLIP weights")
    model_dir = "/path/to/your/local/ensemble/model_dir.pt"  # Replace with your model directory
    
    if os.path.exists(model_dir):
        success = vine_model.load_pretrained_vine_weights(model_dir, epoch=0)
        if success:
            print("βœ“ Successfully loaded pretrained VINE weights manually")
        else:
            print("βœ— Failed to load pretrained weights")
    else:
        print(f"βœ— Model directory not found: {model_dir}")
    
    return vine_model


def compare_model_outputs():
    """Compare outputs between base CLIP and pretrained VINE."""
    print("\n=== Comparing Model Outputs ===")
    
    # Create dummy data for testing
    video_frames = torch.randn(3, 224, 224, 3) * 255  # 3 frames
    video_frames = video_frames.clamp(0, 255).byte()
    
    masks = {
        0: {1: torch.ones(224, 224, 1)},
        1: {1: torch.ones(224, 224, 1)},
        2: {1: torch.ones(224, 224, 1)}
    }
    
    bboxes = {
        0: {1: [50, 50, 150, 150]},
        1: {1: [52, 52, 152, 152]},
        2: {1: [54, 54, 154, 154]}
    }
    
    keywords = ['human', 'dog', 'frisbee']
    
    # Model 1: Base CLIP
    print("Creating model with base CLIP weights...")
    config_base = VineConfig()
    model_base = VineModel(config_base)
    
    # Model 2: Pretrained VINE (if available)
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    if os.path.exists(model_dir):
        print("Creating model with pretrained VINE weights...")
        config_vine = VineConfig(
            use_hf_repo=False,
            local_dir=model_dir,
            local_filename=None,
        )
        model_vine = VineModel(config_vine)
        
        print("\nComparing predictions...")
        
        # Get predictions from both models
        with torch.no_grad():
            results_base = model_base.predict(
                video_frames=video_frames,
                masks=masks,
                bboxes=bboxes,
                categorical_keywords=keywords,
                return_top_k=3
            )
            
            results_vine = model_vine.predict(
                video_frames=video_frames,
                masks=masks,
                bboxes=bboxes,
                categorical_keywords=keywords,
                return_top_k=3
            )
        
        print("Base CLIP confidence scores:", results_base['confidence_scores'])
        print("Pretrained VINE confidence scores:", results_vine['confidence_scores'])
        
        print("βœ“ Successfully compared both models")
    else:
        print(f"Pretrained model not found at: {model_dir}")
        print("Skipping comparison")


if __name__ == "__main__":
    print("VINE HuggingFace Interface - Pretrained Weights Examples")
    print("=" * 60)
    
    try:
        # Test local pretrained weights
        model1 = example_with_local_pretrained_weights()
    except Exception as e:
        print(f"Local weights example failed: {e}")
    
    try:
        # Test HuggingFace Hub weights
        model2 = example_with_huggingface_hub()
    except Exception as e:
        print(f"HuggingFace Hub example failed: {e}")
    
    try:
        # Test pipeline with pretrained weights
        pipeline = example_pipeline_with_pretrained()
    except Exception as e:
        print(f"Pipeline example failed: {e}")
    
    # try:
    #     # Test manual weight loading
    #     #model3 = example_manual_weight_loading()
    # except Exception as e:
    #     print(f"Manual loading example failed: {e}")
    
    # try:
    #     # Compare model outputs
    #     #compare_model_outputs()
    # except Exception as e:
    #     print(f"Comparison example failed: {e}")
    
    print("\n" + "=" * 60)
    print("Examples completed!")
    print("\nUsage Summary:")
    print("1. Configure VineConfig with `use_hf_repo` + `model_repo` for Hub models, or `use_hf_repo=False` + `local_dir`/`local_filename` for local weights")
    print("2. Use VineModel.from_pretrained_vine() for direct loading")