File size: 13,168 Bytes
f9a6349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
289
290
291
292
293
294
295
296
297
298
299
300
301
302
303
304
305
306
307
308
309
310
311
312
313
314
315
316
317
318
319
320
321
322
323
324
325
326
327
328
329
330
331
332
333
334
"""
Example demonstrating how to load and use VINE ensemble weights

This script shows the correct way to load your pretrained VINE ensemble weights
and use them with the HuggingFace interface, based on the actual inference.py workflow.
"""

import os
import sys
import torch
import numpy as np
from transformers.pipelines import PIPELINE_REGISTRY

#os.environ["OPENAI_API_KEY"]="dummy-key"  # Set your OpenAI API key here or via environment variable

# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from vine_hf import VineConfig, VineModel, VinePipeline
from laser.loading import load_video


def example_load_ensemble_weights():
    """Example of loading ensemble weights correctly."""
    print("=== Loading Ensemble VINE Weights ===")
    
    # Path to your ensemble model (adjust this to your actual path)
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    print(f"Looking for ensemble weights in: {model_dir}")
    
    if os.path.exists(model_dir):
        print("βœ“ Model directory found")
        
        # List available model files
        model_files = [f for f in os.listdir(model_dir) if f.endswith('.model')]
        print(f"Available model files: {model_files}")
        
        if model_files:
            # Create configuration with ensemble path (local directory with .model files)
            config = VineConfig(
                segmentation_method="grounding_dino_sam2",
                use_hf_repo=False,
                local_dir=model_dir,
                local_filename=None,
            )
            
            print("Creating VINE model with ensemble weights...")
            vine_model = VineModel(config)
            
            print("βœ“ VINE model created with ensemble weights!")
            return vine_model
        else:
            print("βœ— No .model files found in directory")
            return None
    else:
        print(f"βœ— Model directory not found: {model_dir}")
        print("Please adjust the path to point to your ensemble weights")
        return None


def example_direct_ensemble_loading():
    """Example of loading ensemble weights using from_pretrained_vine."""
    print("\n=== Direct Ensemble Loading ===")
    
    # Path to specific ensemble file
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    if os.path.exists(model_dir):
        try:
            # Use the class method for direct loading
            vine_model = VineModel.from_pretrained_vine(
                model_path=model_dir,
                epoch=0  # Load epoch 0
            )
            
            print("βœ“ Model loaded using from_pretrained_vine!")
            return vine_model
            
        except Exception as e:
            print(f"βœ— Error loading with from_pretrained_vine: {e}")
            return None
    else:
        print(f"βœ— Model directory not found: {model_dir}")
        return None


def example_compare_original_vs_hf():
    """Compare the original inference.py approach with HuggingFace interface."""
    print("\n=== Comparing Original vs HuggingFace Interface ===")
    
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    model_name = "ensemble-2025-02-10-14-57-22"
    epoch = 0
    
    if not os.path.exists(model_dir):
        print(f"Model directory not found: {model_dir}")
        return
    
    print("Original approach (from inference.py):")
    print("```python")
    print("def load_model(model_dir, model_name, epoch, device):")
    print("    model_name = model_name + f'.{epoch}.model'")
    print("    predicate_model = torch.load(os.path.join(model_dir, model_name), map_location=device, weights_only=False)")
    print("    return predicate_model")
    print("")
    print("predicate_model = load_model(model_dir, model_name, epoch, device)")
    print("```")
    
    print("\nNew HuggingFace approach:")
    print("```python")
    print("config = VineConfig(pretrained_vine_path=model_dir)")
    print("vine_model = VineModel(config)")
    print("# or")
    print("vine_model = VineModel.from_pretrained_vine(model_dir, epoch=0)")
    print("```")
    
    # Try to load with both approaches if possible
    try:
        # Original approach
        def load_model(model_dir, model_name, epoch, device):
            model_name = model_name + f'.{epoch}.model'
            model_path = os.path.join(model_dir, model_name)
            if os.path.exists(model_path):
                return torch.load(model_path, map_location=device, weights_only=False)
            else:
                print(f"Model file not found: {model_path}")
                return None
        
        device = "cuda" if torch.cuda.is_available() else "cpu"
        original_model = load_model(model_dir, model_name, epoch, device)
        
        if original_model:
            print(f"βœ“ Original model loaded: {type(original_model)}")
            print(f"  Has clip_cate_model: {hasattr(original_model, 'clip_cate_model')}")
            print(f"  Has clip_unary_model: {hasattr(original_model, 'clip_unary_model')}")
            print(f"  Has clip_binary_model: {hasattr(original_model, 'clip_binary_model')}")
        
        # HuggingFace approach
        vine_model = VineModel.from_pretrained_vine(model_dir, epoch=epoch)
        
        if vine_model:
            print(f"βœ“ HuggingFace model loaded: {type(vine_model)}")
            print(f"  Has clip_cate_model: {hasattr(vine_model, 'clip_cate_model')}")
            print(f"  Has clip_unary_model: {hasattr(vine_model, 'clip_unary_model')}")
            print(f"  Has clip_binary_model: {hasattr(vine_model, 'clip_binary_model')}")
            
            print("\nβœ“ Both approaches work! HuggingFace interface successfully loads ensemble weights.")
        
    except Exception as e:
        print(f"Error in comparison: {e}")


def example_ensemble_with_pipeline():
    """Example using ensemble weights with the pipeline."""
    print("\n=== Using Ensemble Weights with Pipeline ===")
    
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    
    if not os.path.exists(model_dir):
        print(f"Model directory not found: {model_dir}")
        return
    
    # Register pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    
    # Create model with ensemble weights (local directory)
    config = VineConfig(
        segmentation_method="grounding_dino_sam2",
        use_hf_repo=False,
        local_dir=model_dir,
        local_filename=None,
    )
    
    vine_model = VineModel(config)
    # Create pipeline with segmentation model paths
    vine_pipeline = VinePipeline(
        model=vine_model,
        tokenizer=None,
        # SAM2 configuration
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        # GroundingDINO configuration  
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device="cuda" if torch.cuda.is_available() else "cpu",
    )
    
    print("βœ“ Pipeline created with ensemble VINE weights")
    
    # Check for demo video
    demo_video = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
    
    if os.path.exists(demo_video):
        print(f"Found demo video: {demo_video}")
        
        # Use the same keywords as in the original inference.py
        categorical_keywords = ['human', 'dog', 'frisbee']
        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
        binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
        
        print("Example pipeline usage:")
        print("```python")
        print("results = vine_pipeline(")
        print(f"    '{demo_video}',")
        print(f"    categorical_keywords={categorical_keywords},")
        print(f"    unary_keywords={unary_keywords},")
        print(f"    binary_keywords={binary_keywords},")
        print("    segmentation_method='grounding_dino_sam2'")
        print(")")
        print("```")
        
        # Uncomment to actually run (requires segmentation models)
        # try:
        #     results = vine_pipeline(
        #         demo_video,
        #         categorical_keywords=categorical_keywords,
        #         unary_keywords=unary_keywords,
        #         binary_keywords=binary_keywords,
        #         segmentation_method='grounding_dino_sam2'
        #     )
        #     print("Results:", results['summary'])
        # except Exception as e:
        #     print(f"Pipeline execution failed: {e}")
        #     print("This is expected if segmentation models are not set up")
    
    return vine_pipeline



def demonstrate_weight_transfer():
    """Demonstrate how weights are transferred from ensemble to HuggingFace format."""
    print("\n=== Weight Transfer Demonstration ===")
    
    print("The ensemble model structure (PredicateModel):")
    print("- clip_cate_model: CLIP model for categorical classification")
    print("- clip_unary_model: CLIP model for unary predicates")  
    print("- clip_binary_model: CLIP model for binary relations")
    print("- clip_tokenizer: Tokenizer for text processing")
    print("- clip_processor: Processor for image processing")
    
    print("\nWeight transfer process:")
    print("1. Load ensemble model with torch.load()")
    print("2. Initialize base CLIP models in HuggingFace format")
    print("3. Transfer state_dict from ensemble to HuggingFace models:")
    print("   - ensemble.clip_cate_model β†’ hf.clip_cate_model")
    print("   - ensemble.clip_unary_model β†’ hf.clip_unary_model")
    print("   - ensemble.clip_binary_model β†’ hf.clip_binary_model")
    print("4. Transfer tokenizer and processor")
    
    print("\nThis preserves all your fine-tuned weights while making them HuggingFace compatible!")


def troubleshooting_guide():
    """Provide troubleshooting guide for common issues."""
    print("\n=== Troubleshooting Guide ===")
    
    print("Common Issues:")
    print("1. 'No model file found for epoch X'")
    print("   β†’ Check that .model files exist in the directory")
    print("   β†’ Verify the epoch number is correct")
    print("   β†’ List files: ls /path/to/model/dir/*.model")
    
    print("\n2. 'Error loading VINE weights'")
    print("   β†’ Check file permissions")
    print("   β†’ Verify the model file is not corrupted")
    print("   β†’ Try loading with torch.load() directly first")
    
    print("\n3. 'CLIP model mismatch'")
    print("   β†’ Ensure config.model_name matches the base model used in training")
    
    print("\n4. 'Device mismatch errors'")
    print("   β†’ Models are loaded to CPU first, then moved to device")
    print("   β†’ Check CUDA availability with torch.cuda.is_available()")
    
    print("\nDebugging steps:")
    print("1. Test loading ensemble model directly:")
    print("   model = torch.load('path/to/model.0.model', map_location='cpu')")
    print("2. Check model attributes:")
    print("   print(dir(model))")
    print("3. Verify state_dict keys:")
    print("   print(model.clip_cate_model.state_dict().keys())")


if __name__ == "__main__":
    print("VINE Ensemble Weights Loading Examples")
    print("=" * 50)
    
    # Test ensemble weight loading
    try:
        model1 = example_load_ensemble_weights()
    except Exception as e:
        print(f"Ensemble loading example failed: {e}")
    
    try:
        model2 = example_direct_ensemble_loading()
    except Exception as e:
        print(f"Direct loading example failed: {e}")
    
    # Compare approaches
    try:
        example_compare_original_vs_hf()
    except Exception as e:
        print(f"Comparison example failed: {e}")
    
    # Test pipeline with ensemble weights
    try:
        pipeline = example_ensemble_with_pipeline()
    except Exception as e:
        print(f"Pipeline example failed: {e}")
    
    # Educational content
    demonstrate_weight_transfer()
    troubleshooting_guide()
    
    print("\n" + "=" * 50)
    print("Key Points:")
    print("1. AutoModel.from_pretrained() won't work with .pt ensemble weights")
    print("2. Use torch.load() to load the ensemble, then transfer weights")
    print("3. The HuggingFace interface preserves your fine-tuned weights")
    print("4. Specify pretrained_vine_path in VineConfig to auto-load weights")
    print("5. Use VineModel.from_pretrained_vine() for direct loading")