File size: 11,598 Bytes
f9a6349
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
285
286
287
288
"""
Script to convert existing inference.py workflow to use VINE HuggingFace interface

This script demonstrates how to migrate from the original inference.py approach
to the new HuggingFace-compatible interface.
"""

import os
import sys
import torch
import numpy as np
from typing import Dict, List, Tuple, Any

# Add paths for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

from vine_hf import VineConfig, VineModel, VinePipeline
from laser.loading import load_video


def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
    """
    Load a pretrained VINE model from the original format into HuggingFace format.
    
    Args:
        model_dir: Directory containing the model
        model_name: Name of the model file (without .{epoch}.model extension)
        epoch: Epoch number to load
        
    Returns:
        VineModel instance with loaded weights
    """
    print(f"Loading pretrained VINE model from {model_dir}")
    
    # Create configuration (adjust parameters as needed)
    # We expect local ensemble weights in `model_dir`, so configure
    # VineConfig to load from local directory/filename.
    model_file = f"{model_name}.{epoch}.model"
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        target_fps=1,
        box_threshold=0.35,
        text_threshold=0.25,
        use_hf_repo=False,
        local_dir=model_dir,
        local_filename=model_file,
    )

    # Initialize model (VineModel will consult the config when loading)
    vine_model = VineModel(config)
    
    # Load original weights
    model_file = f"{model_name}.{epoch}.model"
    model_path = os.path.join(model_dir, model_file)
    
    if os.path.exists(model_path):
        print(f"Loading weights from: {model_path}")
        try:
            # Add safe globals for PyTorch 2.6+
            import torch.serialization
            from laser.models.llava_clip_model_v3 import PredicateModel
            torch.serialization.add_safe_globals([PredicateModel])
            
            # Load the original model
            original_model = torch.load(model_path, map_location='cpu', weights_only=False)
            
            # Transfer weights to HuggingFace model
            # This assumes the original model has the same structure
            # You may need to adjust this based on your specific model structure
            
            if hasattr(original_model, 'clip_cate_model'):
                vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
            if hasattr(original_model, 'clip_unary_model'):
                vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
            if hasattr(original_model, 'clip_binary_model'):
                vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
            if hasattr(original_model, 'clip_tokenizer'):
                vine_model.clip_tokenizer = original_model.clip_tokenizer
            if hasattr(original_model, 'clip_processor'):
                vine_model.clip_processor = original_model.clip_processor
                
            print("βœ“ Weights transferred successfully")
            
        except Exception as e:
            print(f"βœ— Error loading weights: {e}")
            print("You may need to adjust the weight loading logic for your specific model")
            
    else:
        print(f"βœ— Model file not found: {model_path}")
        
    return vine_model


def convert_inference_workflow():
    """
    Convert the original inference.py workflow to use HuggingFace interface.
    
    This function demonstrates how to replicate the original inference workflow
    using the new HuggingFace-compatible components.
    """
    print("=== Converting Inference Workflow ===")
    
    # Original parameters from inference.py
    video_id = 'v1'
    target_fps = 1
    classes = ['human', 'dog', 'frisbee']
    unary_keywords = ['running', 'jumping', 'sitting', 'standing']
    binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
    
    # Paths (adjust these to match your setup)
    demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
    video_dir = os.path.join(demo_dir, "videos")
    video_path = os.path.join(video_dir, f"{video_id}.mp4")
    
    # Model paths (adjust these to match your setup)
    data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
    model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
    model_name = "ensemble-2025-02-10-14-57-22"
    
    # Segmentation model paths (adjust these to your actual paths)
    sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
    sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
    gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
    gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"
    
    print(f"Video path: {video_path}")
    print(f"Model dir: {model_dir}")
    print(f"SAM2 config: {sam_config_path}")
    print(f"GroundingDINO config: {gd_config_path}")
    
    # Check if video exists
    if not os.path.exists(video_path):
        print(f"βœ— Video not found: {video_path}")
        print("Please adjust the video path or use your own video file")
        return
    
    # 1. Load video (same as original)
    print(f"Loading video: {video_id}")
    video_tensor = load_video(video_path, target_fps=target_fps)
    print(f"Video shape: {video_tensor.shape}")
    
    # 2. Load VINE model with HuggingFace interface
    print("Loading VINE model...")
    if os.path.exists(model_dir):
        vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
    else:
        print(f"Model directory not found: {model_dir}")
        print("Creating new model with random weights for demonstration")
        config = VineConfig()
        vine_model = VineModel(config)
    
    # 3. Create pipeline for easier use
    print("Creating VINE pipeline...")
    from transformers.pipelines import PIPELINE_REGISTRY
    
    # Register pipeline if not already registered
    try:
        PIPELINE_REGISTRY.register_pipeline(
            "vine-video-understanding",
            pipeline_class=VinePipeline,
            pt_model=VineModel,
            type="multimodal",
        )
    except Exception:
        pass  # Already registered
    
    # Create pipeline instance with segmentation model paths
    vine_pipeline = VinePipeline(
        model=vine_model, 
        tokenizer=None,
        # SAM2 configuration
        sam_config_path=sam_config_path,
        sam_checkpoint_path=sam_checkpoint_path,
        # GroundingDINO configuration
        gd_config_path=gd_config_path,
        gd_checkpoint_path=gd_checkpoint_path
    )
    
    # 4. Process video with new interface
    print("Processing video with VINE HuggingFace interface...")
    
    try:
        # Use the pipeline to process the video
        results = vine_pipeline(
            video_path,
            categorical_keywords=classes,
            unary_keywords=unary_keywords,
            binary_keywords=binary_keywords,
            object_pairs=[(1, 2), (2, 3)],  # Example object pairs
            segmentation_method='grounding_dino_sam2',
            target_fps=target_fps,
            return_top_k=3,
            include_visualizations=False
        )
        
        # 5. Display results (similar to original format)
        print("\n=== VINE Results (HuggingFace Interface) ===")
        
        # Categorical predictions
        print("\nCategorical Predictions:")
        for obj_id, predictions in results['categorical_predictions'].items():
            print(f"  Object {obj_id}:")
            for prob, category in predictions:
                print(f"    {prob:.3f}: {category}")
        
        # Unary predictions  
        print("\nUnary Predictions:")
        for (frame_id, obj_id), predictions in results['unary_predictions'].items():
            print(f"  Frame {frame_id}, Object {obj_id}:")
            for prob, action in predictions:
                print(f"    {prob:.3f}: {action}")
        
        # Binary predictions
        print("\nBinary Predictions:")
        for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
            print(f"  Frame {frame_id}, Objects {obj_pair}:")
            for prob, relation in predictions:
                print(f"    {prob:.3f}: {relation}")
        
        # Summary
        print(f"\nSummary:")
        print(f"  Objects detected: {results['summary']['num_objects_detected']}")
        print(f"  Top categories: {results['summary']['top_categories']}")
        print(f"  Top actions: {results['summary']['top_actions']}")
        print(f"  Top relations: {results['summary']['top_relations']}")
        
        print("\nβœ“ Successfully processed video with VINE HuggingFace interface!")
        
    except Exception as e:
        print(f"βœ— Error processing video: {e}")
        print("This may be due to missing segmentation models or other dependencies")
        print("The interface is set up correctly, but full functionality requires:")
        print("  1. Properly installed Grounding DINO and SAM2")
        print("  2. Correct model weights")
        print("  3. Proper configuration paths")


def compare_interfaces():
    """
    Compare the original inference.py approach with the new HuggingFace interface.
    """
    print("\n=== Interface Comparison ===")
    
    print("\nOriginal inference.py approach:")
    print("βœ“ Direct access to model internals")
    print("βœ“ Full control over segmentation pipeline")
    print("βœ— Complex setup and configuration")
    print("βœ— Not compatible with HuggingFace ecosystem")
    print("βœ— Requires manual handling of all components")
    
    print("\nNew HuggingFace interface:")
    print("βœ“ Easy to use pipeline interface")
    print("βœ“ Compatible with HuggingFace Hub")
    print("βœ“ Standardized configuration")
    print("βœ“ Automatic handling of preprocessing/postprocessing")
    print("βœ“ Easy sharing and distribution")
    print("βœ“ Configurable segmentation model paths")
    print("βœ— Slightly less direct control (can still access model directly)")
    
    print("\nMigration benefits:")
    print("β€’ Share your model easily on HuggingFace Hub")
    print("β€’ Users can load your model with a single line")
    print("β€’ Standardized interface for video understanding")
    print("β€’ Better integration with other HuggingFace tools")
    print("β€’ Simplified deployment and inference")
    print("β€’ Flexible segmentation model configuration")


if __name__ == "__main__":
    print("VINE HuggingFace Interface Conversion")
    print("=" * 50)
    
    # Run conversion demonstration
    convert_inference_workflow()
    
    # Show comparison
    compare_interfaces()
    
    print("\n" + "=" * 50)
    print("Next steps:")
    print("1. Install SAM2 and GroundingDINO dependencies")
    print("2. Download the required model checkpoints")
    print("3. Update the paths in this script to point to your models")
    print("4. Test the interface with your specific model weights")
    print("5. Adjust configuration parameters as needed")
    print("6. Push your model to HuggingFace Hub using push_to_hub.py")
    print("7. Share with the community!")