File size: 10,793 Bytes
888f9e4
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
270
271
272
273
274
275
276
277
278
279
280
281
282
283
284
"""
Example usage of VINE HuggingFace interface

This script demonstrates how to use the VINE model through the HuggingFace interface
for video understanding with categorical, unary, and binary keyword predictions.
"""

import os
import sys
import torch
from transformers import pipeline, AutoModel
from transformers.pipelines import PIPELINE_REGISTRY

# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

# Uncomment or set your own
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
from vine_hf import VineConfig, VineModel, VinePipeline

def example_direct_model_usage():
    """Example of using the VINE model directly."""
    print("=== Direct Model Usage ===")
    
    # Create configuration
    config = VineConfig(
        model_name="openai/clip-vit-base-patch32",
        segmentation_method="grounding_dino_sam2",
        use_hf_repo=True,
        model_repo="video-fm/vine_v0",  # Your HF Hub model
        debug_visualizations=True,
        debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
        target_fps=30,
        box_threshold=0.35,
        text_threshold=0.25
    )
    
    # Initialize model
    model = VineModel(config)
    
    print(f"Model initialized with CLIP backbone: {config.model_name}")
    print(f"Segmentation method: {config.segmentation_method}")
    print(f"Device: {model.device}")
    
    # Example video data (placeholder - in real usage, load from video file)
    num_frames, height, width = 3, 224, 224
    video_frames = torch.randn(num_frames, height, width, 3) * 255
    video_frames = video_frames.clamp(0, 255).byte()
    
    # Example masks and bboxes (placeholder - in real usage, generated by segmentation)
    masks = {
        0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
        1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
        2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
    }
    
    bboxes = {
        0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
        1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
        2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
    }
    
    # Define keywords
    categorical_keywords = ["human", "dog", "frisbee"]
    unary_keywords = ["running", "jumping", "sitting", "standing"]
    binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
    object_pairs = [(1, 2)]  # Object 1 relates to Object 2
    
    # Run prediction
    print("\nRunning prediction...")
    results = model.predict(
        video_frames=video_frames,
        masks=masks,
        bboxes=bboxes,
        categorical_keywords=categorical_keywords,
        unary_keywords=unary_keywords,
        binary_keywords=binary_keywords,
        object_pairs=object_pairs,
        return_top_k=3
    )
    
    print("\nResults:")
    print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
    print(f"Unary predictions: {len(results['unary_predictions'])} actions")
    print(f"Binary predictions: {len(results['binary_predictions'])} relations")
    print(f"Confidence scores: {results['confidence_scores']}")


def example_pipeline_usage():
    """Example of using the VINE pipeline."""
    print("\n=== Pipeline Usage ===")
    
    # Register the pipeline
    PIPELINE_REGISTRY.register_pipeline(
        "vine-video-understanding",
        pipeline_class=VinePipeline,
        pt_model=VineModel,
        type="multimodal",
    )
    vine_config = VineConfig(
             model_name="openai/clip-vit-base-patch32",
            use_hf_repo=True,
            model_repo="video-fm/vine_v0",  # Your HF Hub model
            segmentation_method="grounding_dino_sam2",
            debug_visualizations=True,
        )
        
    vine_pipe = VinePipeline(
        model=VineModel(vine_config),
        tokenizer=None,
        trust_remote_code=True,
        # SAM2 configuration
        sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
        sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
        gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
        gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        device=0,
    )
        
    
    print("Pipeline created successfully!")
    
    # Example usage with video path
    video_path = "path/to/your/video.mp4"  # Replace with actual video path
    
    # For demonstration, we'll show the expected usage format
    print(f"\nExample pipeline call (replace with actual video path):")
    print(f"results = vine_pipeline(")
    print(f"    '{video_path}',")
    print(f"    categorical_keywords=['human', 'dog', 'frisbee'],")
    print(f"    unary_keywords=['running', 'jumping', 'sitting'],")
    print(f"    binary_keywords=['behind', 'in front of', 'next to'],")
    print(f"    object_pairs=[(1, 2)],")
    print(f"    segmentation_method='grounding_dino_sam2',")
    print(f"    return_top_k=3,")
    print(f"    return_flattened_segments=True,")
    print(f"    return_valid_pairs=True,")
    print(f"    include_visualizations=True,")
    print(f"    debug_visualizations=True")
    print(f")")
    
    # Note: Actual execution would require proper video file and segmentation models


def example_huggingface_hub_usage():
    """Example of how to push and load from HuggingFace Hub."""
    print("\n=== HuggingFace Hub Usage ===")
    
    # Example of preparing model for Hub
    config = VineConfig()
    model = VineModel(config)
    
    # Register for auto classes
    config.register_for_auto_class()
    model.register_for_auto_class("AutoModel")
    
    print("Model registered for auto classes")
    
    # Example push to hub (commented out - requires actual model weights and credentials)
    # config.push_to_hub('your-username/vine-model')
    # model.push_to_hub('your-username/vine-model')
    
    # Example load from hub (commented out - requires actual model on hub)
    # model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
    # pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)
    
    print("To push to Hub:")
    print("1. config.push_to_hub('your-username/vine-model')")
    print("2. model.push_to_hub('your-username/vine-model')")
    print("\nTo load from Hub:")
    print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
    print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")


def example_with_real_video():
    """Example showing how to use with a real video file."""
    print("\n=== Real Video Usage Example ===")
    
    # Check if demo video exists
    demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
    
    if os.path.exists(demo_video_path):
        print(f"Found demo video: {demo_video_path}")
        
        # Create pipeline with segmentation model paths
        PIPELINE_REGISTRY.register_pipeline(
            "vine-video-understanding",
            pipeline_class=VinePipeline,
            pt_model=VineModel,
            type="multimodal",
        )
        
        vine_config = VineConfig(
            model_name="openai/clip-vit-base-patch32",
            use_hf_repo=True,
            model_repo="video-fm/vine_v0",  # Your HF Hub model
            segmentation_method="grounding_dino_sam2",
            debug_visualizations=True,
            debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
        )
        
        vine_pipeline = VinePipeline(
            model=VineModel(vine_config),
            tokenizer=None,
            trust_remote_code=True,
            # SAM2 configuration
            sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",    
            sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
            gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
            gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
        )
        
        # Define keywords based on the demo
        categorical_keywords = ['human', 'dog', 'frisbee']
        unary_keywords = ['running', 'jumping', 'catching', 'throwing']
        binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
        object_pairs = [(0, 1), (0, 2), (1, 2)]  # human-dog, dog-frisbee relationships
        
        print("\nProcessing video with VINE...")
        print("Keywords:")
        print(f"  Categorical: {categorical_keywords}")
        print(f"  Unary: {unary_keywords}")
        print(f"  Binary: {binary_keywords}")
        print(f"  Object pairs: {object_pairs}")
        
        # Note: This would require proper segmentation models to be set up
        try:
            results = vine_pipeline(
                demo_video_path,
                categorical_keywords=categorical_keywords,
                unary_keywords=unary_keywords,
                binary_keywords=binary_keywords,
                object_pairs=object_pairs,
                segmentation_method='grounding_dino_sam2',
                return_top_k=3,
                include_visualizations=False,
                debug_visualizations=True,
            )
            
            print("\nResults:")
            print(f"Summary: {results['summary']}")
            
        except Exception as e:
            print(f"Note: Full execution requires segmentation models to be properly set up.")
            print(f"Error: {e}")
            
    else:
        print(f"Demo video not found at: {demo_video_path}")
        print("To use with a real video, provide the path to your video file.")


if __name__ == "__main__":
    print("VINE HuggingFace Interface Examples")
    print("=" * 50)
    
    # Run examples
    try:
        example_direct_model_usage()
    except Exception as e:
        print(f"Direct model usage failed: {e}")
    
    try:
        example_pipeline_usage()
    except Exception as e:
        print(f"Pipeline usage failed: {e}")
    
    try:
        example_huggingface_hub_usage()
    except Exception as e:
        print(f"Hub usage example failed: {e}")
    
    try:
        example_with_real_video()
    except Exception as e:
        print(f"Real video example failed: {e}")
    
    print("\n" + "=" * 50)
    print("Examples completed!")
    print("\nNext steps:")
    print("1. Set up Grounding DINO and SAM2 models for segmentation")
    print("2. Load your pretrained VINE model weights")
    print("3. Test with your own videos")
    print("4. Push to HuggingFace Hub for sharing")