Spaces:
Running
on
Zero
Running
on
Zero
File size: 10,793 Bytes
888f9e4 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 |
"""
Example usage of VINE HuggingFace interface
This script demonstrates how to use the VINE model through the HuggingFace interface
for video understanding with categorical, unary, and binary keyword predictions.
"""
import os
import sys
import torch
from transformers import pipeline, AutoModel
from transformers.pipelines import PIPELINE_REGISTRY
# Add the parent directory to the path to import vine_hf
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
# Uncomment or set your own
#os.environ['OPENAI_API_KEY'] = 'dummy-key'
from vine_hf import VineConfig, VineModel, VinePipeline
def example_direct_model_usage():
"""Example of using the VINE model directly."""
print("=== Direct Model Usage ===")
# Create configuration
config = VineConfig(
model_name="openai/clip-vit-base-patch32",
segmentation_method="grounding_dino_sam2",
use_hf_repo=True,
model_repo="video-fm/vine_v0", # Your HF Hub model
debug_visualizations=True,
debug_visualization_path=os.path.join(os.getcwd(), "debug_masks.png"),
target_fps=30,
box_threshold=0.35,
text_threshold=0.25
)
# Initialize model
model = VineModel(config)
print(f"Model initialized with CLIP backbone: {config.model_name}")
print(f"Segmentation method: {config.segmentation_method}")
print(f"Device: {model.device}")
# Example video data (placeholder - in real usage, load from video file)
num_frames, height, width = 3, 224, 224
video_frames = torch.randn(num_frames, height, width, 3) * 255
video_frames = video_frames.clamp(0, 255).byte()
# Example masks and bboxes (placeholder - in real usage, generated by segmentation)
masks = {
0: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
1: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)},
2: {1: torch.ones(height, width, 1), 2: torch.ones(height, width, 1)}
}
bboxes = {
0: {1: [50, 50, 150, 150], 2: [100, 100, 200, 200]},
1: {1: [52, 52, 152, 152], 2: [102, 102, 202, 202]},
2: {1: [54, 54, 154, 154], 2: [104, 104, 204, 204]}
}
# Define keywords
categorical_keywords = ["human", "dog", "frisbee"]
unary_keywords = ["running", "jumping", "sitting", "standing"]
binary_keywords = ["behind", "in front of", "next to", "throwing to", "catching from"]
object_pairs = [(1, 2)] # Object 1 relates to Object 2
# Run prediction
print("\nRunning prediction...")
results = model.predict(
video_frames=video_frames,
masks=masks,
bboxes=bboxes,
categorical_keywords=categorical_keywords,
unary_keywords=unary_keywords,
binary_keywords=binary_keywords,
object_pairs=object_pairs,
return_top_k=3
)
print("\nResults:")
print(f"Categorical predictions: {len(results['categorical_predictions'])} objects")
print(f"Unary predictions: {len(results['unary_predictions'])} actions")
print(f"Binary predictions: {len(results['binary_predictions'])} relations")
print(f"Confidence scores: {results['confidence_scores']}")
def example_pipeline_usage():
"""Example of using the VINE pipeline."""
print("\n=== Pipeline Usage ===")
# Register the pipeline
PIPELINE_REGISTRY.register_pipeline(
"vine-video-understanding",
pipeline_class=VinePipeline,
pt_model=VineModel,
type="multimodal",
)
vine_config = VineConfig(
model_name="openai/clip-vit-base-patch32",
use_hf_repo=True,
model_repo="video-fm/vine_v0", # Your HF Hub model
segmentation_method="grounding_dino_sam2",
debug_visualizations=True,
)
vine_pipe = VinePipeline(
model=VineModel(vine_config),
tokenizer=None,
trust_remote_code=True,
# SAM2 configuration
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
device=0,
)
print("Pipeline created successfully!")
# Example usage with video path
video_path = "path/to/your/video.mp4" # Replace with actual video path
# For demonstration, we'll show the expected usage format
print(f"\nExample pipeline call (replace with actual video path):")
print(f"results = vine_pipeline(")
print(f" '{video_path}',")
print(f" categorical_keywords=['human', 'dog', 'frisbee'],")
print(f" unary_keywords=['running', 'jumping', 'sitting'],")
print(f" binary_keywords=['behind', 'in front of', 'next to'],")
print(f" object_pairs=[(1, 2)],")
print(f" segmentation_method='grounding_dino_sam2',")
print(f" return_top_k=3,")
print(f" return_flattened_segments=True,")
print(f" return_valid_pairs=True,")
print(f" include_visualizations=True,")
print(f" debug_visualizations=True")
print(f")")
# Note: Actual execution would require proper video file and segmentation models
def example_huggingface_hub_usage():
"""Example of how to push and load from HuggingFace Hub."""
print("\n=== HuggingFace Hub Usage ===")
# Example of preparing model for Hub
config = VineConfig()
model = VineModel(config)
# Register for auto classes
config.register_for_auto_class()
model.register_for_auto_class("AutoModel")
print("Model registered for auto classes")
# Example push to hub (commented out - requires actual model weights and credentials)
# config.push_to_hub('your-username/vine-model')
# model.push_to_hub('your-username/vine-model')
# Example load from hub (commented out - requires actual model on hub)
# model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)
# pipeline = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)
print("To push to Hub:")
print("1. config.push_to_hub('your-username/vine-model')")
print("2. model.push_to_hub('your-username/vine-model')")
print("\nTo load from Hub:")
print("model = AutoModel.from_pretrained('your-username/vine-model', trust_remote_code=True)")
print("pipe = pipeline('vine-video-understanding', model='your-username/vine-model', trust_remote_code=True)")
def example_with_real_video():
"""Example showing how to use with a real video file."""
print("\n=== Real Video Usage Example ===")
# Check if demo video exists
demo_video_path = os.path.join(os.path.dirname(__file__), "../demo/videos/v1.mp4")
if os.path.exists(demo_video_path):
print(f"Found demo video: {demo_video_path}")
# Create pipeline with segmentation model paths
PIPELINE_REGISTRY.register_pipeline(
"vine-video-understanding",
pipeline_class=VinePipeline,
pt_model=VineModel,
type="multimodal",
)
vine_config = VineConfig(
model_name="openai/clip-vit-base-patch32",
use_hf_repo=True,
model_repo="video-fm/vine_v0", # Your HF Hub model
segmentation_method="grounding_dino_sam2",
debug_visualizations=True,
debug_visualization_path=os.path.join(os.getcwd(), "real_video_debug_masks.png"),
)
vine_pipeline = VinePipeline(
model=VineModel(vine_config),
tokenizer=None,
trust_remote_code=True,
# SAM2 configuration
sam_config_path="path/to/sam2/configs/sam2.1_hiera_b+.yaml",
sam_checkpoint_path="path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt",
gd_config_path="path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py",
gd_checkpoint_path="path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth",
)
# Define keywords based on the demo
categorical_keywords = ['human', 'dog', 'frisbee']
unary_keywords = ['running', 'jumping', 'catching', 'throwing']
binary_keywords = ['behind', 'in front of', 'next to', 'chasing']
object_pairs = [(0, 1), (0, 2), (1, 2)] # human-dog, dog-frisbee relationships
print("\nProcessing video with VINE...")
print("Keywords:")
print(f" Categorical: {categorical_keywords}")
print(f" Unary: {unary_keywords}")
print(f" Binary: {binary_keywords}")
print(f" Object pairs: {object_pairs}")
# Note: This would require proper segmentation models to be set up
try:
results = vine_pipeline(
demo_video_path,
categorical_keywords=categorical_keywords,
unary_keywords=unary_keywords,
binary_keywords=binary_keywords,
object_pairs=object_pairs,
segmentation_method='grounding_dino_sam2',
return_top_k=3,
include_visualizations=False,
debug_visualizations=True,
)
print("\nResults:")
print(f"Summary: {results['summary']}")
except Exception as e:
print(f"Note: Full execution requires segmentation models to be properly set up.")
print(f"Error: {e}")
else:
print(f"Demo video not found at: {demo_video_path}")
print("To use with a real video, provide the path to your video file.")
if __name__ == "__main__":
print("VINE HuggingFace Interface Examples")
print("=" * 50)
# Run examples
try:
example_direct_model_usage()
except Exception as e:
print(f"Direct model usage failed: {e}")
try:
example_pipeline_usage()
except Exception as e:
print(f"Pipeline usage failed: {e}")
try:
example_huggingface_hub_usage()
except Exception as e:
print(f"Hub usage example failed: {e}")
try:
example_with_real_video()
except Exception as e:
print(f"Real video example failed: {e}")
print("\n" + "=" * 50)
print("Examples completed!")
print("\nNext steps:")
print("1. Set up Grounding DINO and SAM2 models for segmentation")
print("2. Load your pretrained VINE model weights")
print("3. Test with your own videos")
print("4. Push to HuggingFace Hub for sharing")
|