LASER / vine_hf /convert_inference.py
ASethi04's picture
updates
f9a6349
raw
history blame
11.6 kB
"""
Script to convert existing inference.py workflow to use VINE HuggingFace interface
This script demonstrates how to migrate from the original inference.py approach
to the new HuggingFace-compatible interface.
"""
import os
import sys
import torch
import numpy as np
from typing import Dict, List, Tuple, Any
# Add paths for imports
sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
from vine_hf import VineConfig, VineModel, VinePipeline
from laser.loading import load_video
def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
"""
Load a pretrained VINE model from the original format into HuggingFace format.
Args:
model_dir: Directory containing the model
model_name: Name of the model file (without .{epoch}.model extension)
epoch: Epoch number to load
Returns:
VineModel instance with loaded weights
"""
print(f"Loading pretrained VINE model from {model_dir}")
# Create configuration (adjust parameters as needed)
# We expect local ensemble weights in `model_dir`, so configure
# VineConfig to load from local directory/filename.
model_file = f"{model_name}.{epoch}.model"
config = VineConfig(
model_name="openai/clip-vit-base-patch32",
segmentation_method="grounding_dino_sam2",
target_fps=1,
box_threshold=0.35,
text_threshold=0.25,
use_hf_repo=False,
local_dir=model_dir,
local_filename=model_file,
)
# Initialize model (VineModel will consult the config when loading)
vine_model = VineModel(config)
# Load original weights
model_file = f"{model_name}.{epoch}.model"
model_path = os.path.join(model_dir, model_file)
if os.path.exists(model_path):
print(f"Loading weights from: {model_path}")
try:
# Add safe globals for PyTorch 2.6+
import torch.serialization
from laser.models.llava_clip_model_v3 import PredicateModel
torch.serialization.add_safe_globals([PredicateModel])
# Load the original model
original_model = torch.load(model_path, map_location='cpu', weights_only=False)
# Transfer weights to HuggingFace model
# This assumes the original model has the same structure
# You may need to adjust this based on your specific model structure
if hasattr(original_model, 'clip_cate_model'):
vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
if hasattr(original_model, 'clip_unary_model'):
vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
if hasattr(original_model, 'clip_binary_model'):
vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
if hasattr(original_model, 'clip_tokenizer'):
vine_model.clip_tokenizer = original_model.clip_tokenizer
if hasattr(original_model, 'clip_processor'):
vine_model.clip_processor = original_model.clip_processor
print("βœ“ Weights transferred successfully")
except Exception as e:
print(f"βœ— Error loading weights: {e}")
print("You may need to adjust the weight loading logic for your specific model")
else:
print(f"βœ— Model file not found: {model_path}")
return vine_model
def convert_inference_workflow():
"""
Convert the original inference.py workflow to use HuggingFace interface.
This function demonstrates how to replicate the original inference workflow
using the new HuggingFace-compatible components.
"""
print("=== Converting Inference Workflow ===")
# Original parameters from inference.py
video_id = 'v1'
target_fps = 1
classes = ['human', 'dog', 'frisbee']
unary_keywords = ['running', 'jumping', 'sitting', 'standing']
binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']
# Paths (adjust these to match your setup)
demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
video_dir = os.path.join(demo_dir, "videos")
video_path = os.path.join(video_dir, f"{video_id}.mp4")
# Model paths (adjust these to match your setup)
data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
model_name = "ensemble-2025-02-10-14-57-22"
# Segmentation model paths (adjust these to your actual paths)
sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"
print(f"Video path: {video_path}")
print(f"Model dir: {model_dir}")
print(f"SAM2 config: {sam_config_path}")
print(f"GroundingDINO config: {gd_config_path}")
# Check if video exists
if not os.path.exists(video_path):
print(f"βœ— Video not found: {video_path}")
print("Please adjust the video path or use your own video file")
return
# 1. Load video (same as original)
print(f"Loading video: {video_id}")
video_tensor = load_video(video_path, target_fps=target_fps)
print(f"Video shape: {video_tensor.shape}")
# 2. Load VINE model with HuggingFace interface
print("Loading VINE model...")
if os.path.exists(model_dir):
vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
else:
print(f"Model directory not found: {model_dir}")
print("Creating new model with random weights for demonstration")
config = VineConfig()
vine_model = VineModel(config)
# 3. Create pipeline for easier use
print("Creating VINE pipeline...")
from transformers.pipelines import PIPELINE_REGISTRY
# Register pipeline if not already registered
try:
PIPELINE_REGISTRY.register_pipeline(
"vine-video-understanding",
pipeline_class=VinePipeline,
pt_model=VineModel,
type="multimodal",
)
except Exception:
pass # Already registered
# Create pipeline instance with segmentation model paths
vine_pipeline = VinePipeline(
model=vine_model,
tokenizer=None,
# SAM2 configuration
sam_config_path=sam_config_path,
sam_checkpoint_path=sam_checkpoint_path,
# GroundingDINO configuration
gd_config_path=gd_config_path,
gd_checkpoint_path=gd_checkpoint_path
)
# 4. Process video with new interface
print("Processing video with VINE HuggingFace interface...")
try:
# Use the pipeline to process the video
results = vine_pipeline(
video_path,
categorical_keywords=classes,
unary_keywords=unary_keywords,
binary_keywords=binary_keywords,
object_pairs=[(1, 2), (2, 3)], # Example object pairs
segmentation_method='grounding_dino_sam2',
target_fps=target_fps,
return_top_k=3,
include_visualizations=False
)
# 5. Display results (similar to original format)
print("\n=== VINE Results (HuggingFace Interface) ===")
# Categorical predictions
print("\nCategorical Predictions:")
for obj_id, predictions in results['categorical_predictions'].items():
print(f" Object {obj_id}:")
for prob, category in predictions:
print(f" {prob:.3f}: {category}")
# Unary predictions
print("\nUnary Predictions:")
for (frame_id, obj_id), predictions in results['unary_predictions'].items():
print(f" Frame {frame_id}, Object {obj_id}:")
for prob, action in predictions:
print(f" {prob:.3f}: {action}")
# Binary predictions
print("\nBinary Predictions:")
for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
print(f" Frame {frame_id}, Objects {obj_pair}:")
for prob, relation in predictions:
print(f" {prob:.3f}: {relation}")
# Summary
print(f"\nSummary:")
print(f" Objects detected: {results['summary']['num_objects_detected']}")
print(f" Top categories: {results['summary']['top_categories']}")
print(f" Top actions: {results['summary']['top_actions']}")
print(f" Top relations: {results['summary']['top_relations']}")
print("\nβœ“ Successfully processed video with VINE HuggingFace interface!")
except Exception as e:
print(f"βœ— Error processing video: {e}")
print("This may be due to missing segmentation models or other dependencies")
print("The interface is set up correctly, but full functionality requires:")
print(" 1. Properly installed Grounding DINO and SAM2")
print(" 2. Correct model weights")
print(" 3. Proper configuration paths")
def compare_interfaces():
"""
Compare the original inference.py approach with the new HuggingFace interface.
"""
print("\n=== Interface Comparison ===")
print("\nOriginal inference.py approach:")
print("βœ“ Direct access to model internals")
print("βœ“ Full control over segmentation pipeline")
print("βœ— Complex setup and configuration")
print("βœ— Not compatible with HuggingFace ecosystem")
print("βœ— Requires manual handling of all components")
print("\nNew HuggingFace interface:")
print("βœ“ Easy to use pipeline interface")
print("βœ“ Compatible with HuggingFace Hub")
print("βœ“ Standardized configuration")
print("βœ“ Automatic handling of preprocessing/postprocessing")
print("βœ“ Easy sharing and distribution")
print("βœ“ Configurable segmentation model paths")
print("βœ— Slightly less direct control (can still access model directly)")
print("\nMigration benefits:")
print("β€’ Share your model easily on HuggingFace Hub")
print("β€’ Users can load your model with a single line")
print("β€’ Standardized interface for video understanding")
print("β€’ Better integration with other HuggingFace tools")
print("β€’ Simplified deployment and inference")
print("β€’ Flexible segmentation model configuration")
if __name__ == "__main__":
print("VINE HuggingFace Interface Conversion")
print("=" * 50)
# Run conversion demonstration
convert_inference_workflow()
# Show comparison
compare_interfaces()
print("\n" + "=" * 50)
print("Next steps:")
print("1. Install SAM2 and GroundingDINO dependencies")
print("2. Download the required model checkpoints")
print("3. Update the paths in this script to point to your models")
print("4. Test the interface with your specific model weights")
print("5. Adjust configuration parameters as needed")
print("6. Push your model to HuggingFace Hub using push_to_hub.py")
print("7. Share with the community!")