Spaces:

jiani-huang
/

LASER

Running on Zero

App Files Files Community

LASER / vine_hf /convert_inference.py

ASethi04

updates

f9a6349 15 days ago

raw

history blame

11.6 kB

	"""
	Script to convert existing inference.py workflow to use VINE HuggingFace interface

	This script demonstrates how to migrate from the original inference.py approach
	to the new HuggingFace-compatible interface.
	"""

	import os
	import sys
	import torch
	import numpy as np
	from typing import Dict, List, Tuple, Any

	# Add paths for imports
	sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))

	from vine_hf import VineConfig, VineModel, VinePipeline
	from laser.loading import load_video


	def load_pretrained_vine_model(model_dir: str, model_name: str, epoch: int = 0) -> VineModel:
	"""
	Load a pretrained VINE model from the original format into HuggingFace format.

	Args:
	model_dir: Directory containing the model
	model_name: Name of the model file (without .{epoch}.model extension)
	epoch: Epoch number to load

	Returns:
	VineModel instance with loaded weights
	"""
	print(f"Loading pretrained VINE model from {model_dir}")

	# Create configuration (adjust parameters as needed)
	# We expect local ensemble weights in `model_dir`, so configure
	# VineConfig to load from local directory/filename.
	model_file = f"{model_name}.{epoch}.model"
	config = VineConfig(
	model_name="openai/clip-vit-base-patch32",
	segmentation_method="grounding_dino_sam2",
	target_fps=1,
	box_threshold=0.35,
	text_threshold=0.25,
	use_hf_repo=False,
	local_dir=model_dir,
	local_filename=model_file,
	)

	# Initialize model (VineModel will consult the config when loading)
	vine_model = VineModel(config)

	# Load original weights
	model_file = f"{model_name}.{epoch}.model"
	model_path = os.path.join(model_dir, model_file)

	if os.path.exists(model_path):
	print(f"Loading weights from: {model_path}")
	try:
	# Add safe globals for PyTorch 2.6+
	import torch.serialization
	from laser.models.llava_clip_model_v3 import PredicateModel
	torch.serialization.add_safe_globals([PredicateModel])

	# Load the original model
	original_model = torch.load(model_path, map_location='cpu', weights_only=False)

	# Transfer weights to HuggingFace model
	# This assumes the original model has the same structure
	# You may need to adjust this based on your specific model structure

	if hasattr(original_model, 'clip_cate_model'):
	vine_model.clip_cate_model.load_state_dict(original_model.clip_cate_model.state_dict())
	if hasattr(original_model, 'clip_unary_model'):
	vine_model.clip_unary_model.load_state_dict(original_model.clip_unary_model.state_dict())
	if hasattr(original_model, 'clip_binary_model'):
	vine_model.clip_binary_model.load_state_dict(original_model.clip_binary_model.state_dict())
	if hasattr(original_model, 'clip_tokenizer'):
	vine_model.clip_tokenizer = original_model.clip_tokenizer
	if hasattr(original_model, 'clip_processor'):
	vine_model.clip_processor = original_model.clip_processor

	print("✓ Weights transferred successfully")

	except Exception as e:
	print(f"✗ Error loading weights: {e}")
	print("You may need to adjust the weight loading logic for your specific model")

	else:
	print(f"✗ Model file not found: {model_path}")

	return vine_model


	def convert_inference_workflow():
	"""
	Convert the original inference.py workflow to use HuggingFace interface.

	This function demonstrates how to replicate the original inference workflow
	using the new HuggingFace-compatible components.
	"""
	print("=== Converting Inference Workflow ===")

	# Original parameters from inference.py
	video_id = 'v1'
	target_fps = 1
	classes = ['human', 'dog', 'frisbee']
	unary_keywords = ['running', 'jumping', 'sitting', 'standing']
	binary_keywords = ['behind', 'bite', 'front', 'jump over', 'right', 'left']

	# Paths (adjust these to match your setup)
	demo_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../demo"))
	video_dir = os.path.join(demo_dir, "videos")
	video_path = os.path.join(video_dir, f"{video_id}.mp4")

	# Model paths (adjust these to match your setup)
	data_dir = os.path.abspath(os.path.join(os.path.dirname(__file__), "../../data"))
	model_dir = os.path.join(data_dir, "LLaVA-Video-178K-v2/models/ensemble-02-10")
	model_name = "ensemble-2025-02-10-14-57-22"

	# Segmentation model paths (adjust these to your actual paths)
	sam_config_path = "/path/to/sam2/configs/sam2.1/sam2.1_hiera_b+.yaml"
	sam_checkpoint_path = "/path/to/sam2/checkpoints/sam2.1_hiera_base_plus.pt"
	gd_config_path = "/path/to/GroundingDINO/config/GroundingDINO_SwinT_OGC.py"
	gd_checkpoint_path = "/path/to/GroundingDINO/checkpoints/groundingdino_swint_ogc.pth"

	print(f"Video path: {video_path}")
	print(f"Model dir: {model_dir}")
	print(f"SAM2 config: {sam_config_path}")
	print(f"GroundingDINO config: {gd_config_path}")

	# Check if video exists
	if not os.path.exists(video_path):
	print(f"✗ Video not found: {video_path}")
	print("Please adjust the video path or use your own video file")
	return

	# 1. Load video (same as original)
	print(f"Loading video: {video_id}")
	video_tensor = load_video(video_path, target_fps=target_fps)
	print(f"Video shape: {video_tensor.shape}")

	# 2. Load VINE model with HuggingFace interface
	print("Loading VINE model...")
	if os.path.exists(model_dir):
	vine_model = load_pretrained_vine_model(model_dir, model_name, epoch=0)
	else:
	print(f"Model directory not found: {model_dir}")
	print("Creating new model with random weights for demonstration")
	config = VineConfig()
	vine_model = VineModel(config)

	# 3. Create pipeline for easier use
	print("Creating VINE pipeline...")
	from transformers.pipelines import PIPELINE_REGISTRY

	# Register pipeline if not already registered
	try:
	PIPELINE_REGISTRY.register_pipeline(
	"vine-video-understanding",
	pipeline_class=VinePipeline,
	pt_model=VineModel,
	type="multimodal",
	)
	except Exception:
	pass # Already registered

	# Create pipeline instance with segmentation model paths
	vine_pipeline = VinePipeline(
	model=vine_model,
	tokenizer=None,
	# SAM2 configuration
	sam_config_path=sam_config_path,
	sam_checkpoint_path=sam_checkpoint_path,
	# GroundingDINO configuration
	gd_config_path=gd_config_path,
	gd_checkpoint_path=gd_checkpoint_path
	)

	# 4. Process video with new interface
	print("Processing video with VINE HuggingFace interface...")

	try:
	# Use the pipeline to process the video
	results = vine_pipeline(
	video_path,
	categorical_keywords=classes,
	unary_keywords=unary_keywords,
	binary_keywords=binary_keywords,
	object_pairs=[(1, 2), (2, 3)], # Example object pairs
	segmentation_method='grounding_dino_sam2',
	target_fps=target_fps,
	return_top_k=3,
	include_visualizations=False
	)

	# 5. Display results (similar to original format)
	print("\n=== VINE Results (HuggingFace Interface) ===")

	# Categorical predictions
	print("\nCategorical Predictions:")
	for obj_id, predictions in results['categorical_predictions'].items():
	print(f" Object {obj_id}:")
	for prob, category in predictions:
	print(f" {prob:.3f}: {category}")

	# Unary predictions
	print("\nUnary Predictions:")
	for (frame_id, obj_id), predictions in results['unary_predictions'].items():
	print(f" Frame {frame_id}, Object {obj_id}:")
	for prob, action in predictions:
	print(f" {prob:.3f}: {action}")

	# Binary predictions
	print("\nBinary Predictions:")
	for (frame_id, obj_pair), predictions in results['binary_predictions'].items():
	print(f" Frame {frame_id}, Objects {obj_pair}:")
	for prob, relation in predictions:
	print(f" {prob:.3f}: {relation}")

	# Summary
	print(f"\nSummary:")
	print(f" Objects detected: {results['summary']['num_objects_detected']}")
	print(f" Top categories: {results['summary']['top_categories']}")
	print(f" Top actions: {results['summary']['top_actions']}")
	print(f" Top relations: {results['summary']['top_relations']}")

	print("\n✓ Successfully processed video with VINE HuggingFace interface!")

	except Exception as e:
	print(f"✗ Error processing video: {e}")
	print("This may be due to missing segmentation models or other dependencies")
	print("The interface is set up correctly, but full functionality requires:")
	print(" 1. Properly installed Grounding DINO and SAM2")
	print(" 2. Correct model weights")
	print(" 3. Proper configuration paths")


	def compare_interfaces():
	"""
	Compare the original inference.py approach with the new HuggingFace interface.
	"""
	print("\n=== Interface Comparison ===")

	print("\nOriginal inference.py approach:")
	print("✓ Direct access to model internals")
	print("✓ Full control over segmentation pipeline")
	print("✗ Complex setup and configuration")
	print("✗ Not compatible with HuggingFace ecosystem")
	print("✗ Requires manual handling of all components")

	print("\nNew HuggingFace interface:")
	print("✓ Easy to use pipeline interface")
	print("✓ Compatible with HuggingFace Hub")
	print("✓ Standardized configuration")
	print("✓ Automatic handling of preprocessing/postprocessing")
	print("✓ Easy sharing and distribution")
	print("✓ Configurable segmentation model paths")
	print("✗ Slightly less direct control (can still access model directly)")

	print("\nMigration benefits:")
	print("• Share your model easily on HuggingFace Hub")
	print("• Users can load your model with a single line")
	print("• Standardized interface for video understanding")
	print("• Better integration with other HuggingFace tools")
	print("• Simplified deployment and inference")
	print("• Flexible segmentation model configuration")


	if __name__ == "__main__":
	print("VINE HuggingFace Interface Conversion")
	print("=" * 50)

	# Run conversion demonstration
	convert_inference_workflow()

	# Show comparison
	compare_interfaces()

	print("\n" + "=" * 50)
	print("Next steps:")
	print("1. Install SAM2 and GroundingDINO dependencies")
	print("2. Download the required model checkpoints")
	print("3. Update the paths in this script to point to your models")
	print("4. Test the interface with your specific model weights")
	print("5. Adjust configuration parameters as needed")
	print("6. Push your model to HuggingFace Hub using push_to_hub.py")
	print("7. Share with the community!")