Spaces:
Running
on
Zero
Running
on
Zero
| from pathlib import Path | |
| from collections.abc import Mapping, Sequence | |
| from functools import lru_cache | |
| import inspect | |
| import shutil | |
| import tempfile | |
| import os | |
| import sys | |
| # Add src/ to sys.path so LASER, video-sam2, GroundingDINO are importable | |
| current_dir = Path(__file__).resolve().parent | |
| src_dir = current_dir / "src" | |
| if src_dir.is_dir() and str(src_dir) not in sys.path: | |
| sys.path.insert(0, str(src_dir)) | |
| import spaces # <-- ZeroGPU integration | |
| import gradio as gr | |
| import torch | |
| from transformers import pipeline # not strictly necessary, but fine | |
| # ----------------------------- | |
| # Environment / diagnostics | |
| # ----------------------------- | |
| os.environ["GRADIO_TEMP_DIR"] = str(Path(__file__).parent / "gradio_temp") | |
| os.environ["OPENAI_API_KEY"] = "test" | |
| os.environ["OMP_NUM_THREADS"] = "4" | |
| print("All imports finished") | |
| print(f"Python version: {sys.version}") | |
| print(f"PyTorch version: {torch.__version__}") | |
| print(f"CUDA available: {torch.cuda.is_available()}") | |
| print(f"CUDA version: {torch.version.cuda}") | |
| print(f"cuDNN version: {torch.backends.cudnn.version()}") | |
| print(f"Number of GPUs: {torch.cuda.device_count()}") | |
| if torch.cuda.is_available(): | |
| for i in range(torch.cuda.device_count()): | |
| print(f"GPU {i}: {torch.cuda.get_device_name(i)}") | |
| print( | |
| f" Memory: {torch.cuda.get_device_properties(i).total_memory / 1e9:.2f} GB" | |
| ) | |
| torch.backends.cuda.matmul.allow_tf32 = False | |
| torch.backends.cudnn.allow_tf32 = False | |
| os.environ["TORCH_DTYPE"] = "float32" | |
| torch.set_default_dtype(torch.float32) | |
| current_dir = Path(__file__).resolve().parent | |
| # For Spaces, assume checkpoints live alongside app.py or in a "checkpoints" subdir. | |
| # If you keep them next to app.py locally, this still works. | |
| # NOTE: SAM2 config uses Hydra, so we use just the filename (it searches in sam2/configs/) | |
| sam_config_path = "sam2_hiera_t.yaml" # Hydra will find this in sam2/configs/ | |
| sam_checkpoint_path = str(current_dir / "sam2_hiera_tiny.pt") | |
| gd_config_path = str(current_dir / "GroundingDINO_SwinT_OGC.py") | |
| gd_checkpoint_path = str(current_dir / "groundingdino_swint_ogc.pth") | |
| visualization_dir = str(current_dir / "outputs") | |
| print( | |
| f"Setting up paths: {sam_config_path}, {sam_checkpoint_path}, {gd_config_path}, {gd_checkpoint_path}" | |
| ) | |
| def format_summary(summary, binary_confidence_threshold=0.8): | |
| """ | |
| Format the summary dictionary into a readable markdown string. | |
| Filters binary relations by confidence threshold. | |
| """ | |
| if not summary or not isinstance(summary, dict): | |
| return "# Detection Summary\n\nNo events detected or processing in progress..." | |
| output_lines = ["# Detection Summary\n"] | |
| has_content = False | |
| # Categorical keywords | |
| if "categorical_keywords" in summary and summary["categorical_keywords"]: | |
| output_lines.append("## Categorical Keywords\n") | |
| cate = summary["categorical_keywords"] | |
| if isinstance(cate, dict) and cate: | |
| has_content = True | |
| for kw, info in cate.items(): | |
| output_lines.append(f"**{kw}**") | |
| if isinstance(info, dict): | |
| for key, val in info.items(): | |
| output_lines.append(f" - {key}: {val}") | |
| else: | |
| output_lines.append(f" - {info}") | |
| output_lines.append("") | |
| elif isinstance(cate, list) and cate: | |
| has_content = True | |
| for item in cate: | |
| output_lines.append(f"- {item}") | |
| output_lines.append("") | |
| # Unary keywords | |
| if "unary_keywords" in summary and summary["unary_keywords"]: | |
| output_lines.append("## Unary Keywords\n") | |
| unary = summary["unary_keywords"] | |
| if isinstance(unary, dict) and unary: | |
| has_content = True | |
| for kw, info in unary.items(): | |
| output_lines.append(f"**{kw}**") | |
| if isinstance(info, dict): | |
| for key, val in info.items(): | |
| output_lines.append(f" - {key}: {val}") | |
| else: | |
| output_lines.append(f" - {info}") | |
| output_lines.append("") | |
| elif isinstance(unary, list) and unary: | |
| has_content = True | |
| for item in unary: | |
| output_lines.append(f"- {item}") | |
| output_lines.append("") | |
| # Binary keywords - show ALL binary relations for debugging | |
| print(f"DEBUG: Checking binary_keywords...") | |
| print(f" 'binary_keywords' in summary: {'binary_keywords' in summary}") | |
| if 'binary_keywords' in summary: | |
| print(f" summary['binary_keywords'] truthy: {bool(summary['binary_keywords'])}") | |
| print(f" summary['binary_keywords'] type: {type(summary['binary_keywords'])}") | |
| print(f" summary['binary_keywords'] value: {summary['binary_keywords']}") | |
| if "binary_keywords" in summary and summary["binary_keywords"]: | |
| output_lines.append(f"## Binary Keywords\n") | |
| binary = summary["binary_keywords"] | |
| print(f"DEBUG: Processing binary keywords, type: {type(binary)}, length: {len(binary) if isinstance(binary, (dict, list)) else 'N/A'}") | |
| if isinstance(binary, dict) and binary: | |
| has_content = True | |
| # Show all binary relations, sorted by confidence | |
| binary_items = [] | |
| for kw, info in binary.items(): | |
| if isinstance(info, dict): | |
| confidence = info.get("confidence", info.get("score", 0)) | |
| binary_items.append((kw, info, confidence)) | |
| else: | |
| binary_items.append((kw, info, 0)) | |
| # Sort by confidence descending | |
| binary_items.sort(key=lambda x: x[2], reverse=True) | |
| high_conf_count = 0 | |
| low_conf_count = 0 | |
| # Show high confidence items first | |
| output_lines.append(f"### High Confidence (โฅ {binary_confidence_threshold})\n") | |
| for kw, info, confidence in binary_items: | |
| if confidence >= binary_confidence_threshold: | |
| high_conf_count += 1 | |
| if isinstance(info, dict): | |
| output_lines.append(f"**{kw}** (confidence: {confidence:.2f})") | |
| for key, val in info.items(): | |
| if key not in ["confidence", "score"]: | |
| output_lines.append(f" - {key}: {val}") | |
| else: | |
| output_lines.append(f"**{kw}**: {info}") | |
| output_lines.append("") | |
| if high_conf_count == 0: | |
| output_lines.append(f"*No binary relations found with confidence โฅ {binary_confidence_threshold}*\n") | |
| # Show lower confidence items for debugging | |
| output_lines.append(f"### Lower Confidence (< {binary_confidence_threshold})\n") | |
| for kw, info, confidence in binary_items: | |
| if confidence < binary_confidence_threshold: | |
| low_conf_count += 1 | |
| if isinstance(info, dict): | |
| output_lines.append(f"**{kw}** (confidence: {confidence:.2f})") | |
| for key, val in info.items(): | |
| if key not in ["confidence", "score"]: | |
| output_lines.append(f" - {key}: {val}") | |
| else: | |
| output_lines.append(f"**{kw}**: {info}") | |
| output_lines.append("") | |
| if low_conf_count == 0: | |
| output_lines.append(f"*No binary relations found with confidence < {binary_confidence_threshold}*\n") | |
| output_lines.append(f"**Total binary relations detected: {len(binary_items)}**\n") | |
| elif isinstance(binary, list) and binary: | |
| has_content = True | |
| for item in binary: | |
| output_lines.append(f"- {item}") | |
| output_lines.append("") | |
| # Object pairs - show ALL object pair interactions for debugging | |
| print(f"DEBUG: Checking object_pairs...") | |
| print(f" 'object_pairs' in summary: {'object_pairs' in summary}") | |
| if 'object_pairs' in summary: | |
| print(f" summary['object_pairs'] truthy: {bool(summary['object_pairs'])}") | |
| print(f" summary['object_pairs'] type: {type(summary['object_pairs'])}") | |
| print(f" summary['object_pairs'] value: {summary['object_pairs']}") | |
| if "object_pairs" in summary and summary["object_pairs"]: | |
| output_lines.append(f"## Object Pair Interactions\n") | |
| pairs = summary["object_pairs"] | |
| print(f"DEBUG: Processing object pairs, type: {type(pairs)}, length: {len(pairs) if isinstance(pairs, (dict, list)) else 'N/A'}") | |
| if isinstance(pairs, dict) and pairs: | |
| has_content = True | |
| # Show all object pairs, sorted by confidence | |
| pair_items = [] | |
| for pair, info in pairs.items(): | |
| if isinstance(info, dict): | |
| confidence = info.get("confidence", info.get("score", 0)) | |
| pair_items.append((pair, info, confidence)) | |
| else: | |
| pair_items.append((pair, info, 0)) | |
| # Sort by confidence descending | |
| pair_items.sort(key=lambda x: x[2], reverse=True) | |
| high_conf_count = 0 | |
| low_conf_count = 0 | |
| # Show high confidence items first | |
| output_lines.append(f"### High Confidence (โฅ {binary_confidence_threshold})\n") | |
| for pair, info, confidence in pair_items: | |
| if confidence >= binary_confidence_threshold: | |
| high_conf_count += 1 | |
| if isinstance(info, dict): | |
| output_lines.append(f"**{pair}** (confidence: {confidence:.2f})") | |
| for key, val in info.items(): | |
| if key not in ["confidence", "score"]: | |
| output_lines.append(f" - {key}: {val}") | |
| else: | |
| output_lines.append(f"**{pair}**: {info}") | |
| output_lines.append("") | |
| if high_conf_count == 0: | |
| output_lines.append(f"*No object pairs found with confidence โฅ {binary_confidence_threshold}*\n") | |
| # Show lower confidence items for debugging | |
| output_lines.append(f"### Lower Confidence (< {binary_confidence_threshold})\n") | |
| for pair, info, confidence in pair_items: | |
| if confidence < binary_confidence_threshold: | |
| low_conf_count += 1 | |
| if isinstance(info, dict): | |
| output_lines.append(f"**{pair}** (confidence: {confidence:.2f})") | |
| for key, val in info.items(): | |
| if key not in ["confidence", "score"]: | |
| output_lines.append(f" - {key}: {val}") | |
| else: | |
| output_lines.append(f"**{pair}**: {info}") | |
| output_lines.append("") | |
| if low_conf_count == 0: | |
| output_lines.append(f"*No object pairs found with confidence < {binary_confidence_threshold}*\n") | |
| output_lines.append(f"**Total object pairs detected: {len(pair_items)}**\n") | |
| elif isinstance(pairs, list) and pairs: | |
| has_content = True | |
| for item in pairs: | |
| output_lines.append(f"- {item}") | |
| output_lines.append("") | |
| # If no content was added, show the raw summary for debugging | |
| if not has_content: | |
| output_lines.append("## Raw Summary Data\n") | |
| output_lines.append("```json") | |
| import json | |
| output_lines.append(json.dumps(summary, indent=2, default=str)) | |
| output_lines.append("```") | |
| return "\n".join(output_lines) | |
| def _load_vine_pipeline(): | |
| """ | |
| Lazy-load and cache the Vine pipeline so we don't re-download/rebuild it on every request. | |
| """ | |
| from vine_hf import VineConfig, VineModel, VinePipeline | |
| config = VineConfig( | |
| segmentation_method="grounding_dino_sam2", | |
| model_name="openai/clip-vit-base-patch32", | |
| use_hf_repo=True, | |
| model_repo="KevinX-Penn28/testing", | |
| box_threshold=0.35, | |
| text_threshold=0.25, | |
| target_fps=1, # default 1 FPS | |
| topk_cate=5, | |
| white_alpha=0.3, | |
| visualization_dir=visualization_dir, | |
| visualize=True, | |
| debug_visualizations=False, | |
| device="cuda", | |
| categorical_pool="max", | |
| ) | |
| model = VineModel(config) | |
| return VinePipeline( | |
| model=model, | |
| tokenizer=None, | |
| sam_config_path=sam_config_path, | |
| sam_checkpoint_path=sam_checkpoint_path, | |
| gd_config_path=gd_config_path, | |
| gd_checkpoint_path=gd_checkpoint_path, | |
| device="cuda", | |
| trust_remote_code=True, | |
| ) | |
| # Up to ~5 minutes of H200 ZeroGPU time per call | |
| def process_video( | |
| video_file, | |
| categorical_keywords, | |
| unary_keywords, | |
| binary_keywords, | |
| output_fps, | |
| box_threshold, | |
| text_threshold, | |
| binary_confidence_threshold, | |
| ): | |
| vine_pipe = _load_vine_pipeline() | |
| # Normalize incoming video input to a file path | |
| if isinstance(video_file, dict): | |
| video_file = ( | |
| video_file.get("name") | |
| or video_file.get("filepath") | |
| or video_file.get("data") | |
| ) | |
| if not isinstance(video_file, (str, Path)): | |
| raise ValueError(f"Unsupported video input type: {type(video_file)}") | |
| categorical_keywords = ( | |
| [kw.strip() for kw in categorical_keywords.split(",")] | |
| if categorical_keywords | |
| else [] | |
| ) | |
| unary_keywords = ( | |
| [kw.strip() for kw in unary_keywords.split(",")] if unary_keywords else [] | |
| ) | |
| binary_keywords = ( | |
| [kw.strip() for kw in binary_keywords.split(",")] if binary_keywords else [] | |
| ) | |
| # Debug: Print what we're sending to the pipeline | |
| print("\n" + "=" * 80) | |
| print("INPUT TO VINE PIPELINE:") | |
| print(f" categorical_keywords: {categorical_keywords}") | |
| print(f" unary_keywords: {unary_keywords}") | |
| print(f" binary_keywords: {binary_keywords}") | |
| print("=" * 80 + "\n") | |
| # Object pairs is now optional - empty list will auto-generate all pairs in vine_model.py | |
| object_pairs = [] | |
| results = vine_pipe( | |
| inputs=video_file, | |
| categorical_keywords=categorical_keywords, | |
| unary_keywords=unary_keywords, | |
| binary_keywords=binary_keywords, | |
| object_pairs=object_pairs, | |
| segmentation_method="grounding_dino_sam2", | |
| return_top_k=5, | |
| include_visualizations=True, | |
| debug_visualizations=False, | |
| device="cuda", | |
| box_threshold=box_threshold, | |
| text_threshold=text_threshold, | |
| target_fps=output_fps, | |
| binary_confidence_threshold=binary_confidence_threshold, | |
| ) | |
| # Debug: Print what the pipeline returned | |
| print("\n" + "=" * 80) | |
| print("PIPELINE RESULTS DEBUG:") | |
| print(f" results type: {type(results)}") | |
| if isinstance(results, dict): | |
| print(f" results keys: {list(results.keys())}") | |
| print("=" * 80 + "\n") | |
| vine_pipe.box_threshold = box_threshold | |
| vine_pipe.text_threshold = text_threshold | |
| vine_pipe.target_fps = output_fps | |
| if isinstance(results, Mapping): | |
| results_dict = results | |
| elif isinstance(results, Sequence) and results and isinstance(results[0], Mapping): | |
| results_dict = results[0] | |
| else: | |
| results_dict = {} | |
| visualizations = results_dict.get("visualizations") or {} | |
| vine = visualizations.get("vine") or {} | |
| all_vis = vine.get("all") or {} | |
| result_video_path = all_vis.get("video_path") | |
| if not result_video_path: | |
| candidates = sorted( | |
| Path(visualization_dir).rglob("*.mp4"), | |
| key=lambda p: p.stat().st_mtime, | |
| reverse=True, | |
| ) | |
| result_video_path = str(candidates[0]) if candidates else None | |
| summary = results_dict.get("summary") or {} | |
| if result_video_path and os.path.exists(result_video_path): | |
| gradio_tmp = Path( | |
| os.environ.get("GRADIO_TEMP_DIR", tempfile.gettempdir()) | |
| ) / "vine_outputs" | |
| gradio_tmp.mkdir(parents=True, exist_ok=True) | |
| dest_path = gradio_tmp / Path(result_video_path).name | |
| try: | |
| shutil.copyfile(result_video_path, dest_path) | |
| video_path_for_ui = str(dest_path) | |
| except Exception as e: | |
| print(f"Warning: failed to copy video to Gradio temp dir: {e}") | |
| video_path_for_ui = str(result_video_path) | |
| else: | |
| video_path_for_ui = None | |
| print( | |
| "Warning: annotated video not found or empty; check visualization settings." | |
| ) | |
| # Debug: Print summary structure | |
| import json | |
| print("=" * 80) | |
| print("SUMMARY DEBUG OUTPUT:") | |
| print(f"Summary type: {type(summary)}") | |
| print(f"Summary keys: {summary.keys() if isinstance(summary, dict) else 'N/A'}") | |
| if isinstance(summary, dict): | |
| print("\nFULL SUMMARY JSON:") | |
| print(json.dumps(summary, indent=2, default=str)) | |
| print("\n" + "=" * 80) | |
| # Check for any keys that might contain binary relation data | |
| print("\nLOOKING FOR BINARY RELATION DATA:") | |
| possible_keys = ['binary', 'binary_keywords', 'binary_relations', 'object_pairs', | |
| 'pairs', 'relations', 'interactions', 'pairwise'] | |
| for pkey in possible_keys: | |
| if pkey in summary: | |
| print(f" FOUND: '{pkey}' -> {summary[pkey]}") | |
| print("\nALL KEYS IN SUMMARY:") | |
| for key in summary.keys(): | |
| print(f"\n{key}:") | |
| print(f" Type: {type(summary[key])}") | |
| if isinstance(summary[key], dict): | |
| print(f" Length: {len(summary[key])}") | |
| print(f" Keys (first 10): {list(summary[key].keys())[:10]}") | |
| # Print all items for anything that might be binary relations | |
| if any(term in key.lower() for term in ['binary', 'pair', 'relation', 'interaction']): | |
| print(f" ALL ITEMS:") | |
| for k, v in list(summary[key].items())[:20]: # First 20 items | |
| print(f" {k}: {v}") | |
| else: | |
| print(f" Sample: {dict(list(summary[key].items())[:2])}") | |
| elif isinstance(summary[key], list): | |
| print(f" Length: {len(summary[key])}") | |
| print(f" Sample: {summary[key][:2]}") | |
| print("=" * 80) | |
| # Format summary as readable markdown text, filtering by confidence threshold | |
| formatted_summary = format_summary(summary, binary_confidence_threshold) | |
| return video_path_for_ui, formatted_summary | |
| def _video_component(label: str, *, is_output: bool = False): | |
| """ | |
| Build a Gradio Video component that is compatible with older Gradio versions | |
| (no `type`/`sources`/`format` kwargs) and newer ones when available. | |
| """ | |
| kwargs = {"label": label} | |
| sig = inspect.signature(gr.Video.__init__) | |
| # Only set format for OUTPUT components | |
| if is_output and "format" in sig.parameters: | |
| kwargs["format"] = "mp4" | |
| if not is_output: | |
| if "type" in sig.parameters: | |
| kwargs["type"] = "filepath" | |
| if "sources" in sig.parameters: | |
| kwargs["sources"] = ["upload"] | |
| # Restrict to MP4 files only | |
| if "file_types" in sig.parameters: | |
| kwargs["file_types"] = [".mp4"] | |
| if is_output and "autoplay" in sig.parameters: | |
| kwargs["autoplay"] = True | |
| return gr.Video(**kwargs) | |
| def _create_blocks(): | |
| """ | |
| Build a Blocks context that works across Gradio versions. | |
| """ | |
| blocks_kwargs = {"title": "VINE Demo"} | |
| soft_theme = None | |
| if hasattr(gr, "themes") and hasattr(gr.themes, "Soft"): | |
| try: | |
| soft_theme = gr.themes.Soft() | |
| except Exception: | |
| soft_theme = None | |
| if "theme" in inspect.signature(gr.Blocks).parameters and soft_theme is not None: | |
| blocks_kwargs["theme"] = soft_theme | |
| return gr.Blocks(**blocks_kwargs) | |
| # Create Gradio interface with two-column layout | |
| with _create_blocks() as demo: | |
| gr.Markdown( | |
| """ | |
| # ๐ฌ VINE: Video-based Interaction and Event Detection | |
| Upload an MP4 video and specify keywords to detect objects, actions, and interactions in your video. | |
| """ | |
| ) | |
| with gr.Row(): | |
| # Left column: Inputs | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Input Configuration") | |
| video_input = _video_component("Upload Video (MP4 only)", is_output=False) | |
| gr.Markdown("*Note: Only MP4 format is currently supported*") | |
| gr.Markdown("#### Detection Keywords") | |
| categorical_input = gr.Textbox( | |
| label="Categorical Keywords", | |
| placeholder="e.g., person, car, dog", | |
| value="person, car, dog", | |
| info="Objects to detect in the video (comma-separated)" | |
| ) | |
| unary_input = gr.Textbox( | |
| label="Unary Keywords", | |
| placeholder="e.g., walking, running, standing", | |
| value="walking, running, standing", | |
| info="Single-object actions to detect (comma-separated)" | |
| ) | |
| binary_input = gr.Textbox( | |
| label="Binary Keywords", | |
| placeholder="e.g., chasing, carrying", | |
| info="Object-to-object interactions to detect (comma-separated)" | |
| ) | |
| gr.Markdown("#### Processing Settings") | |
| fps_input = gr.Number( | |
| label="Output FPS", | |
| value=1, | |
| info="Frames per second for processing (lower = faster)" | |
| ) | |
| with gr.Accordion("Advanced Settings", open=False): | |
| box_threshold_input = gr.Slider( | |
| label="Box Threshold", | |
| minimum=0.1, | |
| maximum=0.9, | |
| value=0.35, | |
| step=0.05, | |
| info="Confidence threshold for object detection" | |
| ) | |
| text_threshold_input = gr.Slider( | |
| label="Text Threshold", | |
| minimum=0.1, | |
| maximum=0.9, | |
| value=0.25, | |
| step=0.05, | |
| info="Confidence threshold for text-based detection" | |
| ) | |
| binary_confidence_input = gr.Slider( | |
| label="Binary Relation Confidence Threshold", | |
| minimum=0.0, | |
| maximum=1.0, | |
| value=0.8, | |
| step=0.05, | |
| info="Minimum confidence to show binary relations and object pairs" | |
| ) | |
| submit_btn = gr.Button("๐ Process Video", variant="primary", size="lg") | |
| # Right column: Outputs | |
| with gr.Column(scale=1): | |
| gr.Markdown("### Results") | |
| video_output = _video_component("Annotated Video Output", is_output=True) | |
| gr.Markdown("### Detection Summary") | |
| summary_output = gr.Markdown( | |
| value="Results will appear here after processing...", | |
| elem_classes=["summary-output"] | |
| ) | |
| gr.Markdown( | |
| """ | |
| --- | |
| ### How to Use | |
| 1. Upload an MP4 video file | |
| 2. Specify the objects, actions, and interactions you want to detect | |
| 3. Adjust processing settings if needed (including binary relation confidence threshold) | |
| 4. Click "Process Video" to analyze | |
| The system will automatically detect all binary relations between detected objects | |
| and show only those with confidence above the threshold (default: 0.8). | |
| """ | |
| ) | |
| submit_btn.click( | |
| fn=process_video, | |
| inputs=[ | |
| video_input, | |
| categorical_input, | |
| unary_input, | |
| binary_input, | |
| fps_input, | |
| box_threshold_input, | |
| text_threshold_input, | |
| binary_confidence_input, | |
| ], | |
| outputs=[video_output, summary_output], | |
| ) | |
| if __name__ == "__main__": | |
| print("Got to main") | |
| demo.launch(share=True, debug=True) | |