Spaces:

ColamanAI
/

Colaman-segmap

Sleeping

App Files Files Community

ColamanAI commited on Oct 14

Commit

d7a13ed

verified ·

1 Parent(s): 3e7f3f4

Upload app.py

Browse files

Files changed (1) hide show

app.py +343 -1228

app.py CHANGED Viewed

@@ -5,11 +5,10 @@
 # LICENSE file in the root directory of this source tree.
 """
-MapAnything V2: 3D Reconstruction with Object Segmentation
-- Multi-view 3D reconstruction
-- GroundingDINO object detection
-- SAM precise segmentation
-- DBSCAN clustering for cross-view object matching
 """
 import gc
@@ -18,8 +17,6 @@ import shutil
 import sys
 import time
 from datetime import datetime
-from pathlib import Path
-from collections import defaultdict
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
@@ -28,10 +25,8 @@ import gradio as gr
 import numpy as np
 import spaces
 import torch
-import trimesh
 from PIL import Image
 from pillow_heif import register_heif_opener
-from sklearn.cluster import DBSCAN
 register_heif_opener()
@@ -65,10 +60,6 @@ def get_logo_base64():
         return None
-# ============================================================================
-# Configuration
-# ============================================================================
 # MapAnything Configuration
 high_level_config = {
     "path": "configs/train.yaml",
@@ -89,846 +80,13 @@ high_level_config = {
     "resolution": 518,
 }
-# ============ 分割模型配置 ============
-# 方案选择：
-# 1. "segformer" - SegFormer (最轻量，~14MB，最快)
-# 2. "maskformer" - MaskFormer (中等，~100MB，实例分割)
-# 3. "grounding_sam" - GroundingDINO + SAM (最强，~110MB，文本提示)
-SEGMENTATION_METHOD = "segformer"  # 默认使用最轻量的方案
-# SegFormer Configuration (推荐 - CPU友好)
-SEGFORMER_MODEL_ID = "nvidia/segformer-b0-finetuned-ade-512-512"  # 14MB，150类物体
-# MaskFormer Configuration (备选)
-MASKFORMER_MODEL_ID = "facebook/maskformer-swin-tiny-ade"  # 100MB，实例分割
-# GroundingDINO + SAM Configuration (原方案 - 需要文本提示)
-GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"
-GROUNDING_DINO_BOX_THRESHOLD = 0.25
-GROUNDING_DINO_TEXT_THRESHOLD = 0.2
-SAM_MODEL_ID = "dhkim2810/MobileSAM"
-USE_MOBILE_SAM = True
-DEFAULT_TEXT_PROMPT = "chair . table . sofa . bed . desk . cabinet"
-# Common objects prompt for detection
-COMMON_OBJECTS_PROMPT = (
-    "person . face . hand . "
-    "chair . sofa . couch . bed . table . desk . cabinet . shelf . drawer . "
-    "door . window . wall . floor . ceiling . curtain . "
-    "tv . monitor . screen . computer . laptop . keyboard . mouse . "
-    "phone . tablet . remote . "
-    "lamp . light . chandelier . "
-    "book . magazine . paper . pen . pencil . "
-    "bottle . cup . glass . mug . plate . bowl . fork . knife . spoon . "
-    "vase . plant . flower . pot . "
-    "clock . picture . frame . mirror . "
-    "pillow . cushion . blanket . towel . "
-    "bag . backpack . suitcase . "
-    "box . basket . container . "
-    "shoe . hat . coat . "
-    "toy . ball . "
-    "car . bicycle . motorcycle . bus . truck . "
-    "tree . grass . sky . cloud . sun . "
-    "dog . cat . bird . "
-    "building . house . bridge . road . street . "
-    "sign . pole . bench"
-)
-# DBSCAN clustering configuration (eps in meters)
-DBSCAN_EPS_CONFIG = {
-    'sofa': 1.5,
-    'bed': 1.5,
-    'couch': 1.5,
-    'desk': 0.8,
-    'table': 0.8,
-    'chair': 0.6,
-    'cabinet': 0.8,
-    'window': 0.5,
-    'door': 0.6,
-    'tv': 0.6,
-    'default': 1.0
-}
-DBSCAN_MIN_SAMPLES = 1
-# Quality control
-MIN_DETECTION_CONFIDENCE = 0.35
-MIN_MASK_AREA = 100
-# Global model variables
 model = None
-grounding_dino_model = None
-grounding_dino_processor = None
-sam_predictor = None
-# SegFormer 模型（轻量级语义分割）
-segformer_processor = None
-segformer_model = None
-# MaskFormer 模型（实例分割）
-maskformer_processor = None
-maskformer_model = None
-# ============================================================================
-# Model Loading Functions
-# ============================================================================
-def load_segformer_model(device="cpu"):
-    """加载 SegFormer 模型（最轻量，CPU友好）"""
-    global segformer_processor, segformer_model
-    if segformer_model is not None:
-        print("✅ SegFormer already loaded")
-        return
-    try:
-        from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
-        import os
-        print(f"📥 Loading SegFormer from HuggingFace: {SEGFORMER_MODEL_ID}")
-        print(f"   💡 SegFormer-B0: ~14MB, 150类物体, CPU优化")
-        cache_dir = os.getenv("HF_HOME", "./hf_cache")
-        print(f"   正在下载 processor...")
-        segformer_processor = SegformerImageProcessor.from_pretrained(
-            SEGFORMER_MODEL_ID,
-            cache_dir=cache_dir
-        )
-        print(f"   正在下载 model...")
-        segformer_model = SegformerForSemanticSegmentation.from_pretrained(
-            SEGFORMER_MODEL_ID,
-            cache_dir=cache_dir,
-            low_cpu_mem_usage=True
-        ).to(device).eval()
-        print(f"✅ SegFormer loaded successfully on {device.upper()}")
-        print(f"   可识别类别: 人、家具、墙壁、地板等150类")
-    except Exception as e:
-        print(f"❌ SegFormer loading failed: {type(e).__name__}: {e}")
-        import traceback
-        traceback.print_exc()
-def load_maskformer_model(device="cpu"):
-    """加载 MaskFormer 模型（实例分割）"""
-    global maskformer_processor, maskformer_model
-    if maskformer_model is not None:
-        print("✅ MaskFormer already loaded")
-        return
-    try:
-        from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
-        import os
-        print(f"📥 Loading MaskFormer from HuggingFace: {MASKFORMER_MODEL_ID}")
-        print(f"   💡 MaskFormer: ~100MB, 实例分割")
-        cache_dir = os.getenv("HF_HOME", "./hf_cache")
-        print(f"   正在下载 processor...")
-        maskformer_processor = MaskFormerImageProcessor.from_pretrained(
-            MASKFORMER_MODEL_ID,
-            cache_dir=cache_dir
-        )
-        print(f"   正在下载 model...")
-        maskformer_model = MaskFormerForInstanceSegmentation.from_pretrained(
-            MASKFORMER_MODEL_ID,
-            cache_dir=cache_dir,
-            low_cpu_mem_usage=True
-        ).to(device).eval()
-        print(f"✅ MaskFormer loaded successfully on {device.upper()}")
-    except Exception as e:
-        print(f"❌ MaskFormer loading failed: {type(e).__name__}: {e}")
-        import traceback
-        traceback.print_exc()
-def load_grounding_dino_model(device="cpu"):
-    """Load GroundingDINO model from HuggingFace (CPU优化)"""
-    global grounding_dino_model, grounding_dino_processor
-    if grounding_dino_model is not None:
-        print("✅ GroundingDINO already loaded")
-        return
-    try:
-        from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
-        import os
-        # 强制使用 CPU 进行分割（节省 GPU 资源）
-        seg_device = "cpu"
-        print(f"📥 Loading GroundingDINO from HuggingFace: {GROUNDING_DINO_MODEL_ID} (使用 {seg_device.upper()})")
-        # 设置缓存目录（HuggingFace Spaces友好）
-        cache_dir = os.getenv("HF_HOME", "./hf_cache")
-        # 加载模型（带重试和详细日志）
-        print(f"   正在下载 processor...")
-        grounding_dino_processor = AutoProcessor.from_pretrained(
-            GROUNDING_DINO_MODEL_ID,
-            cache_dir=cache_dir,
-            trust_remote_code=True  # 允许运行远程代码
-        )
-        print(f"   正在下载 model...")
-        grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
-            GROUNDING_DINO_MODEL_ID,
-            cache_dir=cache_dir,
-            trust_remote_code=True,
-            low_cpu_mem_usage=True  # 降低CPU内存使用
-        ).to(seg_device).eval()
-        print(f"✅ GroundingDINO loaded successfully on {seg_device.upper()}")
-    except ImportError as e:
-        print(f"❌ ImportError: {e}")
-        print(f"💡 请检查 requirements.txt 是否包含 transformers 库")
-        import traceback
-        traceback.print_exc()
-    except OSError as e:
-        print(f"❌ OSError (网络/文件问题): {e}")
-        print(f"💡 可能是网络连接问题或模型仓库不可访问")
-        print(f"💡 尝试解决方案:")
-        print(f"   1. 检查 HuggingFace Spaces 的网络连接")
-        print(f"   2. 检查模型ID是否正确: {GROUNDING_DINO_MODEL_ID}")
-        print(f"   3. 确保有足够的磁盘空间")
-        import traceback
-        traceback.print_exc()
-    except Exception as e:
-        print(f"❌ GroundingDINO loading failed: {type(e).__name__}: {e}")
-        import traceback
-        traceback.print_exc()
-def load_sam_model(device="cpu"):
-    """Load MobileSAM model from HuggingFace (CPU优化，比SAM快60倍)"""
-    global sam_predictor
-    if sam_predictor is not None:
-        print("✅ SAM already loaded")
-        return
-    try:
-        from transformers import SamModel, SamProcessor
-        import os
-        # 强制使用 CPU 进行分割（MobileSAM 专为移动设备/CPU优化）
-        seg_device = "cpu"
-        print(f"📥 Loading MobileSAM from HuggingFace: {SAM_MODEL_ID} (使用 {seg_device.upper()})")
-        print(f"   💡 MobileSAM 是轻量级版本，比 SAM-huge 快60倍，只有10MB，适合CPU运行")
-        # 设置缓存目录
-        cache_dir = os.getenv("HF_HOME", "./hf_cache")
-        print(f"   正在下载 processor...")
-        sam_processor = SamProcessor.from_pretrained(
-            SAM_MODEL_ID,
-            cache_dir=cache_dir
-        )
-        print(f"   正在下载 model...")
-        sam_model = SamModel.from_pretrained(
-            SAM_MODEL_ID,
-            cache_dir=cache_dir,
-            low_cpu_mem_usage=True
-        ).to(seg_device).eval()
-        # Wrap in a predictor-like interface
-        class SAMPredictor:
-            def __init__(self, model, processor, device):
-                self.model = model
-                self.processor = processor
-                self.device = device
-                self.image = None
-            def set_image(self, image):
-                """Set image for prediction"""
-                if image.dtype == np.uint8:
-                    self.image = Image.fromarray(image)
-                else:
-                    self.image = Image.fromarray((image * 255).astype(np.uint8))
-            def predict(self, box, multimask_output=False):
-                """Predict mask from box (CPU优化)"""
-                inputs = self.processor(
-                    self.image,
-                    input_boxes=[[[box]]],
-                    return_tensors="pt"
-                )
-                # 确保在CPU上运行
-                inputs = {k: v.to(self.device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
-                with torch.no_grad():
-                    outputs = self.model(**inputs)
-                masks = self.processor.image_processor.post_process_masks(
-                    outputs.pred_masks.cpu(),
-                    inputs["original_sizes"].cpu() if "original_sizes" in inputs else outputs.pred_masks.new_tensor([[self.image.height, self.image.width]]),
-                    inputs["reshaped_input_sizes"].cpu() if "reshaped_input_sizes" in inputs else outputs.pred_masks.new_tensor([[self.image.height, self.image.width]])
-                )[0].squeeze().numpy()
-                if len(masks.shape) == 2:
-                    masks = masks[np.newaxis, ...]
-                return masks, None, None
-        sam_predictor = SAMPredictor(sam_model, sam_processor, seg_device)
-        print(f"✅ MobileSAM loaded successfully on {seg_device.upper()}")
-    except ImportError as e:
-        print(f"❌ ImportError: {e}")
-        print(f"💡 请检查 requirements.txt 是否包含 transformers 库")
-        import traceback
-        traceback.print_exc()
-    except OSError as e:
-        print(f"❌ OSError (网络/文件问题): {e}")
-        print(f"💡 可能是网络连接问题或模型仓库不可访问")
-        print(f"💡 尝试解决方案:")
-        print(f"   1. 检查 HuggingFace Spaces 的网络连接")
-        print(f"   2. 检查模型ID是否正确: {SAM_MODEL_ID}")
-        print(f"   3. 确保有足够的磁盘空间")
-        import traceback
-        traceback.print_exc()
-    except Exception as e:
-        print(f"❌ SAM loading failed: {type(e).__name__}: {e}")
-        import traceback
-        traceback.print_exc()
-# ============================================================================
-# Segmentation Functions
-# ============================================================================
-def generate_distinct_colors(n):
-    """Generate N visually distinct colors (RGB, 0-255)"""
-    import colorsys
-    if n == 0:
-        return []
-    colors = []
-    for i in range(n):
-        hue = i / max(n, 1)
-        rgb = colorsys.hsv_to_rgb(hue, 0.9, 0.95)
-        rgb_color = tuple(int(c * 255) for c in rgb)
-        colors.append(rgb_color)
-    return colors
-# ============================================================================
-# SegFormer 分割函数（简化方案）
-# ============================================================================
-def run_segformer_segmentation(image_np, device="cpu"):
-    """使用 SegFormer 进行语义分割（最简单，CPU友好）"""
-    if segformer_model is None or segformer_processor is None:
-        print("❌ SegFormer model not loaded")
-        return []
-    try:
-        import torch
-        from PIL import Image
-        # 准备图片
-        if image_np.dtype != np.uint8:
-            image_np = (image_np * 255).astype(np.uint8)
-        image_pil = Image.fromarray(image_np)
-        # 推理
-        inputs = segformer_processor(images=image_pil, return_tensors="pt")
-        inputs = {k: v.to(device) for k, v in inputs.items()}
-        with torch.no_grad():
-            outputs = segformer_model(**inputs)
-        # 获取分割结果
-        logits = outputs.logits  # (1, num_classes, H, W)
-        predicted_segmentation = logits.argmax(dim=1).squeeze().cpu().numpy()
-        # 生成实例掩码（将相同类别的连续区域分开）
-        from scipy import ndimage
-        # ADE20K 常见类别映射（部分）
-        ade20k_labels = {
-            5: "wall", 7: "floor", 11: "ceiling", 18: "window", 14: "door",
-            19: "table", 20: "chair", 22: "sofa", 23: "bed", 28: "cabinet",
-            34: "desk", 39: "lamp", 65: "television", 89: "shelf"
-        }
-        detections = []
-        masks = []
-        # 对每个类别提取实例
-        unique_labels = np.unique(predicted_segmentation)
-        for label_id in unique_labels:
-            if label_id == 0:  # 跳过背景
-                continue
-            # 获取该类别的掩码
-            class_mask = (predicted_segmentation == label_id)
-            # 分离连通区域（不同实例）
-            labeled_mask, num_features = ndimage.label(class_mask)
-            for instance_id in range(1, num_features + 1):
-                instance_mask = (labeled_mask == instance_id)
-                mask_area = instance_mask.sum()
-                # 过滤小区域
-                if mask_area < MIN_MASK_AREA:
-                    continue
-                # 计算边界框
-                rows, cols = np.where(instance_mask)
-                if len(rows) == 0:
-                    continue
-                y_min, y_max = rows.min(), rows.max()
-                x_min, x_max = cols.min(), cols.max()
-                bbox = [x_min, y_min, x_max, y_max]
-                # 获取类别名称
-                label_name = ade20k_labels.get(int(label_id), f"object_{label_id}")
-                detections.append({
-                    'bbox': bbox,
-                    'label': label_name,
-                    'confidence': 0.9,  # SegFormer 不提供置信度，给固定值
-                    'class_id': int(label_id)
-                })
-                masks.append(instance_mask)
-        return detections, masks
-    except Exception as e:
-        print(f"❌ SegFormer segmentation failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return [], []
-def run_grounding_dino_detection(image_np, text_prompt, device="cpu"):
-    """Run GroundingDINO detection (CPU优化)"""
-    if grounding_dino_model is None or grounding_dino_processor is None:
-        print("⚠️ GroundingDINO not loaded")
-        return []
-    try:
-        print(f"🔍 GroundingDINO detection (CPU): {text_prompt}")
-        # Convert to PIL Image
-        if image_np.dtype == np.uint8:
-            pil_image = Image.fromarray(image_np)
-        else:
-            pil_image = Image.fromarray((image_np * 255).astype(np.uint8))
-        # Preprocess - 强制使用CPU
-        seg_device = "cpu"
-        inputs = grounding_dino_processor(images=pil_image, text=text_prompt, return_tensors="pt")
-        inputs = {k: v.to(seg_device) for k, v in inputs.items() if isinstance(v, torch.Tensor)}
-        # Inference
-        with torch.no_grad():
-            outputs = grounding_dino_model(**inputs)
-        # Post-process
-        results = grounding_dino_processor.post_process_grounded_object_detection(
-            outputs,
-            inputs["input_ids"],
-            threshold=GROUNDING_DINO_BOX_THRESHOLD,
-            text_threshold=GROUNDING_DINO_TEXT_THRESHOLD,
-            target_sizes=[pil_image.size[::-1]]
-        )[0]
-        # Convert to unified format
-        detections = []
-        boxes = results["boxes"].cpu().numpy()
-        scores = results["scores"].cpu().numpy()
-        labels = results["labels"]
-        print(f"✅ Detected {len(boxes)} objects")
-        for box, score, label in zip(boxes, scores, labels):
-            detection = {
-                'bbox': box.tolist(),  # [x1, y1, x2, y2]
-                'label': label,
-                'confidence': float(score)
-            }
-            detections.append(detection)
-            print(f"   - {label}: {score:.2f}")
-        return detections
-    except Exception as e:
-        print(f"❌ GroundingDINO detection failed: {e}")
-        import traceback
-        traceback.print_exc()
-        return []
-def run_sam_refinement(image_np, boxes):
-    """Run SAM precise segmentation"""
-    if sam_predictor is None:
-        print("⚠️ SAM not loaded, using bbox as mask")
-        # Use bbox to create simple rectangular mask
-        masks = []
-        h, w = image_np.shape[:2]
-        for box in boxes:
-            x1, y1, x2, y2 = map(int, box)
-            mask = np.zeros((h, w), dtype=bool)
-            mask[y1:y2, x1:x2] = True
-            masks.append(mask)
-        return masks
-    try:
-        print(f"🎯 SAM precise segmentation for {len(boxes)} regions...")
-        sam_predictor.set_image(image_np)
-        masks = []
-        for box in boxes:
-            x1, y1, x2, y2 = map(int, box)
-            box_array = np.array([x1, y1, x2, y2])
-            mask_output, _, _ = sam_predictor.predict(
-                box=box_array,
-                multimask_output=False
-            )
-            masks.append(mask_output[0])
-        print(f"✅ SAM segmentation complete")
-        return masks
-    except Exception as e:
-        print(f"❌ SAM segmentation failed: {e}")
-        # Fallback to bbox masks
-        masks = []
-        h, w = image_np.shape[:2]
-        for box in boxes:
-            x1, y1, x2, y2 = map(int, box)
-            mask = np.zeros((h, w), dtype=bool)
-            mask[y1:y2, x1:x2] = True
-            masks.append(mask)
-        return masks
-def normalize_label(label):
-    """Normalize label to main category"""
-    label = label.strip().lower()
-    priority_labels = ['sofa', 'bed', 'table', 'desk', 'chair', 'cabinet', 'window', 'door']
-    for priority in priority_labels:
-        if priority in label:
-            return priority
-    first_word = label.split()[0] if label else label
-    # Handle plural forms
-    if first_word.endswith('s') and len(first_word) > 1:
-        singular = first_word[:-1]
-        if first_word.endswith('sses'):
-            singular = first_word[:-2]
-        elif first_word.endswith('ies'):
-            singular = first_word[:-3] + 'y'
-        elif first_word.endswith('ves'):
-            singular = first_word[:-3] + 'f'
-        return singular
-    return first_word
-def compute_object_3d_center(points, mask):
-    """Compute 3D center of object"""
-    masked_points = points[mask]
-    if len(masked_points) == 0:
-        return None
-    return np.median(masked_points, axis=0)
-def compute_adaptive_eps(centers, base_eps):
-    """Adaptively compute eps value based on object distribution"""
-    if len(centers) <= 1:
-        return base_eps
-    from scipy.spatial.distance import pdist
-    distances = pdist(centers)
-    if len(distances) == 0:
-        return base_eps
-    median_dist = np.median(distances)
-    if median_dist > base_eps * 2:
-        adaptive_eps = min(median_dist * 0.6, base_eps * 2.5)
-    elif median_dist > base_eps:
-        adaptive_eps = median_dist * 0.5
-    else:
-        adaptive_eps = base_eps
-    return adaptive_eps
-def match_objects_across_views(all_view_detections):
-    """Match objects across views using DBSCAN clustering"""
-    print("\n🔗 Matching objects across views using DBSCAN clustering...")
-    objects_by_label = defaultdict(list)
-    for view_idx, detections in enumerate(all_view_detections):
-        for det_idx, det in enumerate(detections):
-            if det.get('center_3d') is None:
-                continue
-            norm_label = normalize_label(det['label'])
-            objects_by_label[norm_label].append({
-                'view_idx': view_idx,
-                'det_idx': det_idx,
-                'label': det['label'],
-                'norm_label': norm_label,
-                'center_3d': det['center_3d'],
-                'confidence': det['confidence'],
-            })
-    if len(objects_by_label) == 0:
-        return {}, []
-    object_id_map = defaultdict(dict)
-    unique_objects = []
-    next_global_id = 0
-    for norm_label, objects in objects_by_label.items():
-        print(f"\n   📦 Processing {norm_label}: {len(objects)} detections")
-        if len(objects) == 1:
-            obj = objects[0]
-            unique_objects.append({
-                'global_id': next_global_id,
-                'label': obj['label'],
-                'views': [(obj['view_idx'], obj['det_idx'])],
-                'center_3d': obj['center_3d'],
-            })
-            object_id_map[obj['view_idx']][obj['det_idx']] = next_global_id
-            next_global_id += 1
-            print(f"      → 1 cluster (single detection)")
-            continue
-        centers = np.array([obj['center_3d'] for obj in objects])
-        base_eps = DBSCAN_EPS_CONFIG.get(norm_label, DBSCAN_EPS_CONFIG.get('default', 1.0))
-        eps = compute_adaptive_eps(centers, base_eps)
-        clustering = DBSCAN(eps=eps, min_samples=DBSCAN_MIN_SAMPLES, metric='euclidean')
-        cluster_labels = clustering.fit_predict(centers)
-        n_clusters = len(set(cluster_labels)) - (1 if -1 in cluster_labels else 0)
-        n_noise = list(cluster_labels).count(-1)
-        if eps != base_eps:
-            print(f"      → {n_clusters} clusters (base_eps={base_eps}m → adaptive_eps={eps:.2f}m)")
-        else:
-            print(f"      → {n_clusters} clusters (eps={eps}m)")
-        if n_noise > 0:
-            print(f"      ⚠️  {n_noise} noise points (isolated detections)")
-        for cluster_id in set(cluster_labels):
-            if cluster_id == -1:
-                for i, label in enumerate(cluster_labels):
-                    if label == -1:
-                        obj = objects[i]
-                        unique_objects.append({
-                            'global_id': next_global_id,
-                            'label': obj['label'],
-                            'views': [(obj['view_idx'], obj['det_idx'])],
-                            'center_3d': obj['center_3d'],
-                        })
-                        object_id_map[obj['view_idx']][obj['det_idx']] = next_global_id
-                        next_global_id += 1
-            else:
-                cluster_objects = [objects[i] for i, label in enumerate(cluster_labels) if label == cluster_id]
-                total_conf = sum(o['confidence'] for o in cluster_objects)
-                weighted_center = sum(o['center_3d'] * o['confidence'] for o in cluster_objects) / total_conf
-                unique_objects.append({
-                    'global_id': next_global_id,
-                    'label': cluster_objects[0]['label'],
-                    'views': [(o['view_idx'], o['det_idx']) for o in cluster_objects],
-                    'center_3d': weighted_center,
-                })
-                for obj in cluster_objects:
-                    object_id_map[obj['view_idx']][obj['det_idx']] = next_global_id
-                next_global_id += 1
-    print(f"\n   📊 Summary:")
-    print(f"      Total detections: {sum(len(objs) for objs in objects_by_label.values())}")
-    print(f"      Unique objects: {len(unique_objects)}")
-    return object_id_map, unique_objects
-def create_multi_view_segmented_mesh(processed_data, all_view_detections, all_view_masks,
-                                     object_id_map, unique_objects, target_dir):
-    """Create multi-view fused segmented mesh"""
-    try:
-        print("\n🎨 Generating multi-view segmented mesh...")
-        unique_normalized_labels = sorted(set(normalize_label(obj['label']) for obj in unique_objects))
-        label_colors = {}
-        colors = generate_distinct_colors(len(unique_normalized_labels))
-        for i, norm_label in enumerate(unique_normalized_labels):
-            label_colors[norm_label] = colors[i]
-        for obj in unique_objects:
-            norm_label = normalize_label(obj['label'])
-            obj['color'] = label_colors[norm_label]
-            obj['normalized_label'] = norm_label
-        print(f"   Object category color mapping:")
-        for norm_label, color in sorted(label_colors.items()):
-            count = sum(1 for obj in unique_objects if normalize_label(obj['label']) == norm_label)
-            print(f"   {norm_label} × {count} → RGB{color}")
-        import utils3d
-        all_meshes = []
-        for view_idx in range(len(processed_data)):
-            view_data = processed_data[view_idx]
-            image = view_data["image"]
-            points3d = view_data["points3d"]
-            mask = view_data.get("mask")
-            normal = view_data.get("normal")
-            detections = all_view_detections[view_idx]
-            masks = all_view_masks[view_idx]
-            if len(detections) == 0:
-                continue
-            if image.dtype != np.uint8:
-                if image.max() <= 1.0:
-                    image = (image * 255).astype(np.uint8)
-                else:
-                    image = image.astype(np.uint8)
-            colored_image = image.copy()
-            confidence_map = np.zeros((image.shape[0], image.shape[1]), dtype=np.float32)
-            detections_info = []
-            filtered_count = 0
-            for det_idx, (det, seg_mask) in enumerate(zip(detections, masks)):
-                if det['confidence'] < MIN_DETECTION_CONFIDENCE:
-                    filtered_count += 1
-                    continue
-                mask_area = seg_mask.sum()
-                if mask_area < MIN_MASK_AREA:
-                    filtered_count += 1
-                    continue
-                global_id = object_id_map[view_idx].get(det_idx)
-                if global_id is None:
-                    continue
-                unique_obj = next((obj for obj in unique_objects if obj['global_id'] == global_id), None)
-                if unique_obj is None:
-                    continue
-                detections_info.append({
-                    'mask': seg_mask,
-                    'color': unique_obj['color'],
-                    'confidence': det['confidence'],
-                })
-            if filtered_count > 0:
-                print(f"   View {view_idx + 1}: filtered {filtered_count} low-quality detections")
-            detections_info.sort(key=lambda x: x['confidence'])
-            for info in detections_info:
-                seg_mask = info['mask']
-                color = info['color']
-                conf = info['confidence']
-                update_mask = seg_mask & (conf > confidence_map)
-                colored_image[update_mask] = color
-                confidence_map[update_mask] = conf
-            height, width = image.shape[:2]
-            if normal is None:
-                faces, vertices, vertex_colors, vertex_uvs = utils3d.numpy.image_mesh(
-                    points3d,
-                    colored_image.astype(np.float32) / 255,
-                    utils3d.numpy.image_uv(width=width, height=height),
-                    mask=mask if mask is not None else np.ones((height, width), dtype=bool),
-                    tri=True
-                )
-                vertex_normals = None
-            else:
-                faces, vertices, vertex_colors, vertex_uvs, vertex_normals = utils3d.numpy.image_mesh(
-                    points3d,
-                    colored_image.astype(np.float32) / 255,
-                    utils3d.numpy.image_uv(width=width, height=height),
-                    normal,
-                    mask=mask if mask is not None else np.ones((height, width), dtype=bool),
-                    tri=True
-                )
-            vertices = vertices * np.array([1, -1, -1], dtype=np.float32)
-            if vertex_normals is not None:
-                vertex_normals = vertex_normals * np.array([1, -1, -1], dtype=np.float32)
-            view_mesh = trimesh.Trimesh(
-                vertices=vertices,
-                faces=faces,
-                vertex_normals=vertex_normals,
-                vertex_colors=(vertex_colors * 255).astype(np.uint8),
-                process=False
-            )
-            all_meshes.append(view_mesh)
-            print(f"   View {view_idx + 1}: {len(vertices):,} vertices, {len(faces):,} faces")
-        if len(all_meshes) == 0:
-            print("⚠️ No mesh generated")
-            return None
-        print("   Fusing all views...")
-        combined_mesh = trimesh.util.concatenate(all_meshes)
-        glb_path = os.path.join(target_dir, 'segmented_mesh.glb')
-        combined_mesh.export(glb_path)
-        print(f"✅ Multi-view segmented mesh saved: {glb_path}")
-        print(f"   Total: {len(combined_mesh.vertices):,} vertices, {len(combined_mesh.faces):,} faces")
-        print(f"   {len(unique_objects)} unique objects")
-        return glb_path
-    except Exception as e:
-        print(f"❌ Failed to generate multi-view mesh: {e}")
-        import traceback
-        traceback.print_exc()
-        return None
-# ============================================================================
-# Core Model Inference
-# ============================================================================
 @spaces.GPU(duration=120)
 def run_model(
     target_dir,
@@ -936,24 +94,24 @@ def run_model(
     mask_edges=True,
     filter_black_bg=False,
     filter_white_bg=False,
-    enable_segmentation=False,
-    text_prompt=DEFAULT_TEXT_PROMPT,
     progress=gr.Progress(),
 ):
     """
-    Run the MapAnything model + optional segmentation
     """
     global model
-    import torch
-    progress(0, desc="🔧 初始化设备...")
     print(f"Processing images from {target_dir}")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     device = torch.device(device)
-    # Initialize MapAnything model
-    progress(0.05, desc="📥 加载 MapAnything 模型...")
     if model is None:
         model = initialize_mapanything_model(high_level_config, device)
     else:
@@ -961,46 +119,8 @@ def run_model(
     model.eval()
-    # Load segmentation models if enabled (使用CPU节省GPU资源)
-    if enable_segmentation:
-        progress(0.1, desc="🎯 加载分割模型 (CPU)...")
-        print(f"\n{'='*70}")
-        print(f"🎯 分割模型加载开始... (方案: {SEGMENTATION_METHOD})")
-        print(f"{'='*70}")
-        if SEGMENTATION_METHOD == "segformer":
-            # 方案1: SegFormer (最轻量，~14MB，最快)
-            print("📌 使用方案: SegFormer (轻量级，无需文本提示)")
-            load_segformer_model("cpu")
-            if segformer_model is None:
-                print("❌ SegFormer 模型加载失败！")
-                raise RuntimeError("SegFormer 模型加载失败，请检查网络连接")
-        elif SEGMENTATION_METHOD == "maskformer":
-            # 方案2: MaskFormer (中等，~100MB)
-            print("📌 使用方案: MaskFormer (实例分割)")
-            load_maskformer_model("cpu")
-            if maskformer_model is None:
-                print("❌ MaskFormer 模型加载失败！")
-                raise RuntimeError("MaskFormer 模型加载失败，请检查网络连接")
-        else:  # "grounding_sam"
-            # 方案3: GroundingDINO + SAM (最强，~110MB，需要文本提示)
-            print("📌 使用方案: GroundingDINO + SAM (文本提示驱动)")
-            load_grounding_dino_model("cpu")
-            load_sam_model("cpu")
-            if grounding_dino_model is None:
-                print("❌ GroundingDINO 模型加载失败！")
-                raise RuntimeError("GroundingDINO 模型加载失败，请检查网络连接")
-            if sam_predictor is None:
-                print("❌ SAM 模型加载失败！")
-                raise RuntimeError("SAM 模型加载失败，请检查网络连接")
-        print(f"✅ 分割模型加载成功")
-        print(f"{'='*70}\n")
-    # Load images
-    progress(0.15, desc="📷 加载图片...")
     print("Loading images...")
     image_folder_path = os.path.join(target_dir, "images")
     views = load_images(image_folder_path)
@@ -1010,15 +130,22 @@ def run_model(
         raise ValueError("No images found. Check your upload.")
     # Run model inference
-    progress(0.2, desc=f"🚀 运行 3D 重建 ({len(views)} 张图片)...")
     print("Running inference...")
     outputs = model.infer(
         views, apply_mask=apply_mask, mask_edges=True, memory_efficient_inference=False
     )
-    # Convert predictions
-    progress(0.5, desc="🔄 处理预测结果...")
     predictions = {}
     extrinsic_list = []
     intrinsic_list = []
     world_points_list = []
@@ -1026,158 +153,81 @@ def run_model(
     images_list = []
     final_mask_list = []
-    for pred in outputs:
-        depthmap_torch = pred["depth_z"][0].squeeze(-1)
-        intrinsics_torch = pred["intrinsics"][0]
-        camera_pose_torch = pred["camera_poses"][0]
         pts3d_computed, valid_mask = depthmap_to_world_frame(
             depthmap_torch, intrinsics_torch, camera_pose_torch
         )
         if "mask" in pred:
             mask = pred["mask"][0].squeeze(-1).cpu().numpy().astype(bool)
         else:
             mask = np.ones_like(depthmap_torch.cpu().numpy(), dtype=bool)
         mask = mask & valid_mask.cpu().numpy()
         image = pred["img_no_norm"][0].cpu().numpy()
         extrinsic_list.append(camera_pose_torch.cpu().numpy())
         intrinsic_list.append(intrinsics_torch.cpu().numpy())
         world_points_list.append(pts3d_computed.cpu().numpy())
         depth_maps_list.append(depthmap_torch.cpu().numpy())
-        images_list.append(image)
-        final_mask_list.append(mask)
     predictions["extrinsic"] = np.stack(extrinsic_list, axis=0)
     predictions["intrinsic"] = np.stack(intrinsic_list, axis=0)
     predictions["world_points"] = np.stack(world_points_list, axis=0)
     depth_maps = np.stack(depth_maps_list, axis=0)
     if len(depth_maps.shape) == 3:
         depth_maps = depth_maps[..., np.newaxis]
     predictions["depth"] = depth_maps
     predictions["images"] = np.stack(images_list, axis=0)
     predictions["final_mask"] = np.stack(final_mask_list, axis=0)
-    # Process visualization data
-    progress(0.6, desc="🎨 准备可视化数据...")
     processed_data = process_predictions_for_visualization(
         predictions, views, high_level_config, filter_black_bg, filter_white_bg
     )
-    # Segmentation processing
-    segmented_glb = None
-    if enable_segmentation:
-        progress(0.65, desc="🎯 开始物体分割...")
-        print(f"\n{'='*70}")
-        print(f"🎯 开始物体分割... (方案: {SEGMENTATION_METHOD})")
-        print(f"📐 最小掩码面积: {MIN_MASK_AREA} px")
-        if SEGMENTATION_METHOD == "grounding_sam":
-            print(f"🔍 检测提示词: {text_prompt[:100]}...")
-            print(f"📊 置信度阈值: {GROUNDING_DINO_BOX_THRESHOLD}")
-        print(f"{'='*70}\n")
-        all_view_detections = []
-        all_view_masks = []
-        for view_idx, ref_image in enumerate(images_list):
-            progress(0.65 + (view_idx / len(images_list)) * 0.2,
-                    desc=f"🔍 检测视图 {view_idx + 1}/{len(images_list)}...")
-            print(f"\n📸 Processing view {view_idx + 1}/{len(images_list)}...")
-            if ref_image.dtype != np.uint8:
-                ref_image_np = (ref_image * 255).astype(np.uint8)
-            else:
-                ref_image_np = ref_image
-            # 根据分割方法选择不同的处理流程
-            if SEGMENTATION_METHOD == "segformer":
-                # SegFormer: 直接语义分割，无需文本提示
-                detections, masks = run_segformer_segmentation(ref_image_np, "cpu")
-                print(f"   ✓ 检测到 {len(detections)} 个物体")
-                if len(detections) > 0:
-                    for i, det in enumerate(detections):
-                        print(f"      物体 {i+1}: {det['label']}")
-                    points3d = world_points_list[view_idx]
-                    for det_idx, (det, mask) in enumerate(zip(detections, masks)):
-                        center_3d = compute_object_3d_center(points3d, mask)
-                        det['center_3d'] = center_3d
-                        det['mask_2d'] = mask
-                    all_view_detections.append(detections)
-                    all_view_masks.append(masks)
-                else:
-                    all_view_detections.append([])
-                    all_view_masks.append([])
-            elif SEGMENTATION_METHOD == "grounding_sam":
-                # GroundingDINO + SAM: 文本提示驱动
-                detections = run_grounding_dino_detection(ref_image_np, text_prompt, "cpu")
-                print(f"   ✓ 检测到 {len(detections)} 个物体")
-                if len(detections) > 0:
-                    for i, det in enumerate(detections):
-                        print(f"      物体 {i+1}: {det['label']} (置信度: {det['confidence']:.2f})")
-                    boxes = [d['bbox'] for d in detections]
-                    masks = run_sam_refinement(ref_image_np, boxes)
-                    points3d = world_points_list[view_idx]
-                    for det_idx, (det, mask) in enumerate(zip(detections, masks)):
-                        center_3d = compute_object_3d_center(points3d, mask)
-                        det['center_3d'] = center_3d
-                        det['mask_2d'] = mask
-                    all_view_detections.append(detections)
-                all_view_masks.append(masks)
-            else:
-                all_view_detections.append([])
-                all_view_masks.append([])
-        # Match objects across views
-        total_detections = sum(len(dets) for dets in all_view_detections)
-        print(f"\n📊 总检测数: {total_detections}")
-        if any(len(dets) > 0 for dets in all_view_detections):
-            progress(0.85, desc="🔗 匹配跨视图物体...")
-            object_id_map, unique_objects = match_objects_across_views(all_view_detections)
-            # Generate segmented mesh
-            progress(0.9, desc="🏗️ 生成分割3D模型...")
-            segmented_glb = create_multi_view_segmented_mesh(
-                processed_data, all_view_detections, all_view_masks,
-                object_id_map, unique_objects, target_dir
-            )
-            if segmented_glb:
-                print(f"✅ 分割3D模型已生成: {segmented_glb}")
-            else:
-                print(f"⚠️ 分割3D模型生成失败")
-        else:
-            print(f"\n{'='*70}")
-            print(f"⚠️ 未检测到任何物体，无法生成分割模型")
-            print(f"\n💡 调试提示:")
-            print(f"   1. 检查检测提示词是否准确（当前: {text_prompt[:50]}...）")
-            print(f"   2. 当前置信度阈值: {GROUNDING_DINO_BOX_THRESHOLD}")
-            print(f"   3. 尝试更通用的提示词，如: {COMMON_OBJECTS_PROMPT[:80]}...")
-            print(f"   4. 确保图片中有清晰可见的物体")
-            print(f"{'='*70}\n")
-    # Cleanup
     progress(0.95, desc="🧹 清理内存...")
     torch.cuda.empty_cache()
-    progress(1.0, desc="✅ 完成！")
-    return predictions, processed_data, segmented_glb
-# ============================================================================
-# Helper Functions (from app.py)
-# ============================================================================
 def update_view_selectors(processed_data):
     """Update view selector dropdowns based on available views"""
@@ -1188,9 +238,9 @@ def update_view_selectors(processed_data):
         choices = [f"View {i + 1}" for i in range(num_views)]
     return (
-        gr.Dropdown(choices=choices, value=choices[0]),
-        gr.Dropdown(choices=choices, value=choices[0]),
-        gr.Dropdown(choices=choices, value=choices[0]),
     )
@@ -1228,24 +278,33 @@ def update_measure_view(processed_data, view_index):
     """Update measure view for a specific view index with mask overlay"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None:
-        return None, []
     image = view_data["image"].copy()
     if image.dtype != np.uint8:
         if image.max() <= 1.0:
             image = (image * 255).astype(np.uint8)
         else:
             image = image.astype(np.uint8)
     if view_data["mask"] is not None:
         mask = view_data["mask"]
-        invalid_mask = ~mask
         if invalid_mask.any():
             overlay_color = np.array([255, 220, 220], dtype=np.uint8)
-            alpha = 0.5
-            for c in range(3):
                 image[:, :, c] = np.where(
                     invalid_mask,
                     (1 - alpha) * image[:, :, c] + alpha * overlay_color[c],
@@ -1260,6 +319,7 @@ def navigate_depth_view(processed_data, current_selector_value, direction):
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
     try:
         current_view = int(current_selector_value.split()[1]) - 1
     except:
@@ -1279,6 +339,7 @@ def navigate_normal_view(processed_data, current_selector_value, direction):
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
     try:
         current_view = int(current_selector_value.split()[1]) - 1
     except:
@@ -1298,6 +359,7 @@ def navigate_measure_view(processed_data, current_selector_value, direction):
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None, []
     try:
         current_view = int(current_selector_value.split()[1]) - 1
     except:
@@ -1317,6 +379,7 @@ def populate_visualization_tabs(processed_data):
     if processed_data is None or len(processed_data) == 0:
         return None, None, None, []
     depth_vis = update_depth_view(processed_data, 0)
     normal_vis = update_normal_view(processed_data, 0)
     measure_img, _ = update_measure_view(processed_data, 0)
@@ -1324,6 +387,9 @@ def populate_visualization_tabs(processed_data):
     return depth_vis, normal_vis, measure_img, []
 def handle_uploads(unified_upload, s_time_interval=1.0):
     """
     Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
@@ -1333,10 +399,12 @@ def handle_uploads(unified_upload, s_time_interval=1.0):
     gc.collect()
     torch.cuda.empty_cache()
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     target_dir = f"input_images_{timestamp}"
     target_dir_images = os.path.join(target_dir, "images")
     if os.path.exists(target_dir):
         shutil.rmtree(target_dir)
     os.makedirs(target_dir)
@@ -1344,6 +412,7 @@ def handle_uploads(unified_upload, s_time_interval=1.0):
     image_paths = []
     if unified_upload is not None:
         for file_data in unified_upload:
             if isinstance(file_data, dict) and "name" in file_data:
@@ -1353,13 +422,23 @@ def handle_uploads(unified_upload, s_time_interval=1.0):
             file_ext = os.path.splitext(file_path)[1].lower()
             video_extensions = [
-                ".mp4", ".avi", ".mov", ".mkv", ".wmv", ".flv", ".webm", ".m4v", ".3gp",
             ]
             if file_ext in video_extensions:
                 vs = cv2.VideoCapture(file_path)
                 fps = vs.get(cv2.CAP_PROP_FPS)
-                frame_interval = int(fps * s_time_interval)
                 count = 0
                 video_frame_num = 0
@@ -1369,6 +448,7 @@ def handle_uploads(unified_upload, s_time_interval=1.0):
                         break
                     count += 1
                     if count % frame_interval == 0:
                         base_name = os.path.splitext(os.path.basename(file_path))[0]
                         image_path = os.path.join(
                             target_dir_images, f"{base_name}_{video_frame_num:06}.png"
@@ -1377,52 +457,82 @@ def handle_uploads(unified_upload, s_time_interval=1.0):
                         image_paths.append(image_path)
                         video_frame_num += 1
                 vs.release()
-                print(f"Extracted {video_frame_num} frames from video: {os.path.basename(file_path)}")
             else:
                 if file_ext in [".heic", ".heif"]:
                     try:
                         with Image.open(file_path) as img:
                             if img.mode not in ("RGB", "L"):
                                 img = img.convert("RGB")
                             base_name = os.path.splitext(os.path.basename(file_path))[0]
-                            dst_path = os.path.join(target_dir_images, f"{base_name}.jpg")
                             img.save(dst_path, "JPEG", quality=95)
                             image_paths.append(dst_path)
-                            print(f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> {os.path.basename(dst_path)}")
                     except Exception as e:
                         print(f"Error converting HEIC file {file_path}: {e}")
-                        dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
                         shutil.copy(file_path, dst_path)
                         image_paths.append(dst_path)
                 else:
-                    dst_path = os.path.join(target_dir_images, os.path.basename(file_path))
                     shutil.copy(file_path, dst_path)
                     image_paths.append(dst_path)
     image_paths = sorted(image_paths)
     end_time = time.time()
-    print(f"Files processed to {target_dir_images}; took {end_time - start_time:.3f} seconds")
     return target_dir, image_paths
 def update_gallery_on_upload(input_video, input_images, s_time_interval=1.0):
-    """Update gallery on upload"""
     if not input_video and not input_images:
-        return None, None, None, None, None
     target_dir, image_paths = handle_uploads(input_video, input_images, s_time_interval)
     return (
-        None,
         None,
         target_dir,
         image_paths,
-        "Upload complete. Click 'Reconstruct' to begin 3D processing.",
     )
 @spaces.GPU(duration=120)
 def gradio_demo(
     target_dir,
@@ -1432,19 +542,20 @@ def gradio_demo(
     filter_white_bg=False,
     apply_mask=True,
     show_mesh=True,
-    enable_segmentation=False,
-    text_prompt=DEFAULT_TEXT_PROMPT,
     progress=gr.Progress(),
 ):
-    """执行重建"""
     if not os.path.isdir(target_dir) or target_dir == "None":
-        return None, None, "❌ 未找到有效的目标目录，请先上传文件", None, None, None, None, None, None, None, None, None
     progress(0, desc="🔄 准备重建...")
     start_time = time.time()
     gc.collect()
     torch.cuda.empty_cache()
     target_dir_images = os.path.join(target_dir, "images")
     all_files = (
         sorted(os.listdir(target_dir_images))
@@ -1454,94 +565,92 @@ def gradio_demo(
     all_files = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
     frame_filter_choices = ["All"] + all_files
-    progress(0.05, desc="🚀 运行 MapAnything 模型...")
-    print("运行 MapAnything 模型...")
     with torch.no_grad():
-        predictions, processed_data, segmented_glb = run_model(
-            target_dir, apply_mask, True, filter_black_bg, filter_white_bg,
-            enable_segmentation, text_prompt, progress
         )
-    # 保存预测结果
     progress(0.92, desc="💾 保存预测结果...")
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
     np.savez(prediction_save_path, **predictions)
     if frame_filter is None:
         frame_filter = "All"
-    # 生成 GLB 文件名
-    progress(0.93, desc="🏗️ 生成原始3D模型...")
     glbfile = os.path.join(
         target_dir,
-        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}.glb",
     )
-    # 转换预测结果为 GLB
     glbscene = predictions_to_glb(
         predictions,
         filter_by_frames=frame_filter,
         show_cam=show_cam,
         mask_black_bg=filter_black_bg,
         mask_white_bg=filter_white_bg,
-        as_mesh=show_mesh,
     )
     glbscene.export(file_obj=glbfile)
-    # 清理内存
     progress(0.96, desc="🧹 清理内存...")
     del predictions
     gc.collect()
     torch.cuda.empty_cache()
     end_time = time.time()
-    print(f"总耗时: {end_time - start_time:.2f}秒")
-    log_msg = f"✅ 重建成功 ({len(all_files)} 帧，耗时 {end_time - start_time:.1f}秒)"
-    # Populate visualization tabs
     progress(0.98, desc="🎨 生成可视化...")
     depth_vis, normal_vis, measure_img, measure_pts = populate_visualization_tabs(
         processed_data
     )
-    # Update view selectors
     depth_selector, normal_selector, measure_selector = update_view_selectors(
         processed_data
     )
     progress(1.0, desc="✅ 全部完成！")
-    # 添加分割状态信息
-    if enable_segmentation:
-        if segmented_glb:
-            log_msg += f"\n🎨 分割模型已生成"
-        else:
-            log_msg += f"\n⚠️ 未检测到物体，无分割模型"
     return (
         glbfile,
-        segmented_glb,
         log_msg,
         gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True),
         processed_data,
         depth_vis,
         normal_vis,
         measure_img,
-        "",
         depth_selector,
         normal_selector,
         measure_selector,
     )
 def colorize_depth(depth_map, mask=None):
     """Convert depth map to colorized visualization with optional mask"""
     if depth_map is None:
         return None
     depth_normalized = depth_map.copy()
     valid_mask = depth_normalized > 0
     if mask is not None:
         valid_mask = valid_mask & mask
@@ -1552,12 +661,14 @@ def colorize_depth(depth_map, mask=None):
         depth_normalized[valid_mask] = (depth_normalized[valid_mask] - p5) / (p95 - p5)
     import matplotlib.pyplot as plt
     colormap = plt.cm.turbo_r
     colored = colormap(depth_normalized)
     colored = (colored[:, :, :3] * 255).astype(np.uint8)
     colored[~valid_mask] = [255, 255, 255]
     return colored
@@ -1568,12 +679,15 @@ def colorize_normal(normal_map, mask=None):
     if normal_map is None:
         return None
     normal_vis = normal_map.copy()
     if mask is not None:
         invalid_mask = ~mask
-        normal_vis[invalid_mask] = [0, 0, 0]
     normal_vis = (normal_vis + 1.0) / 2.0
     normal_vis = (normal_vis * 255).astype(np.uint8)
@@ -1586,11 +700,15 @@ def process_predictions_for_visualization(
     """Extract depth, normal, and 3D points from predictions for visualization"""
     processed_data = {}
     for view_idx, view in enumerate(views):
         image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
         pred_pts3d = predictions["world_points"][view_idx]
         view_data = {
             "image": image[0],
             "points3d": pred_pts3d,
@@ -1599,15 +717,22 @@ def process_predictions_for_visualization(
             "mask": None,
         }
         mask = predictions["final_mask"][view_idx].copy()
         if filter_black_bg:
             view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
             black_bg_mask = view_colors.sum(axis=2) >= 16
             mask = mask & black_bg_mask
         if filter_white_bg:
             view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
             white_bg_mask = ~(
                 (view_colors[:, :, 0] > 240)
                 & (view_colors[:, :, 1] > 240)
@@ -1631,6 +756,7 @@ def reset_measure(processed_data):
     if processed_data is None or len(processed_data) == 0:
         return None, [], ""
     first_view = list(processed_data.values())[0]
     return first_view["image"], [], ""
@@ -1640,18 +766,20 @@ def measure(
 ):
     """Handle measurement on images"""
     try:
-        print(f"测量功能调用，选择器: {current_view_selector}")
         if processed_data is None or len(processed_data) == 0:
-            return None, [], "❌ 没有可用数据"
         try:
             current_view_index = int(current_view_selector.split()[1]) - 1
         except:
             current_view_index = 0
-        print(f"使用视图索引: {current_view_index}")
         if current_view_index < 0 or current_view_index >= len(processed_data):
             current_view_index = 0
@@ -1659,46 +787,54 @@ def measure(
         current_view = processed_data[view_keys[current_view_index]]
         if current_view is None:
-            return None, [], "❌ 没有视图数据"
         point2d = event.index[0], event.index[1]
-        print(f"点击点: {point2d}")
         if (
             current_view["mask"] is not None
             and 0 <= point2d[1] < current_view["mask"].shape[0]
             and 0 <= point2d[0] < current_view["mask"].shape[1]
         ):
             if not current_view["mask"][point2d[1], point2d[0]]:
-                print(f"点击点 {point2d} 在遮罩区域，忽略点击")
                 masked_image, _ = update_measure_view(
                     processed_data, current_view_index
                 )
                 return (
                     masked_image,
                     measure_points,
-                    '<span style="color: red; font-weight: bold;">⚠️ 无法在遮罩区域测量（显示为灰色）</span>',
                 )
         measure_points.append(point2d)
         image, _ = update_measure_view(processed_data, current_view_index)
         if image is None:
-            return None, [], "❌ 没有可用图像"
         image = image.copy()
         points3d = current_view["points3d"]
         try:
             if image.dtype != np.uint8:
                 if image.max() <= 1.0:
                     image = (image * 255).astype(np.uint8)
                 else:
                     image = image.astype(np.uint8)
         except Exception as e:
-            print(f"图像转换错误: {e}")
-            return None, [], f"❌ 图像转换错误: {e}"
         try:
             for p in measure_points:
                 if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
@@ -1706,8 +842,8 @@ def measure(
                         image, p, radius=5, color=(255, 0, 0), thickness=2
                     )
         except Exception as e:
-            print(f"绘制错误: {e}")
-            return None, [], f"❌ 绘制错误: {e}"
         depth_text = ""
         try:
@@ -1718,22 +854,24 @@ def measure(
                     and 0 <= p[0] < current_view["depth"].shape[1]
                 ):
                     d = current_view["depth"][p[1], p[0]]
-                    depth_text += f"- **P{i + 1} 深度: {d:.2f}m**\n"
                 else:
                     if (
                         points3d is not None
                         and 0 <= p[1] < points3d.shape[0]
                         and 0 <= p[0] < points3d.shape[1]
                     ):
                         z = points3d[p[1], p[0], 2]
-                        depth_text += f"- **P{i + 1} Z坐标: {z:.2f}m**\n"
         except Exception as e:
-            print(f"深度文本错误: {e}")
-            depth_text = f"❌ 深度计算错误: {e}\n"
         if len(measure_points) == 2:
             try:
                 point1, point2 = measure_points
                 if (
                     0 <= point1[0] < image.shape[1]
                     and 0 <= point1[1] < image.shape[0]
@@ -1744,7 +882,8 @@ def measure(
                         image, point1, point2, color=(255, 0, 0), thickness=2
                     )
-                distance_text = "- **距离: 无法计算**"
                 if (
                     points3d is not None
                     and 0 <= point1[1] < points3d.shape[0]
@@ -1756,35 +895,39 @@ def measure(
                         p1_3d = points3d[point1[1], point1[0]]
                         p2_3d = points3d[point2[1], point2[0]]
                         distance = np.linalg.norm(p1_3d - p2_3d)
-                        distance_text = f"- **距离: {distance:.2f}m**"
                     except Exception as e:
-                        print(f"距离计算错误: {e}")
-                        distance_text = f"- **距离计算错误: {e}**"
                 measure_points = []
                 text = depth_text + distance_text
-                print(f"测量完成: {text}")
                 return [image, measure_points, text]
             except Exception as e:
-                print(f"最终测量错误: {e}")
-                return None, [], f"❌ 测量错误: {e}"
         else:
-            print(f"单点测量: {depth_text}")
             return [image, measure_points, depth_text]
     except Exception as e:
-        print(f"整体测量功能错误: {e}")
-        return None, [], f"❌ 测量功能错误: {e}"
 def clear_fields():
-    """清空 3D 查看器"""
-    return None, None
 def update_log():
-    """显示日志消息"""
-    return "🔄 加载和重建中..."
 def update_visualization(
@@ -1796,16 +939,30 @@ def update_visualization(
     filter_white_bg=False,
     show_mesh=True,
 ):
-    """更新可视化"""
     if is_example == "True":
-        return gr.update(), "❌ 没有可用的重建。请先点击重建按钮。"
     if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
-        return gr.update(), "❌ 没有可用的重建。请先点击重建按钮。"
     predictions_path = os.path.join(target_dir, "predictions.npz")
     if not os.path.exists(predictions_path):
-        return gr.update(), f"❌ 没有可用的重建。请先运行「重建」。"
     loaded = np.load(predictions_path, allow_pickle=True)
     predictions = {key: loaded[key] for key in loaded.keys()}
@@ -1815,17 +972,21 @@ def update_visualization(
         f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}_black{filter_black_bg}_white{filter_white_bg}.glb",
     )
-    glbscene = predictions_to_glb(
-        predictions,
-        filter_by_frames=frame_filter,
-        show_cam=show_cam,
-        mask_black_bg=filter_black_bg,
-        mask_white_bg=filter_white_bg,
-        as_mesh=show_mesh,
-    )
-    glbscene.export(file_obj=glbfile)
-    return glbfile, "✅ 可视化已更新。"
 def update_all_views_on_filter_change(
@@ -1837,7 +998,11 @@ def update_all_views_on_filter_change(
     normal_view_selector,
     measure_view_selector,
 ):
-    """Update all individual view tabs when background filtering checkboxes change"""
     if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
         return processed_data, None, None, None, []
@@ -1846,16 +1011,20 @@ def update_all_views_on_filter_change(
         return processed_data, None, None, None, []
     try:
         loaded = np.load(predictions_path, allow_pickle=True)
         predictions = {key: loaded[key] for key in loaded.keys()}
         image_folder_path = os.path.join(target_dir, "images")
         views = load_images(image_folder_path)
         new_processed_data = process_predictions_for_visualization(
             predictions, views, high_level_config, filter_black_bg, filter_white_bg
         )
         try:
             depth_view_idx = (
                 int(depth_view_selector.split()[1]) - 1 if depth_view_selector else 0
@@ -1879,6 +1048,7 @@ def update_all_views_on_filter_change(
         except:
             measure_view_idx = 0
         depth_vis = update_depth_view(new_processed_data, depth_view_idx)
         normal_vis = update_normal_view(new_processed_data, normal_view_idx)
         measure_img, _ = update_measure_view(new_processed_data, measure_view_idx)
@@ -1890,10 +1060,9 @@ def update_all_views_on_filter_change(
         return processed_data, None, None, None, []
-# ============================================================================
-# Example Scene Functions
-# ============================================================================
 def get_scene_info(examples_dir):
     """Get information about scenes in the examples directory"""
     import glob
@@ -1905,6 +1074,7 @@ def get_scene_info(examples_dir):
     for scene_folder in sorted(os.listdir(examples_dir)):
         scene_path = os.path.join(examples_dir, scene_folder)
         if os.path.isdir(scene_path):
             image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
             image_files = []
             for ext in image_extensions:
@@ -1912,6 +1082,7 @@ def get_scene_info(examples_dir):
                 image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))
             if image_files:
                 image_files = sorted(image_files)
                 first_image = image_files[0]
                 num_images = len(image_files)
@@ -1930,9 +1101,10 @@ def get_scene_info(examples_dir):
 def load_example_scene(scene_name, examples_dir="examples"):
-    """从示例目录加载场景"""
     scenes = get_scene_info(examples_dir)
     selected_scene = None
     for scene in scenes:
         if scene["name"] == scene_name:
@@ -1940,26 +1112,28 @@ def load_example_scene(scene_name, examples_dir="examples"):
             break
     if selected_scene is None:
-        return None, None, None, "❌ 场景未找到"
     file_objects = []
     for image_path in selected_scene["image_files"]:
         file_objects.append(image_path)
     target_dir, image_paths = handle_uploads(file_objects, 1.0)
     return (
-        None,
-        target_dir,
-        image_paths,
-        f"✅ 已加载场景 '{scene_name}' ({selected_scene['num_images']} 张图像)。点击「开始重建」进行 3D 处理。",
     )
-# ============================================================================
-# Gradio UI
-# ============================================================================
 theme = get_gradio_theme()
 # 自定义CSS防止UI抖动
@@ -2022,45 +1196,44 @@ CUSTOM_CSS = GRADIO_CSS + """
 }
 """
-# JavaScript for paste support
-PASTE_JS = """
-<script>
-// 添加粘贴板支持
-document.addEventListener('paste', function(e) {
-    const items = e.clipboardData.items;
-    for (let i = 0; i < items.length; i++) {
-        if (items[i].type.indexOf('image') !== -1) {
-            const blob = items[i].getAsFile();
-            const fileInput = document.querySelector('input[type="file"][multiple]');
-            if (fileInput) {
-                const dataTransfer = new DataTransfer();
-                dataTransfer.items.add(blob);
-                fileInput.files = dataTransfer.files;
-                fileInput.dispatchEvent(new Event('change', { bubbles: true }));
-                console.log('✅ 图片已从剪贴板粘贴');
-            }
-        }
-    }
-});
-// 添加提示信息
-console.log('💡 粘贴板功能已启用：使用 Ctrl+V 可直接粘贴截图');
-</script>
-"""
-with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与物体分割") as demo:
     is_example = gr.Textbox(label="is_example", visible=False, value="None")
     processed_data_state = gr.State(value=None)
     measure_points_state = gr.State(value=[])
     # 添加粘贴板支持的 JavaScript
     gr.HTML(PASTE_JS)
-    # 顶部标题
     gr.HTML("""
     <div style="text-align: center; margin: 20px 0;">
-        <h2 style="color: #1976D2; margin-bottom: 10px;">MapAnything V2 - 3D重建与物体分割</h2>
-        <p style="color: #666; font-size: 16px;">基于DBSCAN聚类的智能物体识别 | 多视图融合 | 自适应参数调整</p>
     </div>
     """)
@@ -2133,23 +1306,6 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
                         clear_color=[0.0, 0.0, 0.0, 0.0]
                     )
-                with gr.Tab("🎨 分割3D"):
-                    gr.Markdown(
-                        """
-                        💡 **使用说明**:
-                        1. 在下方「⚙️ 高级选项」中勾选「启用语义分割 (CPU)」
-                        2. 点击「开始重建」按钮
-                        3. 等待处理完成后，分割结果将显示在此处
-                        📌 如果没有显示分割结果，请查看控制台日志查找原因
-                        """,
-                        elem_classes=["info-box"]
-                    )
-                    segmented_output = gr.Model3D(
-                        height=450, zoom_speed=0.5, pan_speed=0.5,
-                        clear_color=[0.0, 0.0, 0.0, 0.0]
-                    )
                 with gr.Tab("📊 深度图"):
                     with gr.Row(elem_classes=["navigation-row"]):
                         prev_depth_btn = gr.Button("◀", size="sm", scale=1)
@@ -2200,8 +1356,8 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
                 max_lines=1
             )
-    # 高级选项（默认打开）
-    with gr.Accordion("⚙️ 高级选项", open=True):
         with gr.Row(equal_height=False):
             with gr.Column(scale=1, min_width=300):
                 gr.Markdown("#### 可视化参数")
@@ -2218,32 +1374,13 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
                 apply_mask_checkbox = gr.Checkbox(
                     label="应用深度掩码", value=True
                 )
-                gr.Markdown("#### 分割参数")
-                gr.Markdown("💡 **说明**: 分割使用 CPU 运行（MobileSAM轻量级模型），不占用GPU资源")
-                enable_segmentation = gr.Checkbox(
-                    label="启用语义分割 (CPU)", value=False
-                )
-                text_prompt = gr.Textbox(
-                    value=DEFAULT_TEXT_PROMPT,
-                    label="检测物体（用 . 分隔）",
-                    placeholder="例如: chair . table . sofa",
-                    lines=2,
-                    max_lines=2
-                )
-                with gr.Row():
-                    detect_all_btn = gr.Button("🔍 检测所有", size="sm")
-                    restore_default_btn = gr.Button("↻ 默认", size="sm")
-                gr.Markdown("📌 **提示**: 启用后会在「分割3D」标签页显示彩色分割模型")
     # 示例场景（可折叠）
     with gr.Accordion("🖼️ 示例场景", open=False):
         scenes = get_scene_info("examples")
         if scenes:
-            for i in range(0, len(scenes), 4):
                 with gr.Row(equal_height=True):
                     for j in range(4):
                         scene_idx = i + j
@@ -2251,10 +1388,10 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
                             scene = scenes[scene_idx]
                             with gr.Column(scale=1, min_width=150):
                                 scene_img = gr.Image(
-                                    value=scene["thumbnail"],
                                     height=150,
-                                    interactive=False,
-                                    show_label=False,
                                     sources=[],
                                     container=False
                                 )
@@ -2266,22 +1403,14 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
                                     fn=lambda name=scene["name"]: load_example_scene(name),
                                     outputs=[
                                         reconstruction_output,
-                                        target_dir_output, image_gallery, log_output
-                                    ]
                                 )
     # === 事件绑定 ===
-    # 分割选项按钮
-    detect_all_btn.click(
-        fn=lambda: COMMON_OBJECTS_PROMPT,
-        outputs=[text_prompt]
-    )
-    restore_default_btn.click(
-        fn=lambda: DEFAULT_TEXT_PROMPT,
-        outputs=[text_prompt]
-    )
     # 上传文件自动更新
     def update_gallery_on_unified_upload(files, interval):
         if not files:
@@ -2411,7 +1540,7 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
     # 重建按钮
     submit_btn.click(
         fn=clear_fields,
-        outputs=[reconstruction_output, segmented_output]
     ).then(
         fn=update_log,
         outputs=[log_output]
@@ -2420,11 +1549,10 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
         inputs=[
             target_dir_output, frame_filter, show_cam,
             filter_black_bg, filter_white_bg,
-            apply_mask_checkbox, show_mesh,
-            enable_segmentation, text_prompt
         ],
         outputs=[
-            reconstruction_output, segmented_output, log_output, frame_filter,
             processed_data_state, depth_map, normal_map, measure_image,
             measure_text, depth_view_selector, normal_view_selector, measure_view_selector
         ]
@@ -2434,8 +1562,8 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
     )
     # 清空按钮
-    clear_btn.add([reconstruction_output, segmented_output, log_output])
     # 可视化参数实时更新
     for component in [frame_filter, show_cam, show_mesh]:
         component.change(
@@ -2457,7 +1585,7 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
             ],
             outputs=[processed_data_state, depth_map, normal_map, measure_image, measure_points_state]
         )
     # 深度图导航
     prev_depth_btn.click(
         fn=lambda pd, cs: navigate_depth_view(pd, cs, -1),
@@ -2514,17 +1642,4 @@ with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything V2 - 3D重建与
         outputs=[measure_image, measure_points_state]
     )
-# 启动信息
-print("\n" + "="*70)
-print("🚀 MapAnything V2 - 3D重建与物体分割")
-print("="*70)
-print("📊 核心技术: 自适应DBSCAN聚类 + 多视图融合")
-print(f"🔧 质量控制: 置信度≥{MIN_DETECTION_CONFIDENCE} | 面积≥{MIN_MASK_AREA}px")
-print(f"🎯 聚类半径: 沙发{DBSCAN_EPS_CONFIG['sofa']}m | 桌子{DBSCAN_EPS_CONFIG['table']}m | 窗户{DBSCAN_EPS_CONFIG['window']}m | 默认{DBSCAN_EPS_CONFIG['default']}m")
-print("\n💡 分割配置 (CPU优化):")
-print(f"   - 检测模型: {GROUNDING_DINO_MODEL_ID} (CPU)")
-print(f"   - 分割模型: {SAM_MODEL_ID} (MobileSAM, 10MB, CPU)")
-print(f"   - 运行设备: CPU (不占用GPU资源，适合分离部署)")
-print("="*70 + "\n")
 demo.queue(max_size=20).launch(show_error=True, share=True, ssr_mode=False)

 # LICENSE file in the root directory of this source tree.
 """
+MapAnything V2 - 3D重建系统（中文版）
+- 多视图 3D 重建
+- 深度估计与法线计算
+- 距离测量功能
 """
 import gc
 import sys
 import time
 from datetime import datetime
 os.environ["PYTORCH_CUDA_ALLOC_CONF"] = "expandable_segments:True"
 import numpy as np
 import spaces
 import torch
 from PIL import Image
 from pillow_heif import register_heif_opener
 register_heif_opener()
         return None
 # MapAnything Configuration
 high_level_config = {
     "path": "configs/train.yaml",
     "resolution": 518,
 }
+# Initialize model - this will be done on GPU when needed
 model = None
+# -------------------------------------------------------------------------
+# 1) Core model inference
+# -------------------------------------------------------------------------
 @spaces.GPU(duration=120)
 def run_model(
     target_dir,
     mask_edges=True,
     filter_black_bg=False,
     filter_white_bg=False,
     progress=gr.Progress(),
 ):
     """
+    Run the MapAnything model on images in the 'target_dir/images' folder and return predictions.
     """
     global model
+    import torch  # Ensure torch is available in function scope
+    start_time = time.time()
     print(f"Processing images from {target_dir}")
+    # Device check
+    progress(0, desc="🔧 初始化设备...")
     device = "cuda" if torch.cuda.is_available() else "cpu"
     device = torch.device(device)
+    # Initialize model if not already done
+    progress(0.05, desc="📥 加载模型... (~5秒)")
     if model is None:
         model = initialize_mapanything_model(high_level_config, device)
     else:
     model.eval()
+    # Load images using MapAnything's load_images function
+    progress(0.15, desc="📷 加载图片... (~2秒)")
     print("Loading images...")
     image_folder_path = os.path.join(target_dir, "images")
     views = load_images(image_folder_path)
         raise ValueError("No images found. Check your upload.")
     # Run model inference
+    num_images = len(views)
+    estimated_time = num_images * 3  # 估计每张图片3秒
+    progress(0.2, desc=f"🚀 运行3D重建... ({num_images}张图片，预计{estimated_time}秒)")
     print("Running inference...")
+    inference_start = time.time()
     outputs = model.infer(
         views, apply_mask=apply_mask, mask_edges=True, memory_efficient_inference=False
     )
+    inference_time = time.time() - inference_start
+    # Convert predictions to format expected by visualization
+    progress(0.6, desc=f"🔄 处理预测结果... (推理耗时: {inference_time:.1f}秒)")
     predictions = {}
+    # Initialize lists for the required keys
     extrinsic_list = []
     intrinsic_list = []
     world_points_list = []
     images_list = []
     final_mask_list = []
+    # Loop through the outputs
+    for i, pred in enumerate(outputs):
+        if i % max(1, len(outputs) // 5) == 0:
+            progress(0.6 + (i / len(outputs)) * 0.25, desc=f"🔄 处理视图 {i+1}/{len(outputs)}...")
+        # Extract data from predictions
+        depthmap_torch = pred["depth_z"][0].squeeze(-1)  # (H, W)
+        intrinsics_torch = pred["intrinsics"][0]  # (3, 3)
+        camera_pose_torch = pred["camera_poses"][0]  # (4, 4)
+        # Compute new pts3d using depth, intrinsics, and camera pose
         pts3d_computed, valid_mask = depthmap_to_world_frame(
             depthmap_torch, intrinsics_torch, camera_pose_torch
         )
+        # Convert to numpy arrays for visualization
+        # Check if mask key exists in pred, if not, fill with boolean trues in the size of depthmap_torch
         if "mask" in pred:
             mask = pred["mask"][0].squeeze(-1).cpu().numpy().astype(bool)
         else:
+            # Fill with boolean trues in the size of depthmap_torch
             mask = np.ones_like(depthmap_torch.cpu().numpy(), dtype=bool)
+        # Combine with valid depth mask
         mask = mask & valid_mask.cpu().numpy()
         image = pred["img_no_norm"][0].cpu().numpy()
+        # Append to lists
         extrinsic_list.append(camera_pose_torch.cpu().numpy())
         intrinsic_list.append(intrinsics_torch.cpu().numpy())
         world_points_list.append(pts3d_computed.cpu().numpy())
         depth_maps_list.append(depthmap_torch.cpu().numpy())
+        images_list.append(image)  # Add image to list
+        final_mask_list.append(mask)  # Add final_mask to list
+    # Convert lists to numpy arrays with required shapes
+    # extrinsic: (S, 3, 4) - batch of camera extrinsic matrices
     predictions["extrinsic"] = np.stack(extrinsic_list, axis=0)
+    # intrinsic: (S, 3, 3) - batch of camera intrinsic matrices
     predictions["intrinsic"] = np.stack(intrinsic_list, axis=0)
+    # world_points: (S, H, W, 3) - batch of 3D world points
     predictions["world_points"] = np.stack(world_points_list, axis=0)
+    # depth: (S, H, W, 1) or (S, H, W) - batch of depth maps
     depth_maps = np.stack(depth_maps_list, axis=0)
+    # Add channel dimension if needed to match (S, H, W, 1) format
     if len(depth_maps.shape) == 3:
         depth_maps = depth_maps[..., np.newaxis]
     predictions["depth"] = depth_maps
+    # images: (S, H, W, 3) - batch of input images
     predictions["images"] = np.stack(images_list, axis=0)
+    # final_mask: (S, H, W) - batch of final masks for filtering
     predictions["final_mask"] = np.stack(final_mask_list, axis=0)
+    # Process data for visualization tabs (depth, normal, measure)
+    progress(0.85, desc="🎨 生成深度图与法线图...")
     processed_data = process_predictions_for_visualization(
         predictions, views, high_level_config, filter_black_bg, filter_white_bg
     )
+    # Clean up
     progress(0.95, desc="🧹 清理内存...")
     torch.cuda.empty_cache()
+    total_time = time.time() - start_time
+    progress(1.0, desc=f"✅ 完成！总耗时: {total_time:.1f}秒")
+    print(f"Total processing time: {total_time:.2f} seconds")
+    return predictions, processed_data
 def update_view_selectors(processed_data):
     """Update view selector dropdowns based on available views"""
         choices = [f"View {i + 1}" for i in range(num_views)]
     return (
+        gr.Dropdown(choices=choices, value=choices[0]),  # depth_view_selector
+        gr.Dropdown(choices=choices, value=choices[0]),  # normal_view_selector
+        gr.Dropdown(choices=choices, value=choices[0]),  # measure_view_selector
     )
     """Update measure view for a specific view index with mask overlay"""
     view_data = get_view_data_by_index(processed_data, view_index)
     if view_data is None:
+        return None, []  # image, measure_points
+    # Get the base image
     image = view_data["image"].copy()
+    # Ensure image is in uint8 format
     if image.dtype != np.uint8:
         if image.max() <= 1.0:
             image = (image * 255).astype(np.uint8)
         else:
             image = image.astype(np.uint8)
+    # Apply mask overlay if mask is available
     if view_data["mask"] is not None:
         mask = view_data["mask"]
+        # Create light grey overlay for masked areas
+        # Masked areas (False values) will be overlaid with light grey
+        invalid_mask = ~mask  # Areas where mask is False
         if invalid_mask.any():
+            # Create a light grey overlay (RGB: 192, 192, 192)
             overlay_color = np.array([255, 220, 220], dtype=np.uint8)
+            # Apply overlay with some transparency
+            alpha = 0.5  # Transparency level
+            for c in range(3):  # RGB channels
                 image[:, :, c] = np.where(
                     invalid_mask,
                     (1 - alpha) * image[:, :, c] + alpha * overlay_color[c],
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
+    # Parse current view number
     try:
         current_view = int(current_selector_value.split()[1]) - 1
     except:
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None
+    # Parse current view number
     try:
         current_view = int(current_selector_value.split()[1]) - 1
     except:
     if processed_data is None or len(processed_data) == 0:
         return "View 1", None, []
+    # Parse current view number
     try:
         current_view = int(current_selector_value.split()[1]) - 1
     except:
     if processed_data is None or len(processed_data) == 0:
         return None, None, None, []
+    # Use update functions to ensure confidence filtering is applied from the start
     depth_vis = update_depth_view(processed_data, 0)
     normal_vis = update_normal_view(processed_data, 0)
     measure_img, _ = update_measure_view(processed_data, 0)
     return depth_vis, normal_vis, measure_img, []
+# -------------------------------------------------------------------------
+# 2) Handle uploaded video/images --> produce target_dir + images
+# -------------------------------------------------------------------------
 def handle_uploads(unified_upload, s_time_interval=1.0):
     """
     Create a new 'target_dir' + 'images' subfolder, and place user-uploaded
     gc.collect()
     torch.cuda.empty_cache()
+    # Create a unique folder name
     timestamp = datetime.now().strftime("%Y%m%d_%H%M%S_%f")
     target_dir = f"input_images_{timestamp}"
     target_dir_images = os.path.join(target_dir, "images")
+    # Clean up if somehow that folder already exists
     if os.path.exists(target_dir):
         shutil.rmtree(target_dir)
     os.makedirs(target_dir)
     image_paths = []
+    # --- Handle uploaded files (both images and videos) ---
     if unified_upload is not None:
         for file_data in unified_upload:
             if isinstance(file_data, dict) and "name" in file_data:
             file_ext = os.path.splitext(file_path)[1].lower()
+            # Check if it's a video file
             video_extensions = [
+                ".mp4",
+                ".avi",
+                ".mov",
+                ".mkv",
+                ".wmv",
+                ".flv",
+                ".webm",
+                ".m4v",
+                ".3gp",
             ]
             if file_ext in video_extensions:
+                # Handle as video
                 vs = cv2.VideoCapture(file_path)
                 fps = vs.get(cv2.CAP_PROP_FPS)
+                frame_interval = int(fps * s_time_interval)  # frames per interval
                 count = 0
                 video_frame_num = 0
                         break
                     count += 1
                     if count % frame_interval == 0:
+                        # Use original filename as prefix for frames
                         base_name = os.path.splitext(os.path.basename(file_path))[0]
                         image_path = os.path.join(
                             target_dir_images, f"{base_name}_{video_frame_num:06}.png"
                         image_paths.append(image_path)
                         video_frame_num += 1
                 vs.release()
+                print(
+                    f"Extracted {video_frame_num} frames from video: {os.path.basename(file_path)}"
+                )
             else:
+                # Handle as image
+                # Check if the file is a HEIC image
                 if file_ext in [".heic", ".heif"]:
+                    # Convert HEIC to JPEG for better gallery compatibility
                     try:
                         with Image.open(file_path) as img:
+                            # Convert to RGB if necessary (HEIC can have different color modes)
                             if img.mode not in ("RGB", "L"):
                                 img = img.convert("RGB")
+                            # Create JPEG filename
                             base_name = os.path.splitext(os.path.basename(file_path))[0]
+                            dst_path = os.path.join(
+                                target_dir_images, f"{base_name}.jpg"
+                            )
+                            # Save as JPEG with high quality
                             img.save(dst_path, "JPEG", quality=95)
                             image_paths.append(dst_path)
+                            print(
+                                f"Converted HEIC to JPEG: {os.path.basename(file_path)} -> {os.path.basename(dst_path)}"
+                            )
                     except Exception as e:
                         print(f"Error converting HEIC file {file_path}: {e}")
+                        # Fall back to copying as is
+                        dst_path = os.path.join(
+                            target_dir_images, os.path.basename(file_path)
+                        )
                         shutil.copy(file_path, dst_path)
                         image_paths.append(dst_path)
                 else:
+                    # Regular image files - copy as is
+                    dst_path = os.path.join(
+                        target_dir_images, os.path.basename(file_path)
+                    )
                     shutil.copy(file_path, dst_path)
                     image_paths.append(dst_path)
+    # Sort final images for gallery
     image_paths = sorted(image_paths)
     end_time = time.time()
+    print(
+        f"Files processed to {target_dir_images}; took {end_time - start_time:.3f} seconds"
+    )
     return target_dir, image_paths
+# -------------------------------------------------------------------------
+# 3) Update gallery on upload
+# -------------------------------------------------------------------------
 def update_gallery_on_upload(input_video, input_images, s_time_interval=1.0):
+    """
+    Whenever user uploads or changes files, immediately handle them
+    and show in the gallery. Return (target_dir, image_paths).
+    If nothing is uploaded, returns "None" and empty list.
+    """
     if not input_video and not input_images:
+        return None, None, None, None
     target_dir, image_paths = handle_uploads(input_video, input_images, s_time_interval)
     return (
         None,
         target_dir,
         image_paths,
+        "上传完成。点击「开始重建」进行3D处理",
     )
+# -------------------------------------------------------------------------
+# 4) Reconstruction: uses the target_dir plus any viz parameters
+# -------------------------------------------------------------------------
 @spaces.GPU(duration=120)
 def gradio_demo(
     target_dir,
     filter_white_bg=False,
     apply_mask=True,
     show_mesh=True,
     progress=gr.Progress(),
 ):
+    """
+    Perform reconstruction using the already-created target_dir/images.
+    """
     if not os.path.isdir(target_dir) or target_dir == "None":
+        return None, "❌ 未找到有效的目标目录，请先上传文件", None, None, None, None, None, None, None, None, None
     progress(0, desc="🔄 准备重建...")
     start_time = time.time()
     gc.collect()
     torch.cuda.empty_cache()
+    # Prepare frame_filter dropdown
     target_dir_images = os.path.join(target_dir, "images")
     all_files = (
         sorted(os.listdir(target_dir_images))
     all_files = [f"{i}: {filename}" for i, filename in enumerate(all_files)]
     frame_filter_choices = ["All"] + all_files
+    progress(0.05, desc=f"🚀 运行 MapAnything 模型... ({len(all_files)}张图片)")
+    print("Running MapAnything model...")
     with torch.no_grad():
+        predictions, processed_data = run_model(
+            target_dir, apply_mask, True, filter_black_bg, filter_white_bg, progress
         )
+    # Save predictions
     progress(0.92, desc="💾 保存预测结果...")
     prediction_save_path = os.path.join(target_dir, "predictions.npz")
     np.savez(prediction_save_path, **predictions)
+    # Handle None frame_filter
     if frame_filter is None:
         frame_filter = "All"
+    # Build a GLB file name
+    progress(0.93, desc="🏗️ 生成3D模型文件...")
     glbfile = os.path.join(
         target_dir,
+        f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}_black{filter_black_bg}_white{filter_white_bg}.glb",
     )
+    # Convert predictions to GLB
     glbscene = predictions_to_glb(
         predictions,
         filter_by_frames=frame_filter,
         show_cam=show_cam,
         mask_black_bg=filter_black_bg,
         mask_white_bg=filter_white_bg,
+        as_mesh=show_mesh,  # Use the show_mesh parameter
     )
     glbscene.export(file_obj=glbfile)
+    # Cleanup
     progress(0.96, desc="🧹 清理内存...")
     del predictions
     gc.collect()
     torch.cuda.empty_cache()
     end_time = time.time()
+    total_time = end_time - start_time
+    print(f"总耗时: {total_time:.2f}秒")
+    log_msg = f"✅ 重建成功 ({len(all_files)} 帧，耗时 {total_time:.1f}秒)"
+    # Populate visualization tabs with processed data
     progress(0.98, desc="🎨 生成可视化...")
     depth_vis, normal_vis, measure_img, measure_pts = populate_visualization_tabs(
         processed_data
     )
+    # Update view selectors based on available views
     depth_selector, normal_selector, measure_selector = update_view_selectors(
         processed_data
     )
     progress(1.0, desc="✅ 全部完成！")
     return (
         glbfile,
         log_msg,
         gr.Dropdown(choices=frame_filter_choices, value=frame_filter, interactive=True),
         processed_data,
         depth_vis,
         normal_vis,
         measure_img,
+        "",  # measure_text (empty initially)
         depth_selector,
         normal_selector,
         measure_selector,
     )
+# -------------------------------------------------------------------------
+# 5) Helper functions for UI resets + re-visualization
+# -------------------------------------------------------------------------
 def colorize_depth(depth_map, mask=None):
     """Convert depth map to colorized visualization with optional mask"""
     if depth_map is None:
         return None
+    # Normalize depth to 0-1 range
     depth_normalized = depth_map.copy()
     valid_mask = depth_normalized > 0
+    # Apply additional mask if provided (for background filtering)
     if mask is not None:
         valid_mask = valid_mask & mask
         depth_normalized[valid_mask] = (depth_normalized[valid_mask] - p5) / (p95 - p5)
+    # Apply colormap
     import matplotlib.pyplot as plt
     colormap = plt.cm.turbo_r
     colored = colormap(depth_normalized)
     colored = (colored[:, :, :3] * 255).astype(np.uint8)
+    # Set invalid pixels to white
     colored[~valid_mask] = [255, 255, 255]
     return colored
     if normal_map is None:
         return None
+    # Create a copy for modification
     normal_vis = normal_map.copy()
+    # Apply mask if provided (set masked areas to [0, 0, 0] which becomes grey after normalization)
     if mask is not None:
         invalid_mask = ~mask
+        normal_vis[invalid_mask] = [0, 0, 0]  # Set invalid areas to zero
+    # Normalize normals to [0, 1] range for visualization
     normal_vis = (normal_vis + 1.0) / 2.0
     normal_vis = (normal_vis * 255).astype(np.uint8)
     """Extract depth, normal, and 3D points from predictions for visualization"""
     processed_data = {}
+    # Process each view
     for view_idx, view in enumerate(views):
+        # Get image
         image = rgb(view["img"], norm_type=high_level_config["data_norm_type"])
+        # Get predicted points
         pred_pts3d = predictions["world_points"][view_idx]
+        # Initialize data for this view
         view_data = {
             "image": image[0],
             "points3d": pred_pts3d,
             "mask": None,
         }
+        # Start with the final mask from predictions
         mask = predictions["final_mask"][view_idx].copy()
+        # Apply black background filtering if enabled
         if filter_black_bg:
+            # Get the image colors (ensure they're in 0-255 range)
             view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
+            # Filter out black background pixels (sum of RGB < 16)
             black_bg_mask = view_colors.sum(axis=2) >= 16
             mask = mask & black_bg_mask
+        # Apply white background filtering if enabled
         if filter_white_bg:
+            # Get the image colors (ensure they're in 0-255 range)
             view_colors = image[0] * 255 if image[0].max() <= 1.0 else image[0]
+            # Filter out white background pixels (all RGB > 240)
             white_bg_mask = ~(
                 (view_colors[:, :, 0] > 240)
                 & (view_colors[:, :, 1] > 240)
     if processed_data is None or len(processed_data) == 0:
         return None, [], ""
+    # Return the first view image
     first_view = list(processed_data.values())[0]
     return first_view["image"], [], ""
 ):
     """Handle measurement on images"""
     try:
+        print(f"Measure function called with selector: {current_view_selector}")
         if processed_data is None or len(processed_data) == 0:
+            return None, [], "No data available"
+        # Use the currently selected view instead of always using the first view
         try:
             current_view_index = int(current_view_selector.split()[1]) - 1
         except:
             current_view_index = 0
+        print(f"Using view index: {current_view_index}")
+        # Get view data safely
         if current_view_index < 0 or current_view_index >= len(processed_data):
             current_view_index = 0
         current_view = processed_data[view_keys[current_view_index]]
         if current_view is None:
+            return None, [], "No view data available"
         point2d = event.index[0], event.index[1]
+        print(f"Clicked point: {point2d}")
+        # Check if the clicked point is in a masked area (prevent interaction)
         if (
             current_view["mask"] is not None
             and 0 <= point2d[1] < current_view["mask"].shape[0]
             and 0 <= point2d[0] < current_view["mask"].shape[1]
         ):
+            # Check if the point is in a masked (invalid) area
             if not current_view["mask"][point2d[1], point2d[0]]:
+                print(f"Clicked point {point2d} is in masked area, ignoring click")
+                # Always return image with mask overlay
                 masked_image, _ = update_measure_view(
                     processed_data, current_view_index
                 )
                 return (
                     masked_image,
                     measure_points,
+                    '<span style="color: red; font-weight: bold;">Cannot measure on masked areas (shown in grey)</span>',
                 )
         measure_points.append(point2d)
+        # Get image with mask overlay and ensure it's valid
         image, _ = update_measure_view(processed_data, current_view_index)
         if image is None:
+            return None, [], "No image available"
         image = image.copy()
         points3d = current_view["points3d"]
+        # Ensure image is in uint8 format for proper cv2 operations
         try:
             if image.dtype != np.uint8:
                 if image.max() <= 1.0:
+                    # Image is in [0, 1] range, convert to [0, 255]
                     image = (image * 255).astype(np.uint8)
                 else:
+                    # Image is already in [0, 255] range
                     image = image.astype(np.uint8)
         except Exception as e:
+            print(f"Image conversion error: {e}")
+            return None, [], f"Image conversion error: {e}"
+        # Draw circles for points
         try:
             for p in measure_points:
                 if 0 <= p[0] < image.shape[1] and 0 <= p[1] < image.shape[0]:
                         image, p, radius=5, color=(255, 0, 0), thickness=2
                     )
         except Exception as e:
+            print(f"Drawing error: {e}")
+            return None, [], f"Drawing error: {e}"
         depth_text = ""
         try:
                     and 0 <= p[0] < current_view["depth"].shape[1]
                 ):
                     d = current_view["depth"][p[1], p[0]]
+                    depth_text += f"- **P{i + 1} depth: {d:.2f}m.**\n"
                 else:
+                    # Use Z coordinate of 3D points if depth not available
                     if (
                         points3d is not None
                         and 0 <= p[1] < points3d.shape[0]
                         and 0 <= p[0] < points3d.shape[1]
                     ):
                         z = points3d[p[1], p[0], 2]
+                        depth_text += f"- **P{i + 1} Z-coord: {z:.2f}m.**\n"
         except Exception as e:
+            print(f"Depth text error: {e}")
+            depth_text = f"Error computing depth: {e}\n"
         if len(measure_points) == 2:
             try:
                 point1, point2 = measure_points
+                # Draw line
                 if (
                     0 <= point1[0] < image.shape[1]
                     and 0 <= point1[1] < image.shape[0]
                         image, point1, point2, color=(255, 0, 0), thickness=2
                     )
+                # Compute 3D distance
+                distance_text = "- **Distance: Unable to compute**"
                 if (
                     points3d is not None
                     and 0 <= point1[1] < points3d.shape[0]
                         p1_3d = points3d[point1[1], point1[0]]
                         p2_3d = points3d[point2[1], point2[0]]
                         distance = np.linalg.norm(p1_3d - p2_3d)
+                        distance_text = f"- **Distance: {distance:.2f}m**"
                     except Exception as e:
+                        print(f"Distance computation error: {e}")
+                        distance_text = f"- **Distance computation error: {e}**"
                 measure_points = []
                 text = depth_text + distance_text
+                print(f"Measurement complete: {text}")
                 return [image, measure_points, text]
             except Exception as e:
+                print(f"Final measurement error: {e}")
+                return None, [], f"Measurement error: {e}"
         else:
+            print(f"Single point measurement: {depth_text}")
             return [image, measure_points, depth_text]
     except Exception as e:
+        print(f"Overall measure function error: {e}")
+        return None, [], f"Measure function error: {e}"
 def clear_fields():
+    """
+    Clears the 3D viewer, the stored target_dir, and empties the gallery.
+    """
+    return None
 def update_log():
+    """
+    Display a quick log message while waiting.
+    """
+    return "加载和重建中..."
 def update_visualization(
     filter_white_bg=False,
     show_mesh=True,
 ):
+    """
+    Reload saved predictions from npz, create (or reuse) the GLB for new parameters,
+    and return it for the 3D viewer. If is_example == "True", skip.
+    """
+    # If it's an example click, skip as requested
     if is_example == "True":
+        return (
+            gr.update(),
+            "没有可用的重建。请先点击重建按钮。",
+        )
     if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
+        return (
+            gr.update(),
+            "没有可用的重建。请先点击重建按钮。",
+        )
     predictions_path = os.path.join(target_dir, "predictions.npz")
     if not os.path.exists(predictions_path):
+        return (
+            gr.update(),
+            f"No reconstruction available at {predictions_path}. Please run 'Reconstruct' first.",
+        )
     loaded = np.load(predictions_path, allow_pickle=True)
     predictions = {key: loaded[key] for key in loaded.keys()}
         f"glbscene_{frame_filter.replace('.', '_').replace(':', '').replace(' ', '_')}_cam{show_cam}_mesh{show_mesh}_black{filter_black_bg}_white{filter_white_bg}.glb",
     )
+    if not os.path.exists(glbfile):
+        glbscene = predictions_to_glb(
+            predictions,
+            filter_by_frames=frame_filter,
+            show_cam=show_cam,
+            mask_black_bg=filter_black_bg,
+            mask_white_bg=filter_white_bg,
+            as_mesh=show_mesh,
+        )
+        glbscene.export(file_obj=glbfile)
+    return (
+        glbfile,
+        "可视化已更新",
+    )
 def update_all_views_on_filter_change(
     normal_view_selector,
     measure_view_selector,
 ):
+    """
+    Update all individual view tabs when background filtering checkboxes change.
+    This regenerates the processed data with new filtering and updates all views.
+    """
+    # Check if we have a valid target directory and predictions
     if not target_dir or target_dir == "None" or not os.path.isdir(target_dir):
         return processed_data, None, None, None, []
         return processed_data, None, None, None, []
     try:
+        # Load the original predictions and views
         loaded = np.load(predictions_path, allow_pickle=True)
         predictions = {key: loaded[key] for key in loaded.keys()}
+        # Load images using MapAnything's load_images function
         image_folder_path = os.path.join(target_dir, "images")
         views = load_images(image_folder_path)
+        # Regenerate processed data with new filtering settings
         new_processed_data = process_predictions_for_visualization(
             predictions, views, high_level_config, filter_black_bg, filter_white_bg
         )
+        # Get current view indices
         try:
             depth_view_idx = (
                 int(depth_view_selector.split()[1]) - 1 if depth_view_selector else 0
         except:
             measure_view_idx = 0
+        # Update all views with new filtered data
         depth_vis = update_depth_view(new_processed_data, depth_view_idx)
         normal_vis = update_normal_view(new_processed_data, normal_view_idx)
         measure_img, _ = update_measure_view(new_processed_data, measure_view_idx)
         return processed_data, None, None, None, []
+# -------------------------------------------------------------------------
+# Example scene functions
+# -------------------------------------------------------------------------
 def get_scene_info(examples_dir):
     """Get information about scenes in the examples directory"""
     import glob
     for scene_folder in sorted(os.listdir(examples_dir)):
         scene_path = os.path.join(examples_dir, scene_folder)
         if os.path.isdir(scene_path):
+            # Find all image files in the scene folder
             image_extensions = ["*.jpg", "*.jpeg", "*.png", "*.bmp", "*.tiff", "*.tif"]
             image_files = []
             for ext in image_extensions:
                 image_files.extend(glob.glob(os.path.join(scene_path, ext.upper())))
             if image_files:
+                # Sort images and get the first one for thumbnail
                 image_files = sorted(image_files)
                 first_image = image_files[0]
                 num_images = len(image_files)
 def load_example_scene(scene_name, examples_dir="examples"):
+    """Load a scene from examples directory"""
     scenes = get_scene_info(examples_dir)
+    # Find the selected scene
     selected_scene = None
     for scene in scenes:
         if scene["name"] == scene_name:
             break
     if selected_scene is None:
+        return None, None, None, "Scene not found"
+    # Create file-like objects for the unified upload system
+    # Convert image file paths to the format expected by unified_upload
     file_objects = []
     for image_path in selected_scene["image_files"]:
         file_objects.append(image_path)
+    # Create target directory and copy images using the unified upload system
     target_dir, image_paths = handle_uploads(file_objects, 1.0)
     return (
+        None,  # Clear reconstruction output
+        target_dir,  # Set target directory
+        image_paths,  # Set gallery
+        f"已加载场景 '{scene_name}'（{selected_scene['num_images']} 张图片）。点击「开始重建」进行3D处理。",
     )
+# -------------------------------------------------------------------------
+# 6) Build Gradio UI
+# -------------------------------------------------------------------------
 theme = get_gradio_theme()
 # 自定义CSS防止UI抖动
 }
 """
+with gr.Blocks(theme=theme, css=CUSTOM_CSS, title="MapAnything - 3D重建系统") as demo:
+    # State variables for the tabbed interface
     is_example = gr.Textbox(label="is_example", visible=False, value="None")
+    num_images = gr.Textbox(label="num_images", visible=False, value="None")
     processed_data_state = gr.State(value=None)
     measure_points_state = gr.State(value=[])
+    current_view_index = gr.State(value=0)  # Track current view index for navigation
     # 添加粘贴板支持的 JavaScript
+    PASTE_JS = """
+    <script>
+    // 添加粘贴板支持
+    document.addEventListener('paste', function(e) {
+        const items = e.clipboardData.items;
+        for (let i = 0; i < items.length; i++) {
+            if (items[i].type.indexOf('image') !== -1) {
+                const blob = items[i].getAsFile();
+                const fileInput = document.querySelector('input[type="file"][multiple]');
+                if (fileInput) {
+                    const dataTransfer = new DataTransfer();
+                    dataTransfer.items.add(blob);
+                    fileInput.files = dataTransfer.files;
+                    fileInput.dispatchEvent(new Event('change', { bubbles: true }));
+                    console.log('✅ 图片已从剪贴板粘贴');
+                }
+            }
+        }
+    });
+    console.log('💡 粘贴板功能已启用：使用 Ctrl+V 可直接粘贴截图');
+    </script>
+    """
     gr.HTML(PASTE_JS)
+    # 美化的顶部标题
     gr.HTML("""
     <div style="text-align: center; margin: 20px 0;">
+        <h2 style="color: #1976D2; margin-bottom: 10px;">MapAnything - 3D重建系统</h2>
+        <p style="color: #666; font-size: 16px;">多视图3D重建 | 深度估计 | 法线计算 | 距离测量</p>
     </div>
     """)
                         clear_color=[0.0, 0.0, 0.0, 0.0]
                     )
                 with gr.Tab("📊 深度图"):
                     with gr.Row(elem_classes=["navigation-row"]):
                         prev_depth_btn = gr.Button("◀", size="sm", scale=1)
                 max_lines=1
             )
+    # 高级选项（默认折叠）
+    with gr.Accordion("⚙️ 高级选项", open=False):
         with gr.Row(equal_height=False):
             with gr.Column(scale=1, min_width=300):
                 gr.Markdown("#### 可视化参数")
                 apply_mask_checkbox = gr.Checkbox(
                     label="应用深度掩码", value=True
                 )
     # 示例场景（可折叠）
     with gr.Accordion("🖼️ 示例场景", open=False):
+        gr.Markdown("点击缩略图加载场景进行重建")
         scenes = get_scene_info("examples")
         if scenes:
+            for i in range(0, len(scenes), 4):  # Process 4 scenes per row
                 with gr.Row(equal_height=True):
                     for j in range(4):
                         scene_idx = i + j
                             scene = scenes[scene_idx]
                             with gr.Column(scale=1, min_width=150):
                                 scene_img = gr.Image(
+                                    value=scene["thumbnail"],
                                     height=150,
+                                    interactive=False,
+                                    show_label=False,
                                     sources=[],
                                     container=False
                                 )
                                     fn=lambda name=scene["name"]: load_example_scene(name),
                                     outputs=[
                                         reconstruction_output,
+                                        target_dir_output,
+                                        image_gallery,
+                                        log_output,
+                                    ],
                                 )
     # === 事件绑定 ===
     # 上传文件自动更新
     def update_gallery_on_unified_upload(files, interval):
         if not files:
     # 重建按钮
     submit_btn.click(
         fn=clear_fields,
+        outputs=[reconstruction_output]
     ).then(
         fn=update_log,
         outputs=[log_output]
         inputs=[
             target_dir_output, frame_filter, show_cam,
             filter_black_bg, filter_white_bg,
+            apply_mask_checkbox, show_mesh
         ],
         outputs=[
+            reconstruction_output, log_output, frame_filter,
             processed_data_state, depth_map, normal_map, measure_image,
             measure_text, depth_view_selector, normal_view_selector, measure_view_selector
         ]
     )
     # 清空按钮
+    clear_btn.add([reconstruction_output, log_output])
     # 可视化参数实时更新
     for component in [frame_filter, show_cam, show_mesh]:
         component.change(
             ],
             outputs=[processed_data_state, depth_map, normal_map, measure_image, measure_points_state]
         )
     # 深度图导航
     prev_depth_btn.click(
         fn=lambda pd, cs: navigate_depth_view(pd, cs, -1),
         outputs=[measure_image, measure_points_state]
     )
 demo.queue(max_size=20).launch(show_error=True, share=True, ssr_mode=False)