Spaces:

ColamanAI
/

Colaman-segmap

Running

App Files Files Community

ColamanAI commited on Oct 14

Commit

3e7f3f4

verified ·

1 Parent(s): 5fcf7e1

Upload app.py

Browse files

Files changed (1) hide show

app.py +334 -47

app.py CHANGED Viewed

@@ -89,14 +89,26 @@ high_level_config = {
     "resolution": 518,
 }
-# GroundingDINO and SAM Configuration (CPU-friendly versions)
-GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"  # 已经是tiny版本
 GROUNDING_DINO_BOX_THRESHOLD = 0.25
 GROUNDING_DINO_TEXT_THRESHOLD = 0.2
-# 使用 MobileSAM (CPU友好，比SAM-huge快60倍，只有10MB)
-SAM_MODEL_ID = "dhkim2810/MobileSAM"  # 轻量级SAM，适合CPU
-USE_MOBILE_SAM = True  # 标记使用MobileSAM
 DEFAULT_TEXT_PROMPT = "chair . table . sofa . bed . desk . cabinet"
@@ -151,11 +163,95 @@ grounding_dino_model = None
 grounding_dino_processor = None
 sam_predictor = None
 # ============================================================================
 # Model Loading Functions
 # ============================================================================
 def load_grounding_dino_model(device="cpu"):
     """Load GroundingDINO model from HuggingFace (CPU优化)"""
     global grounding_dino_model, grounding_dino_processor
@@ -166,19 +262,49 @@ def load_grounding_dino_model(device="cpu"):
     try:
         from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
         # 强制使用 CPU ���行分割（节省 GPU 资源）
         seg_device = "cpu"
         print(f"📥 Loading GroundingDINO from HuggingFace: {GROUNDING_DINO_MODEL_ID} (使用 {seg_device.upper()})")
-        grounding_dino_processor = AutoProcessor.from_pretrained(GROUNDING_DINO_MODEL_ID)
         grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
-            GROUNDING_DINO_MODEL_ID
         ).to(seg_device).eval()
         print(f"✅ GroundingDINO loaded successfully on {seg_device.upper()}")
     except Exception as e:
-        print(f"❌ GroundingDINO loading failed: {e}")
         import traceback
         traceback.print_exc()
@@ -193,14 +319,28 @@ def load_sam_model(device="cpu"):
     try:
         from transformers import SamModel, SamProcessor
         # 强制使用 CPU 进行分割（MobileSAM 专为移动设备/CPU优化）
         seg_device = "cpu"
         print(f"📥 Loading MobileSAM from HuggingFace: {SAM_MODEL_ID} (使用 {seg_device.upper()})")
         print(f"   💡 MobileSAM 是轻量级版本，比 SAM-huge 快60倍，只有10MB，适合CPU运行")
-        sam_model = SamModel.from_pretrained(SAM_MODEL_ID).to(seg_device).eval()
-        sam_processor = SamProcessor.from_pretrained(SAM_MODEL_ID)
         # Wrap in a predictor-like interface
         class SAMPredictor:
@@ -244,9 +384,22 @@ def load_sam_model(device="cpu"):
         sam_predictor = SAMPredictor(sam_model, sam_processor, seg_device)
         print(f"✅ MobileSAM loaded successfully on {seg_device.upper()}")
     except Exception as e:
-        print(f"❌ SAM loading failed: {e}")
-        print("   Falling back to bbox-based masks")
         import traceback
         traceback.print_exc()
@@ -271,6 +424,98 @@ def generate_distinct_colors(n):
     return colors
 def run_grounding_dino_detection(image_np, text_prompt, device="cpu"):
     """Run GroundingDINO detection (CPU优化)"""
     if grounding_dino_model is None or grounding_dino_processor is None:
@@ -720,20 +965,38 @@ def run_model(
     if enable_segmentation:
         progress(0.1, desc="🎯 加载分割模型 (CPU)...")
         print(f"\n{'='*70}")
-        print("🎯 分割模型加载开始...")
         print(f"{'='*70}")
-        load_grounding_dino_model("cpu")  # 分割使用CPU
-        load_sam_model("cpu")  # MobileSAM在CPU上运行良好
-        # 验证模型是否成功加载
-        if grounding_dino_model is None:
-            print("❌ GroundingDINO 模型加载失败！")
-            raise RuntimeError("GroundingDINO 模型加载失败，请检查网络连接或模型配置")
-        if sam_predictor is None:
-            print("❌ SAM 模型加载失败！")
-            raise RuntimeError("SAM 模型加载失败，请检查网络连接或模型配置")
-        print(f"✅ 所有分割模型加载成功")
         print(f"{'='*70}\n")
     # Load images
@@ -807,13 +1070,14 @@ def run_model(
     # Segmentation processing
     segmented_glb = None
-    if enable_segmentation and grounding_dino_model is not None:
         progress(0.65, desc="🎯 开始物体分割...")
         print(f"\n{'='*70}")
-        print("🎯 开始物体分割...")
-        print(f"🔍 检测提示词: {text_prompt[:100]}...")
-        print(f"📊 置信度阈值: {GROUNDING_DINO_BOX_THRESHOLD}")
         print(f"📐 最小掩码面积: {MIN_MASK_AREA} px")
         print(f"{'='*70}\n")
         all_view_detections = []
@@ -829,23 +1093,46 @@ def run_model(
             else:
                 ref_image_np = ref_image
-            detections = run_grounding_dino_detection(ref_image_np, text_prompt, "cpu")  # 使用CPU进行检测
-            print(f"   ✓ 检测到 {len(detections)} 个物体")
-            if len(detections) > 0:
-                for i, det in enumerate(detections):
-                    print(f"      物体 {i+1}: {det['label']} (置信度: {det['confidence']:.2f})")
-                boxes = [d['bbox'] for d in detections]
-                masks = run_sam_refinement(ref_image_np, boxes)
-                points3d = world_points_list[view_idx]
-                for det_idx, (det, mask) in enumerate(zip(detections, masks)):
-                    center_3d = compute_object_3d_center(points3d, mask)
-                    det['center_3d'] = center_3d
-                    det['mask_2d'] = mask
-                all_view_detections.append(detections)
                 all_view_masks.append(masks)
             else:
                 all_view_detections.append([])

     "resolution": 518,
 }
+# ============ 分割模型配置 ============
+# 方案选择：
+# 1. "segformer" - SegFormer (最轻量，~14MB，最快)
+# 2. "maskformer" - MaskFormer (中等，~100MB，实例分割)
+# 3. "grounding_sam" - GroundingDINO + SAM (最强，~110MB，文本提示)
+SEGMENTATION_METHOD = "segformer"  # 默认使用最轻量的方案
+# SegFormer Configuration (推荐 - CPU友好)
+SEGFORMER_MODEL_ID = "nvidia/segformer-b0-finetuned-ade-512-512"  # 14MB，150类物体
+# MaskFormer Configuration (备选)
+MASKFORMER_MODEL_ID = "facebook/maskformer-swin-tiny-ade"  # 100MB，实例分割
+# GroundingDINO + SAM Configuration (原方案 - 需要文本提示)
+GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"
 GROUNDING_DINO_BOX_THRESHOLD = 0.25
 GROUNDING_DINO_TEXT_THRESHOLD = 0.2
+SAM_MODEL_ID = "dhkim2810/MobileSAM"
+USE_MOBILE_SAM = True
 DEFAULT_TEXT_PROMPT = "chair . table . sofa . bed . desk . cabinet"
 grounding_dino_processor = None
 sam_predictor = None
+# SegFormer 模型（轻量级语义分割）
+segformer_processor = None
+segformer_model = None
+# MaskFormer 模型（实例分割）
+maskformer_processor = None
+maskformer_model = None
 # ============================================================================
 # Model Loading Functions
 # ============================================================================
+def load_segformer_model(device="cpu"):
+    """加载 SegFormer 模型（最轻量，CPU友好）"""
+    global segformer_processor, segformer_model
+    if segformer_model is not None:
+        print("✅ SegFormer already loaded")
+        return
+    try:
+        from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
+        import os
+        print(f"📥 Loading SegFormer from HuggingFace: {SEGFORMER_MODEL_ID}")
+        print(f"   💡 SegFormer-B0: ~14MB, 150类物体, CPU优化")
+        cache_dir = os.getenv("HF_HOME", "./hf_cache")
+        print(f"   正在下载 processor...")
+        segformer_processor = SegformerImageProcessor.from_pretrained(
+            SEGFORMER_MODEL_ID,
+            cache_dir=cache_dir
+        )
+        print(f"   正在下载 model...")
+        segformer_model = SegformerForSemanticSegmentation.from_pretrained(
+            SEGFORMER_MODEL_ID,
+            cache_dir=cache_dir,
+            low_cpu_mem_usage=True
+        ).to(device).eval()
+        print(f"✅ SegFormer loaded successfully on {device.upper()}")
+        print(f"   可识别类别: 人、家具、墙壁、地板等150类")
+    except Exception as e:
+        print(f"❌ SegFormer loading failed: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
+def load_maskformer_model(device="cpu"):
+    """加载 MaskFormer 模型（实例分割）"""
+    global maskformer_processor, maskformer_model
+    if maskformer_model is not None:
+        print("✅ MaskFormer already loaded")
+        return
+    try:
+        from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
+        import os
+        print(f"📥 Loading MaskFormer from HuggingFace: {MASKFORMER_MODEL_ID}")
+        print(f"   💡 MaskFormer: ~100MB, 实例分割")
+        cache_dir = os.getenv("HF_HOME", "./hf_cache")
+        print(f"   正在下载 processor...")
+        maskformer_processor = MaskFormerImageProcessor.from_pretrained(
+            MASKFORMER_MODEL_ID,
+            cache_dir=cache_dir
+        )
+        print(f"   正在下载 model...")
+        maskformer_model = MaskFormerForInstanceSegmentation.from_pretrained(
+            MASKFORMER_MODEL_ID,
+            cache_dir=cache_dir,
+            low_cpu_mem_usage=True
+        ).to(device).eval()
+        print(f"✅ MaskFormer loaded successfully on {device.upper()}")
+    except Exception as e:
+        print(f"❌ MaskFormer loading failed: {type(e).__name__}: {e}")
+        import traceback
+        traceback.print_exc()
 def load_grounding_dino_model(device="cpu"):
     """Load GroundingDINO model from HuggingFace (CPU优化)"""
     global grounding_dino_model, grounding_dino_processor
     try:
         from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
+        import os
         # 强制使用 CPU ���行分割（节省 GPU 资源）
         seg_device = "cpu"
         print(f"📥 Loading GroundingDINO from HuggingFace: {GROUNDING_DINO_MODEL_ID} (使用 {seg_device.upper()})")
+        # 设置缓存目录（HuggingFace Spaces友好）
+        cache_dir = os.getenv("HF_HOME", "./hf_cache")
+        # 加载模型（带重试和详细日志）
+        print(f"   正在下载 processor...")
+        grounding_dino_processor = AutoProcessor.from_pretrained(
+            GROUNDING_DINO_MODEL_ID,
+            cache_dir=cache_dir,
+            trust_remote_code=True  # 允许运行远程代码
+        )
+        print(f"   正在下载 model...")
         grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
+            GROUNDING_DINO_MODEL_ID,
+            cache_dir=cache_dir,
+            trust_remote_code=True,
+            low_cpu_mem_usage=True  # 降低CPU内存使用
         ).to(seg_device).eval()
         print(f"✅ GroundingDINO loaded successfully on {seg_device.upper()}")
+    except ImportError as e:
+        print(f"❌ ImportError: {e}")
+        print(f"💡 请检查 requirements.txt 是否包含 transformers 库")
+        import traceback
+        traceback.print_exc()
+    except OSError as e:
+        print(f"❌ OSError (网络/文件问题): {e}")
+        print(f"💡 可能是网络连接问题或模型仓库不可访问")
+        print(f"💡 尝试解决方案:")
+        print(f"   1. 检查 HuggingFace Spaces 的网络连接")
+        print(f"   2. 检查模型ID是否正确: {GROUNDING_DINO_MODEL_ID}")
+        print(f"   3. 确保有足够的磁盘空间")
+        import traceback
+        traceback.print_exc()
     except Exception as e:
+        print(f"❌ GroundingDINO loading failed: {type(e).__name__}: {e}")
         import traceback
         traceback.print_exc()
     try:
         from transformers import SamModel, SamProcessor
+        import os
         # 强制使用 CPU 进行分割（MobileSAM 专为移动设备/CPU优化）
         seg_device = "cpu"
         print(f"📥 Loading MobileSAM from HuggingFace: {SAM_MODEL_ID} (使用 {seg_device.upper()})")
         print(f"   💡 MobileSAM 是轻量级版本，比 SAM-huge 快60倍，只有10MB，适合CPU运行")
+        # 设置缓存目录
+        cache_dir = os.getenv("HF_HOME", "./hf_cache")
+        print(f"   正在下载 processor...")
+        sam_processor = SamProcessor.from_pretrained(
+            SAM_MODEL_ID,
+            cache_dir=cache_dir
+        )
+        print(f"   正在下载 model...")
+        sam_model = SamModel.from_pretrained(
+            SAM_MODEL_ID,
+            cache_dir=cache_dir,
+            low_cpu_mem_usage=True
+        ).to(seg_device).eval()
         # Wrap in a predictor-like interface
         class SAMPredictor:
         sam_predictor = SAMPredictor(sam_model, sam_processor, seg_device)
         print(f"✅ MobileSAM loaded successfully on {seg_device.upper()}")
+    except ImportError as e:
+        print(f"❌ ImportError: {e}")
+        print(f"💡 请检查 requirements.txt 是否包含 transformers 库")
+        import traceback
+        traceback.print_exc()
+    except OSError as e:
+        print(f"❌ OSError (网络/文件问题): {e}")
+        print(f"💡 可能是网络连接问题或模型仓库不可访问")
+        print(f"💡 尝试解决方案:")
+        print(f"   1. 检查 HuggingFace Spaces 的网络连接")
+        print(f"   2. 检查模型ID是否正确: {SAM_MODEL_ID}")
+        print(f"   3. 确保有足够的磁盘空间")
+        import traceback
+        traceback.print_exc()
     except Exception as e:
+        print(f"❌ SAM loading failed: {type(e).__name__}: {e}")
         import traceback
         traceback.print_exc()
     return colors
+# ============================================================================
+# SegFormer 分割函数（简化方案）
+# ============================================================================
+def run_segformer_segmentation(image_np, device="cpu"):
+    """使用 SegFormer 进行语义分割（最简单，CPU友好）"""
+    if segformer_model is None or segformer_processor is None:
+        print("❌ SegFormer model not loaded")
+        return []
+    try:
+        import torch
+        from PIL import Image
+        # 准备图片
+        if image_np.dtype != np.uint8:
+            image_np = (image_np * 255).astype(np.uint8)
+        image_pil = Image.fromarray(image_np)
+        # 推理
+        inputs = segformer_processor(images=image_pil, return_tensors="pt")
+        inputs = {k: v.to(device) for k, v in inputs.items()}
+        with torch.no_grad():
+            outputs = segformer_model(**inputs)
+        # 获取分割结果
+        logits = outputs.logits  # (1, num_classes, H, W)
+        predicted_segmentation = logits.argmax(dim=1).squeeze().cpu().numpy()
+        # 生成实例掩码（将相同类别的连续区域分开）
+        from scipy import ndimage
+        # ADE20K 常见类别映射（部分）
+        ade20k_labels = {
+            5: "wall", 7: "floor", 11: "ceiling", 18: "window", 14: "door",
+            19: "table", 20: "chair", 22: "sofa", 23: "bed", 28: "cabinet",
+            34: "desk", 39: "lamp", 65: "television", 89: "shelf"
+        }
+        detections = []
+        masks = []
+        # 对每个类别提取实例
+        unique_labels = np.unique(predicted_segmentation)
+        for label_id in unique_labels:
+            if label_id == 0:  # 跳过背景
+                continue
+            # 获取该类别的掩码
+            class_mask = (predicted_segmentation == label_id)
+            # 分离连通区域（不同实例）
+            labeled_mask, num_features = ndimage.label(class_mask)
+            for instance_id in range(1, num_features + 1):
+                instance_mask = (labeled_mask == instance_id)
+                mask_area = instance_mask.sum()
+                # 过滤小区域
+                if mask_area < MIN_MASK_AREA:
+                    continue
+                # 计算边界框
+                rows, cols = np.where(instance_mask)
+                if len(rows) == 0:
+                    continue
+                y_min, y_max = rows.min(), rows.max()
+                x_min, x_max = cols.min(), cols.max()
+                bbox = [x_min, y_min, x_max, y_max]
+                # 获取类别名称
+                label_name = ade20k_labels.get(int(label_id), f"object_{label_id}")
+                detections.append({
+                    'bbox': bbox,
+                    'label': label_name,
+                    'confidence': 0.9,  # SegFormer 不提供置信度，给固定值
+                    'class_id': int(label_id)
+                })
+                masks.append(instance_mask)
+        return detections, masks
+    except Exception as e:
+        print(f"❌ SegFormer segmentation failed: {e}")
+        import traceback
+        traceback.print_exc()
+        return [], []
 def run_grounding_dino_detection(image_np, text_prompt, device="cpu"):
     """Run GroundingDINO detection (CPU优化)"""
     if grounding_dino_model is None or grounding_dino_processor is None:
     if enable_segmentation:
         progress(0.1, desc="🎯 加载分割模型 (CPU)...")
         print(f"\n{'='*70}")
+        print(f"🎯 分割模型加载开始... (方案: {SEGMENTATION_METHOD})")
         print(f"{'='*70}")
+        if SEGMENTATION_METHOD == "segformer":
+            # 方案1: SegFormer (最轻量，~14MB，最快)
+            print("📌 使用方案: SegFormer (轻量级，无需文本提示)")
+            load_segformer_model("cpu")
+            if segformer_model is None:
+                print("❌ SegFormer 模型加载失败！")
+                raise RuntimeError("SegFormer 模型加载失败，请检查网络连接")
+        elif SEGMENTATION_METHOD == "maskformer":
+            # 方案2: MaskFormer (中等，~100MB)
+            print("📌 使用方案: MaskFormer (实例分割)")
+            load_maskformer_model("cpu")
+            if maskformer_model is None:
+                print("❌ MaskFormer 模型加载失败！")
+                raise RuntimeError("MaskFormer 模型加载失败，请检查网络连接")
+        else:  # "grounding_sam"
+            # 方案3: GroundingDINO + SAM (最强，~110MB，需要文本提示)
+            print("📌 使用方案: GroundingDINO + SAM (文本提示驱动)")
+            load_grounding_dino_model("cpu")
+            load_sam_model("cpu")
+            if grounding_dino_model is None:
+                print("❌ GroundingDINO 模型加载失败！")
+                raise RuntimeError("GroundingDINO 模型加载失败，请检查网络连接")
+            if sam_predictor is None:
+                print("❌ SAM 模型加载失败！")
+                raise RuntimeError("SAM 模型加载失败，请检查网络连接")
+        print(f"✅ 分割模型加载成功")
         print(f"{'='*70}\n")
     # Load images
     # Segmentation processing
     segmented_glb = None
+    if enable_segmentation:
         progress(0.65, desc="🎯 开始物体分割...")
         print(f"\n{'='*70}")
+        print(f"🎯 开始物体分割... (方案: {SEGMENTATION_METHOD})")
         print(f"📐 最小掩码面积: {MIN_MASK_AREA} px")
+        if SEGMENTATION_METHOD == "grounding_sam":
+            print(f"🔍 检测提示词: {text_prompt[:100]}...")
+            print(f"📊 置信度阈值: {GROUNDING_DINO_BOX_THRESHOLD}")
         print(f"{'='*70}\n")
         all_view_detections = []
             else:
                 ref_image_np = ref_image
+            # 根据分割方法选择不同的处理流程
+            if SEGMENTATION_METHOD == "segformer":
+                # SegFormer: 直接语义分割，无需文本提示
+                detections, masks = run_segformer_segmentation(ref_image_np, "cpu")
+                print(f"   ✓ 检测到 {len(detections)} 个物体")
+                if len(detections) > 0:
+                    for i, det in enumerate(detections):
+                        print(f"      物体 {i+1}: {det['label']}")
+                    points3d = world_points_list[view_idx]
+                    for det_idx, (det, mask) in enumerate(zip(detections, masks)):
+                        center_3d = compute_object_3d_center(points3d, mask)
+                        det['center_3d'] = center_3d
+                        det['mask_2d'] = mask
+                    all_view_detections.append(detections)
+                    all_view_masks.append(masks)
+                else:
+                    all_view_detections.append([])
+                    all_view_masks.append([])
+            elif SEGMENTATION_METHOD == "grounding_sam":
+                # GroundingDINO + SAM: 文本提示驱动
+                detections = run_grounding_dino_detection(ref_image_np, text_prompt, "cpu")
+                print(f"   ✓ 检测到 {len(detections)} 个物体")
+                if len(detections) > 0:
+                    for i, det in enumerate(detections):
+                        print(f"      物体 {i+1}: {det['label']} (置信度: {det['confidence']:.2f})")
+                    boxes = [d['bbox'] for d in detections]
+                    masks = run_sam_refinement(ref_image_np, boxes)
+                    points3d = world_points_list[view_idx]
+                    for det_idx, (det, mask) in enumerate(zip(detections, masks)):
+                        center_3d = compute_object_3d_center(points3d, mask)
+                        det['center_3d'] = center_3d
+                        det['mask_2d'] = mask
+                    all_view_detections.append(detections)
                 all_view_masks.append(masks)
             else:
                 all_view_detections.append([])