Spaces:
Running
Running
Upload app.py
Browse files
app.py
CHANGED
|
@@ -89,14 +89,26 @@ high_level_config = {
|
|
| 89 |
"resolution": 518,
|
| 90 |
}
|
| 91 |
|
| 92 |
-
#
|
| 93 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 94 |
GROUNDING_DINO_BOX_THRESHOLD = 0.25
|
| 95 |
GROUNDING_DINO_TEXT_THRESHOLD = 0.2
|
| 96 |
-
|
| 97 |
-
|
| 98 |
-
SAM_MODEL_ID = "dhkim2810/MobileSAM" # 轻量级SAM,适合CPU
|
| 99 |
-
USE_MOBILE_SAM = True # 标记使用MobileSAM
|
| 100 |
|
| 101 |
DEFAULT_TEXT_PROMPT = "chair . table . sofa . bed . desk . cabinet"
|
| 102 |
|
|
@@ -151,11 +163,95 @@ grounding_dino_model = None
|
|
| 151 |
grounding_dino_processor = None
|
| 152 |
sam_predictor = None
|
| 153 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 154 |
|
| 155 |
# ============================================================================
|
| 156 |
# Model Loading Functions
|
| 157 |
# ============================================================================
|
| 158 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 159 |
def load_grounding_dino_model(device="cpu"):
|
| 160 |
"""Load GroundingDINO model from HuggingFace (CPU优化)"""
|
| 161 |
global grounding_dino_model, grounding_dino_processor
|
|
@@ -166,19 +262,49 @@ def load_grounding_dino_model(device="cpu"):
|
|
| 166 |
|
| 167 |
try:
|
| 168 |
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
|
|
|
| 169 |
|
| 170 |
# 强制使用 CPU ���行分割(节省 GPU 资源)
|
| 171 |
seg_device = "cpu"
|
| 172 |
print(f"📥 Loading GroundingDINO from HuggingFace: {GROUNDING_DINO_MODEL_ID} (使用 {seg_device.upper()})")
|
| 173 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 174 |
grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
|
| 175 |
-
GROUNDING_DINO_MODEL_ID
|
|
|
|
|
|
|
|
|
|
| 176 |
).to(seg_device).eval()
|
| 177 |
|
| 178 |
print(f"✅ GroundingDINO loaded successfully on {seg_device.upper()}")
|
| 179 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 180 |
except Exception as e:
|
| 181 |
-
print(f"❌ GroundingDINO loading failed: {e}")
|
| 182 |
import traceback
|
| 183 |
traceback.print_exc()
|
| 184 |
|
|
@@ -193,14 +319,28 @@ def load_sam_model(device="cpu"):
|
|
| 193 |
|
| 194 |
try:
|
| 195 |
from transformers import SamModel, SamProcessor
|
|
|
|
| 196 |
|
| 197 |
# 强制使用 CPU 进行分割(MobileSAM 专为移动设备/CPU优化)
|
| 198 |
seg_device = "cpu"
|
| 199 |
print(f"📥 Loading MobileSAM from HuggingFace: {SAM_MODEL_ID} (使用 {seg_device.upper()})")
|
| 200 |
print(f" 💡 MobileSAM 是轻量级版本,比 SAM-huge 快60倍,只有10MB,适合CPU运行")
|
| 201 |
|
| 202 |
-
|
| 203 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 204 |
|
| 205 |
# Wrap in a predictor-like interface
|
| 206 |
class SAMPredictor:
|
|
@@ -244,9 +384,22 @@ def load_sam_model(device="cpu"):
|
|
| 244 |
sam_predictor = SAMPredictor(sam_model, sam_processor, seg_device)
|
| 245 |
print(f"✅ MobileSAM loaded successfully on {seg_device.upper()}")
|
| 246 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 247 |
except Exception as e:
|
| 248 |
-
print(f"❌ SAM loading failed: {e}")
|
| 249 |
-
print(" Falling back to bbox-based masks")
|
| 250 |
import traceback
|
| 251 |
traceback.print_exc()
|
| 252 |
|
|
@@ -271,6 +424,98 @@ def generate_distinct_colors(n):
|
|
| 271 |
return colors
|
| 272 |
|
| 273 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 274 |
def run_grounding_dino_detection(image_np, text_prompt, device="cpu"):
|
| 275 |
"""Run GroundingDINO detection (CPU优化)"""
|
| 276 |
if grounding_dino_model is None or grounding_dino_processor is None:
|
|
@@ -720,20 +965,38 @@ def run_model(
|
|
| 720 |
if enable_segmentation:
|
| 721 |
progress(0.1, desc="🎯 加载分割模型 (CPU)...")
|
| 722 |
print(f"\n{'='*70}")
|
| 723 |
-
print("🎯 分割模型加载开始...")
|
| 724 |
print(f"{'='*70}")
|
| 725 |
-
|
| 726 |
-
|
| 727 |
-
|
| 728 |
-
|
| 729 |
-
|
| 730 |
-
|
| 731 |
-
|
| 732 |
-
|
| 733 |
-
|
| 734 |
-
|
| 735 |
-
|
| 736 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 737 |
print(f"{'='*70}\n")
|
| 738 |
|
| 739 |
# Load images
|
|
@@ -807,13 +1070,14 @@ def run_model(
|
|
| 807 |
|
| 808 |
# Segmentation processing
|
| 809 |
segmented_glb = None
|
| 810 |
-
if enable_segmentation
|
| 811 |
progress(0.65, desc="🎯 开始物体分割...")
|
| 812 |
print(f"\n{'='*70}")
|
| 813 |
-
print("🎯 开始物体分割...")
|
| 814 |
-
print(f"🔍 检测提示词: {text_prompt[:100]}...")
|
| 815 |
-
print(f"📊 置信度阈值: {GROUNDING_DINO_BOX_THRESHOLD}")
|
| 816 |
print(f"📐 最小掩码面积: {MIN_MASK_AREA} px")
|
|
|
|
|
|
|
|
|
|
| 817 |
print(f"{'='*70}\n")
|
| 818 |
|
| 819 |
all_view_detections = []
|
|
@@ -829,23 +1093,46 @@ def run_model(
|
|
| 829 |
else:
|
| 830 |
ref_image_np = ref_image
|
| 831 |
|
| 832 |
-
|
| 833 |
-
|
| 834 |
-
|
| 835 |
-
|
| 836 |
-
|
| 837 |
-
|
| 838 |
-
|
| 839 |
-
|
| 840 |
-
|
| 841 |
-
|
| 842 |
-
|
| 843 |
-
|
| 844 |
-
|
| 845 |
-
|
| 846 |
-
|
| 847 |
-
|
| 848 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
| 849 |
all_view_masks.append(masks)
|
| 850 |
else:
|
| 851 |
all_view_detections.append([])
|
|
|
|
| 89 |
"resolution": 518,
|
| 90 |
}
|
| 91 |
|
| 92 |
+
# ============ 分割模型配置 ============
|
| 93 |
+
# 方案选择:
|
| 94 |
+
# 1. "segformer" - SegFormer (最轻量,~14MB,最快)
|
| 95 |
+
# 2. "maskformer" - MaskFormer (中等,~100MB,实例分割)
|
| 96 |
+
# 3. "grounding_sam" - GroundingDINO + SAM (最强,~110MB,文本提示)
|
| 97 |
+
|
| 98 |
+
SEGMENTATION_METHOD = "segformer" # 默认使用最轻量的方案
|
| 99 |
+
|
| 100 |
+
# SegFormer Configuration (推荐 - CPU友好)
|
| 101 |
+
SEGFORMER_MODEL_ID = "nvidia/segformer-b0-finetuned-ade-512-512" # 14MB,150类物体
|
| 102 |
+
|
| 103 |
+
# MaskFormer Configuration (备选)
|
| 104 |
+
MASKFORMER_MODEL_ID = "facebook/maskformer-swin-tiny-ade" # 100MB,实例分割
|
| 105 |
+
|
| 106 |
+
# GroundingDINO + SAM Configuration (原方案 - 需要文本提示)
|
| 107 |
+
GROUNDING_DINO_MODEL_ID = "IDEA-Research/grounding-dino-tiny"
|
| 108 |
GROUNDING_DINO_BOX_THRESHOLD = 0.25
|
| 109 |
GROUNDING_DINO_TEXT_THRESHOLD = 0.2
|
| 110 |
+
SAM_MODEL_ID = "dhkim2810/MobileSAM"
|
| 111 |
+
USE_MOBILE_SAM = True
|
|
|
|
|
|
|
| 112 |
|
| 113 |
DEFAULT_TEXT_PROMPT = "chair . table . sofa . bed . desk . cabinet"
|
| 114 |
|
|
|
|
| 163 |
grounding_dino_processor = None
|
| 164 |
sam_predictor = None
|
| 165 |
|
| 166 |
+
# SegFormer 模型(轻量级语义分割)
|
| 167 |
+
segformer_processor = None
|
| 168 |
+
segformer_model = None
|
| 169 |
+
|
| 170 |
+
# MaskFormer 模型(实例分割)
|
| 171 |
+
maskformer_processor = None
|
| 172 |
+
maskformer_model = None
|
| 173 |
+
|
| 174 |
|
| 175 |
# ============================================================================
|
| 176 |
# Model Loading Functions
|
| 177 |
# ============================================================================
|
| 178 |
|
| 179 |
+
def load_segformer_model(device="cpu"):
|
| 180 |
+
"""加载 SegFormer 模型(最轻量,CPU友好)"""
|
| 181 |
+
global segformer_processor, segformer_model
|
| 182 |
+
|
| 183 |
+
if segformer_model is not None:
|
| 184 |
+
print("✅ SegFormer already loaded")
|
| 185 |
+
return
|
| 186 |
+
|
| 187 |
+
try:
|
| 188 |
+
from transformers import SegformerImageProcessor, SegformerForSemanticSegmentation
|
| 189 |
+
import os
|
| 190 |
+
|
| 191 |
+
print(f"📥 Loading SegFormer from HuggingFace: {SEGFORMER_MODEL_ID}")
|
| 192 |
+
print(f" 💡 SegFormer-B0: ~14MB, 150类物体, CPU优化")
|
| 193 |
+
|
| 194 |
+
cache_dir = os.getenv("HF_HOME", "./hf_cache")
|
| 195 |
+
|
| 196 |
+
print(f" 正在下载 processor...")
|
| 197 |
+
segformer_processor = SegformerImageProcessor.from_pretrained(
|
| 198 |
+
SEGFORMER_MODEL_ID,
|
| 199 |
+
cache_dir=cache_dir
|
| 200 |
+
)
|
| 201 |
+
|
| 202 |
+
print(f" 正在下载 model...")
|
| 203 |
+
segformer_model = SegformerForSemanticSegmentation.from_pretrained(
|
| 204 |
+
SEGFORMER_MODEL_ID,
|
| 205 |
+
cache_dir=cache_dir,
|
| 206 |
+
low_cpu_mem_usage=True
|
| 207 |
+
).to(device).eval()
|
| 208 |
+
|
| 209 |
+
print(f"✅ SegFormer loaded successfully on {device.upper()}")
|
| 210 |
+
print(f" 可识别类别: 人、家具、墙壁、地板等150类")
|
| 211 |
+
|
| 212 |
+
except Exception as e:
|
| 213 |
+
print(f"❌ SegFormer loading failed: {type(e).__name__}: {e}")
|
| 214 |
+
import traceback
|
| 215 |
+
traceback.print_exc()
|
| 216 |
+
|
| 217 |
+
|
| 218 |
+
def load_maskformer_model(device="cpu"):
|
| 219 |
+
"""加载 MaskFormer 模型(实例分割)"""
|
| 220 |
+
global maskformer_processor, maskformer_model
|
| 221 |
+
|
| 222 |
+
if maskformer_model is not None:
|
| 223 |
+
print("✅ MaskFormer already loaded")
|
| 224 |
+
return
|
| 225 |
+
|
| 226 |
+
try:
|
| 227 |
+
from transformers import MaskFormerImageProcessor, MaskFormerForInstanceSegmentation
|
| 228 |
+
import os
|
| 229 |
+
|
| 230 |
+
print(f"📥 Loading MaskFormer from HuggingFace: {MASKFORMER_MODEL_ID}")
|
| 231 |
+
print(f" 💡 MaskFormer: ~100MB, 实例分割")
|
| 232 |
+
|
| 233 |
+
cache_dir = os.getenv("HF_HOME", "./hf_cache")
|
| 234 |
+
|
| 235 |
+
print(f" 正在下载 processor...")
|
| 236 |
+
maskformer_processor = MaskFormerImageProcessor.from_pretrained(
|
| 237 |
+
MASKFORMER_MODEL_ID,
|
| 238 |
+
cache_dir=cache_dir
|
| 239 |
+
)
|
| 240 |
+
|
| 241 |
+
print(f" 正在下载 model...")
|
| 242 |
+
maskformer_model = MaskFormerForInstanceSegmentation.from_pretrained(
|
| 243 |
+
MASKFORMER_MODEL_ID,
|
| 244 |
+
cache_dir=cache_dir,
|
| 245 |
+
low_cpu_mem_usage=True
|
| 246 |
+
).to(device).eval()
|
| 247 |
+
|
| 248 |
+
print(f"✅ MaskFormer loaded successfully on {device.upper()}")
|
| 249 |
+
|
| 250 |
+
except Exception as e:
|
| 251 |
+
print(f"❌ MaskFormer loading failed: {type(e).__name__}: {e}")
|
| 252 |
+
import traceback
|
| 253 |
+
traceback.print_exc()
|
| 254 |
+
|
| 255 |
def load_grounding_dino_model(device="cpu"):
|
| 256 |
"""Load GroundingDINO model from HuggingFace (CPU优化)"""
|
| 257 |
global grounding_dino_model, grounding_dino_processor
|
|
|
|
| 262 |
|
| 263 |
try:
|
| 264 |
from transformers import AutoProcessor, AutoModelForZeroShotObjectDetection
|
| 265 |
+
import os
|
| 266 |
|
| 267 |
# 强制使用 CPU ���行分割(节省 GPU 资源)
|
| 268 |
seg_device = "cpu"
|
| 269 |
print(f"📥 Loading GroundingDINO from HuggingFace: {GROUNDING_DINO_MODEL_ID} (使用 {seg_device.upper()})")
|
| 270 |
+
|
| 271 |
+
# 设置缓存目录(HuggingFace Spaces友好)
|
| 272 |
+
cache_dir = os.getenv("HF_HOME", "./hf_cache")
|
| 273 |
+
|
| 274 |
+
# 加载模型(带重试和详细日志)
|
| 275 |
+
print(f" 正在下载 processor...")
|
| 276 |
+
grounding_dino_processor = AutoProcessor.from_pretrained(
|
| 277 |
+
GROUNDING_DINO_MODEL_ID,
|
| 278 |
+
cache_dir=cache_dir,
|
| 279 |
+
trust_remote_code=True # 允许运行远程代码
|
| 280 |
+
)
|
| 281 |
+
|
| 282 |
+
print(f" 正在下载 model...")
|
| 283 |
grounding_dino_model = AutoModelForZeroShotObjectDetection.from_pretrained(
|
| 284 |
+
GROUNDING_DINO_MODEL_ID,
|
| 285 |
+
cache_dir=cache_dir,
|
| 286 |
+
trust_remote_code=True,
|
| 287 |
+
low_cpu_mem_usage=True # 降低CPU内存使用
|
| 288 |
).to(seg_device).eval()
|
| 289 |
|
| 290 |
print(f"✅ GroundingDINO loaded successfully on {seg_device.upper()}")
|
| 291 |
|
| 292 |
+
except ImportError as e:
|
| 293 |
+
print(f"❌ ImportError: {e}")
|
| 294 |
+
print(f"💡 请检查 requirements.txt 是否包含 transformers 库")
|
| 295 |
+
import traceback
|
| 296 |
+
traceback.print_exc()
|
| 297 |
+
except OSError as e:
|
| 298 |
+
print(f"❌ OSError (网络/文件问题): {e}")
|
| 299 |
+
print(f"💡 可能是网络连接问题或模型仓库不可访问")
|
| 300 |
+
print(f"💡 尝试解决方案:")
|
| 301 |
+
print(f" 1. 检查 HuggingFace Spaces 的网络连接")
|
| 302 |
+
print(f" 2. 检查模型ID是否正确: {GROUNDING_DINO_MODEL_ID}")
|
| 303 |
+
print(f" 3. 确保有足够的磁盘空间")
|
| 304 |
+
import traceback
|
| 305 |
+
traceback.print_exc()
|
| 306 |
except Exception as e:
|
| 307 |
+
print(f"❌ GroundingDINO loading failed: {type(e).__name__}: {e}")
|
| 308 |
import traceback
|
| 309 |
traceback.print_exc()
|
| 310 |
|
|
|
|
| 319 |
|
| 320 |
try:
|
| 321 |
from transformers import SamModel, SamProcessor
|
| 322 |
+
import os
|
| 323 |
|
| 324 |
# 强制使用 CPU 进行分割(MobileSAM 专为移动设备/CPU优化)
|
| 325 |
seg_device = "cpu"
|
| 326 |
print(f"📥 Loading MobileSAM from HuggingFace: {SAM_MODEL_ID} (使用 {seg_device.upper()})")
|
| 327 |
print(f" 💡 MobileSAM 是轻量级版本,比 SAM-huge 快60倍,只有10MB,适合CPU运行")
|
| 328 |
|
| 329 |
+
# 设置缓存目录
|
| 330 |
+
cache_dir = os.getenv("HF_HOME", "./hf_cache")
|
| 331 |
+
|
| 332 |
+
print(f" 正在下载 processor...")
|
| 333 |
+
sam_processor = SamProcessor.from_pretrained(
|
| 334 |
+
SAM_MODEL_ID,
|
| 335 |
+
cache_dir=cache_dir
|
| 336 |
+
)
|
| 337 |
+
|
| 338 |
+
print(f" 正在下载 model...")
|
| 339 |
+
sam_model = SamModel.from_pretrained(
|
| 340 |
+
SAM_MODEL_ID,
|
| 341 |
+
cache_dir=cache_dir,
|
| 342 |
+
low_cpu_mem_usage=True
|
| 343 |
+
).to(seg_device).eval()
|
| 344 |
|
| 345 |
# Wrap in a predictor-like interface
|
| 346 |
class SAMPredictor:
|
|
|
|
| 384 |
sam_predictor = SAMPredictor(sam_model, sam_processor, seg_device)
|
| 385 |
print(f"✅ MobileSAM loaded successfully on {seg_device.upper()}")
|
| 386 |
|
| 387 |
+
except ImportError as e:
|
| 388 |
+
print(f"❌ ImportError: {e}")
|
| 389 |
+
print(f"💡 请检查 requirements.txt 是否包含 transformers 库")
|
| 390 |
+
import traceback
|
| 391 |
+
traceback.print_exc()
|
| 392 |
+
except OSError as e:
|
| 393 |
+
print(f"❌ OSError (网络/文件问题): {e}")
|
| 394 |
+
print(f"💡 可能是网络连接问题或模型仓库不可访问")
|
| 395 |
+
print(f"💡 尝试解决方案:")
|
| 396 |
+
print(f" 1. 检查 HuggingFace Spaces 的网络连接")
|
| 397 |
+
print(f" 2. 检查模型ID是否正确: {SAM_MODEL_ID}")
|
| 398 |
+
print(f" 3. 确保有足够的磁盘空间")
|
| 399 |
+
import traceback
|
| 400 |
+
traceback.print_exc()
|
| 401 |
except Exception as e:
|
| 402 |
+
print(f"❌ SAM loading failed: {type(e).__name__}: {e}")
|
|
|
|
| 403 |
import traceback
|
| 404 |
traceback.print_exc()
|
| 405 |
|
|
|
|
| 424 |
return colors
|
| 425 |
|
| 426 |
|
| 427 |
+
# ============================================================================
|
| 428 |
+
# SegFormer 分割函数(简化方案)
|
| 429 |
+
# ============================================================================
|
| 430 |
+
|
| 431 |
+
def run_segformer_segmentation(image_np, device="cpu"):
|
| 432 |
+
"""使用 SegFormer 进行语义分割(最简单,CPU友好)"""
|
| 433 |
+
if segformer_model is None or segformer_processor is None:
|
| 434 |
+
print("❌ SegFormer model not loaded")
|
| 435 |
+
return []
|
| 436 |
+
|
| 437 |
+
try:
|
| 438 |
+
import torch
|
| 439 |
+
from PIL import Image
|
| 440 |
+
|
| 441 |
+
# 准备图片
|
| 442 |
+
if image_np.dtype != np.uint8:
|
| 443 |
+
image_np = (image_np * 255).astype(np.uint8)
|
| 444 |
+
image_pil = Image.fromarray(image_np)
|
| 445 |
+
|
| 446 |
+
# 推理
|
| 447 |
+
inputs = segformer_processor(images=image_pil, return_tensors="pt")
|
| 448 |
+
inputs = {k: v.to(device) for k, v in inputs.items()}
|
| 449 |
+
|
| 450 |
+
with torch.no_grad():
|
| 451 |
+
outputs = segformer_model(**inputs)
|
| 452 |
+
|
| 453 |
+
# 获取分割结果
|
| 454 |
+
logits = outputs.logits # (1, num_classes, H, W)
|
| 455 |
+
predicted_segmentation = logits.argmax(dim=1).squeeze().cpu().numpy()
|
| 456 |
+
|
| 457 |
+
# 生成实例掩码(将相同类别的连续区域分开)
|
| 458 |
+
from scipy import ndimage
|
| 459 |
+
|
| 460 |
+
# ADE20K 常见类别映射(部分)
|
| 461 |
+
ade20k_labels = {
|
| 462 |
+
5: "wall", 7: "floor", 11: "ceiling", 18: "window", 14: "door",
|
| 463 |
+
19: "table", 20: "chair", 22: "sofa", 23: "bed", 28: "cabinet",
|
| 464 |
+
34: "desk", 39: "lamp", 65: "television", 89: "shelf"
|
| 465 |
+
}
|
| 466 |
+
|
| 467 |
+
detections = []
|
| 468 |
+
masks = []
|
| 469 |
+
|
| 470 |
+
# 对每个类别提取实例
|
| 471 |
+
unique_labels = np.unique(predicted_segmentation)
|
| 472 |
+
for label_id in unique_labels:
|
| 473 |
+
if label_id == 0: # 跳过背景
|
| 474 |
+
continue
|
| 475 |
+
|
| 476 |
+
# 获取该类别的掩码
|
| 477 |
+
class_mask = (predicted_segmentation == label_id)
|
| 478 |
+
|
| 479 |
+
# 分离连通区域(不同实例)
|
| 480 |
+
labeled_mask, num_features = ndimage.label(class_mask)
|
| 481 |
+
|
| 482 |
+
for instance_id in range(1, num_features + 1):
|
| 483 |
+
instance_mask = (labeled_mask == instance_id)
|
| 484 |
+
mask_area = instance_mask.sum()
|
| 485 |
+
|
| 486 |
+
# 过滤小区域
|
| 487 |
+
if mask_area < MIN_MASK_AREA:
|
| 488 |
+
continue
|
| 489 |
+
|
| 490 |
+
# 计算边界框
|
| 491 |
+
rows, cols = np.where(instance_mask)
|
| 492 |
+
if len(rows) == 0:
|
| 493 |
+
continue
|
| 494 |
+
|
| 495 |
+
y_min, y_max = rows.min(), rows.max()
|
| 496 |
+
x_min, x_max = cols.min(), cols.max()
|
| 497 |
+
bbox = [x_min, y_min, x_max, y_max]
|
| 498 |
+
|
| 499 |
+
# 获取类别名称
|
| 500 |
+
label_name = ade20k_labels.get(int(label_id), f"object_{label_id}")
|
| 501 |
+
|
| 502 |
+
detections.append({
|
| 503 |
+
'bbox': bbox,
|
| 504 |
+
'label': label_name,
|
| 505 |
+
'confidence': 0.9, # SegFormer 不提供置信度,给固定值
|
| 506 |
+
'class_id': int(label_id)
|
| 507 |
+
})
|
| 508 |
+
masks.append(instance_mask)
|
| 509 |
+
|
| 510 |
+
return detections, masks
|
| 511 |
+
|
| 512 |
+
except Exception as e:
|
| 513 |
+
print(f"❌ SegFormer segmentation failed: {e}")
|
| 514 |
+
import traceback
|
| 515 |
+
traceback.print_exc()
|
| 516 |
+
return [], []
|
| 517 |
+
|
| 518 |
+
|
| 519 |
def run_grounding_dino_detection(image_np, text_prompt, device="cpu"):
|
| 520 |
"""Run GroundingDINO detection (CPU优化)"""
|
| 521 |
if grounding_dino_model is None or grounding_dino_processor is None:
|
|
|
|
| 965 |
if enable_segmentation:
|
| 966 |
progress(0.1, desc="🎯 加载分割模型 (CPU)...")
|
| 967 |
print(f"\n{'='*70}")
|
| 968 |
+
print(f"🎯 分割模型加载开始... (方案: {SEGMENTATION_METHOD})")
|
| 969 |
print(f"{'='*70}")
|
| 970 |
+
|
| 971 |
+
if SEGMENTATION_METHOD == "segformer":
|
| 972 |
+
# 方案1: SegFormer (最轻量,~14MB,最快)
|
| 973 |
+
print("📌 使用方案: SegFormer (轻量级,无需文本提示)")
|
| 974 |
+
load_segformer_model("cpu")
|
| 975 |
+
if segformer_model is None:
|
| 976 |
+
print("❌ SegFormer 模型加载失败!")
|
| 977 |
+
raise RuntimeError("SegFormer 模型加载失败,请检查网络连接")
|
| 978 |
+
|
| 979 |
+
elif SEGMENTATION_METHOD == "maskformer":
|
| 980 |
+
# 方案2: MaskFormer (中等,~100MB)
|
| 981 |
+
print("📌 使用方案: MaskFormer (实例分割)")
|
| 982 |
+
load_maskformer_model("cpu")
|
| 983 |
+
if maskformer_model is None:
|
| 984 |
+
print("❌ MaskFormer 模型加载失败!")
|
| 985 |
+
raise RuntimeError("MaskFormer 模型加载失败,请检查网络连接")
|
| 986 |
+
|
| 987 |
+
else: # "grounding_sam"
|
| 988 |
+
# 方案3: GroundingDINO + SAM (最强,~110MB,需要文本提示)
|
| 989 |
+
print("📌 使用方案: GroundingDINO + SAM (文本提示驱动)")
|
| 990 |
+
load_grounding_dino_model("cpu")
|
| 991 |
+
load_sam_model("cpu")
|
| 992 |
+
if grounding_dino_model is None:
|
| 993 |
+
print("❌ GroundingDINO 模型加载失败!")
|
| 994 |
+
raise RuntimeError("GroundingDINO 模型加载失败,请检查网络连接")
|
| 995 |
+
if sam_predictor is None:
|
| 996 |
+
print("❌ SAM 模型加载失败!")
|
| 997 |
+
raise RuntimeError("SAM 模型加载失败,请检查网络连接")
|
| 998 |
+
|
| 999 |
+
print(f"✅ 分割模型加载成功")
|
| 1000 |
print(f"{'='*70}\n")
|
| 1001 |
|
| 1002 |
# Load images
|
|
|
|
| 1070 |
|
| 1071 |
# Segmentation processing
|
| 1072 |
segmented_glb = None
|
| 1073 |
+
if enable_segmentation:
|
| 1074 |
progress(0.65, desc="🎯 开始物体分割...")
|
| 1075 |
print(f"\n{'='*70}")
|
| 1076 |
+
print(f"🎯 开始物体分割... (方案: {SEGMENTATION_METHOD})")
|
|
|
|
|
|
|
| 1077 |
print(f"📐 最小掩码面积: {MIN_MASK_AREA} px")
|
| 1078 |
+
if SEGMENTATION_METHOD == "grounding_sam":
|
| 1079 |
+
print(f"🔍 检测提示词: {text_prompt[:100]}...")
|
| 1080 |
+
print(f"📊 置信度阈值: {GROUNDING_DINO_BOX_THRESHOLD}")
|
| 1081 |
print(f"{'='*70}\n")
|
| 1082 |
|
| 1083 |
all_view_detections = []
|
|
|
|
| 1093 |
else:
|
| 1094 |
ref_image_np = ref_image
|
| 1095 |
|
| 1096 |
+
# 根据分割方法选择不同的处理流程
|
| 1097 |
+
if SEGMENTATION_METHOD == "segformer":
|
| 1098 |
+
# SegFormer: 直接语义分割,无需文本提示
|
| 1099 |
+
detections, masks = run_segformer_segmentation(ref_image_np, "cpu")
|
| 1100 |
+
print(f" ✓ 检测到 {len(detections)} 个物体")
|
| 1101 |
+
|
| 1102 |
+
if len(detections) > 0:
|
| 1103 |
+
for i, det in enumerate(detections):
|
| 1104 |
+
print(f" 物体 {i+1}: {det['label']}")
|
| 1105 |
+
|
| 1106 |
+
points3d = world_points_list[view_idx]
|
| 1107 |
+
for det_idx, (det, mask) in enumerate(zip(detections, masks)):
|
| 1108 |
+
center_3d = compute_object_3d_center(points3d, mask)
|
| 1109 |
+
det['center_3d'] = center_3d
|
| 1110 |
+
det['mask_2d'] = mask
|
| 1111 |
+
|
| 1112 |
+
all_view_detections.append(detections)
|
| 1113 |
+
all_view_masks.append(masks)
|
| 1114 |
+
else:
|
| 1115 |
+
all_view_detections.append([])
|
| 1116 |
+
all_view_masks.append([])
|
| 1117 |
+
|
| 1118 |
+
elif SEGMENTATION_METHOD == "grounding_sam":
|
| 1119 |
+
# GroundingDINO + SAM: 文本提示驱动
|
| 1120 |
+
detections = run_grounding_dino_detection(ref_image_np, text_prompt, "cpu")
|
| 1121 |
+
print(f" ✓ 检测到 {len(detections)} 个物体")
|
| 1122 |
+
|
| 1123 |
+
if len(detections) > 0:
|
| 1124 |
+
for i, det in enumerate(detections):
|
| 1125 |
+
print(f" 物体 {i+1}: {det['label']} (置信度: {det['confidence']:.2f})")
|
| 1126 |
+
boxes = [d['bbox'] for d in detections]
|
| 1127 |
+
masks = run_sam_refinement(ref_image_np, boxes)
|
| 1128 |
+
|
| 1129 |
+
points3d = world_points_list[view_idx]
|
| 1130 |
+
for det_idx, (det, mask) in enumerate(zip(detections, masks)):
|
| 1131 |
+
center_3d = compute_object_3d_center(points3d, mask)
|
| 1132 |
+
det['center_3d'] = center_3d
|
| 1133 |
+
det['mask_2d'] = mask
|
| 1134 |
+
|
| 1135 |
+
all_view_detections.append(detections)
|
| 1136 |
all_view_masks.append(masks)
|
| 1137 |
else:
|
| 1138 |
all_view_detections.append([])
|