Spaces:

chris-propeller
/

sam3-test

Running on L4

App Files Files Community

chris-propeller commited on 16 days ago

Commit

a45e44e

1 Parent(s): acd640e

attempt to fix multiple text prompts

Browse files

Files changed (1) hide show

app.py +82 -58

app.py CHANGED Viewed

@@ -279,61 +279,74 @@ def sam2_compatible_api(data):
         if has_points or has_boxes:
             prompt_types.append("visual")
-        # Prepare inputs for combined SAM3 inference call
-        combined_text_prompt = None
-        combined_boxes = None
-        combined_box_labels = None
-        combined_points = None
-        combined_point_labels = None
-        # Handle text prompts - combine multiple text prompts into one
         if has_text:
-            # For multiple text prompts, join them (SAM3 can handle combined descriptions)
-            combined_text_prompt = ", ".join(text_prompts)
-        # Handle box prompts
-        if has_boxes:
-            combined_boxes = input_boxes
-            # Create box labels (default to positive boxes if not provided)
-            combined_box_labels = inputs_data.get("box_labels", [1] * len(input_boxes))
-        # Handle point prompts
-        if has_points:
-            combined_points = input_points
-            combined_point_labels = input_labels
-        # Make single combined inference call with all prompt types
-        results = sam3_inference(
-            image=image,
-            text_prompt=combined_text_prompt,
-            boxes=combined_boxes,
-            box_labels=combined_box_labels,
-            points=combined_points,
-            point_labels=combined_point_labels,
-            confidence_threshold=confidence_threshold
-        )
-        # Process results
-        if results and len(results["masks"]) > 0:
-            for i in range(len(results["masks"])):
-                mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
-                score = results["scores"][i].item()
-                if score >= confidence_threshold:
-                    # Convert mask to base64
-                    mask_image = Image.fromarray(mask_np, mode='L')
-                    buffer = io.BytesIO()
-                    mask_image.save(buffer, format='PNG')
-                    mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
-                    all_masks.append(mask_b64)
-                    all_scores.append(score)
-                    # Extract polygons if vectorize is enabled
-                    if vectorize:
-                        binary_mask = (mask_np > 0).astype(np.uint8)
-                        polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
-                        all_polygons.append(polygons)
         # Build SAM2-compatible response
         response = {
@@ -472,13 +485,24 @@ response = requests.post(
     }
 )
-# SAM3 Combined Prompts (Text + Visual) - NEW CAPABILITY!
 response = requests.post(
     "https://your-username-sam3-api.hf.space/api/sam2_compatible",
     json={
         "inputs": {
             "image": image_b64,
-            "text_prompts": ["cat"],  # Text description
             "boxes": [[50, 50, 150, 150]],  # Bounding box
             "box_labels": [0],  # 0=negative (exclude this area)
             "points": [[200, 200]],  # Point prompt
@@ -512,8 +536,8 @@ result = response.json()
   "inputs": {
     "image": "base64_encoded_image_string",
-    // SAM3 NEW: Text-based prompts (can be combined with visual prompts)
-    "text_prompts": ["person", "car"],  // List of text descriptions
     // SAM2 COMPATIBLE: Point-based prompts (can be combined with text/boxes)
     "points": [[x1, y1], [x2, y2]],  // Individual points (not nested arrays)

         if has_points or has_boxes:
             prompt_types.append("visual")
+        # Process text prompts individually (SAM3 works best with individual text prompts)
         if has_text:
+            for text_prompt in text_prompts:
+                if text_prompt.strip():  # Skip empty prompts
+                    results = sam3_inference(
+                        image=image,
+                        text_prompt=text_prompt.strip(),
+                        confidence_threshold=confidence_threshold
+                    )
+                    if results and len(results["masks"]) > 0:
+                        for i in range(len(results["masks"])):
+                            mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
+                            score = results["scores"][i].item()
+                            if score >= confidence_threshold:
+                                # Convert mask to base64
+                                mask_image = Image.fromarray(mask_np, mode='L')
+                                buffer = io.BytesIO()
+                                mask_image.save(buffer, format='PNG')
+                                mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+                                all_masks.append(mask_b64)
+                                all_scores.append(score)
+                                # Extract polygons if vectorize is enabled
+                                if vectorize:
+                                    binary_mask = (mask_np > 0).astype(np.uint8)
+                                    polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
+                                    all_polygons.append(polygons)
+        # Process visual prompts (boxes and/or points) - can be combined in a single call
+        if has_boxes or has_points:
+            combined_boxes = input_boxes if has_boxes else None
+            combined_box_labels = inputs_data.get("box_labels", [1] * len(input_boxes)) if has_boxes else None
+            combined_points = input_points if has_points else None
+            combined_point_labels = input_labels if has_points else None
+            results = sam3_inference(
+                image=image,
+                text_prompt=None,
+                boxes=combined_boxes,
+                box_labels=combined_box_labels,
+                points=combined_points,
+                point_labels=combined_point_labels,
+                confidence_threshold=confidence_threshold
+            )
+            if results and len(results["masks"]) > 0:
+                for i in range(len(results["masks"])):
+                    mask_np = results["masks"][i].cpu().numpy().astype(np.uint8) * 255
+                    score = results["scores"][i].item()
+                    if score >= confidence_threshold:
+                        # Convert mask to base64
+                        mask_image = Image.fromarray(mask_np, mode='L')
+                        buffer = io.BytesIO()
+                        mask_image.save(buffer, format='PNG')
+                        mask_b64 = base64.b64encode(buffer.getvalue()).decode('utf-8')
+                        all_masks.append(mask_b64)
+                        all_scores.append(score)
+                        # Extract polygons if vectorize is enabled
+                        if vectorize:
+                            binary_mask = (mask_np > 0).astype(np.uint8)
+                            polygons = _mask_to_polygons_original_size(binary_mask, simplify_epsilon)
+                            all_polygons.append(polygons)
         # Build SAM2-compatible response
         response = {
     }
 )
+# SAM3 with Multiple Text Prompts (processed individually)
+response = requests.post(
+    "https://your-username-sam3-api.hf.space/api/sam2_compatible",
+    json={
+        "inputs": {
+            "image": image_b64,
+            "text_prompts": ["cat", "dog"],  # Each prompt processed separately
+            "confidence_threshold": 0.5
+        }
+    }
+)
+# SAM3 Combined Visual Prompts (boxes + points in single call)
 response = requests.post(
     "https://your-username-sam3-api.hf.space/api/sam2_compatible",
     json={
         "inputs": {
             "image": image_b64,
             "boxes": [[50, 50, 150, 150]],  # Bounding box
             "box_labels": [0],  # 0=negative (exclude this area)
             "points": [[200, 200]],  # Point prompt
   "inputs": {
     "image": "base64_encoded_image_string",
+    // SAM3 NEW: Text-based prompts (each processed individually for best results)
+    "text_prompts": ["person", "car"],  // List of text descriptions - each processed separately
     // SAM2 COMPATIBLE: Point-based prompts (can be combined with text/boxes)
     "points": [[x1, y1], [x2, y2]],  // Individual points (not nested arrays)