JigsawStack
/

moondream2-batched

@@ -900,17 +900,18 @@ class MoondreamModel(nn.Module):
         return last_hidden, next_token, pos_vec  # (B,1,C), (B,1), (B,)
     def _generate_points_batched(
-    self,
-    hidden: torch.Tensor,          # (B, 1, C) last hidden per row from prefill
-    next_token: torch.Tensor,      # (B, 1) not used directly (kept for parity)
-    pos_vec: torch.Tensor,         # (B,) next write pos per row after prefill
-    include_size: bool = True,
-    max_objects: int = 50,
-    lora=None,
-):
         """
         Batched decode loop for multi-label detection.
-        - Uses a *shared* scalar position id per step (q_len = 1), as expected by RoPE.
         - Maintains a per-row attention mask and 'alive' flags.
         - Feeds coord encoders with (B,1) tensors; size encoder with (B,2).
         Returns: list-of-lists of dicts, length B.
@@ -920,35 +921,35 @@ class MoondreamModel(nn.Module):
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
-        # --- Shared write position (scalar) consistent with RoPE q_len=1 ---
-        # We align rows by padding; using the maximum ensures all KV rows can decode in lockstep.
-        pos = int(pos_vec.max().item())
-        # Per-row attention mask (1 = visible). Mark everything up to 'pos' as visible.
         max_ctx = self.config.text.max_context
         mask = torch.zeros(B, 1, max_ctx, device=device, dtype=torch.bool)
-        mask[:, :, :pos] = 1
-        alive  = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
                 # --- x coordinate ---
-                x_logits = decode_coordinate(hidden, self.region)        # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)                       # (B,1024)
-                x_bin    = x_logits.argmax(dim=-1).to(torch.float32)     # (B,)
-                x_center = x_bin / float(x_logits.size(-1))              # (B,)
-                x_input  = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)   # (B,1)
-                x_emb    = encode_coordinate(x_input, self.region).unsqueeze(1)  # (B,1,C)
-                # step: decode hidden for y (advance shared pos)
-                mask[:, :, pos] = 1
                 logits, hidden = self._decode_one_tok(
                     x_emb,
-                    mask,
-                    torch.tensor([pos], device=device, dtype=torch.long),  # length-1 (q_len=1)
                     lora,
                 )
                 pos += 1
@@ -956,17 +957,16 @@ class MoondreamModel(nn.Module):
                 # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
-                    y_logits = y_logits.squeeze(1)
-                y_bin    = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))               # (B,)
-                y_input  = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)    # (B,1)
-                y_emb    = encode_coordinate(y_input, self.region).unsqueeze(1)  # (B,1,C)
-                # step: decode hidden for size / eos (advance shared pos)
-                mask[:, :, pos] = 1
                 logits, hidden = self._decode_one_tok(
                     y_emb,
-                    mask,
                     torch.tensor([pos], device=device, dtype=torch.long),
                     lora,
                 )
@@ -974,17 +974,17 @@ class MoondreamModel(nn.Module):
                 if include_size:
                     # --- size (batched) ---
-                    size_logits = decode_size(hidden, self.region)        # ([B,1,1024],[B,1,1024])
-                    w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)  # (B,1024)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # Convert log-scale bins -> sizes in [0,1]
-                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)   # (B,)
-                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)   # (B,)
                     size_input = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
-                    size_emb   = encode_size(size_input, self.region).unsqueeze(1)    # (B,1,C)
-                    # Record boxes for alive rows
                     for i in range(B):
                         if not alive[i]:
                             continue
@@ -995,32 +995,31 @@ class MoondreamModel(nn.Module):
                             "y_max": (y_center[i] + h[i] / 2).item(),
                         })
-                    # step: decode "next token" to decide continuation
-                    mask[:, :, pos] = 1
                     logits, hidden = self._decode_one_tok(
                         size_emb,
-                        mask,
                         torch.tensor([pos], device=device, dtype=torch.long),
                         lora,
                     )
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     # Points mode (no size)
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    mask[:, :, pos] = 1
                     logits, hidden = self._decode_one_tok(
                         y_emb,
-                        mask,
                         torch.tensor([pos], device=device, dtype=torch.long),
                         lora,
                     )
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
-                # Update finished/alive bookkeeping
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
@@ -1028,6 +1027,7 @@ class MoondreamModel(nn.Module):
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.

         return last_hidden, next_token, pos_vec  # (B,1,C), (B,1), (B,)
     def _generate_points_batched(
+        self,
+        hidden: torch.Tensor,        # (B, 1, C) last hidden per row from prefill
+        next_token: torch.Tensor,    # (B, 1) unused here; kept for parity
+        pos_vec: torch.Tensor,       # (B,) next write pos per row after prefill
+        include_size: bool = True,
+        max_objects: int = 50,
+        lora=None
+    ):
         """
         Batched decode loop for multi-label detection.
+        - Uses a shared scalar position id per step (q_len = 1), as expected by RoPE.
         - Maintains a per-row attention mask and 'alive' flags.
         - Feeds coord encoders with (B,1) tensors; size encoder with (B,2).
         Returns: list-of-lists of dicts, length B.
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
+        # Per-row initial visibility up to each row's individual prefill pos
         max_ctx = self.config.text.max_context
         mask = torch.zeros(B, 1, max_ctx, device=device, dtype=torch.bool)
+        for i in range(B):
+            mask[i, :, : int(pos_vec[i].item())] = 1
+        # Shared write index so RoPE sees a scalar q_len=1 position id
+        pos = int(pos_vec.max().item())
+        alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
                 # --- x coordinate ---
+                x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)                 # (B,1024)
+                x_bin = x_logits.argmax(dim=-1).to(torch.float32)  # (B,)
+                x_center = x_bin / float(x_logits.size(-1))        # (B,)
+                x_input = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
+                x_emb = encode_coordinate(x_input, self.region).unsqueeze(1)  # (B,1,C)
+                # Advance visibility at shared 'pos' and decode (q_len=1)
+                mask[alive, :, pos] = 1
                 logits, hidden = self._decode_one_tok(
                     x_emb,
+                    mask.unsqueeze(2),                              # (B,1,1,max_ctx)
+                    torch.tensor([pos], device=device, dtype=torch.long),
                     lora,
                 )
                 pos += 1
                 # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
+                    y_logits = y_logits.squeeze(1)                 # (B,1024)
+                y_bin = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))        # (B,)
+                y_input = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)  # (B,1)
+                y_emb = encode_coordinate(y_input, self.region).unsqueeze(1)  # (B,1,C)
+                mask[alive, :, pos] = 1
                 logits, hidden = self._decode_one_tok(
                     y_emb,
+                    mask.unsqueeze(2),                              # (B,1,1,max_ctx)
                     torch.tensor([pos], device=device, dtype=torch.long),
                     lora,
                 )
                 if include_size:
                     # --- size (batched) ---
+                    size_logits = decode_size(hidden, self.region)
+                    w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # log-scale bin → actual size in [0,1]
+                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
+                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
                     size_input = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
+                    size_emb = encode_size(size_input, self.region).unsqueeze(1)       # (B,1,C)
+                    # Commit boxes for alive rows
                     for i in range(B):
                         if not alive[i]:
                             continue
                             "y_max": (y_center[i] + h[i] / 2).item(),
                         })
+                    mask[alive, :, pos] = 1
                     logits, hidden = self._decode_one_tok(
                         size_emb,
+                        mask.unsqueeze(2),                          # (B,1,1,max_ctx) ✅
                         torch.tensor([pos], device=device, dtype=torch.long),
                         lora,
                     )
                     pos += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)    # (B,)
                 else:
                     # Points mode (no size)
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[alive, :, pos] = 1
                     logits, hidden = self._decode_one_tok(
                         y_emb,
+                        mask.unsqueeze(2),                          # (B,1,1,max_ctx) ✅
                         torch.tensor([pos], device=device, dtype=torch.long),
                         lora,
                     )
                     pos += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)
+                # Finish rows that emitted EOS or hit object cap
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.