JigsawStack
/

moondream2-batched

@@ -850,84 +850,89 @@ class MoondreamModel(nn.Module):
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
-    def _prefill_prompt_batched(self, labels, pos: int, lora=None, temperature: float = 0.0, top_p: float = 0.0):
         """
-        Build detect prompts for many labels, pad to same length, prefill once as a batch,
-        then return (last_hidden per row, next_token per row, pos per row).
         """
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
         rows, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
             rows.append(torch.tensor(ids, device=self.device, dtype=torch.long))
             lens.append(len(ids))
-        B = len(rows); T = max(lens)
         eos = self.config.tokenizer.eos_id
-        # Pad with eos so we can prefill as a single batch
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
         # Embed & prefill once
-        prompt_emb = text_encoder(prompt_ids, self.text)    # (B, T, C)
-        torch._dynamo.mark_dynamic(prompt_emb, 1)           # allow variable T
-        attn_mask = self.attn_mask
-        mask = attn_mask[:, :, pos : pos + T, :].expand(B, -1, -1, -1).contiguous()
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)
         hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)  # (B, T, C)
         logits_BTV = lm_head(hidden_BTC, self.text)                  # (B, T, V)
-        # Take the last *real* token per row (ignore padding positions)
         idx = (torch.tensor(lens, device=self.device, dtype=torch.long) - 1).clamp_min(0)
-        last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B, 1, C)
-        last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B, V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B, 1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)   # (B, 1)
-        pos_vec = torch.full((B,), pos + T, device=self.device, dtype=torch.long)
-        return last_hidden, next_token, pos_vec  # (B,1,C), (B,1), (B,)
     def _generate_points_batched(
         self,
-        hidden: torch.Tensor,        # (B, 1, C) last hidden per row from prefill
-        next_token: torch.Tensor,    # (B, 1) unused here; kept for parity
-        pos_vec: torch.Tensor,       # (B,) next write pos per row after prefill
         include_size: bool = True,
         max_objects: int = 50,
-        lora=None):
         """
-        Batched decode loop for multi-label detection.
-        - Uses a shared scalar position id per step (q_len = 1), as expected by RoPE.
-        - Maintains a per-row attention mask and 'alive' flags.
-        - Feeds coord encoders with (B,1) tensors; size encoder with (B,2).
-        Returns: list-of-lists of dicts, length B.
         """
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
-        # Per-row initial visibility up to each row's individual prefill pos
         max_ctx = self.config.text.max_context
-        mask = torch.zeros(B, 1, max_ctx, device=device, dtype=torch.bool)
-        for i in range(B):
-            mask[i, :, : int(pos_vec[i].item())] = 1
-        # Shared write index so RoPE sees a scalar q_len=1 position id
-        pos = int(pos_vec.max().item())
         alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
@@ -935,90 +940,72 @@ class MoondreamModel(nn.Module):
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
                 # --- x coordinate ---
-                x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)                 # (B,1024)
-                x_bin = x_logits.argmax(dim=-1).to(torch.float32)  # (B,)
-                x_center = x_bin / float(x_logits.size(-1))        # (B,)
-                x_input = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
-                x_emb = encode_coordinate(x_input, self.region).unsqueeze(1)  # (B,1,C)
-                # Advance visibility at shared 'pos' and decode (q_len=1)
-                mask[alive, :, pos] = 1
-                logits, hidden = self._decode_one_tok(
-                    x_emb,
-                    mask.unsqueeze(2),                              # (B,1,1,max_ctx) ✅
-                    torch.tensor([pos], device=device, dtype=torch.long),
-                    lora,
-                )
                 pos += 1
                 # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
-                    y_logits = y_logits.squeeze(1)                 # (B,1024)
                 y_bin = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))        # (B,)
-                y_input = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)  # (B,1)
-                y_emb = encode_coordinate(y_input, self.region).unsqueeze(1)  # (B,1,C)
-                mask[alive, :, pos] = 1
-                logits, hidden = self._decode_one_tok(
-                    y_emb,
-                    mask.unsqueeze(2),                              # (B,1,1,max_ctx) ✅
-                    torch.tensor([pos], device=device, dtype=torch.long),
-                    lora,
-                )
                 pos += 1
                 if include_size:
-                    # --- size (batched) ---
                     size_logits = decode_size(hidden, self.region)
                     w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # log-scale bin → actual size in [0,1]
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_input = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
-                    size_emb = encode_size(size_input, self.region).unsqueeze(1)       # (B,1,C)
-                    # Commit boxes for alive rows
                     for i in range(B):
-                        if not alive[i]:
-                            continue
-                        out[i].append({
-                            "x_min": (x_center[i] - w[i] / 2).item(),
-                            "y_min": (y_center[i] - h[i] / 2).item(),
-                            "x_max": (x_center[i] + w[i] / 2).item(),
-                            "y_max": (y_center[i] + h[i] / 2).item(),
-                        })
-                    mask[alive, :, pos] = 1
-                    logits, hidden = self._decode_one_tok(
-                        size_emb,
-                        mask.unsqueeze(2),                          # (B,1,1,max_ctx) ✅
-                        torch.tensor([pos], device=device, dtype=torch.long),
-                        lora,
-                    )
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)    # (B,)
                 else:
-                    # Points mode (no size)
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    mask[alive, :, pos] = 1
-                    logits, hidden = self._decode_one_tok(
-                        y_emb,
-                        mask.unsqueeze(2),                          # (B,1,1,max_ctx) ✅
-                        torch.tensor([pos], device=device, dtype=torch.long),
-                        lora,
-                    )
                     pos += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)
-                # Finish rows that emitted EOS or hit object cap
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
@@ -1026,8 +1013,6 @@ class MoondreamModel(nn.Module):
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.
@@ -1038,35 +1023,33 @@ class MoondreamModel(nn.Module):
         Returns:
             {"objects": {label: [box_dict, ...]}}
         """
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
         settings = settings or {}
-        # Encode once; reuse caches
         image = self.encode_image(image, settings)
         B = len(objects)
         self._load_encoded_image_batched(image, B)
-        # Optional LoRA variant (same as detect())
         lora = None
         if "variant" in settings:
             lora = variant_state_dict(settings["variant"], device=self.device)
-        # Prefill all prompts at once
-        last_hidden, next_token, pos_vec = self._prefill_prompt_batched(
             objects, image.pos, lora=lora, temperature=0.0, top_p=0.0
         )
         # Batched decode loop
         max_objects = settings.get("max_objects", 50)
         det_lists = self._generate_points_batched(
-            last_hidden, next_token, pos_vec,
             include_size=True, max_objects=max_objects, lora=lora
         )
-        # Map back to labels and add "label" tags
         res = {}
         for lab, lst in zip(objects, det_lists):
             for d in lst:
@@ -1074,6 +1057,7 @@ class MoondreamModel(nn.Module):
             res[lab] = lst
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,

             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
+    def _prefill_prompt_batched(
+        self,
+        labels,
+        pos: int,
+        lora=None,
+        temperature: float = 0.0,
+        top_p: float = 0.0,
+    ):
         """
+        Build detect prompts for many labels, pad to the same length, prefill once as a batch.
+        Returns (last_hidden per row, next_token per row, shared_pos_end scalar).
         """
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
         rows, lens = [], []
         for lab in labels:
             ids = tpl["prefix"] + self.tokenizer.encode(" " + lab).ids + tpl["suffix"]
             rows.append(torch.tensor(ids, device=self.device, dtype=torch.long))
             lens.append(len(ids))
+        B = len(rows)
+        T = max(lens)
         eos = self.config.tokenizer.eos_id
+        # Pad to T with eos, so we can prefill with a single shared position range
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
         # Embed & prefill once
+        prompt_emb = text_encoder(prompt_ids, self.text)  # (B, T, C)
+        torch._dynamo.mark_dynamic(prompt_emb, 1)         # allow variable T
+        # 4-D mask form makes head broadcasting unambiguous later
+        attn = self.attn_mask
+        mask = attn[:, :, pos : pos + T, :].expand(B, -1, -1, -1).contiguous()  # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)
         hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)  # (B, T, C)
         logits_BTV = lm_head(hidden_BTC, self.text)                  # (B, T, V)
+        # Take the last *real* token per row
         idx = (torch.tensor(lens, device=self.device, dtype=torch.long) - 1).clamp_min(0)
+        last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
+        last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)  # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
+        # Shared "next decode position" for all rows (we prefilled up to pos+T-1)
+        pos_end = pos + T
+        return last_hidden, next_token, pos_end  # (B,1,C), (B,1), int
     def _generate_points_batched(
         self,
+        hidden,              # (B,1,C)
+        next_token,          # (B,1)
+        pos: int,            # shared scalar next position
         include_size: bool = True,
         max_objects: int = 50,
+        lora=None,
+    ):
         """
+        Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
+        for all rows in the batch simultaneously. Returns list-of-lists of dicts, len B.
         """
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B, 1, q_len=1, kv_len)
+        mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
+        if pos > 0:
+            mask[:, :, :, :pos] = True
+        pos_id = torch.tensor([pos], device=device, dtype=torch.long)  # (1,)
         alive = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
                 # --- x coordinate ---
+                x_logits = decode_coordinate(hidden, self.region)       # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)
+                x_bin = x_logits.argmax(dim=-1).to(torch.float32)       # (B,)
+                x_center = x_bin / float(x_logits.size(-1))             # (B,)
+                x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
+                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)   # (B,1,C)
+                # advance attention one step
+                mask[:, :, :, pos] = True
+                logits, hidden = self._decode_one_tok(x_emb, mask, pos_id, lora)
                 pos += 1
+                pos_id[0] = pos
                 # --- y coordinate ---
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
+                    y_logits = y_logits.squeeze(1)
                 y_bin = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))             # (B,)
+                y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)  # (B,1)
+                y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
+                mask[:, :, :, pos] = True
+                logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
                 pos += 1
+                pos_id[0] = pos
                 if include_size:
+                    # --- size ---
                     size_logits = decode_size(hidden, self.region)
                     w_logits, h_logits = size_logits[0].squeeze(1), size_logits[1].squeeze(1)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # bins -> size in [0,1] (inverse of log-scale mapping)
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
+                    size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
+                    # record boxes
                     for i in range(B):
+                        if alive[i]:
+                            out[i].append({
+                                "x_min": (x_center[i] - w[i] / 2).item(),
+                                "y_min": (y_center[i] - h[i] / 2).item(),
+                                "x_max": (x_center[i] + w[i] / 2).item(),
+                                "y_max": (y_center[i] + h[i] / 2).item(),
+                            })
+                    mask[:, :, :, pos] = True
+                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_id, lora)
                     pos += 1
+                    pos_id[0] = pos
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[:, :, :, pos] = True
+                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
                     pos += 1
+                    pos_id[0] = pos
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
                 counts = counts + (~finished_now & alive).to(counts.dtype)
                 alive &= ~finished_now
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.
         Returns:
             {"objects": {label: [box_dict, ...]}}
         """
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
         settings = settings or {}
+        # Encode once; reuse caches for B rows
         image = self.encode_image(image, settings)
         B = len(objects)
         self._load_encoded_image_batched(image, B)
+        # Optional LoRA variant
         lora = None
         if "variant" in settings:
             lora = variant_state_dict(settings["variant"], device=self.device)
+        # Prefill all prompts as a batch; shared next position
+        last_hidden, next_token, pos_end = self._prefill_prompt_batched(
             objects, image.pos, lora=lora, temperature=0.0, top_p=0.0
         )
         # Batched decode loop
         max_objects = settings.get("max_objects", 50)
         det_lists = self._generate_points_batched(
+            last_hidden, next_token, pos_end,
             include_size=True, max_objects=max_objects, lora=lora
         )
+        # Map back to labels and tag
         res = {}
         for lab, lst in zip(objects, det_lists):
             for d in lst:
             res[lab] = lst
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,