JigsawStack
/

moondream2-batched

@@ -64,33 +64,33 @@ class EncodedImage:
     pos: int
     caches: List[Tuple[torch.Tensor, torch.Tensor]]
 class KVCache(nn.Module):
     def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
-        cache_shape = (1, n_kv_heads, max_context, dim // n_heads)
         self.register_buffer("k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
         self.register_buffer("v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
     def update(self, pos_ids, k, v):
         """
         Supports:
-          • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
-          • 1-step:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,1) or (B,)
-          • Legacy:   k,v  = (B, n_kv_heads, 1, d),      pos_ids = scalar
-        Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
         """
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
-            pos_ids = pos_ids.to(device=k.device, dtype=torch.long)
         if k.dim() != 4 or v.dim() != 4:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
-        kout, vout = self.k_cache, self.v_cache
-        # Expand caches from B=1 lazily if needed
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
@@ -98,30 +98,28 @@ class KVCache(nn.Module):
                 kout, vout = self.k_cache, self.v_cache
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
-        # Case A: prefill (same positions for every row)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
-        # Case B: single step with per-row position (B,) or (B,1)
-        if q_len == 1 and pos_ids.numel() == B:
-            pos_ids = pos_ids.view(B)
-            for i in range(B):
-                pi = int(pos_ids[i].item())
-                kout[i, :, pi, :] = k[i, :, 0, :]
-                vout[i, :, pi, :] = v[i, :, 0, :]
-            return kout, vout
-        # Case C: scalar position for everyone
-        if q_len == 1 and pos_ids.dim() == 0:
-            pi = int(pos_ids.item())
-            kout[:, :, pi, :] = k[:, :, 0, :]
-            vout[:, :, pi, :] = v[:, :, 0, :]
             return kout, vout
         raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
@@ -129,6 +127,7 @@ class KVCache(nn.Module):
 class MoondreamModel(nn.Module):
     def __init__(
@@ -211,6 +210,7 @@ class MoondreamModel(nn.Module):
             blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
     def _setup_caches(self):
@@ -567,13 +567,13 @@ class MoondreamModel(nn.Module):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
-        # Top of encode_image(), just after type checks:
-        self._setup_caches()  # re-create caches
-        for blk in self.text.blocks:          # force B=1 for encode
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         if isinstance(image, EncodedImage):
             return image
         if not isinstance(image, Image.Image):
@@ -602,6 +602,7 @@ class MoondreamModel(nn.Module):
         )
     def query(
         self,
         image: Optional[Union[Image.Image, EncodedImage]] = None,
@@ -893,6 +894,7 @@ class MoondreamModel(nn.Module):
         return {"points": objects}
     def _load_encoded_image_batched(self, encoded_image, batch_size: int):
         for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
             T = k.size(2)
@@ -904,11 +906,7 @@ class MoondreamModel(nn.Module):
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
-    def _prefill_prompt_batched(self, labels, pos: int, lora=None,
-                            temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
@@ -925,39 +923,43 @@ class MoondreamModel(nn.Module):
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
-        prompt_emb = text_encoder(prompt_ids, self.text)                  # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
-        base = self.attn_mask[:, :, pos:pos+T, :]                        # (1,1,T,K)
-        mask = base.expand(B, -1, -1, -1).contiguous()                    # (B,1,T,K)
-        pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)       # (B,T,C)
         logits_BTV = lm_head(hidden_BTC, self.text)
-        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)   # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)         # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)          # (B,1)
         return last_hidden, next_token, int(pos + T)
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
-        next_token,          # (B,1)  (not used with greedy coords; kept for API)
-        pos,                 # int, next free KV slot
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
-        use_soft_argmax: bool = True,
     ):
         B = hidden.size(0)
         device = self.device
@@ -965,40 +967,110 @@ class MoondreamModel(nn.Module):
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # mask & position ids
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
-        pos_ids = torch.full((B, 1), pos, device=device, dtype=torch.long)
         alive  = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
-        def _argmax01(logits_2d):
             # logits_2d: (B, bins)
             if use_soft_argmax:
-                probs = torch.softmax(logits_2d, dim=-1)
-                bins  = torch.arange(probs.size(-1), device=logits_2d.device, dtype=torch.float32)
-                val   = (probs * bins).sum(dim=-1) / (probs.size(-1) - 1)
-                return val  # in [0,1]
-            idx = logits_2d.argmax(dim=-1).to(torch.float32)
-            return idx / float(logits_2d.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # x
-                x_logits = decode_coordinate(hidden, self.region)          # (B,1,1024) or (B,1024)
-                if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
-                x_center = _argmax01(x_logits)                             # (B,)
-                x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype).unsqueeze(-1),
-                                          self.region).unsqueeze(1)        # (B,1,C)
                 mask[alive, :, :, pos] = True
-                _, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
-                pos_ids[alive, 0] += 1; pos += 1
-                # y
                 y_logits = decode_coordinate(hidden, self.region)
-                if y_logits.dim
@@ -1008,38 +1080,37 @@ class MoondreamModel(nn.Module):
             raise NotImplementedError("Model does not support object detection.")
         settings = settings or {}
-        # Encode once; reuse caches for B rows
-        image = self.encode_image(image, settings)
         B = len(objects)
-        self._load_encoded_image_batched(image, B)
-        lora = None
-        if "variant" in settings:
-            lora = variant_state_dict(settings["variant"], device=self.device)
         last_hidden, next_token, pos_end = self._prefill_prompt_batched(
-            objects, image.pos, lora=lora, temperature=0.0, top_p=0.0
         )
-        max_objects = settings.get("max_objects", 50)
         det_lists = self._generate_points_batched(
             last_hidden, next_token, pos_end,
-            include_size=True, max_objects=max_objects, lora=lora
         )
-        # Map back to labels and tag
         res = {}
         for lab, lst in zip(objects, det_lists):
             for d in lst:
                 d["label"] = lab
             res[lab] = lst
-        # IMPORTANT: restore caches to B=1 so future calls are safe
         self._reset_kv_caches(1)
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,

     pos: int
     caches: List[Tuple[torch.Tensor, torch.Tensor]]
+# -------------------- KVCache --------------------
 class KVCache(nn.Module):
     def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
+        head_dim = dim // n_heads
+        cache_shape = (1, n_kv_heads, max_context, head_dim)
         self.register_buffer("k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
         self.register_buffer("v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
     def update(self, pos_ids, k, v):
         """
         Supports:
+          • Prefill: k,v = (B, n_kv, q_len, d), pos_ids = (q_len,)
+          • 1-step : k,v = (B, n_kv, 1,    d), pos_ids = (B,) or scalar
         """
+        kout, vout = self.k_cache, self.v_cache
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
+            pos_ids = pos_ids.to(k.device, dtype=torch.long)
         if k.dim() != 4 or v.dim() != 4:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
+        # Expand caches’ batch dim if needed
         if kout.size(0) != B:
             if kout.size(0) == 1:
                 self.k_cache = kout.expand(B, -1, -1, -1).clone()
                 kout, vout = self.k_cache, self.v_cache
             else:
                 raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
+        # Prefill (vector of positions shared across the batch)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
+        # 1-step with per-row positions
+        if q_len == 1 and pos_ids.numel() in {1, B}:
+            if pos_ids.numel() == 1:
+                pi = int(pos_ids.item())
+                kout[:, :, pi, :] = k[:, :, 0, :]
+                vout[:, :, pi, :] = v[:, :, 0, :]
+            else:
+                pos_ids = pos_ids.view(B)
+                for i in range(B):
+                    pi = int(pos_ids[i].item())
+                    kout[i, :, pi, :] = k[i, :, 0, :]
+                    vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
         raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
 class MoondreamModel(nn.Module):
     def __init__(
             blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
     def _setup_caches(self):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
+        # Always start from single-row caches; avoids leftovers from batched runs
+        self._setup_caches()  # re-create caches (B=1)
+        for blk in self.text.blocks:  # make absolutely sure batch dim == 1
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
         if isinstance(image, EncodedImage):
             return image
         if not isinstance(image, Image.Image):
         )
     def query(
         self,
         image: Optional[Union[Image.Image, EncodedImage]] = None,
         return {"points": objects}
+    # -------------------- batched helpers --------------------
     def _load_encoded_image_batched(self, encoded_image, batch_size: int):
         for b, (k, v) in zip(self.text.blocks, encoded_image.caches):
             T = k.size(2)
             b.kv_cache.k_cache[:, :, :T, :] = k.expand(batch_size, -1, -1, -1)
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
+    def _prefill_prompt_batched(self, labels, pos: int, lora=None, temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
+        prompt_emb = text_encoder(prompt_ids, self.text)   # (B,T,C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)
+        # mask: (B,1,T,kv_len)
+        base = self.attn_mask[:, :, pos:pos+T, :]          # (1,1,T,K)
+        mask = base.expand(B, -1, -1, -1).contiguous()     # (B,1,T,K)
+        # IMPORTANT: for prefill pass a 1-D vector of length T (matches upstream)
+        pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)                 # (B,T,C)
         logits_BTV = lm_head(hidden_BTC, self.text)
+        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)                       # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)                         # (B,1)
+        # shared scalar end position
         return last_hidden, next_token, int(pos + T)
     def _generate_points_batched(
         self,
         hidden,              # (B,1,C)
+        next_token,          # (B,1) (ignored in greedy)
+        pos,                 # int
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
+        use_soft_argmax: bool = False,   # default OFF to match upstream numerics
     ):
         B = hidden.size(0)
         device = self.device
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B,1,1,kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
+        # rotary & KV path are happiest with a 1-D scalar position vector (like upstream)
+        pos_id_vec = torch.tensor([pos], device=device, dtype=torch.long)  # (1,)
         alive  = torch.ones(B, dtype=torch.bool, device=device)
         counts = torch.zeros(B, dtype=torch.int32, device=device)
+        def _center01(logits_2d):
             # logits_2d: (B, bins)
+            if logits_2d.dim() == 3:  # (B,1,bins) -> (B,bins)
+                logits_2d = logits_2d.squeeze(1)
+            bins = logits_2d.size(-1)
             if use_soft_argmax:
+                p = torch.softmax(logits_2d, dim=-1)
+                idx = (p * torch.arange(bins, device=logits_2d.device, dtype=torch.float32)).sum(dim=-1)
+                return idx / float(bins)  # match upstream scale
+            else:
+                return logits_2d.argmax(dim=-1).to(torch.float32) / float(bins)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # --- x ---
+                x_logits = decode_coordinate(hidden, self.region)        # (B,1,1024) or (B,1024)
+                x_center = _center01(x_logits)                           # (B,) in [0,1]
+                x_emb = encode_coordinate(x_center.to(x_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)  # (B,1,C)
                 mask[alive, :, :, pos] = True
+                _, hidden = self._decode_one_tok(x_emb, mask, pos_id_vec, lora)
+                pos += 1
+                pos_id_vec[0] = pos
+                # --- y ---
                 y_logits = decode_coordinate(hidden, self.region)
+                y_center = _center01(y_logits)
+                y_emb = encode_coordinate(y_center.to(y_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
+                mask[alive, :, :, pos] = True
+                _, hidden = self._decode_one_tok(y_emb, mask, pos_id_vec, lora)
+                pos += 1
+                pos_id_vec[0] = pos
+                if include_size:
+                    # --- size ---
+                    size_ret = decode_size(hidden, self.region)
+                    # Works for tuple or stacked tensor
+                    if isinstance(size_ret, (tuple, list)):
+                        w_logits, h_logits = size_ret
+                    else:
+                        # expected shapes: (B,2,1024) or (B,1,2,1024)
+                        if size_ret.dim() == 3:  # (B,2,1024)
+                            w_logits, h_logits = size_ret[:, 0], size_ret[:, 1]
+                        else:                    # (B,1,2,1024)
+                            w_logits, h_logits = size_ret[:, 0, 0], size_ret[:, 0, 1]
+                    if w_logits.dim() == 3: w_logits = w_logits.squeeze(1)
+                    if h_logits.dim() == 3: h_logits = h_logits.squeeze(1)
+                    # bins -> size via the same inverse log2 scale as upstream
+                    w_bin = w_logits.argmax(dim=-1).to(torch.float32)
+                    h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
+                    h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_emb = encode_size(torch.stack([w, h], dim=1).to(w_logits.dtype), self.region).unsqueeze(1)
+                    # record boxes (clamped)
+                    for i in range(B):
+                        if not alive[i]:
+                            continue
+                        xl = (x_center[i] - w[i] / 2).item()
+                        xr = (x_center[i] + w[i] / 2).item()
+                        yt = (y_center[i] - h[i] / 2).item()
+                        yb = (y_center[i] + h[i] / 2).item()
+                        out[i].append({
+                            "x_min": max(0.0, min(1.0, xl)),
+                            "y_min": max(0.0, min(1.0, yt)),
+                            "x_max": max(0.0, min(1.0, xr)),
+                            "y_max": max(0.0, min(1.0, yb)),
+                        })
+                    mask[alive, :, :, pos] = True
+                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_id_vec, lora)
+                    pos += 1
+                    pos_id_vec[0] = pos
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
+                else:
+                    for i in range(B):
+                        if alive[i]:
+                            out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[alive, :, :, pos] = True
+                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_id_vec, lora)
+                    pos += 1
+                    pos_id_vec[0] = pos
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)
+                finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
+                counts = counts + ((~finished_now) & alive).to(counts.dtype)
+                alive &= ~finished_now
+        return out
             raise NotImplementedError("Model does not support object detection.")
         settings = settings or {}
+        enc = self.encode_image(image, settings)
         B = len(objects)
+        self._load_encoded_image_batched(enc, B)
+        lora = variant_state_dict(settings["variant"], device=self.device) if "variant" in settings else None
         last_hidden, next_token, pos_end = self._prefill_prompt_batched(
+            objects, enc.pos, lora=lora, temperature=0.0, top_p=0.0
         )
         det_lists = self._generate_points_batched(
             last_hidden, next_token, pos_end,
+            include_size=True,
+            max_objects=settings.get("max_objects", 50),
+            lora=lora,
         )
         res = {}
         for lab, lst in zip(objects, det_lists):
             for d in lst:
                 d["label"] = lab
             res[lab] = lst
+        # make subsequent single-image calls stable
         self._reset_kv_caches(1)
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,