Update moondream.py

Browse files

feat: udpate KV caching and support for batching, from encoding to prefill to decode.

Files changed (1) hide show

moondream.py +74 -91

moondream.py CHANGED Viewed

@@ -64,26 +64,18 @@ class EncodedImage:
     pos: int
     caches: List[Tuple[torch.Tensor, torch.Tensor]]
-# -------------------- KVCache --------------------
 class KVCache(nn.Module):
     def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
         head_dim = dim // n_heads
-        cache_shape = (1, n_kv_heads, max_context, head_dim)
-        self.register_buffer("k_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
-        self.register_buffer("v_cache", torch.zeros(*cache_shape, device=device, dtype=dtype))
     def update(self, pos_ids, k, v):
-        """
-        Supports:
-          • Prefill:  k,v  = (B, n_kv_heads, q_len, d),  pos_ids = (q_len,)
-          • Step:     k,v  = (B, n_kv_heads, 1, d),      pos_ids = (B,) or (B,1)
-          • Legacy:   k,v  = (1, n_kv_heads, q_len, d),  pos_ids = scalar
-        Writes into self.k_cache/self.v_cache shaped (B, n_kv_heads, T_max, d).
-        """
-        kout, vout = self.kv_cache if hasattr(self, "kv_cache") else (self.k_cache, self.v_cache)
-        # normalize pos_ids
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
@@ -91,26 +83,25 @@ class KVCache(nn.Module):
         if k.dim() != 4 or v.dim() != 4:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
-        # match cache batch to B (expand-from-1 allowed)
-        if self.k_cache.size(0) != B:
-            if self.k_cache.size(0) == 1:
-                self.k_cache = self.k_cache.expand(B, -1, -1, -1).clone()
-                self.v_cache = self.v_cache.expand(B, -1, -1, -1).clone()
                 kout, vout = self.k_cache, self.v_cache
             else:
-                raise RuntimeError(f"KV cache batch mismatch: cache.B={self.k_cache.size(0)} vs k.B={B}")
-        # Case A: prefill — vector of length q_len (same for all rows)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
-        # Case B: single step — q_len==1 with per-row positions
         if q_len == 1 and pos_ids.numel() == B:
             pos_ids = pos_ids.view(B)
             for i in range(B):
@@ -119,7 +110,7 @@ class KVCache(nn.Module):
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
-        # Case C: scalar & q_len==1
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
@@ -129,11 +120,6 @@ class KVCache(nn.Module):
         raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
 class MoondreamModel(nn.Module):
     def __init__(
@@ -570,29 +556,29 @@ class MoondreamModel(nn.Module):
         return generator(next_token, pos)
     def encode_image(self, image, settings=None) -> EncodedImage:
-        # always start from B=1 to avoid leftovers from batched runs
-        self._setup_caches()  # recreates caches with B=1
         if isinstance(image, EncodedImage):
             return image
         if not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
-        # hard-trim to B=1 if external code changed it
         for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
-        lora = variant_state_dict(settings["variant"], device=self.device) \
-                if settings and "variant" in settings else None
         with torch.inference_mode():
-            img_emb = self._run_vision_encoder(image)                                   # (T_img,C)
-            bos_emb = text_encoder(torch.tensor([[self.config.tokenizer.bos_id]], device=self.device), self.text)  # (1,1,C)
-            inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)                  # (1,T0,C)
-            mask = self.attn_mask[:, :, :inputs_embeds.size(1), :]                      # (1,1,T0,K)
             pos_ids = torch.arange(inputs_embeds.size(1), device=self.device, dtype=torch.long)  # (T0,)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
@@ -608,7 +594,6 @@ class MoondreamModel(nn.Module):
     def query(
         self,
         image: Optional[Union[Image.Image, EncodedImage]] = None,
@@ -941,8 +926,7 @@ class MoondreamModel(nn.Module):
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
-    def _prefill_prompt_batched(self, labels, pos: int, lora=None,
-                            temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
@@ -959,62 +943,52 @@ class MoondreamModel(nn.Module):
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
-        prompt_emb = text_encoder(prompt_ids, self.text)                 # (B,T,C)
-        torch._dynamo.mark_dynamic(prompt_emb, 1)                        # allow variable T
-        base = self.attn_mask[:, :, pos:pos+T, :]                       # (1,1,T,K)
-        mask = base.expand(B, -1, -1, -1).contiguous()                  # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
-        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)     # (B,T,C)
-        logits_BTV = lm_head(hidden_BTC, self.text)                     # (B,T,V)
-        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0) # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
-            next_token = last_logits.argmax(dim=-1, keepdim=True)       # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
-            next_token = torch.multinomial(probs, num_samples=1)        # (B,1)
-        # shared next-free position in cache (safe upper bound)
-        pos_end = int(pos + T)
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
-        self,
-        hidden,              # (B,1,C)
-        next_token,          # (B,1)  (unused in greedy)
-        pos,                 # int
-        include_size: bool = True,
-        max_objects: int = 50,
-        lora=None,
-        use_soft_argmax: bool = True,   # reduces jitter
-    ):
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # 4-D mask: (B, 1, q_len=1, kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if int(pos) > 0:
             mask[:, :, :, :int(pos)] = True
-        pos_ids = torch.full((B, 1), int(pos), device=device, dtype=torch.long)
-        # helper: (B, bins) -> (B,) in [0,1]
-        def _argmax01(logits):
             if use_soft_argmax:
-                probs = torch.softmax(logits, dim=-1)
-                bins  = torch.arange(probs.size(-1), device=logits.device, dtype=torch.float32)
-                return (probs * bins).sum(dim=-1) / float(probs.size(-1) - 1)
             idx = logits.argmax(dim=-1).to(torch.float32)
             return idx / float(logits.size(-1) - 1)
@@ -1023,29 +997,38 @@ class MoondreamModel(nn.Module):
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # ---- x
                 x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
-                x_center = _argmax01(x_logits)                     # (B,)
                 x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
-                mask[alive, :, :, pos_ids[0,0]] = True
-                logits, hidden = self._decode_one_tok(x_emb, mask, pos_ids, lora)
-                pos_ids[alive, 0] += 1
-                # ---- y
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3: y_logits = y_logits.squeeze(1)
-                y_center = _argmax01(y_logits)
                 y_emb = encode_coordinate(y_center.to(dtype=y_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
-                mask[alive, :, :, pos_ids[0,0]] = True
-                logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                pos_ids[alive, 0] += 1
                 if include_size:
                     size_ret = decode_size(hidden, self.region)
-                    w_logits, h_logits = self._norm_size_logits(size_ret, B)
                     if use_soft_argmax:
                         bins = torch.arange(w_logits.size(-1), device=device, dtype=torch.float32)
@@ -1055,13 +1038,12 @@ class MoondreamModel(nn.Module):
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # inverse log scale used by md2
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
                     size_emb = encode_size(torch.stack([w, h], dim=1).to(dtype=w_logits.dtype), self.region).unsqueeze(1)
-                    # write boxes only for alive rows
                     for i in range(B):
                         if not alive[i]: continue
                         xl = (x_center[i] - w[i] / 2).item()
@@ -1075,17 +1057,17 @@ class MoondreamModel(nn.Module):
                             "y_max": max(0.0, min(1.0, yb)),
                         })
-                    mask[alive, :, :, pos_ids[0,0]] = True
-                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_ids, lora)
-                    pos_ids[alive, 0] += 1
-                    next_tok = logits.argmax(dim=-1).squeeze(-1)     # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    mask[alive, :, :, pos_ids[0,0]] = True
-                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_ids, lora)
-                    pos_ids[alive, 0] += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
@@ -1094,6 +1076,7 @@ class MoondreamModel(nn.Module):
         return out
     def detect_multi(self, image, objects, settings=None):
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
@@ -1122,8 +1105,7 @@ class MoondreamModel(nn.Module):
                 d["label"] = lab
             res[lab] = lst
-        # restore B=1 so the next encode_image() starts clean
-        self._reset_kv_caches(1)
         return {"objects": res}
@@ -1131,6 +1113,7 @@ class MoondreamModel(nn.Module):
     def _detect_gaze(
         self,
         image: EncodedImage,

     pos: int
     caches: List[Tuple[torch.Tensor, torch.Tensor]]
 class KVCache(nn.Module):
     def __init__(self, n_heads, n_kv_heads, max_context, dim, device, dtype):
         super().__init__()
         head_dim = dim // n_heads
+        shape = (1, n_kv_heads, max_context, head_dim)
+        self.register_buffer("k_cache", torch.zeros(*shape, device=device, dtype=dtype))
+        self.register_buffer("v_cache", torch.zeros(*shape, device=device, dtype=dtype))
     def update(self, pos_ids, k, v):
+        # k,v: (B, n_kv_heads, q_len, head_dim)
+        kout, vout = self.k_cache, self.v_cache
         if not torch.is_tensor(pos_ids):
             pos_ids = torch.tensor(pos_ids, device=k.device, dtype=torch.long)
         else:
         if k.dim() != 4 or v.dim() != 4:
             raise RuntimeError(f"KV update expects k,v 4D. Got k={tuple(k.shape)} v={tuple(v.shape)}")
         B, Hkv, q_len, D = k.shape
+        # expand caches from B=1 -> B if needed
+        if kout.size(0) != B:
+            if kout.size(0) == 1:
+                self.k_cache = kout.expand(B, -1, -1, -1).clone()
+                self.v_cache = vout.expand(B, -1, -1, -1).clone()
                 kout, vout = self.k_cache, self.v_cache
             else:
+                raise RuntimeError(f"KV cache batch mismatch: cache.B={kout.size(0)} vs k.B={B}")
+        # prefill: pos_ids = (q_len,)
         if pos_ids.dim() == 1 and pos_ids.numel() == q_len:
             for i in range(B):
                 kout[i, :, pos_ids, :] = k[i]
                 vout[i, :, pos_ids, :] = v[i]
             return kout, vout
+        # one step: q_len==1 & pos_ids per row
         if q_len == 1 and pos_ids.numel() == B:
             pos_ids = pos_ids.view(B)
             for i in range(B):
                 vout[i, :, pi, :] = v[i, :, 0, :]
             return kout, vout
+        # scalar for everyone & q_len==1
         if pos_ids.dim() == 0 and q_len == 1:
             pi = int(pos_ids.item())
             kout[:, :, pi, :] = k[:, :, 0, :]
         raise RuntimeError(f"Unsupported KV update combo: k={tuple(k.shape)}, pos_ids={tuple(pos_ids.shape)}")
 class MoondreamModel(nn.Module):
     def __init__(
         return generator(next_token, pos)
     def encode_image(self, image, settings=None) -> EncodedImage:
+        # start clean: recreate caches as B=1 every time
+        self._setup_caches()
         if isinstance(image, EncodedImage):
             return image
         if not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
+        # hard-trim to B=1 in case something changed it
         for blk in self.text.blocks:
             if blk.kv_cache.k_cache.size(0) != 1:
                 blk.kv_cache.k_cache = blk.kv_cache.k_cache[:1].contiguous()
                 blk.kv_cache.v_cache = blk.kv_cache.v_cache[:1].contiguous()
+        lora = variant_state_dict(settings["variant"], device=self.device) if settings and "variant" in settings else None
         with torch.inference_mode():
+            img_emb = self._run_vision_encoder(image)  # (T_img, C)
+            bos = torch.tensor([[self.config.tokenizer.bos_id]], device=self.device)
+            bos_emb = text_encoder(bos, self.text)     # (1,1,C)
+            inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)  # (1,T0,C)
+            mask = self.attn_mask[:, :, :inputs_embeds.size(1), :]      # (1,1,T0,K)
             pos_ids = torch.arange(inputs_embeds.size(1), device=self.device, dtype=torch.long)  # (T0,)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
     def query(
         self,
         image: Optional[Union[Image.Image, EncodedImage]] = None,
             b.kv_cache.v_cache[:, :, :T, :] = v.expand(batch_size, -1, -1, -1)
+    def _prefill_prompt_batched(self, labels, pos: int, lora=None, temperature: float = 0.0, top_p: float = 0.0):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection.")
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
+        prompt_emb = text_encoder(prompt_ids, self.text)               # (B,T,C)
+        torch._dynamo.mark_dynamic(prompt_emb, 1)
+        base = self.attn_mask[:, :, pos:pos+T, :]                      # (1,1,T,K)
+        mask = base.expand(B, -1, -1, -1).contiguous()                 # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)  # (T,)
+        hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)    # (B,T,C)
+        logits_BTV = lm_head(hidden_BTC, self.text)                    # (B,T,V)
+        idx = (torch.tensor(lens, device=self.device) - 1).clamp_min(0)  # (B,)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
         if temperature == 0.0:
+            next_token = last_logits.argmax(dim=-1, keepdim=True)      # (B,1)
         else:
             probs = torch.softmax(last_logits / temperature, dim=-1)
             probs = self._apply_top_p(probs, top_p)
+            next_token = torch.multinomial(probs, num_samples=1)       # (B,1)
+        pos_end = int(pos + T)  # shared next-free position
         return last_hidden, next_token, pos_end
     def _generate_points_batched(
+        self, hidden, next_token, pos, include_size: bool = True,
+        max_objects: int = 50, lora=None, use_soft_argmax: bool = False):
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B,1,1,kv_len); advance with a 1-D position vector
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if int(pos) > 0:
             mask[:, :, :, :int(pos)] = True
+        pos_id_vec = torch.full((1,), int(pos), device=device, dtype=torch.long)
+        def _center01(logits):
+            # logits: (B, bins) → (B,) in [0,1]
             if use_soft_argmax:
+                p = torch.softmax(logits, dim=-1)
+                bins = torch.arange(p.size(-1), device=logits.device, dtype=torch.float32)
+                return (p * bins).sum(dim=-1) / float(p.size(-1) - 1)
             idx = logits.argmax(dim=-1).to(torch.float32)
             return idx / float(logits.size(-1) - 1)
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # x
                 x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3: x_logits = x_logits.squeeze(1)
+                x_center = _center01(x_logits)
                 x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
+                mask[:, :, :, pos_id_vec] = True
+                logits, hidden = self._decode_one_tok(x_emb, mask, pos_id_vec, lora)
+                pos_id_vec += 1
+                # y
                 y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3: y_logits = y_logits.squeeze(1)
+                y_center = _center01(y_logits)
                 y_emb = encode_coordinate(y_center.to(dtype=y_logits.dtype).unsqueeze(-1), self.region).unsqueeze(1)
+                mask[:, :, :, pos_id_vec] = True
+                logits, hidden = self._decode_one_tok(y_emb, mask, pos_id_vec, lora)
+                pos_id_vec += 1
                 if include_size:
                     size_ret = decode_size(hidden, self.region)
+                    # Robust parse: accept (w,h) tuple OR Tensor (B,2,C)/(B,1,2,C)
+                    if isinstance(size_ret, (tuple, list)):
+                        w_logits, h_logits = size_ret
+                    else:
+                        t = size_ret
+                        if t.dim() == 4:                 # (B,1,2,C)
+                            t = t.squeeze(1)            # → (B,2,C)
+                        if t.dim() != 3 or t.size(1) != 2:
+                            raise RuntimeError(f"Unexpected decode_size shape {tuple(t.shape)}")
+                        w_logits, h_logits = t[:, 0, :], t[:, 1, :]
                     if use_soft_argmax:
                         bins = torch.arange(w_logits.size(-1), device=device, dtype=torch.float32)
                         w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                         h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # inverse log-scale mapping used by md2
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
                     size_emb = encode_size(torch.stack([w, h], dim=1).to(dtype=w_logits.dtype), self.region).unsqueeze(1)
                     for i in range(B):
                         if not alive[i]: continue
                         xl = (x_center[i] - w[i] / 2).item()
                             "y_max": max(0.0, min(1.0, yb)),
                         })
+                    mask[:, :, :, pos_id_vec] = True
+                    logits, hidden = self._decode_one_tok(size_emb, mask, pos_id_vec, lora)
+                    pos_id_vec += 1
+                    next_tok = logits.argmax(dim=-1).squeeze(-1)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[:, :, :, pos_id_vec] = True
+                    logits, hidden = self._decode_one_tok(y_emb, mask, pos_id_vec, lora)
+                    pos_id_vec += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
         return out
     def detect_multi(self, image, objects, settings=None):
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
                 d["label"] = lab
             res[lab] = lst
+        self._reset_kv_caches(1)  # restore B=1
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,