Update moondream.py

fix:
1. Rerun crash in encode_image(...): After a batched call, KV caches are still (B, ...). On the next encode, Q has B=1 but K/V broadcast to B, so attention returns B rows and the reshape to (1, q_len, d_model) fails. Forcing a fresh B=1 cache avoids this. (Moondream’s attention expects Q/K/V batch dimensions to match when reshaping back to (bsz, q_len, d_model).)

2. decode_size IndexError on .squeeze(1): Upstream decode_size returns mlp(...).view(2, -1), which flattens batch/time dims; not always a (B,1,1024) pair. We reshape it back to (2, B, -1) when needed, so it works in both variants.

Files changed (1) hide show

moondream.py +83 -79

moondream.py CHANGED Viewed

@@ -155,17 +155,33 @@ class MoondreamModel(nn.Module):
         if setup_caches:
             self._setup_caches()
-    def _setup_caches(self):
         c = self.config.text
-        for b in self.text.blocks:
-            b.kv_cache = KVCache(
-                c.n_heads,
-                c.n_kv_heads,
-                c.max_context,
-                c.dim,
-                device=self.device,
-                dtype=self.vision.pos_emb.dtype,
-            )
     @property
     def device(self):
@@ -238,30 +254,30 @@ class MoondreamModel(nn.Module):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
         if isinstance(image, EncodedImage):
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
         lora = (
             variant_state_dict(settings["variant"], device=self.device)
             if settings is not None and "variant" in settings
             else None
         )
-        # Run through text model in addition to the vision encoder, to minimize
-        # re-computation if multiple queries are performed on this image.
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
-                torch.tensor([[self.config.tokenizer.bos_id]], device=self.device),
-                self.text,
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
             pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),
             caches=[
@@ -273,6 +289,7 @@ class MoondreamModel(nn.Module):
             ],
         )
     def _apply_top_p(self, probs: torch.Tensor, top_p: float):
         probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
         probs_sum = torch.cumsum(probs_sort, dim=-1)
@@ -835,7 +852,6 @@ class MoondreamModel(nn.Module):
         return {"points": objects}
-    # === BEGIN: Batched multi-label detection additions ===
     def _load_encoded_image_batched(self, encoded_image, batch_size: int):
         """
         Clone single-image KV caches into a batch-B cache so we can decode B labels in parallel.
@@ -860,10 +876,6 @@ class MoondreamModel(nn.Module):
         temperature: float = 0.0,
         top_p: float = 0.0,
     ):
-        """
-        Build detect prompts for many labels, pad to the same length, prefill once as a batch.
-        Returns (last_hidden per row, next_token per row, shared_pos_end scalar).
-        """
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
@@ -877,24 +889,21 @@ class MoondreamModel(nn.Module):
         T = max(lens)
         eos = self.config.tokenizer.eos_id
-        # Pad to T with eos, so we can prefill with a single shared position range
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
-        # Embed & prefill once
-        prompt_emb = text_encoder(prompt_ids, self.text)  # (B, T, C)
-        torch._dynamo.mark_dynamic(prompt_emb, 1)         # allow variable T
-        # 4-D mask form makes head broadcasting unambiguous later
         attn = self.attn_mask
-        mask = attn[:, :, pos : pos + T, :].expand(B, -1, -1, -1).contiguous()  # (B,1,T,K)
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)
         hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)  # (B, T, C)
         logits_BTV = lm_head(hidden_BTC, self.text)                  # (B, T, V)
-        # Take the last *real* token per row
         idx = (torch.tensor(lens, device=self.device, dtype=torch.long) - 1).clamp_min(0)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
@@ -906,34 +915,27 @@ class MoondreamModel(nn.Module):
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
-        # Shared "next decode position" for all rows (we prefilled up to pos+T-1)
         pos_end = pos + T
         return last_hidden, next_token, pos_end  # (B,1,C), (B,1), int
     def _generate_points_batched(
         self,
-        hidden,              # (B,1,C)   last hidden after prefill (per label row)
-        next_token,          # (B,1)     (kept for parity; not used when temperature=0)
-        pos: int,            # shared scalar next position for all rows
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
     ):
-        """
-        Vectorized version of _generate_points() that decodes x -> y -> size -> next-token
-        for all rows in the batch simultaneously. Returns list-of-lists of dicts (len B).
-        Batch-safe: uses 4-D masks and avoids region.decode_size() (which flattens batch).
-        """
-        import torch
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
-        # 4-D mask: (B, 1, q_len=1, kv_len), True means "visible" to match model's convention
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
@@ -944,53 +946,55 @@ class MoondreamModel(nn.Module):
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
-                # --- x coordinate (batched) ---
                 x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
-                    x_logits = x_logits.squeeze(1)                 # (B,1024)
-                x_bin    = x_logits.argmax(dim=-1).to(torch.float32)
                 x_center = x_bin / float(x_logits.size(-1))        # (B,)
-                x_in     = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
-                x_emb    = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
-                # advance one token
-                mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_id, lora)
-                pos += 1
-                pos_id[0] = pos
-                # --- y coordinate (batched) ---
-                y_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if y_logits.dim() == 3:
                     y_logits = y_logits.squeeze(1)
-                y_bin    = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))
-                y_in     = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)   # (B,1)
-                y_emb    = encode_coordinate(y_in, self.region).unsqueeze(1)
-                mask[:, :, :, pos] = True
                 logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
-                pos += 1
-                pos_id[0] = pos
                 if include_size:
-                    # ---- size (batched, *without* region.decode_size which flattens batch) ----
-                    # size_out_dim is 2*1024 (W then H). mlp() preserves (B,1,·).
-                    size_logits = mlp(hidden, self.region["size_decoder"]).squeeze(1)  # (B, 2048)
-                    half = size_logits.size(-1) // 2
-                    w_logits, h_logits = size_logits[:, :half], size_logits[:, half:]   # (B,1024),(B,1024)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
-                    # inverse log-scale mapping used by the repo
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_in  = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
-                    # commit boxes
                     for i in range(B):
                         if alive[i]:
                             out[i].append({
@@ -1000,21 +1004,18 @@ class MoondreamModel(nn.Module):
                                 "y_max": (y_center[i] + h[i] / 2).item(),
                             })
-                    # decide continuation
-                    mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_id, lora)
-                    pos += 1
-                    pos_id[0] = pos
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
-                    # points mode
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
-                    mask[:, :, :, pos] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
-                    pos += 1
-                    pos_id[0] = pos
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
@@ -1024,6 +1025,7 @@ class MoondreamModel(nn.Module):
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.
@@ -1043,17 +1045,14 @@ class MoondreamModel(nn.Module):
         B = len(objects)
         self._load_encoded_image_batched(image, B)
-        # Optional LoRA variant
         lora = None
         if "variant" in settings:
             lora = variant_state_dict(settings["variant"], device=self.device)
-        # Prefill all prompts as a batch; shared next position
         last_hidden, next_token, pos_end = self._prefill_prompt_batched(
             objects, image.pos, lora=lora, temperature=0.0, top_p=0.0
         )
-        # Batched decode loop
         max_objects = settings.get("max_objects", 50)
         det_lists = self._generate_points_batched(
             last_hidden, next_token, pos_end,
@@ -1066,9 +1065,14 @@ class MoondreamModel(nn.Module):
             for d in lst:
                 d["label"] = lab
             res[lab] = lst
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,

         if setup_caches:
             self._setup_caches()
+    def _reset_kv_caches(self, batch_size: int = 1):
+        """
+        Recreate KV caches with the requested batch size so subsequent calls
+        (e.g., encode_image) start from a consistent shape.
+        """
         c = self.config.text
+        head_dim = c.dim // c.n_heads
+        for blk in self.text.blocks:
+            device = blk.kv_cache.k_cache.device
+            dtype = blk.kv_cache.k_cache.dtype
+            shape = (batch_size, c.n_kv_heads, c.max_context, head_dim)
+            blk.kv_cache.k_cache = torch.zeros(shape, device=device, dtype=dtype)
+            blk.kv_cache.v_cache = torch.zeros(shape, device=device, dtype=dtype)
+        def _setup_caches(self):
+            c = self.config.text
+            for b in self.text.blocks:
+                b.kv_cache = KVCache(
+                    c.n_heads,
+                    c.n_kv_heads,
+                    c.max_context,
+                    c.dim,
+                    device=self.device,
+                    dtype=self.vision.pos_emb.dtype,
+                )
     @property
     def device(self):
         image: Union[Image.Image, EncodedImage],
         settings: Optional[ImageEncodingSettings] = None,
     ) -> EncodedImage:
+        # Always start from single-row caches; avoids leftovers from batched runs.
+        self._setup_caches()
         if isinstance(image, EncodedImage):
             return image
         elif not isinstance(image, Image.Image):
             raise ValueError("image must be a PIL Image or EncodedImage")
         lora = (
             variant_state_dict(settings["variant"], device=self.device)
             if settings is not None and "variant" in settings
             else None
         )
         with torch.inference_mode():
             img_emb = self._run_vision_encoder(image)
             bos_emb = text_encoder(
+                torch.tensor([[self.config.tokenizer.bos_id]], device=self.device), self.text
             )
             inputs_embeds = torch.cat([bos_emb, img_emb[None]], dim=1)
             mask = self.attn_mask[:, :, 0 : inputs_embeds.size(1), :]
             pos_ids = torch.arange(inputs_embeds.size(1), dtype=torch.long)
             self._prefill(inputs_embeds, mask, pos_ids, lora)
         return EncodedImage(
             pos=inputs_embeds.size(1),
             caches=[
             ],
         )
     def _apply_top_p(self, probs: torch.Tensor, top_p: float):
         probs_sort, probs_idx = torch.sort(probs, dim=-1, descending=True)
         probs_sum = torch.cumsum(probs_sort, dim=-1)
         return {"points": objects}
     def _load_encoded_image_batched(self, encoded_image, batch_size: int):
         """
         Clone single-image KV caches into a batch-B cache so we can decode B labels in parallel.
         temperature: float = 0.0,
         top_p: float = 0.0,
     ):
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
         T = max(lens)
         eos = self.config.tokenizer.eos_id
         prompt_ids = torch.full((B, T), eos, device=self.device, dtype=torch.long)
         for i, ids in enumerate(rows):
             prompt_ids[i, : ids.numel()] = ids
+        prompt_emb = text_encoder(prompt_ids, self.text)   # (B, T, C)
+        torch._dynamo.mark_dynamic(prompt_emb, 1)
+        # 4-D mask is broadcastable to (B, n_heads, T, K)
         attn = self.attn_mask
+        mask = attn[:, :, pos : pos + T, :].expand(B, -1, -1, -1).contiguous()
         pos_ids = torch.arange(pos, pos + T, device=self.device, dtype=torch.long)
         hidden_BTC = self._prefill(prompt_emb, mask, pos_ids, lora)  # (B, T, C)
         logits_BTV = lm_head(hidden_BTC, self.text)                  # (B, T, V)
         idx = (torch.tensor(lens, device=self.device, dtype=torch.long) - 1).clamp_min(0)
         last_hidden = hidden_BTC[torch.arange(B, device=self.device), idx][:, None, :]  # (B,1,C)
         last_logits = logits_BTV[torch.arange(B, device=self.device), idx]              # (B,V)
             probs = self._apply_top_p(probs, top_p)
             next_token = torch.multinomial(probs, num_samples=1)   # (B,1)
         pos_end = pos + T
         return last_hidden, next_token, pos_end  # (B,1,C), (B,1), int
     def _generate_points_batched(
         self,
+        hidden,              # (B,1,C)
+        next_token,          # (B,1)
+        pos: int,            # shared scalar next position
         include_size: bool = True,
         max_objects: int = 50,
         lora=None,
     ):
         B = hidden.size(0)
         device = self.device
         out = [[] for _ in range(B)]
         eos_id = self.config.tokenizer.eos_id
         max_ctx = self.config.text.max_context
+        # 4-D mask: (B, 1, q_len=1, kv_len)
         mask = torch.zeros(B, 1, 1, max_ctx, device=device, dtype=torch.bool)
         if pos > 0:
             mask[:, :, :, :pos] = True
         with torch.inference_mode():
             while alive.any() and (counts < max_objects).any():
+                # --- x coordinate ---
                 x_logits = decode_coordinate(hidden, self.region)  # (B,1,1024) or (B,1024)
                 if x_logits.dim() == 3:
+                    x_logits = x_logits.squeeze(1)
+                x_bin = x_logits.argmax(dim=-1).to(torch.float32)
                 x_center = x_bin / float(x_logits.size(-1))        # (B,)
+                x_in = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)  # (B,1)
+                x_emb = encode_coordinate(x_in, self.region).unsqueeze(1)  # (B,1,C)
+                mask[:, :, :, pos_id[0].item()] = True
                 logits, hidden = self._decode_one_tok(x_emb, mask, pos_id, lora)
+                pos_id += 1
+                # --- y coordinate ---
+                y_logits = decode_coordinate(hidden, self.region)
                 if y_logits.dim() == 3:
                     y_logits = y_logits.squeeze(1)
+                y_bin = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))        # (B,)
+                y_in = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)
+                y_emb = encode_coordinate(y_in, self.region).unsqueeze(1)
+                mask[:, :, :, pos_id[0].item()] = True
                 logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
+                pos_id += 1
                 if include_size:
+                    size_logits = decode_size(hidden, self.region)
+                    # Support both tuple-of-tensors and flattened (2, -1) forms
+                    if isinstance(size_logits, (tuple, list)):
+                        w_logits = size_logits[0]
+                        h_logits = size_logits[1]
+                        if w_logits.dim() == 3:  # (B,1,1024)
+                            w_logits = w_logits.squeeze(1)
+                            h_logits = h_logits.squeeze(1)
+                    else:
+                        # size_logits shape: (2, B * size_bins) — reshape it back.
+                        size_logits = size_logits.reshape(2, B, -1)
+                        w_logits, h_logits = size_logits[0], size_logits[1]  # (B, size_bins)
                     w_bin = w_logits.argmax(dim=-1).to(torch.float32)
                     h_bin = h_logits.argmax(dim=-1).to(torch.float32)
+                    # inverse of log-scale mapping used by Moondream
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_in = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B,2)
                     size_emb = encode_size(size_in, self.region).unsqueeze(1)       # (B,1,C)
                     for i in range(B):
                         if alive[i]:
                             out[i].append({
                                 "y_max": (y_center[i] + h[i] / 2).item(),
                             })
+                    mask[:, :, :, pos_id[0].item()] = True
                     logits, hidden = self._decode_one_tok(size_emb, mask, pos_id, lora)
+                    pos_id += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)  # (B,)
                 else:
                     for i in range(B):
                         if alive[i]:
                             out[i].append({"x": x_center[i].item(), "y": y_center[i].item()})
+                    mask[:, :, :, pos_id[0].item()] = True
                     logits, hidden = self._decode_one_tok(y_emb, mask, pos_id, lora)
+                    pos_id += 1
                     next_tok = logits.argmax(dim=-1).squeeze(-1)
                 finished_now = (next_tok == eos_id) | (counts >= max_objects - 1)
         return out
     def detect_multi(self, image, objects, settings=None):
         """
         Parallel multi-label detection.
         B = len(objects)
         self._load_encoded_image_batched(image, B)
         lora = None
         if "variant" in settings:
             lora = variant_state_dict(settings["variant"], device=self.device)
         last_hidden, next_token, pos_end = self._prefill_prompt_batched(
             objects, image.pos, lora=lora, temperature=0.0, top_p=0.0
         )
         max_objects = settings.get("max_objects", 50)
         det_lists = self._generate_points_batched(
             last_hidden, next_token, pos_end,
             for d in lst:
                 d["label"] = lab
             res[lab] = lst
+        # IMPORTANT: restore caches to B=1 so future calls (e.g., encode_image) are safe.
+        self._reset_kv_caches(1)
         return {"objects": res}
     def _detect_gaze(
         self,
         image: EncodedImage,