JigsawStack
/

moondream2-batched

@@ -22,6 +22,10 @@ from .region import (
 from .layers import QuantizedLinear
 from .lora import variant_state_dict
 from .utils import remove_outlier_points
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
@@ -851,9 +855,6 @@ class MoondreamModel(nn.Module):
         Build detect prompts for many labels, pad to same length, prefill once as a batch,
         then return (last_hidden per row, next_token per row, pos per row).
         """
-        import torch
-        from .text import text_encoder, lm_head
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
@@ -873,7 +874,6 @@ class MoondreamModel(nn.Module):
         # Embed & prefill once
         prompt_emb = text_encoder(prompt_ids, self.text)    # (B, T, C)
-        import torch
         torch._dynamo.mark_dynamic(prompt_emb, 1)           # allow variable T
         attn_mask = self.attn_mask
@@ -905,8 +905,6 @@ class MoondreamModel(nn.Module):
         for all rows in the batch simultaneously.
         Returns: list-of-lists of dicts, length B.
         """
-        import torch
-        from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
         B = hidden.size(0)
         device = self.device
@@ -930,8 +928,10 @@ class MoondreamModel(nn.Module):
                 if x_logits.dim() == 3:
                     x_logits = x_logits.squeeze(1)                 # (B, 1024)
                 x_bin = x_logits.argmax(dim=-1).to(torch.float32)  # (B,)
-                x_center = x_bin / float(x_logits.size(-1))        # normalize to [0,1]
-                x_emb = encode_coordinate(x_center.to(dtype=x_logits.dtype), self.region).unsqueeze(1)  # (B,1,C)
                 # step: decode to get hidden for y
                 for i in range(B):
@@ -945,8 +945,10 @@ class MoondreamModel(nn.Module):
                 if y_logits.dim() == 3:
                     y_logits = y_logits.squeeze(1)                 # (B, 1024)
                 y_bin = y_logits.argmax(dim=-1).to(torch.float32)
-                y_center = y_bin / float(y_logits.size(-1))
-                y_emb = encode_coordinate(y_center.to(dtype=y_logits.dtype), self.region).unsqueeze(1)
                 # step: decode to get hidden for size (or eos)
                 for i in range(B):
@@ -964,7 +966,8 @@ class MoondreamModel(nn.Module):
                     # Convert from log-scale bin to size in [0,1]
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
-                    size_emb = encode_size(torch.stack([w, h], dim=0), self.region).transpose(0,1).unsqueeze(1)  # (B,1,C)
                     # Commit boxes for alive rows
                     for i in range(B):
@@ -1015,8 +1018,7 @@ class MoondreamModel(nn.Module):
         Returns:
             {"objects": {label: [box_dict, ...]}}
         """
-        import torch
-        from typing import Optional, List, Union
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
@@ -1030,7 +1032,6 @@ class MoondreamModel(nn.Module):
         # Optional LoRA variant (same as detect())
         lora = None
         if "variant" in settings:
-            from .lora import variant_state_dict
             lora = variant_state_dict(settings["variant"], device=self.device)
         # Prefill all prompts at once

 from .layers import QuantizedLinear
 from .lora import variant_state_dict
 from .utils import remove_outlier_points
+from .region import decode_coordinate, encode_coordinate, decode_size, encode_size
+from .text import text_encoder, lm_head
+from typing import Optional, List, Union
+from .lora import variant_state_dict
 ImageEncodingSettings = TypedDict(
     "ImageEncodingSettings",
         Build detect prompts for many labels, pad to same length, prefill once as a batch,
         then return (last_hidden per row, next_token per row, pos per row).
         """
         tpl = self.config.tokenizer.templates["detect"]
         if tpl is None:
             raise NotImplementedError("Model does not support object detection (no detect template).")
         # Embed & prefill once
         prompt_emb = text_encoder(prompt_ids, self.text)    # (B, T, C)
         torch._dynamo.mark_dynamic(prompt_emb, 1)           # allow variable T
         attn_mask = self.attn_mask
         for all rows in the batch simultaneously.
         Returns: list-of-lists of dicts, length B.
         """
         B = hidden.size(0)
         device = self.device
                 if x_logits.dim() == 3:
                     x_logits = x_logits.squeeze(1)                 # (B, 1024)
                 x_bin = x_logits.argmax(dim=-1).to(torch.float32)  # (B,)
+                x_center = x_bin / float(x_logits.size(-1))              # (B,)
+                x_input  = x_center.to(dtype=x_logits.dtype).unsqueeze(-1)   # (B, 1)  ✅
+                x_emb    = encode_coordinate(x_input, self.region).unsqueeze(1)  # (B,1,C)
                 # step: decode to get hidden for y
                 for i in range(B):
                 if y_logits.dim() == 3:
                     y_logits = y_logits.squeeze(1)                 # (B, 1024)
                 y_bin = y_logits.argmax(dim=-1).to(torch.float32)
+                y_center = y_bin / float(y_logits.size(-1))              # (B,)
+                y_input  = y_center.to(dtype=y_logits.dtype).unsqueeze(-1)   # (B, 1)  ✅
+                y_emb    = encode_coordinate(y_input, self.region).unsqueeze(1)
                 # step: decode to get hidden for size (or eos)
                 for i in range(B):
                     # Convert from log-scale bin to size in [0,1]
                     w = torch.pow(2.0, (w_bin / 1023.0) * 10.0 - 10.0)
                     h = torch.pow(2.0, (h_bin / 1023.0) * 10.0 - 10.0)
+                    size_input = torch.stack([w, h], dim=1).to(dtype=w_logits.dtype)  # (B, 2)  ✅
+                    size_emb   = encode_size(size_input, self.region).unsqueeze(1)
                     # Commit boxes for alive rows
                     for i in range(B):
         Returns:
             {"objects": {label: [box_dict, ...]}}
         """
         if self.config.tokenizer.templates["detect"] is None:
             raise NotImplementedError("Model does not support object detection.")
         # Optional LoRA variant (same as detect())
         lora = None
         if "variant" in settings:
             lora = variant_state_dict(settings["variant"], device=self.device)
         # Prefill all prompts at once