JigsawStack
/

moondream2-batched

Image-Text-to-Text

Model card Files Files and versions

HV-Khurdula commited on Sep 24, 2025

Commit

c0e9503

·

verified ·

1 Parent(s): f542ccb

Update region.py

fix: update internal decode size.

Files changed (1) hide show

region.py +10 -16

region.py CHANGED Viewed

@@ -71,26 +71,20 @@ def encode_size(size: torch.Tensor, w: nn.Module) -> torch.Tensor:
     return w.size_encoder(fourier_features(size, w.size_features))
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
-    Takes as input the last hidden state from the text model and outputs logits
-    for 1024 bins representing width and height in log-scale.
-    The bins are distributed according to the formula:
-    bin = (log2(size) + 10.0) / 10.0 * 1023.0
-    where size values are clamped to be at least 1/1024.
-    To convert from bin back to size:
-    size = 2^((bin / 1023.0) * 10.0 - 10.0)
-    Args:
-        hidden_state: The final hidden state tensor from the text model.
-    Returns:
-        A tensor containing logits for 1024 bins for width and height.
-        Shape is (2, 1024) where the first dimension corresponds to width and height.
     """
-    return mlp(hidden_state, w.size_decoder).view(2, -1)
 def encode_spatial_refs(spatial_refs: SpatialRefs, w: nn.Module) -> torch.Tensor:

     return w.size_encoder(fourier_features(size, w.size_features))
+# region.py
 def decode_size(hidden_state: torch.Tensor, w: nn.Module) -> torch.Tensor:
     """
+    Returns logits for width & height bins without collapsing batch/seq dims.
+    Input  (hidden_state): (..., C)
+    Output: (..., 2, bins)  # keeps all leading dims intact
     """
+    x = mlp(hidden_state, w.size_decoder)       # (..., size_out_dim)
+    last = x.shape[-1]
+    if last % 2 != 0:
+        raise RuntimeError(f"size_out_dim must be even, got {last}")
+    return x.view(*x.shape[:-1], 2, last // 2)  # (..., 2, bins)
 def encode_spatial_refs(spatial_refs: SpatialRefs, w: nn.Module) -> torch.Tensor: