finhdev
/

testmobileclip

mobileclip

Model card Files Files and versions

xet

Community

finhdev commited on Aug 4, 2025

Commit

147df04

verified ·

1 Parent(s): d46ebda

Update handler.py

Browse files

Files changed (1) hide show

handler.py +32 -37

handler.py CHANGED Viewed

@@ -1,68 +1,63 @@
-import io, base64, torch, open_clip
 from PIL import Image
 class EndpointHandler:
     """
-    MobileCLIP‑B ('datacompdr') zero‑shot classifier with a per‑process
-    text‑embedding cache.
-    Client JSON must look like:
-      {
-        "inputs": {
-          "image": "<base64 PNG/JPEG>",
-          "candidate_labels": ["a photo of a cat", ...]
-        }
       }
     """
-    # ---------- init (runs once per container) ----------
-    def __init__(self, path=""):
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
-            "mobileclip_b", pretrained="datacompdr"
         )
         self.model.eval()
-        self.tokenizer = open_clip.get_tokenizer("mobileclip_b")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
-        self.cache: dict[str, torch.Tensor] = {}   # prompt → embedding
-    # ----------------- inference ------------------------
     def __call__(self, data):
         payload = data.get("inputs", data)
         img_b64 = payload["image"]
         labels  = payload.get("candidate_labels", [])
         if not labels:
             return {"error": "candidate_labels list is empty"}
-        # Image → tensor
-        img = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-        img_t = self.preprocess(img).unsqueeze(0).to(self.device)
-        # Text embeddings (cached)
-        new = [l for l in labels if l not in self.cache]
-        if new:
-            tok = self.tokenizer(new).to(self.device)
-            with torch.no_grad():
-                emb = self.model.encode_text(tok)
-                emb = emb / emb.norm(dim=-1, keepdim=True)
-            for l, e in zip(new, emb):
-                self.cache[l] = e
-        txt_t = torch.stack([self.cache[l] for l in labels])
-        # Forward
         with torch.no_grad(), torch.cuda.amp.autocast():
-            img_f = self.model.encode_image(img_t)
-            img_f = img_f / img_f.norm(dim=-1, keepdim=True)
-            probs = (100 * img_f @ txt_t.T).softmax(dim=-1)[0].tolist()
         return [
             {"label": l, "score": float(p)}
             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
         ]
 # import io, base64, torch
 # from PIL import Image
 # import open_clip

+# handler.py  (repo root)
+import io, base64, torch
 from PIL import Image
+import open_clip
 class EndpointHandler:
     """
+    Zero‑shot classifier for MobileCLIP‑B (OpenCLIP).
+    Expected client JSON *to the endpoint*:
+    {
+      "inputs": {
+        "image": "<base64 PNG/JPEG>",
+        "candidate_labels": ["cat", "dog", ...]
       }
+    }
     """
+    def __init__(self, path: str = ""):
+        weights = f"{path}/mobileclip_b.pt"
         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
+            "MobileCLIP-B", pretrained=weights
         )
         self.model.eval()
+        self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
         self.model.to(self.device)
     def __call__(self, data):
+        # ── unwrap Hugging Face's `inputs` envelope ───────────
         payload = data.get("inputs", data)
         img_b64 = payload["image"]
         labels  = payload.get("candidate_labels", [])
         if not labels:
             return {"error": "candidate_labels list is empty"}
+        # Decode & preprocess image
+        image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
+        img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+        # Tokenise labels
+        text_tokens = self.tokenizer(labels).to(self.device)
+        # Forward pass
         with torch.no_grad(), torch.cuda.amp.autocast():
+            img_feat = self.model.encode_image(img_tensor)
+            txt_feat = self.model.encode_text(text_tokens)
+            img_feat = img_feat / img_feat.norm(dim=-1, keepdim=True)
+            txt_feat = txt_feat / txt_feat.norm(dim=-1, keepdim=True)
+            probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
+        # Sorted output
         return [
             {"label": l, "score": float(p)}
             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
         ]
 # import io, base64, torch
 # from PIL import Image
 # import open_clip