finhdev
/

testmobileclip

mobileclip

Model card Files Files and versions

xet

Community

finhdev commited on Aug 4, 2025

Commit

2f51b26

verified ·

1 Parent(s): 9bde8e9

Update handler.py

Browse files

Files changed (1) hide show

handler.py +64 -39

handler.py CHANGED Viewed

@@ -1,4 +1,4 @@
-import contextlib, io, base64, torch
 from PIL import Image
 import open_clip
 from reparam import reparameterize_model
@@ -7,71 +7,92 @@ class EndpointHandler:
     def __init__(self, path: str = ""):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-        # Fix 1: Load weights directly from the web, just like local script
-        # This guarantees the weights are identical.
         model, _, self.preprocess = open_clip.create_model_and_transforms(
             "MobileCLIP-B", pretrained='datacompdr'
         )
         model.eval()
-        self.model = reparameterize_model(model)  # fuse branches
-        self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
         self.model.to(self.device)
-        # Fix 2: Explicitly set model to half-precision if on CUDA
-        # This matches the behavior of torch.set_default_dtype(torch.float16)
         if self.device == "cuda":
             self.model.to(torch.float16)
     def __call__(self, data):
         payload = data.get("inputs", data)
         img_b64 = payload["image"]
-        labels  = payload.get("candidate_labels", [])
-        if not labels:
-            return {"error": "candidate_labels list is empty"}
-        # ---------------- decode inputs ----------------
         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
-        # The preprocessor might output float32, so ensure tensor matches model dtype
         if self.device == "cuda":
             img_tensor = img_tensor.to(torch.float16)
-        text_tokens = self.tokenizer(labels).to(self.device)
-        # ---------------- forward pass -----------------
-        # No need for autocast if everything is already float16
         with torch.no_grad():
             img_feat = self.model.encode_image(img_tensor)
-            txt_feat = self.model.encode_text(text_tokens)
             img_feat /= img_feat.norm(dim=-1, keepdim=True)
-            txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
-            probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].cpu().tolist()
-        return [
-            {"label": l, "score": float(p)}
-            for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
-        ]
 # import contextlib, io, base64, torch
 # from PIL import Image
 # import open_clip
 # from reparam import reparameterize_model
 # class EndpointHandler:
 #     def __init__(self, path: str = ""):
-#         # You can also pass pretrained='datacompdr' to let OpenCLIP download
-#         weights = f"{path}/mobileclip_b.pt"
-#         self.model, _, self.preprocess = open_clip.create_model_and_transforms(
-#             "MobileCLIP-B", pretrained=weights
 #         )
-#         self.model.eval()
-#         self.model = reparameterize_model(self.model)   # *** fuse branches ***
-#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
-#         self.model.to(self.device)
 #         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
 #     def __call__(self, data):
 #         payload = data.get("inputs", data)
@@ -82,23 +103,27 @@ class EndpointHandler:
 #         # ---------------- decode inputs ----------------
 #         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
-#         img_tensor  = self.preprocess(image).unsqueeze(0).to(self.device)
 #         text_tokens = self.tokenizer(labels).to(self.device)
 #         # ---------------- forward pass -----------------
-#         autocast_ctx = (
-#             torch.cuda.amp.autocast if self.device.startswith("cuda") else contextlib.nullcontext
-#         )
-#         with torch.no_grad(), autocast_ctx():
 #             img_feat = self.model.encode_image(img_tensor)
 #             txt_feat = self.model.encode_text(text_tokens)
 #             img_feat /= img_feat.norm(dim=-1, keepdim=True)
 #             txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
-#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].tolist()
 #         return [
 #             {"label": l, "score": float(p)}
 #             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
 #         ]

+import contextlib, io, base64, torch, json
 from PIL import Image
 import open_clip
 from reparam import reparameterize_model
     def __init__(self, path: str = ""):
         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+        # 1. Load the model (happens only once at startup)
         model, _, self.preprocess = open_clip.create_model_and_transforms(
             "MobileCLIP-B", pretrained='datacompdr'
         )
         model.eval()
+        self.model = reparameterize_model(model)
+        tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
         self.model.to(self.device)
         if self.device == "cuda":
             self.model.to(torch.float16)
+        # --- OPTIMIZATION: Pre-compute text features from your JSON ---
+        # 2. Load your rich class definitions from the file
+        with open(f"{path}/classes.json", "r", encoding="utf-8") as f:
+            class_definitions = json.load(f)
+        # 3. Prepare the data for encoding and for the final response
+        #    - Use the 'prompt' field for creating the embeddings
+        #    - Keep 'name' and 'id' to structure the response later
+        prompts = [item['prompt'] for item in class_definitions]
+        self.class_ids = [item['id'] for item in class_definitions]
+        self.class_names = [item['name'] for item in class_definitions]
+        # 4. Tokenize and encode all prompts at once
+        with torch.no_grad():
+            text_tokens = tokenizer(prompts).to(self.device)
+            self.text_features = self.model.encode_text(text_tokens)
+            self.text_features /= self.text_features.norm(dim=-1, keepdim=True)
     def __call__(self, data):
+        # The payload only needs the image now
         payload = data.get("inputs", data)
         img_b64 = payload["image"]
+        # ---------------- decode image ----------------
         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
         if self.device == "cuda":
             img_tensor = img_tensor.to(torch.float16)
+        # ---------------- forward pass (very fast) -----------------
         with torch.no_grad():
+            # 1. Encode only the image
             img_feat = self.model.encode_image(img_tensor)
             img_feat /= img_feat.norm(dim=-1, keepdim=True)
+            # 2. Compute similarity against the pre-computed text features
+            probs = (100 * img_feat @ self.text_features.T).softmax(dim=-1)[0]
+        # 3. Combine the results with your stored class IDs and names
+        #    and convert the tensor of probabilities to a list of floats
+        results = zip(self.class_ids, self.class_names, probs.cpu().tolist())
+        # 4. Create a sorted list of dictionaries for a clean JSON response
+        return sorted(
+            [{"id": i, "label": name, "score": float(p)} for i, name, p in results],
+            key=lambda x: x["score"],
+            reverse=True
+        )
 # import contextlib, io, base64, torch
 # from PIL import Image
 # import open_clip
 # from reparam import reparameterize_model
 # class EndpointHandler:
 #     def __init__(self, path: str = ""):
+#         self.device = "cuda" if torch.cuda.is_available() else "cpu"
+#         # Fix 1: Load weights directly from the web, just like local script
+#         # This guarantees the weights are identical.
+#         model, _, self.preprocess = open_clip.create_model_and_transforms(
+#             "MobileCLIP-B", pretrained='datacompdr'
 #         )
+#         model.eval()
+#         self.model = reparameterize_model(model)  # fuse branches
 #         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
+#         self.model.to(self.device)
+#         # Fix 2: Explicitly set model to half-precision if on CUDA
+#         # This matches the behavior of torch.set_default_dtype(torch.float16)
+#         if self.device == "cuda":
+#             self.model.to(torch.float16)
 #     def __call__(self, data):
 #         payload = data.get("inputs", data)
 #         # ---------------- decode inputs ----------------
 #         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
+#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
+#         # The preprocessor might output float32, so ensure tensor matches model dtype
+#         if self.device == "cuda":
+#             img_tensor = img_tensor.to(torch.float16)
 #         text_tokens = self.tokenizer(labels).to(self.device)
 #         # ---------------- forward pass -----------------
+#         # No need for autocast if everything is already float16
+#         with torch.no_grad():
 #             img_feat = self.model.encode_image(img_tensor)
 #             txt_feat = self.model.encode_text(text_tokens)
 #             img_feat /= img_feat.norm(dim=-1, keepdim=True)
 #             txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
+#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].cpu().tolist()
 #         return [
 #             {"label": l, "score": float(p)}
 #             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
 #         ]