import contextlib, io, base64, torch, json from PIL import Image import open_clip from reparam import reparameterize_model class EndpointHandler: def __init__(self, path: str = ""): self.device = "cuda" if torch.cuda.is_available() else "cpu" # 1. Load the model (happens only once at startup) model, _, self.preprocess = open_clip.create_model_and_transforms( "MobileCLIP-B", pretrained='datacompdr' ) model.eval() self.model = reparameterize_model(model) tokenizer = open_clip.get_tokenizer("MobileCLIP-B") self.model.to(self.device) if self.device == "cuda": self.model.to(torch.float16) # --- OPTIMIZATION: Pre-compute text features from your JSON --- # 2. Load your rich class definitions from the file with open(f"{path}/items.json", "r", encoding="utf-8") as f: class_definitions = json.load(f) # 3. Prepare the data for encoding and for the final response # - Use the 'prompt' field for creating the embeddings # - Keep 'name' and 'id' to structure the response later prompts = [item['prompt'] for item in class_definitions] self.class_ids = [item['id'] for item in class_definitions] self.class_names = [item['name'] for item in class_definitions] # 4. Tokenize and encode all prompts at once with torch.no_grad(): text_tokens = tokenizer(prompts).to(self.device) self.text_features = self.model.encode_text(text_tokens) self.text_features /= self.text_features.norm(dim=-1, keepdim=True) def __call__(self, data): # The payload only needs the image now payload = data.get("inputs", data) img_b64 = payload["image"] # ---------------- decode image ---------------- image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB") img_tensor = self.preprocess(image).unsqueeze(0).to(self.device) if self.device == "cuda": img_tensor = img_tensor.to(torch.float16) # ---------------- forward pass (very fast) ----------------- with torch.no_grad(): # 1. Encode only the image img_feat = self.model.encode_image(img_tensor) img_feat /= img_feat.norm(dim=-1, keepdim=True) # 2. Compute similarity against the pre-computed text features probs = (100 * img_feat @ self.text_features.T).softmax(dim=-1)[0] # 3. Combine the results with your stored class IDs and names # and convert the tensor of probabilities to a list of floats results = zip(self.class_ids, self.class_names, probs.cpu().tolist()) # 4. Create a sorted list of dictionaries for a clean JSON response return sorted( [{"id": i, "label": name, "score": float(p)} for i, name, p in results], key=lambda x: x["score"], reverse=True ) # import contextlib, io, base64, torch # from PIL import Image # import open_clip # from reparam import reparameterize_model # class EndpointHandler: # def __init__(self, path: str = ""): # self.device = "cuda" if torch.cuda.is_available() else "cpu" # # Fix 1: Load weights directly from the web, just like local script # # This guarantees the weights are identical. # model, _, self.preprocess = open_clip.create_model_and_transforms( # "MobileCLIP-B", pretrained='datacompdr' # ) # model.eval() # self.model = reparameterize_model(model) # fuse branches # self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B") # self.model.to(self.device) # # Fix 2: Explicitly set model to half-precision if on CUDA # # This matches the behavior of torch.set_default_dtype(torch.float16) # if self.device == "cuda": # self.model.to(torch.float16) # def __call__(self, data): # payload = data.get("inputs", data) # img_b64 = payload["image"] # labels = payload.get("candidate_labels", []) # if not labels: # return {"error": "candidate_labels list is empty"} # # ---------------- decode inputs ---------------- # image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB") # img_tensor = self.preprocess(image).unsqueeze(0).to(self.device) # # The preprocessor might output float32, so ensure tensor matches model dtype # if self.device == "cuda": # img_tensor = img_tensor.to(torch.float16) # text_tokens = self.tokenizer(labels).to(self.device) # # ---------------- forward pass ----------------- # # No need for autocast if everything is already float16 # with torch.no_grad(): # img_feat = self.model.encode_image(img_tensor) # txt_feat = self.model.encode_text(text_tokens) # img_feat /= img_feat.norm(dim=-1, keepdim=True) # txt_feat /= txt_feat.norm(dim=-1, keepdim=True) # probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].cpu().tolist() # return [ # {"label": l, "score": float(p)} # for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True) # ]