File size: 5,379 Bytes

2f51b26
9188b68
147df04
9106e2d
9188b68
652b877
147df04
c5d457b
 
2f51b26
c5d457b
 
35037e4
c5d457b
2f51b26
 
c5d457b
 
 
 
9188b68
2f51b26
 
 
048809c
2f51b26
 
 
 
 
 
 
 
 
 
 
 
 
 
 
9188b68
2f51b26
825b375
e1369ab
9188b68
2f51b26
147df04
c5d457b
 
 
 
 
2f51b26
c5d457b
2f51b26
147df04
652b877
35037e4
2f51b26
 
 
 
 
 
 
 
 
 
 
 
 
c5d457b
aa10251
 
c5d457b
aa10251
c5d457b
aa10251
2f51b26
 
 
 
 
 
aa10251
2f51b26
 
aa10251
c5d457b
2f51b26
 
 
 
 
 
aa10251
 
 
 
 
 
 
 
c5d457b
aa10251
2f51b26
 
 
 
 
 
aa10251
 
c5d457b
2f51b26
 
aa10251
 
c5d457b
 
2f51b26
aa10251
 
 
 
 
c5d457b
2f51b26
c5d457b

import contextlib, io, base64, torch, json
from PIL import Image
import open_clip
from reparam import reparameterize_model

class EndpointHandler:
    def __init__(self, path: str = ""):
        self.device = "cuda" if torch.cuda.is_available() else "cpu"

        # 1. Load the model (happens only once at startup)
        model, _, self.preprocess = open_clip.create_model_and_transforms(
            "MobileCLIP-B", pretrained='datacompdr'
        )
        model.eval()
        self.model = reparameterize_model(model)
        tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
        self.model.to(self.device)

        if self.device == "cuda":
            self.model.to(torch.float16)

        # --- OPTIMIZATION: Pre-compute text features from your JSON ---
        
        # 2. Load your rich class definitions from the file
        with open(f"{path}/items.json", "r", encoding="utf-8") as f:
            class_definitions = json.load(f)

        # 3. Prepare the data for encoding and for the final response
        #    - Use the 'prompt' field for creating the embeddings
        #    - Keep 'name' and 'id' to structure the response later
        prompts = [item['prompt'] for item in class_definitions]
        self.class_ids = [item['id'] for item in class_definitions]
        self.class_names = [item['name'] for item in class_definitions]
        
        # 4. Tokenize and encode all prompts at once
        with torch.no_grad():
            text_tokens = tokenizer(prompts).to(self.device)
            self.text_features = self.model.encode_text(text_tokens)
            self.text_features /= self.text_features.norm(dim=-1, keepdim=True)

    def __call__(self, data):
        # The payload only needs the image now
        payload = data.get("inputs", data)
        img_b64 = payload["image"]

        # ---------------- decode image ----------------
        image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
        img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

        if self.device == "cuda":
            img_tensor = img_tensor.to(torch.float16)

        # ---------------- forward pass (very fast) -----------------
        with torch.no_grad():
            # 1. Encode only the image
            img_feat = self.model.encode_image(img_tensor)
            img_feat /= img_feat.norm(dim=-1, keepdim=True)

            # 2. Compute similarity against the pre-computed text features
            probs = (100 * img_feat @ self.text_features.T).softmax(dim=-1)[0]

        # 3. Combine the results with your stored class IDs and names
        #    and convert the tensor of probabilities to a list of floats
        results = zip(self.class_ids, self.class_names, probs.cpu().tolist())
        
        # 4. Create a sorted list of dictionaries for a clean JSON response
        return sorted(
            [{"id": i, "label": name, "score": float(p)} for i, name, p in results],
            key=lambda x: x["score"],
            reverse=True
        )
# import contextlib, io, base64, torch
# from PIL import Image
# import open_clip
# from reparam import reparameterize_model

# class EndpointHandler:
#     def __init__(self, path: str = ""):
#         self.device = "cuda" if torch.cuda.is_available() else "cpu"

#         # Fix 1: Load weights directly from the web, just like local script
#         # This guarantees the weights are identical.
#         model, _, self.preprocess = open_clip.create_model_and_transforms(
#             "MobileCLIP-B", pretrained='datacompdr'
#         )
#         model.eval()
#         self.model = reparameterize_model(model)  # fuse branches

#         self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
#         self.model.to(self.device)

#         # Fix 2: Explicitly set model to half-precision if on CUDA
#         # This matches the behavior of torch.set_default_dtype(torch.float16)
#         if self.device == "cuda":
#             self.model.to(torch.float16)

#     def __call__(self, data):
#         payload = data.get("inputs", data)
#         img_b64 = payload["image"]
#         labels  = payload.get("candidate_labels", [])
#         if not labels:
#             return {"error": "candidate_labels list is empty"}

#         # ---------------- decode inputs ----------------
#         image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
#         img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)

#         # The preprocessor might output float32, so ensure tensor matches model dtype
#         if self.device == "cuda":
#             img_tensor = img_tensor.to(torch.float16)

#         text_tokens = self.tokenizer(labels).to(self.device)

#         # ---------------- forward pass -----------------
#         # No need for autocast if everything is already float16
#         with torch.no_grad():
#             img_feat = self.model.encode_image(img_tensor)
#             txt_feat = self.model.encode_text(text_tokens)
#             img_feat /= img_feat.norm(dim=-1, keepdim=True)
#             txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
#             probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].cpu().tolist()

#         return [
#             {"label": l, "score": float(p)}
#             for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
#         ]