File size: 5,379 Bytes
2f51b26 9188b68 147df04 9106e2d 9188b68 652b877 147df04 c5d457b 2f51b26 c5d457b 35037e4 c5d457b 2f51b26 c5d457b 9188b68 2f51b26 048809c 2f51b26 9188b68 2f51b26 825b375 e1369ab 9188b68 2f51b26 147df04 c5d457b 2f51b26 c5d457b 2f51b26 147df04 652b877 35037e4 2f51b26 c5d457b aa10251 c5d457b aa10251 c5d457b aa10251 2f51b26 aa10251 2f51b26 aa10251 c5d457b 2f51b26 aa10251 c5d457b aa10251 2f51b26 aa10251 c5d457b 2f51b26 aa10251 c5d457b 2f51b26 aa10251 c5d457b 2f51b26 c5d457b |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 |
import contextlib, io, base64, torch, json
from PIL import Image
import open_clip
from reparam import reparameterize_model
class EndpointHandler:
def __init__(self, path: str = ""):
self.device = "cuda" if torch.cuda.is_available() else "cpu"
# 1. Load the model (happens only once at startup)
model, _, self.preprocess = open_clip.create_model_and_transforms(
"MobileCLIP-B", pretrained='datacompdr'
)
model.eval()
self.model = reparameterize_model(model)
tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
self.model.to(self.device)
if self.device == "cuda":
self.model.to(torch.float16)
# --- OPTIMIZATION: Pre-compute text features from your JSON ---
# 2. Load your rich class definitions from the file
with open(f"{path}/items.json", "r", encoding="utf-8") as f:
class_definitions = json.load(f)
# 3. Prepare the data for encoding and for the final response
# - Use the 'prompt' field for creating the embeddings
# - Keep 'name' and 'id' to structure the response later
prompts = [item['prompt'] for item in class_definitions]
self.class_ids = [item['id'] for item in class_definitions]
self.class_names = [item['name'] for item in class_definitions]
# 4. Tokenize and encode all prompts at once
with torch.no_grad():
text_tokens = tokenizer(prompts).to(self.device)
self.text_features = self.model.encode_text(text_tokens)
self.text_features /= self.text_features.norm(dim=-1, keepdim=True)
def __call__(self, data):
# The payload only needs the image now
payload = data.get("inputs", data)
img_b64 = payload["image"]
# ---------------- decode image ----------------
image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
if self.device == "cuda":
img_tensor = img_tensor.to(torch.float16)
# ---------------- forward pass (very fast) -----------------
with torch.no_grad():
# 1. Encode only the image
img_feat = self.model.encode_image(img_tensor)
img_feat /= img_feat.norm(dim=-1, keepdim=True)
# 2. Compute similarity against the pre-computed text features
probs = (100 * img_feat @ self.text_features.T).softmax(dim=-1)[0]
# 3. Combine the results with your stored class IDs and names
# and convert the tensor of probabilities to a list of floats
results = zip(self.class_ids, self.class_names, probs.cpu().tolist())
# 4. Create a sorted list of dictionaries for a clean JSON response
return sorted(
[{"id": i, "label": name, "score": float(p)} for i, name, p in results],
key=lambda x: x["score"],
reverse=True
)
# import contextlib, io, base64, torch
# from PIL import Image
# import open_clip
# from reparam import reparameterize_model
# class EndpointHandler:
# def __init__(self, path: str = ""):
# self.device = "cuda" if torch.cuda.is_available() else "cpu"
# # Fix 1: Load weights directly from the web, just like local script
# # This guarantees the weights are identical.
# model, _, self.preprocess = open_clip.create_model_and_transforms(
# "MobileCLIP-B", pretrained='datacompdr'
# )
# model.eval()
# self.model = reparameterize_model(model) # fuse branches
# self.tokenizer = open_clip.get_tokenizer("MobileCLIP-B")
# self.model.to(self.device)
# # Fix 2: Explicitly set model to half-precision if on CUDA
# # This matches the behavior of torch.set_default_dtype(torch.float16)
# if self.device == "cuda":
# self.model.to(torch.float16)
# def __call__(self, data):
# payload = data.get("inputs", data)
# img_b64 = payload["image"]
# labels = payload.get("candidate_labels", [])
# if not labels:
# return {"error": "candidate_labels list is empty"}
# # ---------------- decode inputs ----------------
# image = Image.open(io.BytesIO(base64.b64decode(img_b64))).convert("RGB")
# img_tensor = self.preprocess(image).unsqueeze(0).to(self.device)
# # The preprocessor might output float32, so ensure tensor matches model dtype
# if self.device == "cuda":
# img_tensor = img_tensor.to(torch.float16)
# text_tokens = self.tokenizer(labels).to(self.device)
# # ---------------- forward pass -----------------
# # No need for autocast if everything is already float16
# with torch.no_grad():
# img_feat = self.model.encode_image(img_tensor)
# txt_feat = self.model.encode_text(text_tokens)
# img_feat /= img_feat.norm(dim=-1, keepdim=True)
# txt_feat /= txt_feat.norm(dim=-1, keepdim=True)
# probs = (100 * img_feat @ txt_feat.T).softmax(dim=-1)[0].cpu().tolist()
# return [
# {"label": l, "score": float(p)}
# for l, p in sorted(zip(labels, probs), key=lambda x: x[1], reverse=True)
# ]
|