Inference using python
to run the onnx model using python what are the input and output model format / parameters to pass
from PIL import Image, ImageDraw
import requests
from transformers import AutoProcessor
import torch
import onnxruntime as ort
import numpy as np
Path to the downloaded ONNX model
onnx_model_path = "model.onnx" # Ensure this is the correct path
hf_repo = "onnx-community/grounding-dino-tiny-ONNX"
Text prompt for detection
text_prompt = ["cat"]
Load the image
image_url = "http://images.cocodataset.org/val2017/000000039769.jpg"
image = Image.open(requests.get(image_url, stream=True).raw).convert("RGB")
img_width, img_height = image.size
Load the processor
processor = AutoProcessor.from_pretrained(hf_repo)
processor_inputs = processor(images=image, text=text_prompt, return_tensors="pt")
Set device and providers
device = torch.device("cuda" if torch.cuda.is_available() else "cpu")
providers = ["CUDAExecutionProvider"] if device.type == "cuda" else ["CPUExecutionProvider"]
Move inputs to device
processor_inputs = {k: v.to(device) for k, v in processor_inputs.items()}
onnx_inputs = {k: v.detach().cpu().numpy() for k, v in processor_inputs.items()}
Run inference
ort_session = ort.InferenceSession(onnx_model_path, providers=providers)
outputs = ort_session.run(None, onnx_inputs)
Post-process outputs
logits, boxes = outputs
logits = torch.from_numpy(logits).sigmoid().numpy() # Apply sigmoid to get probabilities
boxes = boxes[0] # Shape: [num_queries, 4]
logits = logits[0] # Shape: [num_queries, num_classes]
Filter detections with confidence threshold
confidence_threshold = 0.3 # Adjust as needed
detections = []
for i, (score, box) in enumerate(zip(logits, boxes)):
if score.max() > confidence_threshold: # Check max score across classes
detections.append({"score": score.max(), "label": text_prompt[0], "box": box})
Draw bounding boxes on the image
draw = ImageDraw.Draw(image)
for detection in detections:
score, label, box = detection["score"], detection["label"], detection["box"]
# Box format: [x_center, y_center, width, height] (normalized)
x_center, y_center, width, height = box
# Convert to pixel coordinates
x1 = (x_center - width / 2) * img_width
y1 = (y_center - height / 2) * img_height
x2 = (x_center + width / 2) * img_width
y2 = (y_center + height / 2) * img_height
# Draw rectangle and label
draw.rectangle([x1, y1, x2, y2], outline="red", width=2)
draw.text((x1, y1 - 10), f"{label}: {score:.2f}", fill="red")
Save or display the image
image.save("output_image_model.jpg")
image.show() # Uncomment to display the image if you have a GUI environment
print(f"Detections: {detections}")