rtr46 commited on
Commit
f368b06
·
verified ·
1 Parent(s): c5ee532

Upload 4 files

Browse files
inference.small.py ADDED
@@ -0,0 +1,152 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import onnxruntime as ort
4
+
5
+ # --- CONFIGURATION ---
6
+ MODEL_PATH = "meiki.text.detect.small.v0.onnx"
7
+ INPUT_IMAGE_PATH = "input.jpg"
8
+ OUTPUT_IMAGE_PATH = "output.small.jpg"
9
+
10
+ # The model expects a 640x640 RGB image.
11
+ MODEL_SIZE = 640
12
+
13
+ # A threshold to filter out weak detections.
14
+ # You can adjust this value (e.g., lower to 0.3 for more boxes,
15
+ # or raise to 0.5 for fewer, more confident boxes).
16
+ CONFIDENCE_THRESHOLD = 0.4
17
+
18
+ def resize_and_pad(image: np.ndarray, size: int):
19
+ """
20
+ Resizes a COLOR image to the model's expected size,
21
+ maintaining aspect ratio and padding.
22
+
23
+ Returns:
24
+ - The padded image ready for the model.
25
+ - The ratio used to resize the image.
26
+ - The padding amounts (width, height).
27
+ """
28
+ # Get the original image dimensions.
29
+ original_height, original_width, _ = image.shape
30
+
31
+ # Calculate the ratio to resize the image.
32
+ ratio = min(size / original_width, size / original_height)
33
+ new_width = int(original_width * ratio)
34
+ new_height = int(original_height * ratio)
35
+
36
+ # Resize the image using the calculated ratio.
37
+ resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
38
+
39
+ # Create a new square image (640x640) filled with zeros (black).
40
+ # Note the `(size, size, 3)` for the 3 color channels (BGR).
41
+ padded_image = np.zeros((size, size, 3), dtype=np.uint8)
42
+
43
+ # Calculate padding to center the resized image.
44
+ pad_w = (size - new_width) // 2
45
+ pad_h = (size - new_height) // 2
46
+
47
+ # Paste the resized image onto the center of the black square.
48
+ padded_image[pad_h:pad_h + new_height, pad_w:pad_w + new_width] = resized_image
49
+
50
+ return padded_image, ratio, pad_w, pad_h
51
+
52
+ def main():
53
+ """
54
+ Main function to run the inference process.
55
+ """
56
+ # --- 1. Load the Model ---
57
+ try:
58
+ # Create an inference session with the ONNX model.
59
+ session = ort.InferenceSession(MODEL_PATH, providers=['CPUExecutionProvider'])
60
+ print(f"Successfully loaded model: {MODEL_PATH}")
61
+ except Exception as e:
62
+ print(f"Error: Failed to load the ONNX model. Make sure '{MODEL_PATH}' exists.")
63
+ print(f"Details: {e}")
64
+ return
65
+
66
+ # --- 2. Load and Pre-process the Input Image ---
67
+ try:
68
+ # Read the input image from the file. It will be in BGR format by default.
69
+ original_image = cv2.imread(INPUT_IMAGE_PATH)
70
+ if original_image is None:
71
+ raise FileNotFoundError(f"Image not found at '{INPUT_IMAGE_PATH}'")
72
+ print(f"Successfully loaded image: {INPUT_IMAGE_PATH}")
73
+ except Exception as e:
74
+ print(f"Error: {e}")
75
+ return
76
+
77
+ # This model requires a color image, so we don't convert to grayscale.
78
+ # Resize and pad the image to fit the model's 640x640 input size.
79
+ padded_image, ratio, pad_w, pad_h = resize_and_pad(original_image, MODEL_SIZE)
80
+
81
+ # Normalize the image data to be between 0 and 1.
82
+ img_normalized = padded_image.astype(np.float32) / 255.0
83
+
84
+ # The model expects the channel dimension to be first (Channels, Height, Width).
85
+ # OpenCV loads images as (Height, Width, Channels), so we transpose the axes.
86
+ img_transposed = np.transpose(img_normalized, (2, 0, 1))
87
+
88
+ # Add a batch dimension to match the model's expected input shape: (1, 3, 640, 640).
89
+ image_input_tensor = np.expand_dims(img_transposed, axis=0)
90
+
91
+ # --- 3. Run Inference ---
92
+ # The model requires a second input specifying the image size. We provide the padded size.
93
+ sizes_input_tensor = np.array([[MODEL_SIZE, MODEL_SIZE]], dtype=np.int64)
94
+
95
+ # Get the names of the model's input nodes.
96
+ input_names = [inp.name for inp in session.get_inputs()]
97
+
98
+ # Prepare the dictionary of inputs for the model.
99
+ inputs = {
100
+ input_names[0]: image_input_tensor,
101
+ input_names[1]: sizes_input_tensor
102
+ }
103
+
104
+ # Run the model.
105
+ # This model returns three separate outputs: labels, boxes, and confidence scores.
106
+ outputs = session.run(None, inputs)
107
+ labels, boxes, scores = outputs
108
+
109
+ # --- 4. Post-process and Draw Bounding Boxes ---
110
+ # The outputs have an extra batch dimension, so we remove it.
111
+ boxes = boxes[0]
112
+ scores = scores[0]
113
+
114
+ print(f"Model returned {len(boxes)} boxes. Filtering with confidence > {CONFIDENCE_THRESHOLD}...")
115
+
116
+ # Create a copy of the original image to draw on.
117
+ output_image = original_image.copy()
118
+
119
+ # Iterate through the boxes and their corresponding scores.
120
+ confident_boxes_count = 0
121
+ for box, score in zip(boxes, scores):
122
+ # Only process boxes with a confidence score above our threshold.
123
+ if score > CONFIDENCE_THRESHOLD:
124
+ confident_boxes_count += 1
125
+ # The coordinates from the model are relative to the 640x640 padded image.
126
+ # We need to scale them back to the original image's coordinate space.
127
+ x_min, y_min, x_max, y_max = box
128
+
129
+ # Step 1: Subtract the padding that was added.
130
+ x_min_unpadded = x_min - pad_w
131
+ y_min_unpadded = y_min - pad_h
132
+ x_max_unpadded = x_max - pad_w
133
+ y_max_unpadded = y_max - pad_h
134
+
135
+ # Step 2: Scale the coordinates back up to the original image size by dividing by the ratio.
136
+ final_x_min = int(x_min_unpadded / ratio)
137
+ final_y_min = int(y_min_unpadded / ratio)
138
+ final_x_max = int(x_max_unpadded / ratio)
139
+ final_y_max = int(y_max_unpadded / ratio)
140
+
141
+ # Draw a green rectangle on the output image.
142
+ cv2.rectangle(output_image, (final_x_min, final_y_min), (final_x_max, final_y_max), (0, 255, 0), 2)
143
+
144
+ print(f"Found {confident_boxes_count} confident boxes.")
145
+
146
+ # --- 5. Save the Final Image ---
147
+ cv2.imwrite(OUTPUT_IMAGE_PATH, output_image)
148
+ print(f"Successfully saved result to: {OUTPUT_IMAGE_PATH}")
149
+
150
+
151
+ if __name__ == "__main__":
152
+ main()
inference.tiny.py ADDED
@@ -0,0 +1,131 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import cv2
2
+ import numpy as np
3
+ import onnxruntime as ort
4
+
5
+ # --- CONFIGURATION ---
6
+ MODEL_PATH = "meiki.text.detect.tiny.v0.onnx"
7
+ INPUT_IMAGE_PATH = "input.jpg"
8
+ OUTPUT_IMAGE_PATH = "output.tiny.jpg"
9
+
10
+ # The model expects a 320x320 input image.
11
+ MODEL_SIZE = 320
12
+
13
+ def resize_and_pad(image: np.ndarray, size: int):
14
+ """
15
+ Resizes a GRAYSCALE image to the model's expected size,
16
+ maintaining aspect ratio and padding.
17
+
18
+ Returns:
19
+ - The padded image ready for the model.
20
+ - The ratio used to resize the image.
21
+ - The padding amounts (width, height).
22
+ """
23
+ # Get the original image dimensions.
24
+ original_height, original_width = image.shape
25
+
26
+ # Calculate the ratio to resize the image.
27
+ ratio = min(size / original_width, size / original_height)
28
+ new_width = int(original_width * ratio)
29
+ new_height = int(original_height * ratio)
30
+
31
+ # Resize the image using the calculated ratio.
32
+ resized_image = cv2.resize(image, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
33
+
34
+ # Create a new square image (320x320) filled with zeros (black).
35
+ padded_image = np.zeros((size, size), dtype=np.uint8)
36
+
37
+ # Calculate padding to center the resized image.
38
+ pad_w = (size - new_width) // 2
39
+ pad_h = (size - new_height) // 2
40
+
41
+ # Paste the resized image onto the center of the black square.
42
+ padded_image[pad_h:pad_h + new_height, pad_w:pad_w + new_width] = resized_image
43
+
44
+ return padded_image, ratio, pad_w, pad_h
45
+
46
+ def main():
47
+ """
48
+ Main function to run the inference process.
49
+ """
50
+ # --- 1. Load the Model ---
51
+ try:
52
+ # Create an inference session with the ONNX model.
53
+ # We use the CPUExecutionProvider for broad compatibility.
54
+ session = ort.InferenceSession(MODEL_PATH, providers=['CPUExecutionProvider'])
55
+ print(f"Successfully loaded model: {MODEL_PATH}")
56
+ except Exception as e:
57
+ print(f"Error: Failed to load the ONNX model. Make sure '{MODEL_PATH}' exists.")
58
+ print(f"Details: {e}")
59
+ return
60
+
61
+ # --- 2. Load and Pre-process the Input Image ---
62
+ try:
63
+ # Read the input image from the file.
64
+ original_image = cv2.imread(INPUT_IMAGE_PATH)
65
+ if original_image is None:
66
+ raise FileNotFoundError(f"Image not found at '{INPUT_IMAGE_PATH}'")
67
+ print(f"Successfully loaded image: {INPUT_IMAGE_PATH}")
68
+ except Exception as e:
69
+ print(f"Error: {e}")
70
+ return
71
+
72
+ # The model requires a grayscale image.
73
+ img_gray = cv2.cvtColor(original_image, cv2.COLOR_BGR2GRAY)
74
+
75
+ # Resize and pad the image to fit the model's 320x320 input size.
76
+ padded_image, ratio, pad_w, pad_h = resize_and_pad(img_gray, MODEL_SIZE)
77
+
78
+ # Normalize the image data to be between 0 and 1.
79
+ img_normalized = padded_image.astype(np.float32) / 255.0
80
+
81
+ # Add batch and channel dimensions to match the model's expected input shape: (1, 1, 320, 320).
82
+ image_input_tensor = np.expand_dims(np.expand_dims(img_normalized, axis=0), axis=0)
83
+
84
+ # --- 3. Run Inference ---
85
+ # The model requires a second input specifying the image size. We provide the padded size.
86
+ sizes_input_tensor = np.array([[MODEL_SIZE, MODEL_SIZE]], dtype=np.int64)
87
+
88
+ # Get the names of the model's input nodes.
89
+ input_names = [inp.name for inp in session.get_inputs()]
90
+
91
+ # Prepare the dictionary of inputs for the model.
92
+ inputs = {
93
+ input_names[0]: image_input_tensor,
94
+ input_names[1]: sizes_input_tensor
95
+ }
96
+
97
+ # Run the model. The output will be the detected bounding boxes.
98
+ outputs = session.run(None, inputs)
99
+ boxes_from_model = outputs[0]
100
+
101
+ print(f"Found {len(boxes_from_model)} potential text boxes.")
102
+
103
+ # --- 4. Post-process and Draw Bounding Boxes ---
104
+ # The coordinates from the model are relative to the 320x320 padded image.
105
+ # We need to scale them back to the original image's coordinate space.
106
+ output_image = original_image.copy()
107
+ for box in boxes_from_model:
108
+ x_min, y_min, x_max, y_max = box
109
+
110
+ # Step 1: Subtract the padding that was added.
111
+ x_min_unpadded = x_min - pad_w
112
+ y_min_unpadded = y_min - pad_h
113
+ x_max_unpadded = x_max - pad_w
114
+ y_max_unpadded = y_max - pad_h
115
+
116
+ # Step 2: Scale the coordinates back up to the original image size by dividing by the ratio.
117
+ final_x_min = int(x_min_unpadded / ratio)
118
+ final_y_min = int(y_min_unpadded / ratio)
119
+ final_x_max = int(x_max_unpadded / ratio)
120
+ final_y_max = int(y_max_unpadded / ratio)
121
+
122
+ # Draw a red rectangle on the output image.
123
+ cv2.rectangle(output_image, (final_x_min, final_y_min), (final_x_max, final_y_max), (0, 0, 255), 2)
124
+
125
+ # --- 5. Save the Final Image ---
126
+ cv2.imwrite(OUTPUT_IMAGE_PATH, output_image)
127
+ print(f"Successfully saved result to: {OUTPUT_IMAGE_PATH}")
128
+
129
+
130
+ if __name__ == "__main__":
131
+ main()
meiki.text.detect.small.v0.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:3b702a7e7c9328ff461ed22ed6efc71505798ec1f44b76564e467d08fd042ce0
3
+ size 41593485
meiki.text.detect.tiny.v0.onnx ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:2e03bb478db6fe3e7baa051adad4edf4f6e704e7b9bffab27d03cb784316b0e2
3
+ size 10593466