Visual Document Retrieval
Transformers
ONNX
ColPali
English
pretraining
kitsuneb commited on
Commit
ccd36ee
·
1 Parent(s): 9872d37

conversion scripts

Browse files
conversion/README.md ADDED
@@ -0,0 +1,56 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # ONNX Model Conversion Notes
2
+
3
+ First of all, this was rather fun to do!
4
+ The `convert.py` script is based on code I made on Google Colab in order to have access to a GPU.
5
+ The `requirements.txt` might not be perfect, I'd much rather use UV which I use on a daily basis however this was created in Google colab in a fast manner.
6
+
7
+
8
+ Also note that I checked the output of the converted models and the original to compare:
9
+ - The fp32 (default ONNX) is nearly the same as the original HF model.
10
+ - However, the FP16 converted ONNX model is not exactly the same, there is a margin of error.
11
+
12
+ Below is a code snippet that showcases the comparison:
13
+
14
+ ```python
15
+ import torch
16
+ import numpy as np
17
+ import onnxruntime as ort
18
+ from PIL import Image
19
+ from transformers import ColPaliForRetrieval, ColPaliProcessor
20
+
21
+ MODEL_ID = "vidore/colpali-v1.3-hf"
22
+ # change this to the FP16 version or FP32 version
23
+ ONNX_PATH = "/content/final_colpali/model.onnx"
24
+ DEVICE = "cpu"
25
+
26
+ hf = (
27
+ ColPaliForRetrieval
28
+ .from_pretrained(MODEL_ID, torch_dtype=torch.float16)
29
+ .to(DEVICE)
30
+ .eval()
31
+ )
32
+ processor = ColPaliProcessor.from_pretrained(MODEL_ID)
33
+
34
+ img = Image.new("RGB", (32, 32), color="white")
35
+ inputs = processor(images=[img], return_tensors="pt").to(DEVICE)
36
+
37
+ with torch.no_grad():
38
+ out = hf(**inputs)
39
+ hf_emb = out.embeddings.cpu().numpy()
40
+
41
+ sess = ort.InferenceSession(ONNX_PATH, providers=["CPUExecutionProvider"])
42
+ ort_inputs = {k: v.cpu().numpy() for k, v in inputs.items()}
43
+ [onnx_emb] = sess.run(["embeddings"], ort_inputs)
44
+ ```
45
+
46
+ ```
47
+ python
48
+ ## Wil output for FP32
49
+ ## 0.999~
50
+ ## Will output for FP16
51
+ ## 0.939~
52
+ dot = np.sum(hf_emb * onnx_emb, axis=-1)
53
+ norms = np.linalg.norm(hf_emb,axis=-1) * np.linalg.norm(onnx_emb,axis=-1)
54
+ cosim = dot / norms
55
+ cosim.min()
56
+ ```
conversion/convert.py ADDED
@@ -0,0 +1,207 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import os
2
+ import json
3
+ import torch
4
+ import onnx
5
+ import argparse
6
+ from PIL import Image
7
+ from torch.onnx._globals import GLOBALS
8
+ from transformers import ColPaliForRetrieval, ColPaliProcessor
9
+ from optimum.onnx.graph_transformations import check_and_save_model
10
+ import onnx_graphsurgeon as gs
11
+ from onnxconverter_common import float16
12
+ from onnx.external_data_helper import convert_model_to_external_data
13
+
14
+
15
+ def export_model(model_id, output_dir, device):
16
+ """Export HuggingFace model ColPaliForRetrieval to ONNX format"""
17
+ os.makedirs(output_dir, exist_ok=True)
18
+
19
+ # Load HF model & processor
20
+ model = (
21
+ ColPaliForRetrieval.from_pretrained(
22
+ model_id, torch_dtype=torch.float16, device_map="auto"
23
+ )
24
+ .to(device)
25
+ .eval()
26
+ )
27
+ processor = ColPaliProcessor.from_pretrained(model_id)
28
+
29
+ # Save HF artifacts
30
+ model.config.save_pretrained(output_dir)
31
+ processor.save_pretrained(output_dir)
32
+
33
+ # patched forward method
34
+ _orig_forward = model.forward
35
+
36
+ def _patched_forward(
37
+ self, pixel_values=None, input_ids=None, attention_mask=None, **kwargs
38
+ ):
39
+ # Call the original .forward
40
+ out = _orig_forward(
41
+ pixel_values=pixel_values,
42
+ input_ids=input_ids,
43
+ attention_mask=attention_mask,
44
+ **kwargs,
45
+ )
46
+ return out.embeddings
47
+
48
+ model.forward = _patched_forward.__get__(model, model.__class__)
49
+
50
+ # check with dummy image batch
51
+ dummy_img = Image.new("RGB", (32, 32), color="white")
52
+ vision_pt = processor(images=[dummy_img], return_tensors="pt").to(device)
53
+
54
+ pv = vision_pt["pixel_values"]
55
+ ids = vision_pt["input_ids"]
56
+ msk = vision_pt["attention_mask"]
57
+
58
+ with torch.no_grad():
59
+ emb = model(pv, ids, msk)
60
+ print("Sanity-check embedding shape:", emb.shape)
61
+
62
+ # Export to ONNX + external data
63
+ GLOBALS.onnx_shape_inference = False # Workaround shape bugs
64
+ onnx_path = os.path.join(output_dir, "model.onnx")
65
+ external_binfile = os.path.join(output_dir, "model.onnx_data")
66
+
67
+ torch.onnx.export(
68
+ model,
69
+ (pv, ids, msk),
70
+ onnx_path,
71
+ export_params=True,
72
+ opset_version=14,
73
+ do_constant_folding=True,
74
+ use_external_data_format=True,
75
+ all_tensors_to_one_file=True,
76
+ size_threshold=0,
77
+ external_data_filename=os.path.basename(external_binfile),
78
+ input_names=["pixel_values", "input_ids", "attention_mask"],
79
+ output_names=["embeddings"],
80
+ dynamic_axes={
81
+ "pixel_values": {0: "batch_size"},
82
+ "input_ids": {0: "batch_size", 1: "seq_len"},
83
+ "attention_mask": {0: "batch_size", 1: "seq_len"},
84
+ "embeddings": {0: "batch_size", 1: "seq_len"},
85
+ },
86
+ )
87
+ print("Exported ONNX to", onnx_path)
88
+
89
+ # Shape-infer & fix external-data refs
90
+ onnx_model = onnx.shape_inference.infer_shapes_path(onnx_path)
91
+ onnx_model = onnx.load(onnx_path)
92
+ check_and_save_model(onnx_model, onnx_path)
93
+ print("Shape-inference + external refs fixed")
94
+
95
+ # Minify tokenizer.json
96
+ tok = os.path.join(output_dir, "tokenizer.json")
97
+ if os.path.isfile(tok):
98
+ data = json.load(open(tok))
99
+ with open(tok, "w") as f:
100
+ json.dump(data, f, separators=(",", ":"))
101
+ print("✔ Minified tokenizer.json")
102
+
103
+ print("✅ ONNX + HF artifacts exported to", output_dir)
104
+ return onnx_path
105
+
106
+
107
+ def quantize_fp16_and_externalize(
108
+ input_path,
109
+ output_path,
110
+ external_data_filename="model.onnx_data",
111
+ op_block_list=None,
112
+ ):
113
+ """
114
+ Quantize an ONNX model from FP32 to FP16
115
+ 1) Load FP32 ONNX (+ its .onnx_data)
116
+ 2) Cast weight tensors to FP16
117
+ 3) Topo-sort / clean up
118
+ 4) Copy opset_import from original model
119
+ 5) Mark ALL tensors for external data
120
+ 6) Save the new ONNX + .onnx_data
121
+ """
122
+ orig = onnx.load(input_path, load_external_data=True)
123
+ model = onnx.load(input_path, load_external_data=True)
124
+
125
+ disable_si = model.ByteSize() >= onnx.checker.MAXIMUM_PROTOBUF
126
+ blocked = set(float16.DEFAULT_OP_BLOCK_LIST)
127
+ if op_block_list:
128
+ blocked.update(op_block_list)
129
+ blocked.update(["LayerNormalization", "Softmax", "Div"])
130
+
131
+ model_fp16 = float16.convert_float_to_float16(
132
+ model,
133
+ max_finite_val=65504.0,
134
+ keep_io_types=True,
135
+ disable_shape_infer=disable_si,
136
+ op_block_list=blocked,
137
+ )
138
+
139
+ graph = gs.import_onnx(model_fp16)
140
+ graph.toposort()
141
+ model_fp16 = gs.export_onnx(graph)
142
+
143
+ model_fp16.ClearField("opset_import")
144
+ model_fp16.opset_import.extend(orig.opset_import)
145
+
146
+ convert_model_to_external_data(
147
+ model_fp16,
148
+ all_tensors_to_one_file=True,
149
+ location=external_data_filename,
150
+ size_threshold=0,
151
+ )
152
+
153
+ # required for check_and_save_model
154
+ if not model_fp16.opset_import:
155
+ model_fp16.opset_import.extend(
156
+ [
157
+ onnx.helper.make_opsetid("", 14), # Default domain with opset 14
158
+ ]
159
+ )
160
+
161
+ # Save with shape-infer + final checks
162
+ check_and_save_model(model_fp16, output_path)
163
+
164
+ print("✅ FP16 model quantized and saved:")
165
+ print(f" ONNX: {output_path}")
166
+ print(
167
+ f" DATA: {os.path.join(os.path.dirname(output_path), external_data_filename)}"
168
+ )
169
+ return True
170
+
171
+
172
+ def main():
173
+ parser = argparse.ArgumentParser(
174
+ description="Convert ColPali model to ONNX format and FP16 quantization"
175
+ )
176
+ parser.add_argument(
177
+ "--model-id", default="vidore/colpali-v1.3-hf", help="HuggingFace model ID"
178
+ )
179
+ parser.add_argument("--output-dir", default=None, help="Output directory")
180
+ parser.add_argument(
181
+ "--quantize", action="store_true", help="Apply FP16 quantization after export"
182
+ )
183
+ parser.add_argument("--device", default=None, help="Device for model (cuda/cpu)")
184
+ args = parser.parse_args()
185
+
186
+ if args.device is None:
187
+ args.device = "cuda" if torch.cuda.is_available() else "cpu"
188
+
189
+ if args.output_dir is None:
190
+ args.output_dir = os.path.join("output", args.model_id.replace("/", "_"))
191
+
192
+ # Export model to ONNX
193
+ onnx_path = export_model(args.model_id, args.output_dir, args.device)
194
+
195
+ # Optionally quantize to FP16
196
+ if args.quantize:
197
+ print("Starting FP16 quantization")
198
+ quantize_fp16_and_externalize(
199
+ input_path=onnx_path,
200
+ output_path=onnx_path,
201
+ external_data_filename="model.onnx_data",
202
+ op_block_list=None,
203
+ )
204
+
205
+
206
+ if __name__ == "__main__":
207
+ main()
conversion/requirements.txt ADDED
@@ -0,0 +1,6 @@
 
 
 
 
 
 
 
1
+ onnx-graphsurgeon==0.5.2
2
+ onnxconverter-common==1.14.0
3
+ onnxruntime==1.21.1
4
+ onnxruntime-tools==1.7.0
5
+ onnxslim==0.1.51
6
+ transformers==4.48.3