conversion scripts

Browse files

Files changed (3) hide show

conversion/README.md +56 -0
conversion/convert.py +207 -0
conversion/requirements.txt +6 -0

conversion/README.md ADDED Viewed

	@@ -0,0 +1,56 @@

+# ONNX Model Conversion Notes
+First of all, this was rather fun to do!
+The `convert.py` script is based on code I made on Google Colab in order to have access to a GPU.
+The `requirements.txt` might not be perfect, I'd much rather use UV which I use on a daily basis however this was created in Google colab in a fast manner.
+Also note that I checked the output of the converted models and the original to compare:
+- The fp32 (default ONNX) is nearly the same as the original HF model.
+- However, the FP16 converted ONNX model is not exactly the same, there is a margin of error.
+Below is a code snippet that showcases the comparison:
+```python
+import torch
+import numpy as np
+import onnxruntime as ort
+from PIL import Image
+from transformers import ColPaliForRetrieval, ColPaliProcessor
+MODEL_ID  = "vidore/colpali-v1.3-hf"
+# change this to the FP16 version or FP32 version
+ONNX_PATH = "/content/final_colpali/model.onnx"
+DEVICE    = "cpu"
+hf = (
+    ColPaliForRetrieval
+      .from_pretrained(MODEL_ID, torch_dtype=torch.float16)
+      .to(DEVICE)
+      .eval()
+)
+processor = ColPaliProcessor.from_pretrained(MODEL_ID)
+img = Image.new("RGB", (32, 32), color="white")
+inputs = processor(images=[img], return_tensors="pt").to(DEVICE)
+with torch.no_grad():
+    out = hf(**inputs)
+hf_emb = out.embeddings.cpu().numpy()
+sess = ort.InferenceSession(ONNX_PATH, providers=["CPUExecutionProvider"])
+ort_inputs = {k: v.cpu().numpy() for k, v in inputs.items()}
+[onnx_emb] = sess.run(["embeddings"], ort_inputs)
+```
+```
+python
+## Wil output  for FP32
+## 0.999~
+## Will output for FP16
+## 0.939~
+dot   = np.sum(hf_emb * onnx_emb, axis=-1)
+norms = np.linalg.norm(hf_emb,axis=-1) * np.linalg.norm(onnx_emb,axis=-1)
+cosim = dot / norms
+cosim.min()
+```

conversion/convert.py ADDED Viewed

	@@ -0,0 +1,207 @@

+import os
+import json
+import torch
+import onnx
+import argparse
+from PIL import Image
+from torch.onnx._globals import GLOBALS
+from transformers import ColPaliForRetrieval, ColPaliProcessor
+from optimum.onnx.graph_transformations import check_and_save_model
+import onnx_graphsurgeon as gs
+from onnxconverter_common import float16
+from onnx.external_data_helper import convert_model_to_external_data
+def export_model(model_id, output_dir, device):
+    """Export HuggingFace model ColPaliForRetrieval to ONNX format"""
+    os.makedirs(output_dir, exist_ok=True)
+    # Load HF model & processor
+    model = (
+        ColPaliForRetrieval.from_pretrained(
+            model_id, torch_dtype=torch.float16, device_map="auto"
+        )
+        .to(device)
+        .eval()
+    )
+    processor = ColPaliProcessor.from_pretrained(model_id)
+    # Save HF artifacts
+    model.config.save_pretrained(output_dir)
+    processor.save_pretrained(output_dir)
+    # patched forward method
+    _orig_forward = model.forward
+    def _patched_forward(
+        self, pixel_values=None, input_ids=None, attention_mask=None, **kwargs
+    ):
+        # Call the original .forward
+        out = _orig_forward(
+            pixel_values=pixel_values,
+            input_ids=input_ids,
+            attention_mask=attention_mask,
+            **kwargs,
+        )
+        return out.embeddings
+    model.forward = _patched_forward.__get__(model, model.__class__)
+    # check with dummy image batch
+    dummy_img = Image.new("RGB", (32, 32), color="white")
+    vision_pt = processor(images=[dummy_img], return_tensors="pt").to(device)
+    pv = vision_pt["pixel_values"]
+    ids = vision_pt["input_ids"]
+    msk = vision_pt["attention_mask"]
+    with torch.no_grad():
+        emb = model(pv, ids, msk)
+        print("Sanity-check embedding shape:", emb.shape)
+    # Export to ONNX + external data
+    GLOBALS.onnx_shape_inference = False  # Workaround shape bugs
+    onnx_path = os.path.join(output_dir, "model.onnx")
+    external_binfile = os.path.join(output_dir, "model.onnx_data")
+    torch.onnx.export(
+        model,
+        (pv, ids, msk),
+        onnx_path,
+        export_params=True,
+        opset_version=14,
+        do_constant_folding=True,
+        use_external_data_format=True,
+        all_tensors_to_one_file=True,
+        size_threshold=0,
+        external_data_filename=os.path.basename(external_binfile),
+        input_names=["pixel_values", "input_ids", "attention_mask"],
+        output_names=["embeddings"],
+        dynamic_axes={
+            "pixel_values": {0: "batch_size"},
+            "input_ids": {0: "batch_size", 1: "seq_len"},
+            "attention_mask": {0: "batch_size", 1: "seq_len"},
+            "embeddings": {0: "batch_size", 1: "seq_len"},
+        },
+    )
+    print("Exported ONNX to", onnx_path)
+    # Shape-infer & fix external-data refs
+    onnx_model = onnx.shape_inference.infer_shapes_path(onnx_path)
+    onnx_model = onnx.load(onnx_path)
+    check_and_save_model(onnx_model, onnx_path)
+    print("Shape-inference + external refs fixed")
+    # Minify tokenizer.json
+    tok = os.path.join(output_dir, "tokenizer.json")
+    if os.path.isfile(tok):
+        data = json.load(open(tok))
+        with open(tok, "w") as f:
+            json.dump(data, f, separators=(",", ":"))
+        print("✔ Minified tokenizer.json")
+    print("✅ ONNX + HF artifacts exported to", output_dir)
+    return onnx_path
+def quantize_fp16_and_externalize(
+    input_path,
+    output_path,
+    external_data_filename="model.onnx_data",
+    op_block_list=None,
+):
+    """
+    Quantize an ONNX model from FP32 to FP16
+    1) Load FP32 ONNX (+ its .onnx_data)
+    2) Cast weight tensors to FP16
+    3) Topo-sort / clean up
+    4) Copy opset_import from original model
+    5) Mark ALL tensors for external data
+    6) Save the new ONNX + .onnx_data
+    """
+    orig = onnx.load(input_path, load_external_data=True)
+    model = onnx.load(input_path, load_external_data=True)
+    disable_si = model.ByteSize() >= onnx.checker.MAXIMUM_PROTOBUF
+    blocked = set(float16.DEFAULT_OP_BLOCK_LIST)
+    if op_block_list:
+        blocked.update(op_block_list)
+    blocked.update(["LayerNormalization", "Softmax", "Div"])
+    model_fp16 = float16.convert_float_to_float16(
+        model,
+        max_finite_val=65504.0,
+        keep_io_types=True,
+        disable_shape_infer=disable_si,
+        op_block_list=blocked,
+    )
+    graph = gs.import_onnx(model_fp16)
+    graph.toposort()
+    model_fp16 = gs.export_onnx(graph)
+    model_fp16.ClearField("opset_import")
+    model_fp16.opset_import.extend(orig.opset_import)
+    convert_model_to_external_data(
+        model_fp16,
+        all_tensors_to_one_file=True,
+        location=external_data_filename,
+        size_threshold=0,
+    )
+    # required for check_and_save_model
+    if not model_fp16.opset_import:
+        model_fp16.opset_import.extend(
+            [
+                onnx.helper.make_opsetid("", 14),  # Default domain with opset 14
+            ]
+        )
+    # Save with shape-infer + final checks
+    check_and_save_model(model_fp16, output_path)
+    print("✅ FP16 model quantized and saved:")
+    print(f"     ONNX: {output_path}")
+    print(
+        f"     DATA: {os.path.join(os.path.dirname(output_path), external_data_filename)}"
+    )
+    return True
+def main():
+    parser = argparse.ArgumentParser(
+        description="Convert ColPali model to ONNX format and FP16 quantization"
+    )
+    parser.add_argument(
+        "--model-id", default="vidore/colpali-v1.3-hf", help="HuggingFace model ID"
+    )
+    parser.add_argument("--output-dir", default=None, help="Output directory")
+    parser.add_argument(
+        "--quantize", action="store_true", help="Apply FP16 quantization after export"
+    )
+    parser.add_argument("--device", default=None, help="Device for model (cuda/cpu)")
+    args = parser.parse_args()
+    if args.device is None:
+        args.device = "cuda" if torch.cuda.is_available() else "cpu"
+    if args.output_dir is None:
+        args.output_dir = os.path.join("output", args.model_id.replace("/", "_"))
+    # Export model to ONNX
+    onnx_path = export_model(args.model_id, args.output_dir, args.device)
+    # Optionally quantize to FP16
+    if args.quantize:
+        print("Starting FP16 quantization")
+        quantize_fp16_and_externalize(
+            input_path=onnx_path,
+            output_path=onnx_path,
+            external_data_filename="model.onnx_data",
+            op_block_list=None,
+        )
+if __name__ == "__main__":
+    main()

conversion/requirements.txt ADDED Viewed

	@@ -0,0 +1,6 @@

+onnx-graphsurgeon==0.5.2
+onnxconverter-common==1.14.0
+onnxruntime==1.21.1
+onnxruntime-tools==1.7.0
+onnxslim==0.1.51
+transformers==4.48.3