Intel
/

gpt-j-6B-int8-static-inc

     "print('acc: ', acc)"
    ]
   },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {
+    "vscode": {
+     "languageId": "plaintext"
+    }
+   },
+   "outputs": [],
+   "source": [
+    "# batch inference\n",
+    "\n",
+    "from transformers import AutoTokenizer\n",
+    "import torch\n",
+    "import numpy as np\n",
+    "from datasets import load_dataset\n",
+    "import onnxruntime as ort\n",
+    "from torch.nn.functional import pad\n",
+    "from torch.utils.data import DataLoader\n",
+    "\n",
+    "batch_size = 2\n",
+    "pad_max = 196\n",
+    "\n",
+    "# load model\n",
+    "model_id = \"EleutherAI/gpt-j-6B\"\n",
+    "tokenizer = AutoTokenizer.from_pretrained(model_id)\n",
+    "\n",
+    "def tokenize_function(examples):\n",
+    "    example = tokenizer(examples['text'])\n",
+    "    return example\n",
+    "\n",
+    "# create dataloader\n",
+    "class Dataloader:\n",
+    "    def __init__(self, pad_max=196, batch_size=1, sub_folder='validation'):\n",
+    "        self.pad_max = pad_max\n",
+    "        self.batch_size=batch_size\n",
+    "        dataset = load_dataset('lambada', split=sub_folder)\n",
+    "        dataset = dataset.map(tokenize_function, batched=True)\n",
+    "        dataset.set_format(type=\"torch\", columns=[\"input_ids\", \"attention_mask\"])\n",
+    "        self.dataloader = DataLoader(\n",
+    "            dataset,\n",
+    "            batch_size=self.batch_size,\n",
+    "            shuffle=False,\n",
+    "            collate_fn=self.collate_batch,\n",
+    "        )\n",
+    "\n",
+    "    def collate_batch(self, batch):\n",
+    "        input_ids_padded = []\n",
+    "        attention_mask_padded = []\n",
+    "        last_ind = []\n",
+    "        for text in batch:\n",
+    "            input_ids = text[\"input_ids\"] if text[\"input_ids\"].shape[0] <= self.pad_max else text[\"input_ids\"][0:int(self.pad_max-1)]\n",
+    "            pad_len = self.pad_max - input_ids.shape[0]\n",
+    "            last_ind.append(input_ids.shape[0] - 1)\n",
+    "            input_ids = pad(input_ids, (0, pad_len), value=1)\n",
+    "            input_ids_padded.append(input_ids)\n",
+    "            attention_mask = torch.ones(input_ids.shape[0] + 1)\n",
+    "            attention_mask_padded.append(attention_mask)\n",
+    "        return (torch.vstack(input_ids_padded), torch.vstack(attention_mask_padded)), torch.tensor(last_ind)\n",
+    "\n",
+    "    def __iter__(self):\n",
+    "        try:\n",
+    "            for (input_ids, attention_mask), last_ind in self.dataloader:\n",
+    "                data = [input_ids.detach().cpu().numpy().astype('int64')]\n",
+    "                data.append(attention_mask.detach().cpu().numpy().astype('int64'))\n",
+    "                yield data, last_ind.detach().cpu().numpy()\n",
+    "        except StopIteration:\n",
+    "            return\n",
+    "\n",
+    "# create session\n",
+    "options = ort.SessionOptions()\n",
+    "options.graph_optimization_level = ort.GraphOptimizationLevel.ORT_ENABLE_ALL\n",
+    "session = ort.InferenceSession('/path/to/model.onnx', options, providers=ort.get_available_providers())\n",
+    "total, hit = 0, 0\n",
+    "\n",
+    "dataloader = Dataloader(pad_max=pad_max, batch_size=batch_size)\n",
+    "\n",
+    "# inference\n",
+    "for idx, (batch, last_ind) in enumerate(dataloader):\n",
+    "    label = torch.from_numpy(batch[0][torch.arange(len(last_ind)), last_ind])\n",
+    "    pad_len = pad_max - last_ind - 1\n",
+    "    ort_inputs = {\n",
+    "        'input_ids': batch[0],\n",
+    "        'attention_mask': batch[1]\n",
+    "    }\n",
+    "    for i in range(28):\n",
+    "        ort_inputs[\"past_key_values.{}.key\".format(i)] = np.zeros((batch_size,16,1,256), dtype='float32')\n",
+    "        ort_inputs[\"past_key_values.{}.value\".format(i)] = np.zeros((batch_size,16,1,256), dtype='float32')\n",
+    " \n",
+    "    predictions = session.run(None, ort_inputs)\n",
+    "    outputs = torch.from_numpy(predictions[0])\n",
+    "    last_token_logits = outputs[torch.arange(len(last_ind)), -2 - pad_len, :]\n",
+    "    pred = last_token_logits.argmax(dim=-1)\n",
+    "    total += len(label)\n",
+    "    hit += (pred == label).sum().item()\n",
+    "\n",
+    "acc = hit / total\n",
+    "print('acc: ', acc)"
+   ]
+  },
   {
    "attachments": {},
    "cell_type": "markdown",