Unverified Commit bbdba0a7 authored by Funtowicz Morgan's avatar Funtowicz Morgan Committed by GitHub
Browse files

Update ONNX notebook to include section on quantization. (#6831)



* Update ONNX notebook to include section on quantization.
Signed-off-by: default avatarMorgan Funtowicz <morgan@huggingface.co>

* Addressing ONNX team comments
parent a59bcefb
......@@ -46,30 +46,220 @@
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"scrolled": false
},
"outputs": [],
"execution_count": 22,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Collecting git+https://github.com/huggingface/transformers\n",
" Cloning https://github.com/huggingface/transformers to /tmp/pip-req-build-9rvbp9p8\n",
" Running command git clone -q https://github.com/huggingface/transformers /tmp/pip-req-build-9rvbp9p8\n",
"Requirement already satisfied, skipping upgrade: numpy in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: tokenizers==0.8.1.rc2 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (0.8.1rc2)\n",
"Requirement already satisfied, skipping upgrade: packaging in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (20.4)\n",
"Requirement already satisfied, skipping upgrade: filelock in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (3.0.12)\n",
"Requirement already satisfied, skipping upgrade: requests in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (2.23.0)\n",
"Requirement already satisfied, skipping upgrade: tqdm>=4.27 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (4.46.1)\n",
"Requirement already satisfied, skipping upgrade: regex!=2019.12.17 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (2020.6.8)\n",
"Requirement already satisfied, skipping upgrade: sentencepiece!=0.1.92 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (0.1.91)\n",
"Requirement already satisfied, skipping upgrade: sacremoses in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers==3.0.2) (0.0.43)\n",
"Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from packaging->transformers==3.0.2) (2.4.7)\n",
"Requirement already satisfied, skipping upgrade: six in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from packaging->transformers==3.0.2) (1.15.0)\n",
"Requirement already satisfied, skipping upgrade: chardet<4,>=3.0.2 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers==3.0.2) (3.0.4)\n",
"Requirement already satisfied, skipping upgrade: idna<3,>=2.5 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers==3.0.2) (2.9)\n",
"Requirement already satisfied, skipping upgrade: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers==3.0.2) (1.25.9)\n",
"Requirement already satisfied, skipping upgrade: certifi>=2017.4.17 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers==3.0.2) (2020.6.20)\n",
"Requirement already satisfied, skipping upgrade: click in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from sacremoses->transformers==3.0.2) (7.1.2)\n",
"Requirement already satisfied, skipping upgrade: joblib in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from sacremoses->transformers==3.0.2) (0.15.1)\n",
"Building wheels for collected packages: transformers\n",
" Building wheel for transformers (setup.py) ... \u001b[?25ldone\n",
"\u001b[?25h Created wheel for transformers: filename=transformers-3.0.2-py3-none-any.whl size=883063 sha256=5f2caef76450921ae2e5b10abbbaab436e9c87c83486114fa08d305e4396d4cd\n",
" Stored in directory: /tmp/pip-ephem-wheel-cache-kftypcjz/wheels/42/68/45/c63edff61c292f2dfd4df4ef6522dcbecc603e7af82813c1d7\n",
"Successfully built transformers\n",
"Installing collected packages: transformers\n",
" Attempting uninstall: transformers\n",
" Found existing installation: transformers 3.0.2\n",
" Uninstalling transformers-3.0.2:\n",
" Successfully uninstalled transformers-3.0.2\n",
"Successfully installed transformers-3.0.2\n",
"Looking in links: https://download.pytorch.org/whl/torch_stable.html\n",
"Requirement already up-to-date: torch==1.6.0+cpu in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (1.6.0+cpu)\n",
"Requirement already up-to-date: torchvision==0.7.0+cpu in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (0.7.0+cpu)\n",
"Requirement already satisfied, skipping upgrade: numpy in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from torch==1.6.0+cpu) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: future in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from torch==1.6.0+cpu) (0.18.2)\n",
"Requirement already satisfied, skipping upgrade: pillow>=4.1.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from torchvision==0.7.0+cpu) (7.2.0)\n",
"Requirement already up-to-date: onnxruntime==1.4.0 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (1.4.0)\n",
"Requirement already satisfied, skipping upgrade: protobuf in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime==1.4.0) (3.12.2)\n",
"Requirement already satisfied, skipping upgrade: numpy>=1.16.6 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime==1.4.0) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: setuptools in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from protobuf->onnxruntime==1.4.0) (47.1.1.post20200604)\n",
"Requirement already satisfied, skipping upgrade: six>=1.9 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from protobuf->onnxruntime==1.4.0) (1.15.0)\n",
"Looking in indexes: https://test.pypi.org/simple/\n",
"Requirement already satisfied: ort-nightly in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (1.4.0.dev202008262)\n",
"Requirement already satisfied: protobuf in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from ort-nightly) (3.12.2)\n",
"Requirement already satisfied: numpy>=1.16.6 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from ort-nightly) (1.18.1)\n",
"Requirement already satisfied: setuptools in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from protobuf->ort-nightly) (47.1.1.post20200604)\n",
"Requirement already satisfied: six>=1.9 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from protobuf->ort-nightly) (1.15.0)\n",
"Requirement already up-to-date: onnxruntime-tools in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (1.4.2)\n",
"Requirement already satisfied, skipping upgrade: numpy in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-tools) (1.18.1)\n",
"Requirement already satisfied, skipping upgrade: coloredlogs in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-tools) (14.0)\n",
"Requirement already satisfied, skipping upgrade: py3nvml in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-tools) (0.2.6)\n",
"Requirement already satisfied, skipping upgrade: psutil in /home/mfuntowicz/.local/lib/python3.8/site-packages/psutil-5.7.0-py3.8-linux-x86_64.egg (from onnxruntime-tools) (5.7.0)\n",
"Requirement already satisfied, skipping upgrade: packaging in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-tools) (20.4)\n",
"Requirement already satisfied, skipping upgrade: py-cpuinfo in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-tools) (5.0.0)\n",
"Requirement already satisfied, skipping upgrade: onnx in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-tools) (1.7.0)\n",
"Requirement already satisfied, skipping upgrade: humanfriendly>=7.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from coloredlogs->onnxruntime-tools) (8.2)\n",
"Requirement already satisfied, skipping upgrade: xmltodict in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from py3nvml->onnxruntime-tools) (0.12.0)\n",
"Requirement already satisfied, skipping upgrade: six in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from packaging->onnxruntime-tools) (1.15.0)\n",
"Requirement already satisfied, skipping upgrade: pyparsing>=2.0.2 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from packaging->onnxruntime-tools) (2.4.7)\n",
"Requirement already satisfied, skipping upgrade: typing-extensions>=3.6.2.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnx->onnxruntime-tools) (3.7.4.2)\n",
"Requirement already satisfied, skipping upgrade: protobuf in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnx->onnxruntime-tools) (3.12.2)\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied, skipping upgrade: setuptools in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from protobuf->onnx->onnxruntime-tools) (47.1.1.post20200604)\r\n"
]
}
],
"source": [
"!pip install --upgrade git+https://github.com/huggingface/transformers"
"import sys\n",
"!{sys.executable} -m pip install --upgrade git+https://github.com/huggingface/transformers\n",
"!{sys.executable} -m pip install --upgrade torch==1.6.0+cpu torchvision==0.7.0+cpu -f https://download.pytorch.org/whl/torch_stable.html\n",
"!{sys.executable} -m pip install --upgrade onnxruntime==1.4.0\n",
"!{sys.executable} -m pip install -i https://test.pypi.org/simple/ ort-nightly\n",
"!{sys.executable} -m pip install --upgrade onnxruntime-tools"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 23,
"metadata": {
"colab": {},
"colab_type": "code",
"id": "PwAaOchY4N2-"
},
"outputs": [],
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/mfuntowicz/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391\n",
"Model config BertConfig {\n",
" \"architectures\": [\n",
" \"BertForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-12,\n",
" \"max_position_embeddings\": 512,\n",
" \"model_type\": \"bert\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 0,\n",
" \"type_vocab_size\": 2,\n",
" \"vocab_size\": 28996\n",
"}\n",
"\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"ONNX opset version set to: 11\n",
"Loading pipeline (model: bert-base-cased, tokenizer: bert-base-cased)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/mfuntowicz/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n",
"loading model card file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-modelcard.json from cache at /home/mfuntowicz/.cache/torch/transformers/72b46f187c40a666d54782e06684c2870e109350a3efe9aa5027253dec2e671d.455d944f3d1572ab55ed579849f751cf37f303e3388980a42d94f7cd57a4e331\n",
"Model card: {\n",
" \"caveats_and_recommendations\": {},\n",
" \"ethical_considerations\": {},\n",
" \"evaluation_data\": {},\n",
" \"factors\": {},\n",
" \"intended_use\": {},\n",
" \"metrics\": {},\n",
" \"model_details\": {},\n",
" \"quantitative_analyses\": {},\n",
" \"training_data\": {}\n",
"}\n",
"\n",
"loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/mfuntowicz/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391\n",
"Model config BertConfig {\n",
" \"architectures\": [\n",
" \"BertForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-12,\n",
" \"max_position_embeddings\": 512,\n",
" \"model_type\": \"bert\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 0,\n",
" \"type_vocab_size\": 2,\n",
" \"vocab_size\": 28996\n",
"}\n",
"\n",
"loading weights file https://cdn.huggingface.co/bert-base-cased-pytorch_model.bin from cache at /home/mfuntowicz/.cache/torch/transformers/d8f11f061e407be64c4d5d7867ee61d1465263e24085cfa26abf183fdc830569.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2\n",
"All model checkpoint weights were used when initializing BertModel.\n",
"\n",
"All the weights of BertModel were initialized from the model checkpoint at bert-base-cased.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.\n",
"/home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages/transformers/modeling_bert.py:201: TracerWarning: Converting a tensor to a Python index might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
" position_ids = self.position_ids[:, :seq_length]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"Creating folder onnx\n",
"Using framework PyTorch: 1.6.0\n",
"Found input input_ids with shape: {0: 'batch', 1: 'sequence'}\n",
"Found input token_type_ids with shape: {0: 'batch', 1: 'sequence'}\n",
"Found input attention_mask with shape: {0: 'batch', 1: 'sequence'}\n",
"Found output output_0 with shape: {0: 'batch', 1: 'sequence'}\n",
"Found output output_1 with shape: {0: 'batch'}\n",
"Ensuring inputs are in correct order\n",
"position_ids is not present in the generated input list.\n",
"Generated inputs order: ['input_ids', 'attention_mask', 'token_type_ids']\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages/transformers/modeling_utils.py:1570: TracerWarning: Converting a tensor to a Python boolean might cause the trace to be incorrect. We can't record the data flow of Python values, so this value will be treated as a constant in the future. This means that the trace might not generalize to other inputs!\n",
" input_tensor.shape == tensor_shape for input_tensor in input_tensors\n"
]
}
],
"source": [
"!rm -rf onnx/\n",
"from pathlib import Path\n",
"from transformers.convert_graph_to_onnx import convert\n",
"\n",
"# Handles all the above steps for you\n",
"convert(framework=\"pt\", model=\"bert-base-cased\", output=\"onnx/bert-base-cased.onnx\", opset=11)\n",
"convert(framework=\"pt\", model=\"bert-base-cased\", output=Path(\"onnx/bert-base-cased.onnx\"), opset=11)\n",
"\n",
"# Tensorflow \n",
"# convert(framework=\"tf\", model=\"bert-base-cased\", output=\"onnx/bert-base-cased.onnx\", opset=11)"
......@@ -95,13 +285,49 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 24,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Requirement already satisfied: transformers in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (3.0.2)\n",
"Requirement already satisfied: onnxruntime-gpu in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (1.3.0)\n",
"Requirement already satisfied: onnx in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (1.7.0)\n",
"Requirement already satisfied: psutil in /home/mfuntowicz/.local/lib/python3.8/site-packages/psutil-5.7.0-py3.8-linux-x86_64.egg (5.7.0)\n",
"Requirement already satisfied: matplotlib in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (3.3.1)\n",
"Requirement already satisfied: tqdm>=4.27 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (4.46.1)\n",
"Requirement already satisfied: numpy in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (1.18.1)\n",
"Requirement already satisfied: sacremoses in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (0.0.43)\n",
"Requirement already satisfied: regex!=2019.12.17 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (2020.6.8)\n",
"Requirement already satisfied: filelock in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (3.0.12)\n",
"Requirement already satisfied: sentencepiece!=0.1.92 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (0.1.91)\n",
"Requirement already satisfied: requests in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (2.23.0)\n",
"Requirement already satisfied: packaging in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (20.4)\n",
"Requirement already satisfied: tokenizers==0.8.1.rc2 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from transformers) (0.8.1rc2)\n",
"Requirement already satisfied: protobuf in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnxruntime-gpu) (3.12.2)\n",
"Requirement already satisfied: six in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnx) (1.15.0)\n",
"Requirement already satisfied: typing-extensions>=3.6.2.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from onnx) (3.7.4.2)\n",
"Requirement already satisfied: pyparsing!=2.0.4,!=2.1.2,!=2.1.6,>=2.0.3 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from matplotlib) (2.4.7)\n",
"Requirement already satisfied: kiwisolver>=1.0.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from matplotlib) (1.2.0)\n",
"Requirement already satisfied: python-dateutil>=2.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from matplotlib) (2.8.1)\n",
"Requirement already satisfied: cycler>=0.10 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from matplotlib) (0.10.0)\n",
"Requirement already satisfied: pillow>=6.2.0 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from matplotlib) (7.2.0)\n",
"Requirement already satisfied: certifi>=2020.06.20 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from matplotlib) (2020.6.20)\n",
"Requirement already satisfied: click in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from sacremoses->transformers) (7.1.2)\n",
"Requirement already satisfied: joblib in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from sacremoses->transformers) (0.15.1)\n",
"Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers) (1.25.9)\n",
"Requirement already satisfied: chardet<4,>=3.0.2 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers) (3.0.4)\n",
"Requirement already satisfied: idna<3,>=2.5 in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from requests->transformers) (2.9)\n",
"Requirement already satisfied: setuptools in /home/mfuntowicz/miniconda3/envs/pytorch/lib/python3.8/site-packages (from protobuf->onnxruntime-gpu) (47.1.1.post20200604)\n"
]
}
],
"source": [
"!pip install transformers onnxruntime-gpu onnx psutil matplotlib"
]
......@@ -132,7 +358,12 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 25,
"metadata": {
"pycharm": {
"name": "#%%\n"
}
},
"outputs": [],
"source": [
"# # An optional step unless\n",
......@@ -149,18 +380,29 @@
"\n",
"# # optimizations for bert-base-cased model converted from Tensorflow(tf.keras)\n",
"# optimized_model = optimizer.optimize_model(\"bert-base-cased.onnx\", model_type='bert_keras', num_heads=12, hidden_size=768)\n",
"# optimized_model.save_model_to_file(\"bert-base-cased.onnx\")\n"
],
"metadata": {
"collapsed": false,
"pycharm": {
"name": "#%%\n"
}
}
"# optimized_model.save_model_to_file(\"bert-base-cased.onnx\")\n",
"\n",
"\n",
"# optimize transformer-based models with onnxruntime-tools\n",
"from onnxruntime_tools import optimizer\n",
"from onnxruntime_tools.transformers.onnx_model_bert import BertOptimizationOptions\n",
"\n",
"# disable embedding layer norm optimization for better model size reduction\n",
"opt_options = BertOptimizationOptions('bert')\n",
"opt_options.enable_embed_layer_norm = False\n",
"\n",
"opt_model = optimizer.optimize_model(\n",
" 'onnx/bert-base-cased.onnx',\n",
" 'bert', \n",
" num_heads=12,\n",
" hidden_size=768,\n",
" optimization_options=opt_options)\n",
"opt_model.save_model_to_file('bert.opt.onnx')\n"
]
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": 26,
"metadata": {
"pycharm": {
"name": "#%%\n"
......@@ -176,12 +418,12 @@
"environ[\"OMP_NUM_THREADS\"] = str(cpu_count(logical=True))\n",
"environ[\"OMP_WAIT_POLICY\"] = 'ACTIVE'\n",
"\n",
"from onnxruntime import InferenceSession, SessionOptions, get_all_providers"
"from onnxruntime import GraphOptimizationLevel, InferenceSession, SessionOptions, get_all_providers"
]
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": 27,
"metadata": {
"colab": {},
"colab_type": "code",
......@@ -189,6 +431,11 @@
},
"outputs": [],
"source": [
"from contextlib import contextmanager\n",
"from dataclasses import dataclass\n",
"from time import time\n",
"from tqdm import trange\n",
"\n",
"def create_model_for_provider(model_path: str, provider: str) -> InferenceSession: \n",
" \n",
" assert provider in get_all_providers(), f\"provider {provider} not found, {get_all_providers()}\"\n",
......@@ -196,9 +443,28 @@
" # Few properties that might have an impact on performances (provided by MS)\n",
" options = SessionOptions()\n",
" options.intra_op_num_threads = 1\n",
" options.graph_optimization_level = GraphOptimizationLevel.ORT_ENABLE_ALL\n",
"\n",
" # Load the model as a graph and prepare the CPU backend \n",
" return InferenceSession(model_path, options, providers=[provider])"
" session = InferenceSession(model_path, options, providers=[provider])\n",
" session.disable_fallback()\n",
" \n",
" return session\n",
"\n",
"\n",
"@contextmanager\n",
"def track_infer_time(buffer: [int]):\n",
" start = time()\n",
" yield\n",
" end = time()\n",
"\n",
" buffer.append(end - start)\n",
"\n",
"\n",
"@dataclass\n",
"class OnnxInferenceResult:\n",
" model_inference_time: [int] \n",
" optimized_model_path: str"
]
},
{
......@@ -222,7 +488,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": 28,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
......@@ -233,6 +499,13 @@
"outputId": "f3aba5dc-15c0-4f82-b38c-1bbae1bf112e"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"loading file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-vocab.txt from cache at /home/mfuntowicz/.cache/torch/transformers/5e8a2b4893d13790ed4150ca1906be5f7a03d6c4ddf62296c383f6db42814db2.e13dbb970cb325137104fb2e5f36fe865f27746c6b526f6352861b1980eb80b1\n"
]
},
{
"name": "stdout",
"output_type": "stream",
......@@ -259,6 +532,101 @@
"print(f\"Sequence output: {sequence.shape}, Pooled output: {pooled.shape}\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Benchmarking PyTorch model\n",
"\n",
"_Note: PyTorch model benchmark is run on CPU_"
]
},
{
"cell_type": "code",
"execution_count": 29,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "PS_49goe197g",
"outputId": "0ef0f70c-f5a7-46a0-949a-1a93f231d193"
},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"loading configuration file https://s3.amazonaws.com/models.huggingface.co/bert/bert-base-cased-config.json from cache at /home/mfuntowicz/.cache/torch/transformers/b945b69218e98b3e2c95acf911789741307dec43c698d35fad11c1ae28bda352.9da767be51e1327499df13488672789394e2ca38b877837e52618a67d7002391\n",
"Model config BertConfig {\n",
" \"architectures\": [\n",
" \"BertForMaskedLM\"\n",
" ],\n",
" \"attention_probs_dropout_prob\": 0.1,\n",
" \"gradient_checkpointing\": false,\n",
" \"hidden_act\": \"gelu\",\n",
" \"hidden_dropout_prob\": 0.1,\n",
" \"hidden_size\": 768,\n",
" \"initializer_range\": 0.02,\n",
" \"intermediate_size\": 3072,\n",
" \"layer_norm_eps\": 1e-12,\n",
" \"max_position_embeddings\": 512,\n",
" \"model_type\": \"bert\",\n",
" \"num_attention_heads\": 12,\n",
" \"num_hidden_layers\": 12,\n",
" \"pad_token_id\": 0,\n",
" \"type_vocab_size\": 2,\n",
" \"vocab_size\": 28996\n",
"}\n",
"\n",
"loading weights file https://cdn.huggingface.co/bert-base-cased-pytorch_model.bin from cache at /home/mfuntowicz/.cache/torch/transformers/d8f11f061e407be64c4d5d7867ee61d1465263e24085cfa26abf183fdc830569.3fadbea36527ae472139fe84cddaa65454d7429f12d543d80bfc3ad70de55ac2\n",
"All model checkpoint weights were used when initializing BertModel.\n",
"\n",
"All the weights of BertModel were initialized from the model checkpoint at bert-base-cased.\n",
"If your task is similar to the task the model of the checkpoint was trained on, you can already use BertModel for predictions without further training.\n",
"Warming up: 100%|██████████| 10/10 [00:00<00:00, 39.30it/s]\n",
"Tracking inference time on PyTorch: 100%|██████████| 100/100 [00:02<00:00, 41.09it/s]\n"
]
}
],
"source": [
"from transformers import BertModel\n",
"\n",
"PROVIDERS = {\n",
" (\"cpu\", \"PyTorch CPU\"),\n",
"# Uncomment this line to enable GPU benchmarking\n",
"# (\"cuda:0\", \"PyTorch GPU\")\n",
"}\n",
"\n",
"results = {}\n",
"\n",
"for device, label in PROVIDERS:\n",
" \n",
" # Move inputs to the correct device\n",
" model_inputs_on_device = {\n",
" arg_name: tensor.to(device)\n",
" for arg_name, tensor in model_inputs.items()\n",
" }\n",
"\n",
" # Add PyTorch to the providers\n",
" model_pt = BertModel.from_pretrained(\"bert-base-cased\").to(device)\n",
" for _ in trange(10, desc=\"Warming up\"):\n",
" model_pt(**model_inputs_on_device)\n",
"\n",
" # Compute \n",
" time_buffer = []\n",
" for _ in trange(100, desc=f\"Tracking inference time on PyTorch\"):\n",
" with track_infer_time(time_buffer):\n",
" model_pt(**model_inputs_on_device)\n",
"\n",
" # Store the result\n",
" results[label] = OnnxInferenceResult(\n",
" time_buffer, \n",
" None\n",
" ) "
]
},
{
"cell_type": "markdown",
"metadata": {
......@@ -266,14 +634,14 @@
"id": "Kda1e7TkEqNR"
},
"source": [
"## Benchmarking different CPU & GPU providers\n",
"## Benchmarking PyTorch & ONNX on CPU\n",
"\n",
"_**Disclamer: results may vary from the actual hardware used to run the model**_"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": 30,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
......@@ -284,126 +652,191 @@
"outputId": "bfd779a1-0bc7-42db-8587-e52a485ec5e3"
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Doing GPU inference on TITAN RTX\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warming up: 100%|██████████| 10/10 [00:00<00:00, 333.82it/s]\n",
"Tracking inference time on CUDAExecutionProvider: 100%|██████████| 100/100 [00:00<00:00, 521.76it/s]\n",
"Warming up: 100%|██████████| 10/10 [00:00<00:00, 62.95it/s]\n",
"Tracking inference time on CPUExecutionProvider: 100%|██████████| 100/100 [00:01<00:00, 68.65it/s]\n",
"Warming up: 100%|██████████| 10/10 [00:00<00:00, 69.72it/s]\n",
"Tracking inference time on TensorrtExecutionProvider: 100%|██████████| 100/100 [00:01<00:00, 71.31it/s]\n",
"Warming up: 100%|██████████| 10/10 [00:00<00:00, 66.28it/s]\n",
"Tracking inference time on DnnlExecutionProvider: 100%|██████████| 100/100 [00:01<00:00, 72.03it/s]\n"
"Tracking inference time on CPUExecutionProvider: 100%|██████████| 100/100 [00:01<00:00, 63.62it/s]\n"
]
}
],
"source": [
"from torch.cuda import get_device_name\n",
"from contextlib import contextmanager\n",
"from dataclasses import dataclass\n",
"from time import time\n",
"from tqdm import trange\n",
"PROVIDERS = {\n",
" (\"CPUExecutionProvider\", \"ONNX CPU\"),\n",
"# Uncomment this line to enable GPU benchmarking\n",
"# (\"CUDAExecutionProvider\", \"ONNX GPU\")\n",
"}\n",
"\n",
"print(f\"Doing GPU inference on {get_device_name(0)}\", flush=True)\n",
"\n",
"@contextmanager\n",
"def track_infer_time(buffer: [int]):\n",
" start = time()\n",
" yield\n",
" end = time()\n",
"for provider, label in PROVIDERS:\n",
" # Create the model with the specified provider\n",
" model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", provider)\n",
"\n",
" buffer.append(end - start)\n",
" # Keep track of the inference time\n",
" time_buffer = []\n",
"\n",
" # Warm up the model\n",
" model.run(None, inputs_onnx)\n",
"\n",
"@dataclass\n",
"class OnnxInferenceResult:\n",
" model_inference_time: [int] \n",
" optimized_model_path: str\n",
" # Compute \n",
" for _ in trange(100, desc=f\"Tracking inference time on {provider}\"):\n",
" with track_infer_time(time_buffer):\n",
" model.run(None, inputs_onnx)\n",
"\n",
" # Store the result\n",
" results[label] = OnnxInferenceResult(\n",
" time_buffer,\n",
" model.get_session_options().optimized_model_filepath\n",
" )"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"\n",
"import matplotlib\n",
"import matplotlib.pyplot as plt\n",
"import numpy as np\n",
"import os\n",
"\n",
"\n",
"# All the providers we'll be using in the test\n",
"results = {}\n",
"providers = [\n",
" \"CUDAExecutionProvider\",\n",
" \"CPUExecutionProvider\", \n",
" \"TensorrtExecutionProvider\",\n",
" \"DnnlExecutionProvider\", \n",
"]\n",
"# Compute average inference time + std\n",
"time_results = {k: np.mean(v.model_inference_time) * 1e3 for k, v in results.items()}\n",
"time_results_std = np.std([v.model_inference_time for v in results.values()]) * 1000\n",
"\n",
"plt.rcdefaults()\n",
"fig, ax = plt.subplots(figsize=(16, 12))\n",
"ax.set_ylabel(\"Avg Inference time (ms)\")\n",
"ax.set_title(\"Average inference time (ms) for each provider\")\n",
"ax.bar(time_results.keys(), time_results.values(), yerr=time_results_std)\n",
"plt.show()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Quantization support from transformers\n",
"\n",
"# Iterate over all the providers\n",
"for provider in providers:\n",
"Quantization enables the use of integers (_instead of floatting point_) arithmetic to run neural networks models faster. From a high-level point of view, quantization works as mapping the float32 ranges of values as int8 with the less loss in the performances of the model.\n",
"\n",
" # Create the model with the specified provider\n",
" model = create_model_for_provider(\"onnx/bert-base-cased.onnx\", provider)\n",
"Hugging Face provides a conversion tool as part of the transformers repository to easily export quantized models to ONNX Runtime. For more information, please refer to the following: \n",
"\n",
" # Keep track of the inference time\n",
" time_buffer = []\n",
"- [Hugging Face Documentation on ONNX Runtime quantization supports](https://huggingface.co/transformers/master/serialization.html#quantization)\n",
"- [Intel's Explanation of Quantization](https://nervanasystems.github.io/distiller/quantization.html)\n",
"\n",
" # Warm up the model\n",
" for _ in trange(10, desc=\"Warming up\"):\n",
" model.run(None, inputs_onnx)\n",
"With this method, the accuracy of the model remains at the same level than the full-precision model. If you want to see benchmarks on model performances, we recommand reading the [ONNX Runtime notebook](https://github.com/microsoft/onnxruntime/blob/master/onnxruntime/python/tools/quantization/notebooks/Bert-GLUE_OnnxRuntime_quantization.ipynb) on the subject."
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Benchmarking PyTorch quantized model"
]
},
{
"cell_type": "code",
"execution_count": 32,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 100/100 [00:01<00:00, 90.15it/s]\n"
]
}
],
"source": [
"import torch \n",
"\n",
" # Compute \n",
" for _ in trange(100, desc=f\"Tracking inference time on {provider}\"):\n",
" with track_infer_time(time_buffer):\n",
" model.run(None, inputs_onnx)\n",
"# Quantize\n",
"model_pt_quantized = torch.quantization.quantize_dynamic(\n",
" model_pt.to(\"cpu\"), {torch.nn.Linear}, dtype=torch.qint8\n",
")\n",
"\n",
" # Store the result\n",
" results[provider] = OnnxInferenceResult(\n",
" time_buffer,\n",
" model.get_session_options().optimized_model_filepath\n",
" )"
"# Warm up \n",
"model_pt_quantized(**model_inputs)\n",
"\n",
"# Benchmark PyTorch quantized model\n",
"time_buffer = []\n",
"for _ in trange(100):\n",
" with track_infer_time(time_buffer):\n",
" model_pt_quantized(**model_inputs)\n",
" \n",
"results[\"PyTorch CPU Quantized\"] = OnnxInferenceResult(\n",
" time_buffer,\n",
" None\n",
")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Benchmarking ONNX quantized model"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 51
},
"colab_type": "code",
"id": "PS_49goe197g",
"outputId": "0ef0f70c-f5a7-46a0-949a-1a93f231d193"
},
"execution_count": 33,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"As of onnxruntime 1.4.0, models larger than 2GB will fail to quantize due to protobuf constraint.\n",
"This limitation will be removed in the next release of onnxruntime.\n",
"Quantized model has been written at bert.onnx: ✔\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"Warming up: 100%|██████████| 10/10 [00:00<00:00, 18.04it/s]\n",
"Tracking inference time on PyTorch: 100%|██████████| 100/100 [00:05<00:00, 18.88it/s]\n"
"Tracking inference time on CPUExecutionProvider with quantized model: 100%|██████████| 100/100 [00:00<00:00, 237.49it/s]\n"
]
}
],
"source": [
"from transformers import BertModel\n",
"from transformers.convert_graph_to_onnx import quantize\n",
"\n",
"# Transformers allow you to easily convert float32 model to quantized int8 with ONNX Runtime\n",
"quantized_model_path = quantize(Path(\"bert.opt.onnx\"))\n",
"\n",
"# Add PyTorch to the providers\n",
"model_pt = BertModel.from_pretrained(\"bert-base-cased\")\n",
"for _ in trange(10, desc=\"Warming up\"):\n",
" model_pt(**model_inputs)\n",
"# Then you just have to load through ONNX runtime as you would normally do\n",
"quantized_model = create_model_for_provider(quantized_model_path.as_posix(), \"CPUExecutionProvider\")\n",
"\n",
"# Compute \n",
"# Warm up the overall model to have a fair comparaison\n",
"outputs = quantized_model.run(None, inputs_onnx)\n",
"\n",
"# Evaluate performances\n",
"time_buffer = []\n",
"for _ in trange(100, desc=f\"Tracking inference time on PyTorch\"):\n",
" with track_infer_time(time_buffer):\n",
" model_pt(**model_inputs)\n",
"for _ in trange(100, desc=f\"Tracking inference time on CPUExecutionProvider with quantized model\"):\n",
" with track_infer_time(time_buffer):\n",
" outputs = quantized_model.run(None, inputs_onnx)\n",
"\n",
"# Store the result\n",
"results[\"Pytorch\"] = OnnxInferenceResult(\n",
"results[\"ONNX CPU Quantized\"] = OnnxInferenceResult(\n",
" time_buffer, \n",
" model.get_session_options().optimized_model_filepath\n",
" quantized_model_path\n",
") "
]
},
......@@ -411,14 +844,12 @@
"cell_type": "markdown",
"metadata": {},
"source": [
"## Show the inference performance of each providers \n",
"\n",
"_Note: PyTorch model benchmark is run on CPU_"
"## Show the inference performance of each providers "
]
},
{
"cell_type": "code",
"execution_count": 24,
"execution_count": 34,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
......@@ -431,7 +862,7 @@
"outputs": [
{
"data": {
"image/png": "\n",
"image/png": "\n",
"text/plain": [
"<Figure size 1600x1200 with 1 Axes>"
]
......@@ -448,6 +879,7 @@
"import numpy as np\n",
"import os\n",
"\n",
"\n",
"# Compute average inference time + std\n",
"time_results = {k: np.mean(v.model_inference_time) * 1e3 for k, v in results.items()}\n",
"time_results_std = np.std([v.model_inference_time for v in results.values()]) * 1000\n",
......@@ -484,7 +916,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.9"
"version": "3.8.0"
}
},
"nbformat": 4,
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment