Merge branch 'develop' into type-string-driver

1b098fd7 · Paul Fultz II · GitHub · 05f2ee1c · c0398ded · 1b098fd7
Unverified Commit 1b098fd7 authored Jun 21, 2022 by Paul Fultz II Committed by GitHub Jun 21, 2022
20 changed files
--- a/examples/nlp/README.md
+++ b/examples/nlp/README.md
+# Natural Language Processing Inference Examples
+
+- [Python BERT-SQuAD](./python_bert_squad)
\ No newline at end of file
--- a/examples/nlp/python_bert_squad/BERT-Squad.ipynb
+++ b/examples/nlp/python_bert_squad/BERT-Squad.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# BERT-SQuAD Inference Example with AMD MIGraphX"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "This tutorial shows how to run the BERT-Squad model on ONNX-Runtime with MIGraphX backend."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Requirements "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install -r requirements_bertsquad.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import json\n",
+    "import time\n",
+    "import os.path\n",
+    "from os import path\n",
+    "import sys\n",
+    "\n",
+    "import tokenizers\n",
+    "from run_onnx_squad import *\n",
+    "\n",
+    "import migraphx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download BERT ONNX file"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget -nc https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Download uncased file / vocabulary"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apt-get install unzip\n",
+    "!wget -q -nc https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip\n",
+    "!unzip -n uncased_L-12_H-768_A-12.zip"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Input data"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_file = 'inputs.json'\n",
+    "with open(input_file) as json_file:\n",
+    "    test_data = json.load(json_file)\n",
+    "    print(json.dumps(test_data, indent=2))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# Configuration for inference"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "max_seq_length = 256\n",
+    "doc_stride = 128\n",
+    "max_query_length = 64\n",
+    "batch_size = 1\n",
+    "n_best_size = 20\n",
+    "max_answer_length = 30"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Read vocabulary file and tokenize"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vocab_file = os.path.join('uncased_L-12_H-768_A-12', 'vocab.txt')\n",
+    "tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Convert the example to features to input"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# preprocess input\n",
+    "predict_file = 'inputs.json'\n",
+    "\n",
+    "# Use read_squad_examples method from run_onnx_squad to read the input file\n",
+    "eval_examples = read_squad_examples(input_file=predict_file)\n",
+    "\n",
+    "# Use convert_examples_to_features method from run_onnx_squad to get parameters from the input\n",
+    "input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(\n",
+    "    eval_examples, tokenizer, max_seq_length, doc_stride, max_query_length)"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Compile with MIGraphX for GPU"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = migraphx.parse_onnx(\"bertsquad-10.onnx\")\n",
+    "model.compile(migraphx.get_target(\"gpu\"))\n",
+    "#model.print()\n",
+    "\n",
+    "model.get_parameter_names()\n",
+    "model.get_parameter_shapes()"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Run the input through the model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "n = len(input_ids)\n",
+    "bs = batch_size\n",
+    "all_results = []\n",
+    "\n",
+    "for idx in range(0, n):\n",
+    "    item = eval_examples[idx]\n",
+    "    print(item)\n",
+    "\n",
+    "    result = model.run({\n",
+    "        \"unique_ids_raw_output___9:0\":\n",
+    "        np.array([item.qas_id], dtype=np.int64),\n",
+    "        \"input_ids:0\":\n",
+    "        input_ids[idx:idx + bs],\n",
+    "        \"input_mask:0\":\n",
+    "        input_mask[idx:idx + bs],\n",
+    "        \"segment_ids:0\":\n",
+    "        segment_ids[idx:idx + bs]\n",
+    "    })\n",
+    "\n",
+    "    in_batch = result[1].get_shape().lens()[0]\n",
+    "    print(in_batch)\n",
+    "    start_logits = [float(x) for x in result[1].tolist()]\n",
+    "    end_logits = [float(x) for x in result[0].tolist()]\n",
+    "    # print(start_logits)\n",
+    "    # print(end_logits)\n",
+    "    for i in range(0, in_batch):\n",
+    "        unique_id = len(all_results)\n",
+    "        all_results.append(\n",
+    "            RawResult(unique_id=unique_id,\n",
+    "                      start_logits=start_logits,\n",
+    "                      end_logits=end_logits))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Get the predictions"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output_dir = 'predictions'\n",
+    "os.makedirs(output_dir, exist_ok=True)\n",
+    "output_prediction_file = os.path.join(output_dir, \"predictions.json\")\n",
+    "output_nbest_file = os.path.join(output_dir, \"nbest_predictions.json\")\n",
+    "write_predictions(eval_examples, extra_data, all_results, n_best_size,\n",
+    "                  max_answer_length, True, output_prediction_file,\n",
+    "                  output_nbest_file)\n",
+    "\n",
+    "with open(output_prediction_file) as json_file:\n",
+    "    test_data = json.load(json_file)\n",
+    "    print(json.dumps(test_data, indent=2))"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 2
+}
--- a/examples/nlp/python_bert_squad/README.md
+++ b/examples/nlp/python_bert_squad/README.md
+# BERT-SQuAD Example with MIGraphX
+Question answering with BERT using MIGraphX optimizations on ROCm platform.
+
+There are two ways to run the example:
+1) Install MIGraphX and Jupyter notebook to your system and then utilize `BERT-Squad.ipynb` notebook file.
+2) Install MIGraphx to your system and follow the steps executing the python script `bert-squad-migraphx.py`.
+
+# Steps
+1) Install MIGraphX to your environment. Please follow the steps to build MIGraphX given at https://github.com/ROCmSoftwarePlatform/AMDMIGraphX
+2) Upgrade your pip3 to latest version
+```
+pip3 install --upgrade pip 
+```
+3) Install the requirements file
+```
+pip3 install -r requirements_bertsquad.txt
+```
+4) Install `unzip` and fetch the uncased file (vocabulary):
+```
+apt-get install unzip
+wget -q https://storage.googleapis.com/bert_models/2018_10_18/uncased_L-12_H-768_A-12.zip
+unzip uncased_L-12_H-768_A-12.zip
+```
+5) Get BERT ONNX model (bertsquad-10.onnx):
+```
+wget https://github.com/onnx/models/raw/main/text/machine_comprehension/bert-squad/model/bertsquad-10.onnx
+```
+6) Run the inference, it will compile and run the model on three questions and small data provided in `inputs.json`:
+```
+python3 bert-squad-migraphx.py
+```
+## References
+This example utilizes the following notebook :notebook: and applies it to MIGraphX:
+https://github.com/onnx/models/blob/master/text/machine_comprehension/bert-squad/BERT-Squad.ipynb
--- a/examples/nlp/python_bert_squad/bert-squad-migraphx.py
+++ b/examples/nlp/python_bert_squad/bert-squad-migraphx.py
+import numpy as np
+import json
+import os.path
+import tokenizers
+import collections
+from run_onnx_squad import read_squad_examples, write_predictions, convert_examples_to_features
+import migraphx
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+#######################################
+input_file = 'inputs_amd.json'
+with open(input_file) as json_file:
+    test_data = json.load(json_file)
+    print(json.dumps(test_data, indent=2))
+
+# preprocess input
+predict_file = 'inputs_amd.json'
+
+# Use read_squad_examples method from run_onnx_squad to read the input file
+eval_examples = read_squad_examples(input_file=predict_file)
+
+max_seq_length = 256
+doc_stride = 128
+max_query_length = 64
+batch_size = 1
+n_best_size = 20
+max_answer_length = 30
+
+vocab_file = os.path.join('uncased_L-12_H-768_A-12', 'vocab.txt')
+tokenizer = tokenizers.BertWordPieceTokenizer(vocab_file)
+
+# Use convert_examples_to_features method from run_onnx_squad to get parameters from the input
+input_ids, input_mask, segment_ids, extra_data = convert_examples_to_features(
+    eval_examples, tokenizer, max_seq_length, doc_stride, max_query_length)
+
+#######################################
+# Compile
+print("INFO: Parsing and compiling the model...")
+model = migraphx.parse_onnx("bertsquad-10.onnx")
+model.compile(migraphx.get_target("gpu"))
+#model.print()
+
+print(model.get_parameter_names())
+print(model.get_parameter_shapes())
+
+n = len(input_ids)
+bs = batch_size
+all_results = []
+
+for idx in range(0, n):
+    item = eval_examples[idx]
+    print(item)
+
+    result = model.run({
+        "unique_ids_raw_output___9:0":
+        np.array([item.qas_id], dtype=np.int64),
+        "input_ids:0":
+        input_ids[idx:idx + bs],
+        "input_mask:0":
+        input_mask[idx:idx + bs],
+        "segment_ids:0":
+        segment_ids[idx:idx + bs]
+    })
+
+    in_batch = result[1].get_shape().lens()[0]
+    start_logits = [float(x) for x in result[1].tolist()]
+    end_logits = [float(x) for x in result[0].tolist()]
+    for i in range(0, in_batch):
+        unique_id = len(all_results)
+        all_results.append(
+            RawResult(unique_id=unique_id,
+                      start_logits=start_logits,
+                      end_logits=end_logits))
+
+output_dir = 'predictions'
+os.makedirs(output_dir, exist_ok=True)
+output_prediction_file = os.path.join(output_dir, "predictions.json")
+output_nbest_file = os.path.join(output_dir, "nbest_predictions.json")
+write_predictions(eval_examples, extra_data, all_results, n_best_size,
+                  max_answer_length, True, output_prediction_file,
+                  output_nbest_file)
+
+with open(output_prediction_file) as json_file:
+    test_data = json.load(json_file)
+    print(json.dumps(test_data, indent=2))
--- a/examples/nlp/python_bert_squad/inputs.json
+++ b/examples/nlp/python_bert_squad/inputs.json
+{
+  "version": "1.4",
+  "data": [
+    {
+      "paragraphs": [
+        {
+          "context": "In its early years, the new convention center failed to meet attendance and revenue expectations.[12] By 2002, many Silicon Valley businesses were choosing the much larger Moscone Center in San Francisco over the San Jose Convention Center due to the latter's limited space. A ballot measure to finance an expansion via a hotel tax failed to reach the required two-thirds majority to pass. In June 2005, Team San Jose built the South Hall, a $6.77 million, blue and white tent, adding 80,000 square feet (7,400 m2) of exhibit space",
+          "qas": [
+            {
+              "question": "where is the businesses choosing to go?",
+              "id": "1"
+            },
+            {
+              "question": "how may votes did the ballot measure need?",
+              "id": "2"
+            },
+            {
+              "question": "By what year many Silicon Valley businesses were choosing the Moscone Center?",
+              "id": "3"
+            }
+          ]
+        }
+      ],
+      "title": "Conference Center"
+    }
+  ]
+}
\ No newline at end of file
--- a/examples/nlp/python_bert_squad/inputs_amd.json
+++ b/examples/nlp/python_bert_squad/inputs_amd.json
+{
+  "data": [
+    {
+      "paragraphs": [
+        {
+          "context": "ROCm is the first open-source exascale-class platform for accelerated computing that’s also programming-language independent. It brings a philosophy of choice, minimalism and modular software development to GPU computing. You are free to choose or even develop tools and a language run time for your application. ROCm is built for scale, it supports multi-GPU computing and has a rich system run time with the critical features that large-scale application, compiler and language-run-time development requires. Since the ROCm ecosystem is comprised of open technologies: frameworks (Tensorflow / PyTorch), libraries (MIOpen / Blas / RCCL), programming model (HIP), inter-connect (OCD) and up streamed Linux® Kernel support – the platform is continually optimized for performance and extensibility.",
+          "qas": [
+            {
+              "question": "What is ROCm?",
+              "id": "1"
+            },
+            {
+              "question": "Which frameworks does ROCm support?",
+              "id": "2"
+            },
+            {
+              "question": "What is ROCm built for?",
+              "id": "3"
+            }
+          ]
+        }
+      ],
+      "title": "AMD ROCm"
+    }
+  ]
+}
\ No newline at end of file
--- a/examples/nlp/python_bert_squad/requirements_bertsquad.txt
+++ b/examples/nlp/python_bert_squad/requirements_bertsquad.txt
+tensorflow==2.7.2
+onnxruntime
+tokenizers
\ No newline at end of file
--- a/examples/nlp/python_bert_squad/run_onnx_squad.py
+++ b/examples/nlp/python_bert_squad/run_onnx_squad.py
+# Modifications Copyright (C) 2022, Advanced Micro Devices, Inc. All rights reserved
+# Copyright 2018 The Google AI Language Team Authors.
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+#     http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+"""
+Inference for squad/bert using onnx.
+
+This is going to do the samem as 'python run_squad.py --do_predict=True ...' using a squad/bert model
+that was converted to onnx. Lots of code was taken from run_squad.py.
+You run it with:
+
+
+python onnx_squad.py --model $SQUAD_MODEL/squad.onnx \
+                     --vocab_file $BERT_BASE_DIR/uncased_L-12_H-768_A-12/vocab.txt
+                     --predict_file $SQUAD_DATA/dev-v1.1.json \
+                     --bert_config_file $BERT_BASE_DIR/uncased_L-12_H-768_A-12/bert_config.json \
+                     --output /tmp/
+"""
+
+import argparse
+import collections
+import json
+import math
+import os
+import sys
+from timeit import default_timer as timer
+
+import numpy as np
+import onnxruntime as onnxrt
+import six
+from tokenizers import BertWordPieceTokenizer
+from tokenizers import pre_tokenizers
+
+RawResult = collections.namedtuple("RawResult",
+                                   ["unique_id", "start_logits", "end_logits"])
+
+Feature = collections.namedtuple("Feature", [
+    "unique_id", "tokens", "example_index", "token_to_orig_map",
+    "token_is_max_context"
+])
+
+
+class SquadExample(object):
+    """A single training/test example for simple sequence classification."""
+    def __init__(self,
+                 qas_id,
+                 question_text,
+                 doc_tokens,
+                 orig_answer_text=None,
+                 start_position=None,
+                 end_position=None):
+        self.qas_id = qas_id
+        self.question_text = question_text
+        self.doc_tokens = doc_tokens
+        self.orig_answer_text = orig_answer_text
+        self.start_position = start_position
+        self.end_position = end_position
+
+    def __str__(self):
+        return self.__repr__()
+
+    def __repr__(self):
+        s = []
+        s.append("qas_id: %s" % (self.qas_id))
+        s.append("question_text: %s" % (self.question_text))
+        s.append("doc_tokens: [%s]" % (" ".join(self.doc_tokens)))
+        if self.start_position:
+            s.append("start_position: %d" % (self.start_position))
+        if self.start_position:
+            s.append("end_position: %d" % (self.end_position))
+        return ", ".join(s)
+
+
+def _check_is_max_context(doc_spans, cur_span_index, position):
+    """Check if this is the 'max context' doc span for the token."""
+
+    # Because of the sliding window approach taken to scoring documents, a single
+    # token can appear in multiple documents. E.g.
+    #  Doc: the man went to the store and bought a gallon of milk
+    #  Span A: the man went to the
+    #  Span B: to the store and bought
+    #  Span C: and bought a gallon of
+    #  ...
+    #
+    # Now the word 'bought' will have two scores from spans B and C. We only
+    # want to consider the score with "maximum context", which we define as
+    # the *minimum* of its left and right context (the *sum* of left and
+    # right context will always be the same, of course).
+    #
+    # In the example the maximum context for 'bought' would be span C since
+    # it has 1 left context and 3 right context, while span B has 4 left context
+    # and 0 right context.
+    best_score = None
+    best_span_index = None
+    for (span_index, doc_span) in enumerate(doc_spans):
+        end = doc_span.start + doc_span.length - 1
+        if position < doc_span.start:
+            continue
+        if position > end:
+            continue
+        num_left_context = position - doc_span.start
+        num_right_context = end - position
+        score = min(num_left_context,
+                    num_right_context) + 0.01 * doc_span.length
+        if best_score is None or score > best_score:
+            best_score = score
+            best_span_index = span_index
+
+    return cur_span_index == best_span_index
+
+
+def convert_examples_to_features(examples, tokenizer, max_seq_length,
+                                 doc_stride, max_query_length):
+    """Loads a data file into a list of `InputBatch`s."""
+
+    res_input_ids = []
+    res_input_mask = []
+    res_segment_ids = []
+    extra = []
+    unique_id = 0
+
+    for (example_index, example) in enumerate(examples):
+        query_tokens = tokenizer.encode(example.question_text)
+
+        if len(query_tokens) > max_query_length:
+            query_tokens = query_tokens[0:max_query_length]
+
+        tok_to_orig_index = []
+        orig_to_tok_index = []
+        all_doc_tokens = []
+        for (i, token) in enumerate(example.doc_tokens):
+            orig_to_tok_index.append(len(all_doc_tokens))
+            sub_tokens = tokenizer.encode(token, add_special_tokens=False)
+            for sub_token in sub_tokens.tokens:
+                tok_to_orig_index.append(i)
+                all_doc_tokens.append(sub_token)
+
+        # The -3 accounts for [CLS], [SEP] and [SEP]
+        max_tokens_for_doc = max_seq_length - len(query_tokens) - 3
+
+        # We can have documents that are longer than the maximum sequence length.
+        # To deal with this we do a sliding window approach, where we take chunks
+        # of the up to our max length with a stride of `doc_stride`.
+        _DocSpan = collections.namedtuple("DocSpan", ["start", "length"])
+        doc_spans = []
+        start_offset = 0
+        while start_offset < len(all_doc_tokens):
+            length = len(all_doc_tokens) - start_offset
+            if length > max_tokens_for_doc:
+                length = max_tokens_for_doc
+            doc_spans.append(_DocSpan(start=start_offset, length=length))
+            if start_offset + length == len(all_doc_tokens):
+                break
+            start_offset += min(length, doc_stride)
+
+        for (doc_span_index, doc_span) in enumerate(doc_spans):
+            tokens = []
+            token_to_orig_map = {}
+            token_is_max_context = {}
+            segment_ids = []
+            tokens.append("[CLS]")
+            segment_ids.append(0)
+            for token in query_tokens.tokens:
+                tokens.append(token)
+                segment_ids.append(0)
+            tokens.append("[SEP]")
+            segment_ids.append(0)
+
+            for i in range(doc_span.length):
+                split_token_index = doc_span.start + i
+                token_to_orig_map[len(
+                    tokens)] = tok_to_orig_index[split_token_index]
+
+                is_max_context = _check_is_max_context(doc_spans,
+                                                       doc_span_index,
+                                                       split_token_index)
+                token_is_max_context[len(tokens)] = is_max_context
+                tokens.append(all_doc_tokens[split_token_index])
+                segment_ids.append(1)
+            tokens.append("[SEP]")
+            segment_ids.append(1)
+
+            input_ids = []
+            for token in tokens:
+                input_ids.append(tokenizer.token_to_id(token))
+
+            # The mask has 1 for real tokens and 0 for padding tokens. Only real
+            # tokens are attended to.
+            input_mask = [1] * len(input_ids)
+
+            # Zero-pad up to the sequence length.
+            while len(input_ids) < max_seq_length:
+                input_ids.append(0)
+                input_mask.append(0)
+                segment_ids.append(0)
+            res_input_ids.append(np.array(input_ids, dtype=np.int64))
+            res_input_mask.append(np.array(input_mask, dtype=np.int64))
+            res_segment_ids.append(np.array(segment_ids, dtype=np.int64))
+            feature = Feature(unique_id=unique_id,
+                              tokens=tokens,
+                              example_index=example_index,
+                              token_to_orig_map=token_to_orig_map,
+                              token_is_max_context=token_is_max_context)
+            extra.append(feature)
+            unique_id += 1
+    return np.array(res_input_ids), np.array(res_input_mask), np.array(
+        res_segment_ids), extra
+
+
+def read_squad_examples(input_file):
+    """Read a SQuAD json file into a list of SquadExample."""
+    with open(input_file, "r") as f:
+        input_data = json.load(f)["data"]
+
+    def is_whitespace(c):
+        if c == " " or c == "\t" or c == "\r" or c == "\n" or ord(c) == 0x202F:
+            return True
+        return False
+
+    examples = []
+    for idx, entry in enumerate(input_data):
+        for paragraph in entry["paragraphs"]:
+            paragraph_text = paragraph["context"]
+            doc_tokens = []
+            char_to_word_offset = []
+            prev_is_whitespace = True
+            for c in paragraph_text:
+                if is_whitespace(c):
+                    prev_is_whitespace = True
+                else:
+                    if prev_is_whitespace:
+                        doc_tokens.append(c)
+                    else:
+                        doc_tokens[-1] += c
+                    prev_is_whitespace = False
+                char_to_word_offset.append(len(doc_tokens) - 1)
+
+            for qa in paragraph["qas"]:
+                qas_id = qa["id"]
+                question_text = qa["question"]
+                start_position = None
+                end_position = None
+                orig_answer_text = None
+                example = SquadExample(qas_id=qas_id,
+                                       question_text=question_text,
+                                       doc_tokens=doc_tokens,
+                                       orig_answer_text=orig_answer_text,
+                                       start_position=start_position,
+                                       end_position=end_position)
+                examples.append(example)
+    return examples
+
+
+def write_predictions(all_examples, all_features, all_results, n_best_size,
+                      max_answer_length, do_lower_case, output_prediction_file,
+                      output_nbest_file):
+    """Write final predictions to the json file."""
+    example_index_to_features = collections.defaultdict(list)
+    for feature in all_features:
+        example_index_to_features[feature.example_index].append(feature)
+
+    unique_id_to_result = {}
+    for result in all_results:
+        unique_id_to_result[result.unique_id] = result
+
+    _PrelimPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+        "PrelimPrediction", [
+            "feature_index", "start_index", "end_index", "start_logit",
+            "end_logit"
+        ])
+
+    all_predictions = collections.OrderedDict()
+    all_nbest_json = collections.OrderedDict()
+    for (example_index, example) in enumerate(all_examples):
+        features = example_index_to_features[example_index]
+        prelim_predictions = []
+        for (feature_index, feature) in enumerate(features):
+            if not feature.unique_id in unique_id_to_result:
+                print("feature not in unique_Id", feature.unique_id)
+                continue
+            result = unique_id_to_result[feature.unique_id]
+
+            start_indexes = _get_best_indexes(result.start_logits, n_best_size)
+            end_indexes = _get_best_indexes(result.end_logits, n_best_size)
+            for start_index in start_indexes:
+                for end_index in end_indexes:
+                    # We could hypothetically create invalid predictions, e.g., predict
+                    # that the start of the span is in the question. We throw out all
+                    # invalid predictions.
+                    if start_index >= len(feature.tokens):
+                        continue
+                    if end_index >= len(feature.tokens):
+                        continue
+                    if start_index not in feature.token_to_orig_map:
+                        continue
+                    if end_index not in feature.token_to_orig_map:
+                        continue
+                    if not feature.token_is_max_context.get(
+                            start_index, False):
+                        continue
+                    if end_index < start_index:
+                        continue
+                    length = end_index - start_index + 1
+                    if length > max_answer_length:
+                        continue
+                    prelim_predictions.append(
+                        _PrelimPrediction(
+                            feature_index=feature_index,
+                            start_index=start_index,
+                            end_index=end_index,
+                            start_logit=result.start_logits[start_index],
+                            end_logit=result.end_logits[end_index]))
+
+        prelim_predictions = sorted(prelim_predictions,
+                                    key=lambda x:
+                                    (x.start_logit + x.end_logit),
+                                    reverse=True)
+
+        _NbestPrediction = collections.namedtuple(  # pylint: disable=invalid-name
+            "NbestPrediction", ["text", "start_logit", "end_logit"])
+
+        seen_predictions = {}
+        nbest = []
+        for pred in prelim_predictions:
+            if len(nbest) >= n_best_size:
+                break
+            feature = features[pred.feature_index]
+
+            tok_tokens = feature.tokens[pred.start_index:(pred.end_index + 1)]
+            orig_doc_start = feature.token_to_orig_map[pred.start_index]
+            orig_doc_end = feature.token_to_orig_map[pred.end_index]
+            orig_tokens = example.doc_tokens[orig_doc_start:(orig_doc_end + 1)]
+            tok_text = " ".join(tok_tokens)
+
+            # De-tokenize WordPieces that have been split off.
+            tok_text = tok_text.replace(" ##", "")
+            tok_text = tok_text.replace("##", "")
+
+            # Clean whitespace
+            tok_text = tok_text.strip()
+            tok_text = " ".join(tok_text.split())
+            orig_text = " ".join(orig_tokens)
+
+            final_text = get_final_text(tok_text, orig_text, do_lower_case)
+            if final_text in seen_predictions:
+                continue
+
+            seen_predictions[final_text] = True
+            nbest.append(
+                _NbestPrediction(text=final_text,
+                                 start_logit=pred.start_logit,
+                                 end_logit=pred.end_logit))
+
+        # In very rare edge cases we could have no valid predictions. So we
+        # just create a nonce prediction in this case to avoid failure.
+        if not nbest:
+            nbest.append(
+                _NbestPrediction(text="empty", start_logit=0.0, end_logit=0.0))
+
+        assert len(nbest) >= 1
+
+        total_scores = []
+        for entry in nbest:
+            total_scores.append(entry.start_logit + entry.end_logit)
+
+        probs = _compute_softmax(total_scores)
+
+        nbest_json = []
+        for (i, entry) in enumerate(nbest):
+            output = collections.OrderedDict()
+            output["text"] = entry.text
+            output["probability"] = probs[i]
+            output["start_logit"] = float(entry.start_logit)
+            output["end_logit"] = float(entry.end_logit)
+            nbest_json.append(output)
+
+        all_predictions[example.qas_id] = nbest_json[0]["text"]
+        all_nbest_json[example.qas_id] = nbest_json
+
+    with open(output_prediction_file, "w") as writer:
+        writer.write(json.dumps(all_predictions, indent=4) + "\n")
+
+    with open(output_nbest_file, "w") as writer:
+        writer.write(json.dumps(all_nbest_json, indent=4) + "\n")
+
+
+def get_final_text(pred_text, orig_text, do_lower_case):
+    """Project the tokenized prediction back to the original text."""
+
+    # When we created the data, we kept track of the alignment between original
+    # (whitespace tokenized) tokens and our WordPiece tokenized tokens. So
+    # now `orig_text` contains the span of our original text corresponding to the
+    # span that we predicted.
+    #
+    # However, `orig_text` may contain extra characters that we don't want in
+    # our prediction.
+    #
+    # For example, let's say:
+    #   pred_text = steve smith
+    #   orig_text = Steve Smith's
+    #
+    # We don't want to return `orig_text` because it contains the extra "'s".
+    #
+    # We don't want to return `pred_text` because it's already been normalized
+    # (the SQuAD eval script also does punctuation stripping/lower casing but
+    # our tokenizer does additional normalization like stripping accent
+    # characters).
+    #
+    # What we really want to return is "Steve Smith".
+    #
+    # Therefore, we have to apply a semi-complicated alignment heruistic between
+    # `pred_text` and `orig_text` to get a character-to-charcter alignment. This
+    # can fail in certain cases in which case we just return `orig_text`.
+
+    def _strip_spaces(text):
+        ns_chars = []
+        ns_to_s_map = collections.OrderedDict()
+        for (i, c) in enumerate(text):
+            if c == " ":
+                continue
+            ns_to_s_map[len(ns_chars)] = i
+            ns_chars.append(c)
+        ns_text = "".join(ns_chars)
+        return (ns_text, ns_to_s_map)
+
+    # We first tokenize `orig_text`, strip whitespace from the result
+    # and `pred_text`, and check if they are the same length. If they are
+    # NOT the same length, the heuristic has failed. If they are the same
+    # length, we assume the characters are one-to-one aligned.
+    tokenizer = pre_tokenizers.Sequence(
+        [pre_tokenizers.Whitespace(),
+         pre_tokenizers.Punctuation()])
+
+    tok_text = []
+    for item in tokenizer.pre_tokenize_str(orig_text):
+        tok_text.append(item[0])
+
+    tok_text = " ".join(tok_text)
+
+    start_position = tok_text.find(pred_text)
+    if start_position == -1:
+        return orig_text
+    end_position = start_position + len(pred_text) - 1
+
+    (orig_ns_text, orig_ns_to_s_map) = _strip_spaces(orig_text)
+    (tok_ns_text, tok_ns_to_s_map) = _strip_spaces(tok_text)
+
+    if len(orig_ns_text) != len(tok_ns_text):
+        return orig_text
+
+    # We then project the characters in `pred_text` back to `orig_text` using
+    # the character-to-character alignment.
+    tok_s_to_ns_map = {}
+    for (i, tok_index) in six.iteritems(tok_ns_to_s_map):
+        tok_s_to_ns_map[tok_index] = i
+
+    orig_start_position = None
+    if start_position in tok_s_to_ns_map:
+        ns_start_position = tok_s_to_ns_map[start_position]
+        if ns_start_position in orig_ns_to_s_map:
+            orig_start_position = orig_ns_to_s_map[ns_start_position]
+
+    if orig_start_position is None:
+        return orig_text
+
+    orig_end_position = None
+    if end_position in tok_s_to_ns_map:
+        ns_end_position = tok_s_to_ns_map[end_position]
+        if ns_end_position in orig_ns_to_s_map:
+            orig_end_position = orig_ns_to_s_map[ns_end_position]
+
+    if orig_end_position is None:
+        return orig_text
+
+    output_text = orig_text[orig_start_position:(orig_end_position + 1)]
+    return output_text
+
+
+def _get_best_indexes(logits, n_best_size):
+    """Get the n-best logits from a list."""
+    index_and_score = sorted(enumerate(logits),
+                             key=lambda x: x[1],
+                             reverse=True)
+    best_indexes = []
+    for i in range(len(index_and_score)):
+        if i >= n_best_size:
+            break
+        best_indexes.append(index_and_score[i][0])
+    return best_indexes
+
+
+def _compute_softmax(scores):
+    """Compute softmax probability over raw logits."""
+    if not scores:
+        return []
+
+    max_score = None
+    for score in scores:
+        if max_score is None or score > max_score:
+            max_score = score
+
+    exp_scores = []
+    total_sum = 0.0
+    for score in scores:
+        x = math.exp(score - max_score)
+        exp_scores.append(x)
+        total_sum += x
+
+    probs = []
+    for score in exp_scores:
+        probs.append(score / total_sum)
+    return probs
+
+
+def main():
+    parser = argparse.ArgumentParser(description='onnx squad')
+    parser.add_argument('--model', required=True, help='model')
+    parser.add_argument('--vocab_file', required=True, help='vocab_file')
+    parser.add_argument('--bert_config_file', help='vocab_file')
+    parser.add_argument('--predict_file', required=True, help='predict_file')
+    parser.add_argument('--output_dir', help='output dir')
+    parser.add_argument('--max_seq_length',
+                        type=int,
+                        default=256,
+                        help='max_seq_length')
+    parser.add_argument('--max_query_length',
+                        type=int,
+                        default=64,
+                        help='max_query_length')
+    parser.add_argument('--max_answer_length',
+                        type=int,
+                        default=30,
+                        help='max_answer_length')
+    parser.add_argument('--n_best_size',
+                        type=int,
+                        default=20,
+                        help='n_best_size')
+    parser.add_argument('--doc_stride',
+                        type=int,
+                        default=128,
+                        help='doc_stride')
+    parser.add_argument('--batch_size', type=int, default=1, help='batch_size')
+    parser.add_argument('--profile',
+                        action='store_true',
+                        help='enable chrome timeline trace profiling.')
+    parser.add_argument('--log', type=int, help='log level.')
+    args = parser.parse_args()
+
+    sess_options = None
+    if args.profile:
+        sess_options = onnxrt.SessionOptions()
+        sess_options.enable_profiling = True
+        sess_options.profile_file_prefix = os.path.basename(args.model)
+    if args.log:
+        sess_options = onnxrt.SessionOptions()
+        sess_options.session_log_verbosity_level = args.log
+
+    tokenizer = BertWordPieceTokenizer(args.vocab_file)
+
+    eval_examples = read_squad_examples(input_file=args.predict_file)
+    input_ids, input_mask, segment_ids, extra_data = \
+        convert_examples_to_features(eval_examples, tokenizer, args.max_seq_length,
+                                     args.doc_stride, args.max_query_length)
+
+    sess = onnxrt.InferenceSession(args.model, sess_options)
+    for input_meta in sess.get_inputs():
+        print(input_meta)
+    n = len(input_ids)
+    bs = args.batch_size
+    all_results = []
+    start = timer()
+    for idx in range(0, n, bs):
+        data = {
+            "input_ids:0": input_ids[idx:idx + bs],
+            "input_mask:0": input_mask[idx:idx + bs],
+            "segment_ids:0": segment_ids[idx:idx + bs]
+        }
+        result = sess.run(["unstack:0", "unstack:1"], data)
+        in_batch = result[0].shape[1]
+        for i in range(0, in_batch):
+            unique_id = len(all_results)
+            all_results.append(
+                RawResult(unique_id=unique_id,
+                          start_logits=result[0][0][i],
+                          end_logits=result[1][0][i]))
+            if unique_id > 0 and unique_id % 100 == 0:
+                print("at {} {}sec per item".format(
+                    unique_id, (timer() - start) / unique_id))
+    end = timer()
+
+    print("total time: {}sec, {}sec per item".format(
+        end - start, (end - start) / len(all_results)))
+
+    if args.output_dir:
+        output_prediction_file = os.path.join(args.output_dir,
+                                              "predictions.json")
+        output_nbest_file = os.path.join(args.output_dir,
+                                         "nbest_predictions.json")
+        write_predictions(eval_examples, extra_data, all_results,
+                          args.n_best_size, args.max_answer_length, True,
+                          output_prediction_file, output_nbest_file)
+    if args.profile:
+        trace_file = sess.end_profiling()
+        print("trace file written to: {}".format(trace_file))
+    return 0
+
+
+if __name__ == "__main__":
+    sys.exit(main())
--- a/examples/vision/README.md
+++ b/examples/vision/README.md
+# Vision Inference Examples
+
+- [CPP MNIST](./cpp_mnist)
+- [Python Resnet50](./python_resnet50)
+- [Python Super Resolution](./python_super_resolution)
+- [Python NFNet](./python_nfnet)
+- [Python U-Net](./python_unet)
+- [Python 3D-UNet](./python_3dunet)
\ No newline at end of file
--- a/examples/vision/cpp_mnist/CMakeLists.txt
+++ b/examples/vision/cpp_mnist/CMakeLists.txt
+cmake_minimum_required(VERSION 3.5)
+project (CAI)
+
+set (CMAKE_CXX_STANDARD 14)
+set (EXAMPLE mnist_inference)
+
+list (APPEND CMAKE_PREFIX_PATH /opt/rocm/hip /opt/rocm)
+find_package (migraphx)
+
+message("source file: " ${EXAMPLE}.cpp " ---> bin: " ${EXAMPLE})
+add_executable(${EXAMPLE} ${EXAMPLE}.cpp)
+
+target_link_libraries(${EXAMPLE} migraphx::c)
--- a/examples/vision/cpp_mnist/README.md
+++ b/examples/vision/cpp_mnist/README.md
+# Performing Inference Using C++ API
+
+## Description
+This example demonstrates how to perform inference using the MIGraphX C++ API. The model used is a convolutional network pre-trained on the MNIST dataset, and inference is performed on a random digit selected from the test set. 
+
+## Content
+- [Basic Setup](#Basic-Setup)
+- [Quantization](#Quantization)
+- [Compilation](#Compilation)
+- [Preparing Input Data](#Preparing-Input-Data)
+- [Evaluating Inputs and Handling Outputs](#Evaluating-Inputs-and-Handling-Outputs)
+- [**Running this Example**](#Running-this-Example)
+
+## Basic Setup
+Before running inference, we must first instantiate a network graph and select a compilation target. See [this example](../cpp_parse_load_save) for more information about working with MIGraphX program objects. 
+```
+migraphx::program prog;
+migraphx::onnx_options onnx_opts;
+prog = parse_onnx("../mnist-8.onnx", onnx_opts);
+
+std::string target_str;
+if(CPU)
+    target_str = "cpu";
+else if(GPU)
+    target_str = "gpu";
+else
+    target_str = "ref";
+migraphx::target targ = migraphx::target(target_str.c_str());
+```
+
+## Quantization
+Optionally, graph programs may be quantized to fp16 or int8 precision to improve performance and memory usage. 
+
+##### Floating Point 16-bit Precision
+To quantize using fp16, we simply add the following line:
+```
+migraphx::quantize_fp16(prog);
+```
+
+##### Integer 8-bit Precision
+Int8 quantization requires calibration to accurately map ranges of floating point values onto integer values. 
+
+To calibrate prior to inference, one or more inputs can be supplied as follows:
+```
+std::vector<float> calib_dig;
+// ... read in data
+
+migraphx::quantize_int8_options quant_opts;
+migraphx::program_parameters quant_params;
+auto param_shapes = prog.get_parameter_shapes();
+for(auto&& name : param_shapes.names())
+{
+    quant_params.add(name, migraphx::argument(param_shapes[name], calib_dig.data()));
+}
+
+quant_opts.add_calibration_data(quant_params);
+migraphx::quantize_int8(prog, targ, quant_opts);
+```
+
+## Compilation 
+Network graphs saved in e.g. ONNX or protobuf format are not target-specific. In order to run inference, we must compile the graph into a target-specific program. 
+
+Two options may be turned on when compiling:
+- `set_offload_copy(bool value)`: For targets with offloaded memory (such as the gpu), this will insert instructions during compilation to copy the input parameters to the offloaded memory and to copy the final result from the offloaded memory back to main memory. Default value is `false` for offload_copy.
+- `set_fast_math(bool value)`: Optimize math functions to use faster approximate versions. There may be slight accuracy degredation when enabled. Default value is `true` for fast_math. 
+
+The following snippet assumes `targ` has been set as "gpu", and will compile the program without the fast_math optimization.
+```
+migraphx::compile_options comp_opts;
+comp_opts.set_offload_copy();
+prog.compile(targ, comp_opts);
+``` 
+
+To compile a program with the default options, we simply call:
+```
+prog.compile(targ);
+```
+
+The targets "ref" and "cpu" both compile the program to run on the CPU. The target "ref" is primarily used for correctness checking. The target "cpu" is under ongoing development and has more optimizations enabled. Additionally, the "cpu" target requires MIGraphX to be built with the `-DMIGRAPHX_ENABLE_CPU=On` flag. Specifically, 
+```
+CXX=/opt/rocm/llvm/bin/clang++ cmake -DMIGRAPHX_ENABLE_CPU=On ..
+```
+
+## Preparing Input Data
+Now that we have a compiled program, the last step to perform infernce is to prepare the input data as program parameters. 
+The first step is to read in the data and store it in a `std::vector<float>` we will in this case call `digit`. 
+Next, we create a program parameter containing the data stored in `digit`:
+```
+migraphx::program_parameters prog_params;
+auto param_shapes = prog.get_parameter_shapes();
+for(auto&& name : param_shapes.names())
+{
+    prog_params.add(name, migraphx::argument(param_shapes[name], digit.data()));
+}
+```
+
+## Evaluating Inputs and Handling Outputs
+Now that everything is in place, the final step to run inference is to call:
+```
+auto outputs = prog.eval(prog_params);
+```
+
+The output layer(s) will be returned and stored in `outputs`. Our network for this example returns a single output layer with the shape (1, 10). The index of the largest value in this output layer corresponds to the digit that the model has predicted. 
+```
+auto shape   = outputs[0].get_shape();
+auto lengths = shape.lengths();
+auto num_results = std::accumulate(lengths.begin(), lengths.end(), 1, std::multiplies<size_t>();
+float* results = reinterpret_cast<float*>(outputs[0].data());
+float* max     = std::max_element(results, results + num_results);
+int answer     = max - results;
+```
+
+Other networks may require alternative processing of outputs. 
+
+
+## Running this Example
+This directory contains everything that is needed to perform inference on an MNIST digit. To create the executable:
+```
+$ mkdir build
+$ cd build
+$ CXX=/opt/rocm/llvm/bin/clang++ cmake ..
+$ make
+```
+There will now be an executable named `mnist_inference` in the `build` directory. This can be run with or without options. Executing without any options will produce the following output:
+```
+Usage: ./mnist_inference [options]
+options:
+         -c, --cpu      Compile for CPU
+         -g, --gpu      Compile for GPU
+         -f, --fp16     FP16 Quantization
+         -i, --int8     Int8 Quantization
+               --cal    Int8 Calibration ON
+         -p, --print    Print Graph at Each Stage
+
+
+Parsing ONNX model...
+
+Compiling program for ref...
+
+Model input: 
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@%=@@@@@@@@@
+@@@@@@@@@@@@@0+.   +@@@@@@@@
+@@@@@@@@@@@0+   ..  0@@@@@@@
+@@@@@@@@@@+    .00  #@@@@@@@
+@@@@@@@@@%    .0@0  #@@@@@@@
+@@@@@@@@@-  .*0@@%  #@@@@@@@
+@@@@@@@@@0+#@@@@@%  #@@@@@@@
+@@@@@@@@@@@@@@@@@*  #@@@@@@@
+@@@@@@@@@@@@@====- -@@@@@@@@
+@@@@@@@@@@@#-     .0@@@@@@@@
+@@@@@@@@@#.  .*    =@@@@@@@@
+@@@@@@@@%  =#@@.    %@@@@@@@
+@@@@@@@+  -@@@-  +*  -#00@@@
+@@@@@@+  =@@#- .#@@#*   .@@@
+@@@@@=  %@#*  =0@@@@@%--0@@@
+@@@@@   ..   =@@@@@@@@@@@@@@
+@@@@@.    *=0@@@@@@@@@@@@@@@
+@@@@@@%+=@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+@@@@@@@@@@@@@@@@@@@@@@@@@@@@
+
+Model evaluating input...
+Inference complete
+Inference time: 0.022ms
+
+Randomly chosen digit: 2
+Result from inference: 2
+
+CORRECT
+
+```
+
+*Note: the actual digit selected and printed will not necessarily be the same as shown above. 
\ No newline at end of file
--- a/examples/vision/cpp_mnist/digits.txt
+++ b/examples/vision/cpp_mnist/digits.txt
--- a/examples/vision/cpp_mnist/mnist-7.onnx
+++ b/examples/vision/cpp_mnist/mnist-7.onnx
--- a/examples/vision/cpp_mnist/mnist-8.onnx
+++ b/examples/vision/cpp_mnist/mnist-8.onnx
--- a/examples/vision/cpp_mnist/mnist_inference.cpp
+++ b/examples/vision/cpp_mnist/mnist_inference.cpp
+#include <iostream>
+#include <fstream>
+#include <vector>
+#include <string>
+#include <numeric>
+#include <algorithm>
+#include <chrono>
+#include <random>
+#include <migraphx/migraphx.hpp>
+
+void read_nth_digit(const int, std::vector<float>&);
+
+int main(int argc, char** argv)
+{
+    if(argc == 1)
+    {
+        std::cout << "Usage: " << argv[0] << " [options]" << std::endl
+                  << "options:" << std::endl
+                  << "\t -c, --cpu      Compile for CPU" << std::endl
+                  << "\t -g, --gpu      Compile for GPU" << std::endl
+                  << "\t -f, --fp16     FP16 Quantization" << std::endl
+                  << "\t -i, --int8     Int8 Quantization" << std::endl
+                  << "\t       --cal    Int8 Calibration ON" << std::endl
+                  << "\t -p, --print    Print Graph at Each Stage" << std::endl
+                  << std::endl
+                  << std::endl;
+    }
+
+    char** begin   = argv + 1;
+    char** end     = argv + argc;
+    const bool CPU = (std::find(begin, end, std::string("-c")) != end) ||
+                     std::find(begin, end, std::string("--cpu")) != end;
+    const bool GPU = std::find(begin, end, std::string("-g")) != end ||
+                     std::find(begin, end, std::string("--gpu")) != end;
+    const bool FP16 = std::find(begin, end, std::string("-f")) != end ||
+                      std::find(begin, end, std::string("--fp16")) != end;
+    const bool INT8 = std::find(begin, end, std::string("-i")) != end ||
+                      std::find(begin, end, std::string("--int8")) != end;
+    const bool CALIB = std::find(begin, end, std::string("--cal")) != end;
+    const bool PRINT = std::find(begin, end, std::string("-p")) != end ||
+                       std::find(begin, end, std::string("--print")) != end;
+
+    migraphx::program prog;
+    migraphx::onnx_options onnx_opts;
+    prog = parse_onnx("../mnist-8.onnx", onnx_opts);
+
+    std::cout << "Parsing ONNX model..." << std::endl;
+    if(PRINT)
+        prog.print();
+    std::cout << std::endl;
+
+    std::string target_str;
+    if(CPU)
+        target_str = "cpu";
+    else if(GPU)
+        target_str = "gpu";
+    else
+        target_str = "ref";
+    migraphx::target targ = migraphx::target(target_str.c_str());
+
+    if(FP16)
+    {
+        migraphx::quantize_fp16(prog);
+
+        std::cout << "Quantizing program for FP16..." << std::endl;
+        if(PRINT)
+            prog.print();
+        std::cout << std::endl;
+    }
+    else if(INT8)
+    {
+        if(CALIB)
+        {
+            std::cout << "Calibration data: " << std::endl;
+            std::vector<float> calib_dig;
+            read_nth_digit(9, calib_dig);
+
+            migraphx::quantize_int8_options quant_opts;
+            migraphx::program_parameters quant_params;
+            auto param_shapes = prog.get_parameter_shapes();
+            for(auto&& name : param_shapes.names())
+            {
+                quant_params.add(name, migraphx::argument(param_shapes[name], calib_dig.data()));
+            }
+
+            quant_opts.add_calibration_data(quant_params);
+            migraphx::quantize_int8(prog, targ, quant_opts);
+        }
+        else
+        {
+            migraphx::quantize_int8(prog, targ, migraphx::quantize_int8_options());
+        }
+
+        std::cout << "Quantizing program for INT8..." << std::endl;
+        if(PRINT)
+            prog.print();
+        std::cout << std::endl;
+    }
+
+    if(GPU)
+    {
+        migraphx::compile_options comp_opts;
+        comp_opts.set_offload_copy();
+        prog.compile(targ, comp_opts);
+    }
+    else
+    {
+        prog.compile(targ);
+    }
+
+    std::cout << "Compiling program for " << target_str << "..." << std::endl;
+    if(PRINT)
+        prog.print();
+    std::cout << std::endl;
+
+    std::vector<float> digit;
+    std::random_device rd;
+    std::uniform_int_distribution<int> dist(0, 9);
+    const int rand_digit = dist(rd);
+    std::cout << "Model input: " << std::endl;
+    read_nth_digit(rand_digit, digit);
+
+    migraphx::program_parameters prog_params;
+    auto param_shapes = prog.get_parameter_shapes();
+    auto input        = param_shapes.names().front();
+    prog_params.add(input, migraphx::argument(param_shapes[input], digit.data()));
+
+    std::cout << "Model evaluating input..." << std::endl;
+    auto start   = std::chrono::high_resolution_clock::now();
+    auto outputs = prog.eval(prog_params);
+    auto stop    = std::chrono::high_resolution_clock::now();
+    auto elapsed = std::chrono::duration_cast<std::chrono::milliseconds>(stop - start);
+    std::cout << "Inference complete" << std::endl;
+    std::cout << "Inference time: " << elapsed.count() * 1e-3 << "ms" << std::endl;
+
+    auto shape   = outputs[0].get_shape();
+    auto lengths = shape.lengths();
+    auto num_results =
+        std::accumulate(lengths.begin(), lengths.end(), 1, std::multiplies<size_t>());
+    float* results = reinterpret_cast<float*>(outputs[0].data());
+    float* max     = std::max_element(results, results + num_results);
+    int answer     = max - results;
+
+    std::cout << std::endl
+              << "Randomly chosen digit: " << rand_digit << std::endl
+              << "Result from inference: " << answer << std::endl
+              << std::endl
+              << (answer == rand_digit ? "CORRECT" : "INCORRECT") << std::endl
+              << std::endl;
+
+    return 0;
+}
+
+void read_nth_digit(const int n, std::vector<float>& digit)
+{
+    const std::string SYMBOLS = "@0#%=+*-.  ";
+    std::ifstream file("../digits.txt");
+    const int DIGITS = 10;
+    const int HEIGHT = 28;
+    const int WIDTH  = 28;
+
+    if(!file.is_open())
+    {
+        return;
+    }
+
+    for(int d = 0; d < DIGITS; ++d)
+    {
+        for(int i = 0; i < HEIGHT * WIDTH; ++i)
+        {
+            unsigned char temp = 0;
+            file.read((char*)&temp, sizeof(temp));
+            if(d == n)
+            {
+                float data = temp / 255.0;
+                digit.push_back(data);
+                std::cout << SYMBOLS[(int)(data * 10) % 11];
+                if((i + 1) % WIDTH == 0)
+                    std::cout << std::endl;
+            }
+        }
+    }
+    std::cout << std::endl;
+}
--- a/examples/vision/python_3dunet/3dunet_inference.ipynb
+++ b/examples/vision/python_3dunet/3dunet_inference.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "id": "fee8cfa5",
+   "metadata": {},
+   "source": [
+    "# 3D-UNet Example with MIGraphX\n",
+    "References:<br>\n",
+    "https://github.com/naomifridman/Unet_Brain_tumor_segmentation"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "09ceec31",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip install SimpleITK matplotlib scikit-image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "bb22bcc4",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import migraphx\n",
+    "from PIL import Image\n",
+    "import numpy as np\n",
+    "import os\n",
+    "import SimpleITK as sitk"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "cb973c63",
+   "metadata": {},
+   "source": [
+    "## Fetch U-NET ONNX Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1928662c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget -nc https://zenodo.org/record/3928973/files/224_224_160.onnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "1a64a616",
+   "metadata": {},
+   "source": [
+    "## Load ONNX Model"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "53928a98",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = migraphx.parse_onnx(\"224_224_160.onnx\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "27e8587f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model.compile(migraphx.get_target(\"gpu\"))"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "2f6014a4",
+   "metadata": {},
+   "source": [
+    "## Print model parameters"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "9e73728c",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(model.get_parameter_names())\n",
+    "print(model.get_parameter_shapes())\n",
+    "print(model.get_output_shapes())"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "a4cac52e",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "img_type=['FLAIR', 'T1','T1CE', 'T2']\n",
+    "label_type_shrt = ['background', 'necrotic',\n",
+    "             'edema', 'enhancing']\n",
+    "label_type = ['background', 'necrotic and non-enhancing tumor', 'edema', 'enhancing tumor']"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "b65f9297",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "red_multiplier = [1, 0.2, 0.2]\n",
+    "green_multiplier = [0.35,0.75,0.25]\n",
+    "blue_multiplier = [0,0.5,1.]#[0,0.25,0.9]\n",
+    "yellow_multiplier = [1,1,0.25]\n",
+    "brown_miltiplier = [40./255, 26./255, 13./255]\n",
+    "my_colors=[blue_multiplier, yellow_multiplier, brown_miltiplier]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0e175ac5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "from importlib import reload  # Python 3.4+ only."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "530e4f97",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import visualization_utils  as vu\n",
+    "from visualization_utils import show_label_on_image4\n",
+    "reload(vu)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "865c46a2",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def show_img_label(img, lbl, modality = 0):\n",
+    "    \n",
+    "    if (len(lbl.shape)> 2):\n",
+    "        lbl[0,0,3]=1 # for uniqe colors in plot\n",
+    "        lbl = lbl_from_cat(lbl)\n",
+    "    vu.show_n_images([img[:,:,modality],lbl, show_label_on_image4(img[:,:,modality],lbl)],\n",
+    "                    titles = [img_type[modality], 'Label', 'Label on '+ img_type[modality]]);\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1e926482",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_img_sitk(img):\n",
+    "    inputImage = sitk.ReadImage( img )\n",
+    "    inputImage = sitk.Cast( inputImage, sitk.sitkFloat32 )\n",
+    "    image = sitk.GetArrayFromImage(inputImage)\n",
+    "    return image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "0b620138",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# ima files are of the form\n",
+    "# BraTS19_TCIA04_192_1_flair.nii.gz  \n",
+    "# BraTS19_TCIA04_192_1_t1.nii.gz    \n",
+    "# BraTS19_TCIA04_192_1_t2.nii.gz\n",
+    "# BraTS19_TCIA04_192_1_seg.nii.gz    \n",
+    "# BraTS19_TCIA04_192_1_t1ce.nii.gz\n",
+    "\n",
+    "def read_image_into_numpy(dirpath):\n",
+    "    \n",
+    "    img_id = os.path.basename(dirpath)\n",
+    "    np_image=np.zeros((4, 160, 224, 224), dtype=np.float32)\n",
+    "    \n",
+    "    ## Flair\n",
+    "    flair_img = os.path.join(dirpath, img_id+'_flair.nii.gz')\n",
+    "    if (not os.path.isfile(flair_img)):\n",
+    "        print(flair_img,' not found aborting')\n",
+    "        return None\n",
+    "    np_image[0] = read_img_sitk(flair_img)\n",
+    "        \n",
+    "    ## T1\n",
+    "    t1_nb4_img = os.path.join(dirpath, img_id+'_t1_nb4.nii.gz')\n",
+    "    if (not os.path.isfile(t1_nb4_img)):\n",
+    "        #print(t1_nb4_img,' not found')\n",
+    "        t1_img = os.path.join(dirpath, img_id+'_t1.nii.gz')\n",
+    "        if (not os.path.isfile(t1_img)):\n",
+    "            print(t1_img,' not found aborting')\n",
+    "            return None\n",
+    "        np_image[1] = read_img_sitk(t1_img)\n",
+    "    else:\n",
+    "        np_image[1] = read_img_sitk(t1_nb4_img)    \n",
+    "            \n",
+    "    ## T1CE\n",
+    "    t1ce_nb4_img = os.path.join(dirpath, img_id+'_t1ce_nb4.nii.gz')\n",
+    "    if (not os.path.isfile(t1ce_nb4_img)):\n",
+    "        #print(t1ce_nb4_img,' not found')\n",
+    "        t1ce_img = os.path.join(dirpath, img_id+'_t1ce.nii.gz')\n",
+    "        if (not os.path.isfile(t1ce_img)):\n",
+    "            print(t1ce_img,' not found aborting')\n",
+    "            return None\n",
+    "        np_image[2] = read_img_sitk(t1ce_img)\n",
+    "    else:\n",
+    "        np_image[2] = read_img_sitk(t1ce_nb4_img)    \n",
+    "    \n",
+    "        \n",
+    "    ## T2\n",
+    "    t2_img = os.path.join(dirpath, img_id+'_t2.nii.gz')\n",
+    "    if (not os.path.isfile(t2_img)):\n",
+    "        print(t2_img,' not found aborting')\n",
+    "        return None\n",
+    "    np_image[3] = read_img_sitk(t2_img)\n",
+    "\n",
+    "    return np_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "2fb66f17",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def read_label_into_numpy(dirpath):\n",
+    "    \n",
+    "    img_id = os.path.basename(dirpath)\n",
+    "    np_image=np.zeros((160, 224, 224), dtype=np.int)\n",
+    "    \n",
+    "    ## label\n",
+    "    label_img = os.path.join(dirpath, img_id+'_seg.nii.gz')\n",
+    "    if (not os.path.isfile(label_img)):\n",
+    "        print(label_img,' not found aborting')\n",
+    "        return None\n",
+    "    np_image = read_img_sitk(label_img).astype(int)\n",
+    "\n",
+    "    return np_image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "558d47b9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def bbox2_3D(img):\n",
+    "\n",
+    "    r = np.any(img, axis=(1, 2))\n",
+    "    c = np.any(img, axis=(0, 2))\n",
+    "    z = np.any(img, axis=(0, 1))\n",
+    "\n",
+    "    rmin, rmax = np.where(r)[0][[0, -1]]\n",
+    "    cmin, cmax = np.where(c)[0][[0, -1]]\n",
+    "    zmin, zmax = np.where(z)[0][[0, -1]]\n",
+    "\n",
+    "    return [rmin, rmax, cmin, cmax, zmin, zmax]"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1405e186",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def lbl_from_cat(cat_lbl):\n",
+    "    \n",
+    "    lbl=0\n",
+    "    if (len(cat_lbl.shape)==3):\n",
+    "        for i in range(1,4):\n",
+    "            lbl = lbl + cat_lbl[:,:,i]*i\n",
+    "    elif (len(cat_lbl.shape)==4):\n",
+    "        for i in range(1,4):\n",
+    "            lbl = lbl + cat_lbl[:,:,:,i]*i\n",
+    "    else:\n",
+    "        print('Error in lbl_from_cat', cat_lbl.shape)\n",
+    "        return None\n",
+    "    return lbl"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24eb472f",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def show_label(lbl):\n",
+    "    vu.show_n_images([lbl[:,:,k] for k in range(4)]+[lbl_from_cat(lbl)],\n",
+    "                 titles = label_type_shrt + ['Label'])\n",
+    "\n",
+    "def show_pred_im_label(im, lb, pred):\n",
+    "    \n",
+    "    vu.show_n_images([im[:,:,1], lb[:,:], \n",
+    "                   show_label_on_image4(im[:,:,1], lb[:,:]),\n",
+    "                  show_label_on_image4(im[:,:,1], pred[:,:])],\n",
+    "                 titles=['Flair', 'Label', 'Label on T1', 'Prediction on Flair'])\n",
+    "\n",
+    "def show_pred_im(im, pred):\n",
+    "    \n",
+    "    vu.show_n_images([im[:,:,1], \n",
+    "                   im[:,:,0],pred,\n",
+    "                  show_label_on_image4(im[:,:,1], pred[:,:])],\n",
+    "                 titles=['Flair','T1', 'Pred',  'Prediction on Flair'])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d15f788b",
+   "metadata": {},
+   "source": [
+    "Multiple image inputs:\n",
+    "- Native (T1)\n",
+    "- Post-contrast T1-weighted (T1Gd)\n",
+    "- T2-weighted (T2)\n",
+    "- T2 Fluid Attenuated Inversion Recovery (T2-FLAIR)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7a7aad87",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Resize input images\n",
+    "from scipy.ndimage import zoom\n",
+    "\n",
+    "def resize(img, shape, mode='constant', orig_shape=(155, 240, 240)):\n",
+    "    \"\"\"\n",
+    "    Wrapper for scipy.ndimage.zoom suited for MRI images.\n",
+    "    \"\"\"\n",
+    "    assert len(shape) == 3, \"Can not have more than 3 dimensions\"\n",
+    "    factors = (\n",
+    "        shape[0]/orig_shape[0],\n",
+    "        shape[1]/orig_shape[1], \n",
+    "        shape[2]/orig_shape[2]\n",
+    "    )\n",
+    "    \n",
+    "    # Resize to the given shape\n",
+    "    return zoom(img, factors, mode=mode)\n",
+    "\n",
+    "def preprocess_label(img, out_shape=None, mode='nearest'):\n",
+    "    \"\"\"\n",
+    "    Separates out the 3 labels from the segmentation provided, namely:\n",
+    "    GD-enhancing tumor (ET — label 4), the peritumoral edema (ED — label 2))\n",
+    "    and the necrotic and non-enhancing tumor core (NCR/NET — label 1)\n",
+    "    \"\"\"\n",
+    "    ncr = img == 1  # Necrotic and Non-Enhancing Tumor (NCR/NET)\n",
+    "    \n",
+    "    ed = img == 2  # Peritumoral Edema (ED)\n",
+    "    et = img == 4  # GD-enhancing Tumor (ET)\n",
+    "    \n",
+    "    if out_shape is not None:\n",
+    "        ncr = resize(ncr, out_shape, mode=mode)\n",
+    "        ed = resize(ed, out_shape, mode=mode)\n",
+    "        et = resize(et, out_shape, mode=mode)\n",
+    "    return np.array([ncr, ed, et], dtype=np.uint8)\n",
+    "\n",
+    "hgg_path = \"/code/AMDMIGraphX/bratsdata/MICCAI_BraTS_2019_Data_Training/HGG\"\n",
+    "np_image=np.zeros((4, 160, 224, 224), dtype=np.float32)\n",
+    "tmp = read_img_sitk('%s/BraTS19_TMC_30014_1/BraTS19_TMC_30014_1_flair.nii.gz'%hgg_path)\n",
+    "tmp = resize(tmp, [160,224,224])\n",
+    "mean = tmp.mean()\n",
+    "std = tmp.std()\n",
+    "np_image[0] = (tmp - mean) / std\n",
+    "\n",
+    "tmp = read_img_sitk('%s/BraTS19_TMC_30014_1/BraTS19_TMC_30014_1_t1.nii.gz'%hgg_path)\n",
+    "tmp = resize(tmp, [160,224,224])\n",
+    "mean = tmp.mean()\n",
+    "std = tmp.std()\n",
+    "np_image[1] = (tmp - mean) / std\n",
+    "\n",
+    "tmp = read_img_sitk('%s/BraTS19_TMC_30014_1/BraTS19_TMC_30014_1_t1ce.nii.gz'%hgg_path)\n",
+    "tmp = resize(tmp, [160,224,224])\n",
+    "mean = tmp.mean()\n",
+    "std = tmp.std()\n",
+    "np_image[2] = (tmp - mean) / std\n",
+    "\n",
+    "tmp = read_img_sitk('%s/BraTS19_TMC_30014_1/BraTS19_TMC_30014_1_t2.nii.gz'%hgg_path)\n",
+    "tmp = resize(tmp, [160,224,224])\n",
+    "mean = tmp.mean()\n",
+    "std = tmp.std()\n",
+    "np_image[3] = (tmp - mean) / std\n",
+    "\n",
+    "print(np_image.shape)\n",
+    "np_image_tmp = np_image.copy()"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "d7e5b3c6",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "vu.show_n_images(np_image[:,100,:,:], titles=img_type)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "19117da5",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np_lbl=np.zeros((160, 224, 224), dtype=np.int)\n",
+    "tmp = read_img_sitk('/code/AMDMIGraphX/bratsdata/MICCAI_BraTS_2019_Data_Training/HGG/BraTS19_TMC_30014_1/BraTS19_TMC_30014_1_seg.nii.gz').astype(int)\n",
+    "tmp = resize(tmp, [160,224,224])\n",
+    "print(tmp.shape)\n",
+    "np_lbl = tmp.astype(int)\n",
+    "print(np_lbl.shape)\n",
+    "\n",
+    "print(np_image.shape)\n",
+    "\n",
+    "img1 = vu.show_label_on_image4(np_image[1,100,:,:], np_lbl[100])\n",
+    "img2 = vu.show_label_on_image(np_image[1,100,:,:], np_lbl[100])\n",
+    "vu.show_n_images([img1,img2,np_image[0,100]])"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "facdea15",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def get_pred(img, threshold=0.5):\n",
+    "    out_img=img.copy()\n",
+    "    out_img=np.where(out_img>threshold, 1,0)\n",
+    "    return out_img\n",
+    "\n",
+    "def prediction_from_probabily_3D(img):\n",
+    "    \n",
+    "    int_image = get_pred(img)\n",
+    "    return lbl_from_cat(int_image)\n",
+    "\n",
+    "def get_prediction_for_batch(pred_batch, threshold=0.5):\n",
+    "    \n",
+    "    out_batch = np.zeros((pred_batch.shape[0], 224, 224),dtype=np.int)\n",
+    "    \n",
+    "    for j in range(pred_batch.shape[0]):\n",
+    "        pred = get_prediction(pred_batch[j])\n",
+    "        if (pred.sum()>0):\n",
+    "            print(j, np.unique(pred , return_counts=True))\n",
+    "        out_batch[j] = lbl_from_cat(get_prediction(pred_batch[j]))\n",
+    "    return out_batch\n",
+    "\n",
+    "def get_label_from_pred_batch(labels_batch):\n",
+    "    \n",
+    "    batch = np.zeros((labels_batch.shape[0], 224, 224), np.uint8)\n",
+    "     \n",
+    "    for j in range(labels_batch.shape[0]):\n",
+    "        batch[j]=get_pred(labels_batch[j,:,:,0])+\\\n",
+    "                get_pred(labels_batch[j,:,:,1])*2+\\\n",
+    "        get_pred(labels_batch[j,:,:,2])*4\n",
+    "\n",
+    "    return batch\n",
+    "\n",
+    "def predict_3D_img_prob(np_file):\n",
+    "    \n",
+    "    np_img = np.load(np_file)\n",
+    "    for_pred_img = np.zeros((160, 224, 224, 4), np.float32)\n",
+    "\n",
+    "    # Normalize image\n",
+    "    for_pred_img = normalize_3D_image(np_img)\n",
+    "\n",
+    "    mdl_pred_img =  model.predict(for_pred_img)\n",
+    "\n",
+    "    #pred_label = prediction_from_probabily_3D(mdl_pred_img)\n",
+    "\n",
+    "    return mdl_pred_img\n"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "7f7fe7ee",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "#Remember the MIGraphX model inputs\n",
+    "print(model.get_parameter_names())\n",
+    "print(model.get_parameter_shapes())\n",
+    "\n",
+    "np_image = np_image.transpose((0,2,3,1))\n",
+    "\n",
+    "print(np_image.shape)\n",
+    "print(np_image.strides)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "dfc47b53",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def normalize_3D_image(img):\n",
+    "    for z in range(img.shape[0]):\n",
+    "        for k in range(4):\n",
+    "            if (img[z,:,:,k].max()>0):\n",
+    "                img[z,:,:,k] /= img[z,:,:,k].max()\n",
+    "    return img"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "f990cb50",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "print(np_image_tmp.shape)\n",
+    "np_image_tmp = np_image_tmp.transpose((1,2,3,0))\n",
+    "print(np_image_tmp.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "24c3736d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "np_image = np.expand_dims(np_image, 0)\n",
+    "print(np_image.shape)\n",
+    "print(np_image.strides)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "1aac6285",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "input_im = np.zeros((1,4,224,224,160),dtype='float32')\n",
+    "np.lib.stride_tricks.as_strided(input_im, shape=np_image.shape, strides=input_im.strides)[:] = np_image #getting correct stride\n",
+    "print(input_im.strides)\n",
+    "print(input_im.shape)\n",
+    "\n",
+    "#input_im = normalize_3D_image(input_im)\n",
+    "\n",
+    "print(input_im.strides)\n",
+    "print(input_im.shape)\n",
+    "\n",
+    "result = model.run({\n",
+    "         \"input\": input_im\n",
+    "     })"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "5848b63d",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "output = np.array(result[0])\n",
+    "print(output.shape)\n",
+    "output = output[0]\n",
+    "print(output.shape)\n",
+    "output = output.transpose((3,1,2,0))\n",
+    "print(output.shape)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "id": "ab77f7e9",
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "out = prediction_from_probabily_3D(output)\n",
+    "print(np_image_tmp.shape)\n",
+    "print(np_lbl.shape)\n",
+    "print(out.shape)\n",
+    "print(np.unique(out))\n",
+    "ind=[100]\n",
+    "for i in ind:\n",
+    "    show_label(output[i])\n",
+    "    show_label(get_pred(output[i]))\n",
+    "    show_pred_im_label(np_image_tmp[i], np_lbl[i], out[i])"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "id": "d2862d81",
+   "metadata": {},
+   "source": [
+    "The possible prediction discrepancy is due to the not-perfect resizing 3D input image, as BRATS dataset has 3D images of size 160x240x240, meanwhile the ONNX model utilized here requires 155x224x224. This example is representative for how to utilize MIGraphX for such an application. All data processing should follow and match the model requirements otherwise. "
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 5
+}
--- a/examples/vision/python_3dunet/README.md
+++ b/examples/vision/python_3dunet/README.md
+# 3D-Unet Inference with AMD MIGraphX
+
+This example applies image segmentation to 3D images using AMD MIGraphX on a given AMD GPU. 
+
+## How to:
+1) User will need to have access to the BRATS dataset. Please follow https://www.med.upenn.edu/cbica/brats2019/data.html for how to get access to the dataset.
+2) Follow the provided notebook `3dunet_inference.ipynb`.
--- a/examples/vision/python_3dunet/visualization_utils.py
+++ b/examples/vision/python_3dunet/visualization_utils.py
+import matplotlib.pyplot as plt
+import matplotlib.gridspec as gridspec
+import matplotlib.pylab as pylab
+import numpy as np
+
+params = {
+    'legend.fontsize': 'x-large',
+    'figure.figsize': (6, 5),
+    'axes.labelsize': 'x-large',
+    'axes.titlesize': 'x-large',
+    'xtick.labelsize': 'x-large',
+    'ytick.labelsize': 'x-large'
+}
+pylab.rcParams.update(params)
+
+
+#-----------------------------------------------------------
+def show_n_images(imgs, titles=None, enlarge=20, cmap='jet'):
+
+    plt.set_cmap(cmap)
+    n = len(imgs)
+    gs1 = gridspec.GridSpec(1, n)
+
+    fig1 = plt.figure()
+    # create a figure with the default size
+    fig1.set_size_inches(enlarge, 2 * enlarge)
+
+    for i in range(n):
+
+        ax1 = fig1.add_subplot(gs1[i])
+
+        ax1.imshow(imgs[i], interpolation='none')
+        if (titles is not None):
+            ax1.set_title(titles[i])
+        ax1.set_ylim(ax1.get_ylim()[::-1])
+
+    plt.show()
+
+
+#--------------------------------------------------------------
+from skimage import color, img_as_float
+from skimage.exposure import adjust_gamma
+
+
+# Creates an image of original brain with segmentation overlay
+def show_label_on_image(test_img, test_lbl):
+
+    label_im = test_lbl
+
+    ones = np.argwhere(label_im == 1)
+    twos = np.argwhere(label_im == 2)
+    threes = np.argwhere(label_im == 3)
+    fours = np.argwhere(label_im == 4)
+
+    gray_img = img_as_float(test_img / test_img.max())
+
+    # adjust gamma of image
+    # print(color.gray2rgb(gray_img))
+    image = adjust_gamma(np.abs(color.gray2rgb(gray_img)), 0.45)
+    #sliced_image = image.copy()
+
+    green_multiplier = [0.35, 0.75, 0.25]
+    blue_multiplier = [0, 0.5, 1.]  #[0,0.25,0.9]
+    yellow_multiplier = [1, 1, 0.25]
+    brown_miltiplier = [40. / 255, 26. / 255, 13. / 255]
+
+    # change colors of segmented classes
+    for i in range(len(ones)):
+        image[ones[i][0]][ones[i][1]] = blue_multiplier
+    for i in range(len(twos)):
+        image[twos[i][0]][twos[i][1]] = yellow_multiplier
+    for i in range(len(threes)):
+        image[threes[i][0]][threes[i][1]] = brown_miltiplier  #blue_multiplier
+    for i in range(len(fours)):
+        image[fours[i][0]][fours[i][1]] = green_multiplier  #yellow_multiplier
+
+    return image
+
+
+#-------------------------------------------------------------------------------------
+def show_label_on_image4(test_img, label_im):
+
+    alpha = 0.8
+
+    img = img_as_float(test_img / test_img.max())
+    rows, cols = img.shape
+
+    # Construct a colour image to superimpose
+    color_mask = np.zeros((rows, cols, 3))
+    green_multiplier = [0.35, 0.75, 0.25]
+    blue_multiplier = [0, 0.25, 0.9]
+    yellow_multiplier = [1, 1, 0.25]
+    brown_miltiplier = [40. / 255, 26. / 255, 13. / 255]
+
+    color_mask[label_im == 1] = blue_multiplier  #[1, 0, 0]  # Red block
+    color_mask[label_im == 2] = yellow_multiplier  #[0, 1, 0] # Green block
+    color_mask[label_im == 3] = brown_miltiplier  #[0, 0, 1] # Blue block
+    color_mask[label_im == 4] = green_multiplier  #[0, 1, 1] # Blue block
+
+    # Construct RGB version of grey-level image
+    img_color = np.dstack((img, img, img))
+
+    # Convert the input image and color mask to Hue Saturation Value (HSV)
+    # colorspace
+    img_hsv = color.rgb2hsv(img_color)
+    color_mask_hsv = color.rgb2hsv(color_mask)
+
+    # Replace the hue and saturation of the original image
+    # with that of the color mask
+    img_hsv[..., 0] = color_mask_hsv[..., 0]
+    img_hsv[..., 1] = color_mask_hsv[..., 1] * alpha
+
+    img_masked = color.hsv2rgb(img_hsv)
+
+    return img_masked
+
+
+#------------------------------------------------------------------------------
--- a/examples/vision/python_nfnet/README.md
+++ b/examples/vision/python_nfnet/README.md
+# NFNet Inference with MIGraphX
+
+## NFNet
+NFNet: Normalizer-Free Nets. An image recognition model that can be trained without batch normalization layers. It instead uses gradient clipping algorithm to provide same affects of BatchNorm.
+
+<ins>**Summary:**</ins>
+- SOTA on ImageNet (86.5% top-1 w/o extra data)
+- Up to 8.7x faster to train than EfficientNets to a given accuracy
+- Normalizer-free (no BatchNorm)
+
+**Paper**: https://arxiv.org/pdf/2102.06171.pdf
+
+**Colab notebook**: https://github.com/deepmind/deepmind-research/tree/master/nfnets
+
+### Why not batch norm?
+
+Batch normalization has three significant practical disadvantages:
+1. It is an expensive computational primitive, which incurs memory overhead and significantly increases the time required to evaluate the gradient in some networks.
+2. It introduces a discrepancy between the behavior of the model during training and at inference time, introducing hidden hyper-parameters that have to be tuned.
+3. Last and most important point, batch normalization breaks the independence between training examples in the minibatch (batch size matters with batch norm, distributed training becomes extremely cumbersome).
+
+Instead:
+
+- Authors provide Adaptive Gradient Clipping (AGC), which clips gradients based on the unit-wise ratio of gradient norms to parameter norms, and they demonstrate that AGC allows them to train normalizer-free networks with larger batch sizes and stronger data augmentations.
+- They design a family of Normalizer-Free ResNets, called NFNets, which set new state-of-the-art validation accuracies on ImageNet for a range of training latencies. Their NFNet-F1 model achieves similar accuracy to EfficientNet-B7 while being 8.7× faster to train, and their largest model sets a new overall state of the art without extra data of 86.5% top-1 accuracy.
+- They show that NFNets achieve substantially higher validation accuracies than batch-normalized networks when fine-tuning on ImageNet after pre-training on a large private dataset of 300 million labelled images. Their best model achieves 89.2% top-1 accuracy after fine-tuning.
+
+## Inference with MIGraphX using NFNet ONNX Model
+
+There is no ONNX model released for NFNet, as of June 2021, however a PyTorch model is available at:
+https://github.com/rwightman/pytorch-image-models. 
+We provide an in-house produced and optimized ONNX model, which can be parsed and compiled using MIGraphX for AMD GPUs. The ONNX model file can be fetched using the Jupyter notebook we provide.
+
+### Requirements:
+1) AMD GPU system with ROCm installed.
+2) Jupyter notebook library.
+
+### How to use NFNet for image recognition:
+Please utilize the notebook example provided:
+1) Install jupyter notebook to your environment if not already installed:
+```
+https://jupyter.org/install
+```
+2) Connect to your jupyter server and utilize `nfnet_inference.ipynb` notebook file.
+
+### How to compare MIGraphX to ONNX Runtime for NFNet ONNX model:
+First install requirements:
+```
+pip3 install -r requirements_nfnet.txt
+```
+
+On your terminal, invoke:
+```
+python3 ort_comparison.py
+````
--- a/examples/vision/python_nfnet/nfnet_inference.ipynb
+++ b/examples/vision/python_nfnet/nfnet_inference.ipynb
+{
+ "cells": [
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "# NFNet Inference with AMD MIGraphX\n"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "Normalizer-Free ResNet is a new residual convolutional network providing new state-of-the-art Top-1 accuracy of 86.5% at ImageNet dataset. The most important feature of the model is removing batch normalization. Instead of batch normalization, it uses adaptive gradient clipping to provide same regularization effect of BatchNorm. <br> Details of this network: https://arxiv.org/abs/2102.06171\n",
+    "\n",
+    "In this notebook, we are showing: <br>\n",
+    "- How to optimize NFNet ONNX model with AMD MIGraphX.\n",
+    "- How to run inference on AMD GPU with the optimized ONNX model.\n",
+    "\n",
+    "The NFNet utilized in this example is the smallest NFNet version, F0: 71.5M parameters (83.6% top-1 accuracy on ImageNet)\n",
+    "\n",
+    "Please make sure MIGraphX Python API is installed following the instructions at Github page."
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "## Requirements"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!apt-get update\n",
+    "!apt-get install ffmpeg libsm6 libxext6  -y "
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!pip3 install --upgrade pip\n",
+    "!pip3 install -r requirements_nfnet.txt"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import numpy as np\n",
+    "import cv2\n",
+    "import json\n",
+    "from PIL import Image\n",
+    "import time\n",
+    "from os import path "
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Importing AMD MIGraphX Python Module"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "import migraphx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Create NFNet ONNX file\n",
+    "Following repository provides functionality to create NFNet ONNX file from PyTorch model."
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "!wget -nc https://www.dropbox.com/s/u4ga8zyxtppfzxc/dm_nfnet_f0.onnx"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Load ImageNet labels"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "with open('../python_resnet50/imagenet_simple_labels.json') as json_data:\n",
+    "    labels = json.load(json_data)"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "## Load ONNX model using MIGraphX"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "model = migraphx.parse_onnx(\"dm_nfnet_f0.onnx\")\n",
+    "model.compile(migraphx.get_target(\"gpu\"))\n",
+    "\n",
+    "print(model.get_parameter_names())\n",
+    "print(model.get_parameter_shapes())\n",
+    "print(model.get_output_shapes())"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Functions for image processing"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "def make_nxn(image, n):\n",
+    "    height, width = image.shape[:2]    \n",
+    "    if height > width:\n",
+    "        dif = height - width\n",
+    "        bar = dif // 2 \n",
+    "        square = image[(bar + (dif % 2)):(height - bar),:]\n",
+    "        return cv2.resize(square, (n, n))\n",
+    "    elif width > height:\n",
+    "        dif = width - height\n",
+    "        bar = dif // 2\n",
+    "        square = image[:,(bar + (dif % 2)):(width - bar)]\n",
+    "        return cv2.resize(square, (n, n))\n",
+    "    else:\n",
+    "        return cv2.resize(image, (n, n))\n",
+    "    \n",
+    "def preprocess(img_data):\n",
+    "    mean_vec = np.array([0.485, 0.456, 0.406])\n",
+    "    stddev_vec = np.array([0.229, 0.224, 0.225])\n",
+    "    norm_img_data = np.zeros(img_data.shape).astype('float32')\n",
+    "    for i in range(img_data.shape[0]):  \n",
+    "        norm_img_data[i,:,:] = (img_data[i,:,:]/255 - mean_vec[i]) / stddev_vec[i]\n",
+    "    return norm_img_data\n",
+    "\n",
+    "def input_process(frame, dim):\n",
+    "    # Crop and resize original image\n",
+    "    cropped = make_nxn(frame, dim)\n",
+    "    # Convert from HWC to CHW\n",
+    "    chw = cropped.transpose(2,0,1)\n",
+    "    # Apply normalization\n",
+    "    pp = preprocess(chw)\n",
+    "    # Add singleton dimension (CHW to NCHW)\n",
+    "    data = np.expand_dims(pp.astype('float32'),0)\n",
+    "    return data"
+   ]
+  },
+  {
+   "cell_type": "markdown",
+   "metadata": {},
+   "source": [
+    "### Download example image"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Fetch example image: traffic light\n",
+    "!wget -nc http://farm5.static.flickr.com/4072/4462811418_8bc2bd42ca_z_d.jpg -O traffic_light.jpg\n",
+    "# Read the image\n",
+    "im = cv2.imread('traffic_light.jpg')"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Process the read image to conform input requirements\n",
+    "data_input = input_process(im, 192)\n",
+    "\n",
+    "# Run the model\n",
+    "start = time.time()\n",
+    "results = model.run({'inputs':data_input}) # Your first inference would take longer than the following ones.\n",
+    "print(f\"Time inference took: {1000*(time.time() - start):.2f}ms\")\n",
+    "# Extract the index of the top prediction\n",
+    "res_npa = np.array(results[0])\n",
+    "print(f\"\\nResult: {labels[np.argmax(res_npa)]}\")"
+   ]
+  },
+  {
+   "cell_type": "code",
+   "execution_count": null,
+   "metadata": {},
+   "outputs": [],
+   "source": [
+    "# Run the model again, first one would take long\n",
+    "start = time.time()\n",
+    "results = model.run({'inputs':data_input}) # Your first inference would take longer than the following ones.\n",
+    "print(f\"Time inference took: {1000*(time.time() - start):.2f}ms\")\n",
+    "# Extract the index of the top prediction\n",
+    "res_npa = np.array(results[0])\n",
+    "print(f\"\\nResult: {labels[np.argmax(res_npa)]}\")"
+   ]
+  }
+ ],
+ "metadata": {
+  "kernelspec": {
+   "display_name": "Python 3",
+   "language": "python",
+   "name": "python3"
+  },
+  "language_info": {
+   "codemirror_mode": {
+    "name": "ipython",
+    "version": 3
+   },
+   "file_extension": ".py",
+   "mimetype": "text/x-python",
+   "name": "python",
+   "nbconvert_exporter": "python",
+   "pygments_lexer": "ipython3"
+  }
+ },
+ "nbformat": 4,
+ "nbformat_minor": 4
+}