Merge branch 'develop' into jit-reduce-reg

dae94657 · Chris Austen · GitHub · b013d991 · 56c43445 · dae94657
Unverified Commit dae94657 authored Dec 14, 2022 by Chris Austen Committed by GitHub Dec 14, 2022
20 changed files
--- a/.github/workflows/ci.yaml
+++ b/.github/workflows/ci.yaml
@@ -7,17 +7,27 @@ jobs:
    runs-on: ubuntu-latest
    steps:
      - name: Cancel Previous Runs
-        uses: styfle/cancel-workflow-action@0.6.0
+        uses: styfle/cancel-workflow-action@0.11.0
        with:
          access_token: ${{ github.token }}
  tidy:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-
-    - uses: actions/checkout@v2
+      run: |
+        sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku 
+        du . --max-depth=1 -h
+        ls -la
+        cd /usr/local
+        du . --max-depth=1 -h
+        ls -la
+        cd /usr/local/lib
+        echo $(pwd)
+        du . --max-depth=1 -h
+        ls -la
+
+    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
@@ -34,7 +44,7 @@ jobs:
        message("::set-output name=timestamp::${current_date}")

    - name: Cache files for tidy
-      uses: pat-s/always-upload-cache@v2.1.3
+      uses: pat-s/always-upload-cache@v3.0.11
      with:
        path: tidy-cache
        key: tidy-cache-${{ steps.cache_timestamp.outputs.timestamp }}
@@ -61,12 +71,12 @@ jobs:
        make -j2 -k onnx-proto tf-proto tidy

  cppcheck:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
@@ -106,12 +116,12 @@ jobs:
        make -j2 cppcheck

  format:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku 
+    - uses: actions/checkout@v3

    # In this step, this action saves a list of existing images,
    # the cache is created without them in the post run.
@@ -142,14 +152,14 @@ jobs:
          | xargs -n 1 -P 1 -I{} -t sh -c 'yapf {} | diff - {}'

  pyflakes:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
        python-version: 3.8
    - name: Install pyflakes
@@ -163,14 +173,14 @@ jobs:
        mypy tools/api.py

  licensing:
-    runs-on: ubuntu-18.04
+    runs-on: ubuntu-20.04

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
        python-version: 3.8
    - name: run License Check
@@ -190,7 +200,6 @@ jobs:
    strategy:
      matrix:
        os:
-          - ubuntu-18.04
          - ubuntu-20.04
        configuration:
          - debug
@@ -199,16 +208,16 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws*  /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
-        python-version: 3.6
+        python-version: 3.7
    - name: Cache dependencies
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
-      uses: actions/cache@v2
+      uses: actions/cache@v3
      with:
        # This path is specific to Ubuntu
        path: ${{ github.workspace }}/cget
@@ -287,7 +296,6 @@ jobs:
    strategy:
      matrix:
        os:
-          - ubuntu-18.04
          - ubuntu-20.04
        configuration:
          - debug
@@ -296,16 +304,16 @@ jobs:

    steps:
    - name: Free space
-      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android
-    - uses: actions/checkout@v2
+      run: sudo rm -rf /usr/local/android /usr/share/dotnet /usr/local/share/boost /opt/ghc /usr/local/share/chrom* /usr/share/swift /usr/local/julia* /usr/local/lib/android /usr/local/graalvm /usr/local/aws* /usr/local/lib/heroku
+    - uses: actions/checkout@v3
    - name: Set up Python
-      uses: actions/setup-python@v2
+      uses: actions/setup-python@v4
      with:
-        python-version: 3.6
+        python-version: 3.7
    - name: Cache dependencies
      # Ignore the failure of a step and avoid terminating the job.
      continue-on-error: true
-      uses: actions/cache@v2
+      uses: actions/cache@v3
      with:
        # This path is specific to Ubuntu
        path: ${{ github.workspace }}/cget

--- a/.github/workflows/performance.yaml
+++ b/.github/workflows/performance.yaml
@@ -5,14 +5,14 @@ on:
    branches: [develop]
    types: [opened, synchronize, closed]
  schedule:
-    - cron: "0 5 * * 1-6"
+    - cron: "0 6 * * 1-6"

  workflow_dispatch:
    inputs:
      rocm_release:
        description: ROCm Version
        required: true
-        default: '5.2'
+        default: '5.3'
      performance_reports_repo:
        description: Result repository
        required: true
@@ -30,9 +30,9 @@ concurrency: "perftest-${{ github.head_ref ||  github.base_ref || 'schedule' }}"

 jobs:
  release:
-    uses: rocmsoftwareplatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
+    uses: ROCmSoftwarePlatform/migraphx-benchmark/.github/workflows/perf-test.yml@main
    with:
-      rocm_release: ${{ github.event.inputs.rocm_release || '5.2' }}
+      rocm_release: ${{ github.event.inputs.rocm_release || '5.3' }}
      result_number: ${{ github.event.inputs.result_number || '10' }}
      flags: ${{ github.event.inputs.flags || '-s' }} 
      performance_reports_repo: ${{ github.event.inputs.performance_reports_repo || 'ROCmSoftwarePlatform/migraphx-reports' }} 

--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -63,7 +63,7 @@ set(CMAKE_EXTRA_INCLUDE_FILES)

 include(ROCMSetupVersion)

-rocm_setup_version(VERSION 2.4)
+rocm_setup_version(VERSION 2.5)
 set(MIGRAPHX_SO_VERSION ${PROJECT_VERSION_MAJOR}.${PROJECT_VERSION_MINOR})

 option( BUILD_SHARED_LIBS "Build as a shared library" ON )
@@ -114,6 +114,7 @@ rocm_enable_clang_tidy(
        hicpp-signed-bitwise
        llvm-namespace-comment
        misc-*
+	-misc-confusable-identifiers
        modernize-*
        performance-*
        readability-*

--- a/Dockerfile
+++ b/Dockerfile
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386

 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.0.2/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main > /etc/apt/sources.list.d/rocm.list'

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \
@@ -71,10 +71,11 @@ RUN /download_models.sh && rm /download_models.sh

 # Install latest ccache version
 RUN cget -p $PREFIX install facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
-RUN cget -p $PREFIX install ccache@v4.1
+RUN cget -p $PREFIX install ccache@v4.1 -DENABLE_TESTING=OFF

 # Install newer cmake for onnx runtime
-RUN cget -p /opt/cmake install kitware/cmake@v3.13.4
+ARG CMAKE_VERSION=3.24.2
+RUN cget -p /opt/cmake install -X binary https://github.com/Kitware/CMake/releases/download/v${CMAKE_VERSION}/cmake-${CMAKE_VERSION}-Linux-x86_64.tar.gz

 ARG ONNXRUNTIME_REPO=https://github.com/Microsoft/onnxruntime
 ARG ONNXRUNTIME_BRANCH=main
@@ -86,7 +87,7 @@ RUN git clone --single-branch --branch ${ONNXRUNTIME_BRANCH} --recursive ${ONNXR

 ADD tools/build_and_test_onnxrt.sh /onnxruntime/build_and_test_onnxrt.sh

-RUN cget -p /usr/local install ROCmSoftwarePlatform/llvm-project-mlir@e8e77eb16be413d301ea8509726d47f265d9011f -DBUILD_MIXR_TARGET=On
+RUN cget -p /usr/local install ROCmSoftwarePlatform/rocMLIR@0f38fb33f518b53b94b541feb9b079668c5518e8 -DBUILD_MIXR_TARGET=On -DLLVM_ENABLE_ZSTD=Off -DLLVM_ENABLE_THREADS=Off

 ENV MIOPEN_FIND_DB_PATH=/tmp/miopen/find-db
 ENV MIOPEN_USER_DB_PATH=/tmp/miopen/user-db

--- a/dev-requirements.txt
+++ b/dev-requirements.txt
@@ -21,9 +21,9 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-pfultz2/rocm-recipes
+ROCmSoftwarePlatform/rocm-recipes
 facebook/zstd@v1.4.5 -X subdir -DCMAKE_DIR=build/cmake
-ccache@v4.1
+ccache@v4.1 -DENABLE_TESTING=OFF
 pcre,pfultz2/pcre@8.45 -H sha256:d6f7182602a775a7d500a0cedca6449af0400c6493951513046d17615ed0bf11
 danmar/cppcheck@2.9 -DHAVE_RULES=1
 RadeonOpenCompute/rocm-cmake@1ebf7e7bc61bb5e949c171562b421264065230a7 --build

--- a/examples/migraphx/migraphx_driver/README.md
+++ b/examples/migraphx/migraphx_driver/README.md
@@ -29,6 +29,7 @@ See below for a comprehensive list of commands and option arguments, as well as
 | --tf | Load file as a tensorflow graph |
 | --migraphx | Load file as a migraphx graph |
 | --migraphx-json | Load file as a migraphx JSON graph |
+| --batch | Set batch size for the model | 
 | --nhwc | Treat tensorflow format as nhwc | 
 | --nchw | Treat tensorflow format as nchw |
 | --skip-unknown-operators | Skip unknown operators when parsing and continue to parse |

--- a/examples/nlp/python_bert_squad/requirements_bertsquad.txt
+++ b/examples/nlp/python_bert_squad/requirements_bertsquad.txt
@@ -21,6 +21,6 @@
 # OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 # THE SOFTWARE.
 #####################################################################################
-tensorflow==2.7.2
+tensorflow==2.9.3
 onnxruntime
 tokenizers
\ No newline at end of file
--- a/hip-clang.docker
+++ b/hip-clang.docker
@@ -6,7 +6,7 @@ ARG PREFIX=/usr/local
 RUN dpkg --add-architecture i386

 # Add rocm repository
-RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.0.2/ ubuntu main > /etc/apt/sources.list.d/rocm.list'
+RUN sh -c 'echo deb [arch=amd64 trusted=yes] http://repo.radeon.com/rocm/apt/5.3/ ubuntu main > /etc/apt/sources.list.d/rocm.list'

 # Install dependencies
 RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-unauthenticated \

--- a/rbuild.ini
+++ b/rbuild.ini
@@ -2,7 +2,7 @@
 cxx = ${rocm_path}/llvm/bin/clang++
 cc = ${rocm_path}/llvm/bin/clang
 deps =
-    pfultz2/rocm-recipes
+    ROCmSoftwarePlatform/rocm-recipes
    -f requirements.txt

 [gh]
@@ -24,4 +24,4 @@ deps =
 define =
    CMAKE_C_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
    CMAKE_CXX_COMPILER_LAUNCHER=${deps_dir}/bin/ccache
-    MIGRAPHX_ENABLE_CPU=On
\ No newline at end of file
+    MIGRAPHX_ENABLE_CPU=On
--- a/requirements.txt
+++ b/requirements.txt
@@ -24,7 +24,7 @@
 google/protobuf@v3.11.0 -DCMAKE_POSITION_INDEPENDENT_CODE=On -X subdir -Dprotobuf_BUILD_TESTS=Off
 nlohmann/json@v3.8.0
 live-clones/blaze@v3.8 -X header -DHEADER_DIR=blaze -H sha256:d0ff011f47538285178908ea5f2cab46bb6a8f55b1edb6e03224a82dbc1a3212
-half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969
+half,https://github.com/ROCmSoftwarePlatform/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969
 pybind/pybind11@d159a563383d10c821ba7b2a71905d1207db6de4 --build
 msgpack/msgpack-c@cpp-3.3.0 -DMSGPACK_BUILD_TESTS=Off
 sqlite3@3.17 -DCMAKE_POSITION_INDEPENDENT_CODE=On
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -55,6 +55,7 @@ add_library(migraphx
    insert_pad.cpp
    instruction.cpp
    json.cpp
+    layout_nhwc.cpp
    load_save.cpp
    make_op.cpp
    module.cpp
@@ -81,7 +82,6 @@ add_library(migraphx
    replace_allocate.cpp
    simplify_qdq.cpp
    sqlite.cpp
-    rewrite_batchnorm.cpp
    rewrite_gelu.cpp
    rewrite_pooling.cpp
    rewrite_quantization.cpp
@@ -115,7 +115,6 @@ register_migraphx_ops(
    as_shape
    atanh
    atan
-    batch_norm_inference
    broadcast
    capture
    ceil
@@ -146,6 +145,7 @@ register_migraphx_ops(
    if_op
    im2col
    isnan
+    layout
    leaky_relu
    less
    load

--- a/src/api/include/migraphx/migraphx.hpp
+++ b/src/api/include/migraphx/migraphx.hpp
@@ -32,6 +32,7 @@
 #include <memory>
 #include <numeric>
 #include <exception>
+#include <array>
 #include <vector>
 #include <cassert>
 #include <iostream>

--- a/src/auto_contiguous.cpp
+++ b/src/auto_contiguous.cpp
@@ -59,6 +59,8 @@ void auto_contiguous::apply(module& m) const
    auto last = std::prev(m.end());
    for(auto ins : iterator_for(m))
    {
+        if(ins->name() == "layout")
+            continue;
        // for last instruction that is NOT a return
        if(ins->outputs().empty() and ins != last)
            continue;

--- a/src/common.cpp
+++ b/src/common.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/algorithm.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/ranges.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -43,6 +44,7 @@ inline namespace MIGRAPHX_INLINE_NS {
 // In this case we need to broadcast the (:,:,1:,:) axis
 // of s0 plus the 1st dimension of s1 giving
 // output_lens = (3,2,7,5)
+//
 std::vector<std::size_t> compute_broadcasted_lens(std::vector<std::size_t> s0,
                                                  std::vector<std::size_t> s1)
 {
@@ -50,25 +52,62 @@ std::vector<std::size_t> compute_broadcasted_lens(std::vector<std::size_t> s0,
        return s0;
    if(s0.size() > s1.size())
        s0.swap(s1);
-
    std::vector<std::size_t> out_lens(s1);
    auto offset = s1.size() - s0.size();
    std::transform(
        s0.begin(), s0.end(), s1.begin() + offset, out_lens.begin() + offset, [&](auto a, auto b) {
            if(a != b and a != 1 and b != 1)
            {
-                MIGRAPHX_THROW("COMPUTE_BROADCASTLEN: shape {" + to_string_range(s0) + "} and {" +
-                               to_string_range(s1) + "} mismatch!");
+                MIGRAPHX_THROW("COMPUTE_BROADCASTLEN: shape {" + migraphx::to_string_range(s0) +
+                               "} and {" + migraphx::to_string_range(s1) + "} mismatch!");
            }
            return std::max(a, b);
        });
-
    return out_lens;
 }

+std::vector<shape::dynamic_dimension> compute_broadcasted_dyn_dims(shape s0, shape s1)
+{
+    // change both shapes to dynamic_dimension representation
+    s0 = s0.to_dynamic();
+    s1 = s1.to_dynamic();
+    if(s0.ndim() > s1.ndim())
+    {
+        std::swap(s0, s1);
+    }
+    auto offset = s1.ndim() - s0.ndim();
+    std::vector<shape::dynamic_dimension> out_dims(s1.dyn_dims());
+    std::transform(
+        s0.dyn_dims().cbegin(),
+        s0.dyn_dims().cend(),
+        s1.dyn_dims().cbegin() + offset,
+        out_dims.begin() + offset,
+        [&](auto a, auto b) {
+            if(a == b)
+            {
+                return a;
+            }
+            else if(a == 1 or b == 1)
+            {
+                // setting opt to 0, may need to be changed
+                return shape::dynamic_dimension{std::max(a.min, b.min), std::max(a.max, b.max), 0};
+            }
+            else
+            {
+                MIGRAPHX_THROW("COMPUTE_BROADCASTED_DYN_DIMS: dynamic shapes {" +
+                               migraphx::to_string_range(s0.dyn_dims()) + "} and {" +
+                               migraphx::to_string_range(s1.dyn_dims()) + "} mismatch!");
+            }
+        });
+    return out_dims;
+}
+
+// Compute the common (broadcasted) dimensions of a list of fixed shapes
 std::vector<std::size_t> compute_common_lens(const std::vector<shape>& shapes)
 {
    assert(not shapes.empty());
+    assert(
+        std::none_of(shapes.cbegin(), shapes.cend(), [](auto shape) { return shape.dynamic(); }));
    return transform_accumulate(shapes.begin() + 1,
                                shapes.end(),
                                shapes.front().lens(),
@@ -114,20 +153,63 @@ instruction_ref insert_common_op(module& m,
                                 const operation& op,
                                 std::vector<instruction_ref> inputs)
 {
-    auto common = common_shape(to_shapes(inputs));
-    std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto input) {
-        if(input->get_shape().lens() != common.lens())
+    if(std::any_of(
+           inputs.cbegin(), inputs.cend(), [](auto input) { return input->get_shape().dynamic(); }))
+    {
+        // currently only handles the binary case
+        if(inputs.size() != 2)
        {
-            input = m.insert_instruction(
-                ins, make_op("multibroadcast", {{"out_lens", common.lens()}}), input);
+            MIGRAPHX_THROW("INSERT_COMMON_OP: not handled; " + migraphx::to_string(inputs.size()) +
+                           "inputs, only handle two inputs if any are dynamic shape");
        }
-        if(input->get_shape().type() != common.type())
+
+        auto c_type = compute_common_types(to_shapes(inputs));
+        auto c_dyn_dims =
+            compute_broadcasted_dyn_dims(inputs[0]->get_shape(), inputs[1]->get_shape());
+
+        // following should work for a static or dynamic shape
+        if(inputs[0]->get_shape().dyn_dims() != c_dyn_dims)
        {
-            input = m.insert_instruction(
-                ins, make_op("convert", {{"target_type", common.type()}}), input);
+            inputs[0] = m.insert_instruction(
+                ins,
+                make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}),
+                inputs[0],
+                inputs[1]);
        }
-        return input;
-    });
+        if(inputs[1]->get_shape().dyn_dims() != c_dyn_dims)
+        {
+            inputs[1] = m.insert_instruction(
+                ins,
+                make_op("multibroadcast", {{"out_dyn_dims", to_value(c_dyn_dims)}}),
+                inputs[1],
+                inputs[0]);
+        }
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto input) {
+            if(input->get_shape().type() != c_type)
+            {
+                input =
+                    m.insert_instruction(ins, make_op("convert", {{"target_type", c_type}}), input);
+            }
+            return input;
+        });
+    }
+    else
+    {
+        auto common = common_shape(to_shapes(inputs));
+        std::transform(inputs.begin(), inputs.end(), inputs.begin(), [&](auto input) {
+            if(input->get_shape().lens() != common.lens())
+            {
+                input = m.insert_instruction(
+                    ins, make_op("multibroadcast", {{"out_lens", common.lens()}}), input);
+            }
+            if(input->get_shape().type() != common.type())
+            {
+                input = m.insert_instruction(
+                    ins, make_op("convert", {{"target_type", common.type()}}), input);
+            }
+            return input;
+        });
+    }
    return m.insert_instruction(ins, op, inputs);
 }


--- a/src/dead_code_elimination.cpp
+++ b/src/dead_code_elimination.cpp
@@ -51,8 +51,8 @@ void dead_code_elimination::apply(module& m) const
        // Skip instruction with empty shape as output unless its [dynamic, builtin, undefined,
        // identity, allocate]
        if((not i->get_shape().dynamic() and i->get_shape().elements() == 0) and
-           i->name().front() != '@' and
-           not contains({"undefined", "identity", "allocate"}, i->name()))
+           not(i->name().front() == '@') and not contains({"identity", "allocate"}, i->name()) and
+           not i->is_undefined())
            continue;
        assert(std::distance(m.begin(), i) <= std::distance(m.begin(), last));
        std::unordered_set<instruction_ref> visited;

--- a/src/driver/alexnet.cpp
+++ b/src/driver/alexnet.cpp
@@ -74,9 +74,9 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
    auto x_main_module_19 = mmain->add_literal(migraphx::generate_literal(
        migraphx::shape{migraphx::shape::float_type, {64, 3, 11, 11}}, 18));
    auto x_main_module_20 = mmain->add_instruction(
-        migraphx::make_json_op("convolution",
-                               "{dilation:[1,1],group:1,padding:[2,2,2,2],padding_mode:0,stride:[4,"
-                               "4],use_dynamic_same_auto_pad:0}"),
+        migraphx::make_json_op(
+            "convolution",
+            "{dilation:[1,1],group:1,padding:[2,2,2,2],padding_mode:0,stride:[4,4]}"),
        x_0,
        x_main_module_19);
    auto x_main_module_21 = mmain->add_instruction(
@@ -90,9 +90,9 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
            "{ceil_mode:0,lengths:[3,3],lp_order:2,mode:1,padding:[0,0,0,0],stride:[2,2]}"),
        x_main_module_23);
    auto x_main_module_25 = mmain->add_instruction(
-        migraphx::make_json_op("convolution",
-                               "{dilation:[1,1],group:1,padding:[2,2,2,2],padding_mode:0,stride:[1,"
-                               "1],use_dynamic_same_auto_pad:0}"),
+        migraphx::make_json_op(
+            "convolution",
+            "{dilation:[1,1],group:1,padding:[2,2,2,2],padding_mode:0,stride:[1,1]}"),
        x_main_module_24,
        x_main_module_17);
    auto x_main_module_26 = mmain->add_instruction(
@@ -106,9 +106,9 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
            "{ceil_mode:0,lengths:[3,3],lp_order:2,mode:1,padding:[0,0,0,0],stride:[2,2]}"),
        x_main_module_28);
    auto x_main_module_30 = mmain->add_instruction(
-        migraphx::make_json_op("convolution",
-                               "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,"
-                               "1],use_dynamic_same_auto_pad:0}"),
+        migraphx::make_json_op(
+            "convolution",
+            "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,1]}"),
        x_main_module_29,
        x_main_module_15);
    auto x_main_module_31 = mmain->add_instruction(
@@ -117,9 +117,9 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
        mmain->add_instruction(migraphx::make_op("add"), x_main_module_30, x_main_module_31);
    auto x_main_module_33 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_32);
    auto x_main_module_34 = mmain->add_instruction(
-        migraphx::make_json_op("convolution",
-                               "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,"
-                               "1],use_dynamic_same_auto_pad:0}"),
+        migraphx::make_json_op(
+            "convolution",
+            "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,1]}"),
        x_main_module_33,
        x_main_module_13);
    auto x_main_module_35 = mmain->add_instruction(
@@ -128,9 +128,9 @@ migraphx::program alexnet(unsigned batch) // NOLINT(readability-function-size)
        mmain->add_instruction(migraphx::make_op("add"), x_main_module_34, x_main_module_35);
    auto x_main_module_37 = mmain->add_instruction(migraphx::make_op("relu"), x_main_module_36);
    auto x_main_module_38 = mmain->add_instruction(
-        migraphx::make_json_op("convolution",
-                               "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,"
-                               "1],use_dynamic_same_auto_pad:0}"),
+        migraphx::make_json_op(
+            "convolution",
+            "{dilation:[1,1],group:1,padding:[1,1,1,1],padding_mode:0,stride:[1,1]}"),
        x_main_module_37,
        x_main_module_11);
    auto x_main_module_39 = mmain->add_instruction(

--- a/src/driver/inceptionv3.cpp
+++ b/src/driver/inceptionv3.cpp
--- a/src/driver/main.cpp
+++ b/src/driver/main.cpp
@@ -44,7 +44,6 @@
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/quantization.hpp>
 #include <migraphx/register_op.hpp>
-#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/register_target.hpp>
@@ -110,8 +109,12 @@ struct loader
        ap(brief, {"--brief"}, ap.help("Make the output brief."), ap.set_value(true));
        ap(output_type,
           {"--cpp"},
-           ap.help("Print out the program as cpp program."),
+           ap.help("Print out the program as C++ program."),
           ap.set_value("cpp"));
+        ap(output_type,
+           {"--python", "--py"},
+           ap.help("Print out the program as python program."),
+           ap.set_value("py"));
        ap(output_type, {"--json"}, ap.help("Print out program as json."), ap.set_value("json"));
        ap(output_type,
           {"--text"},
@@ -221,7 +224,6 @@ struct loader
        {
            migraphx::run_passes(*p.get_main_module(),
                                 {
-                                     migraphx::rewrite_batchnorm{},
                                     migraphx::eliminate_identity{},
                                     migraphx::dead_code_elimination{},
                                     migraphx::simplify_algebra{},
@@ -261,7 +263,9 @@ struct loader
                type = "binary";
        }

-        if(type == "cpp")
+        if(type == "py")
+            p.print_py(*os);
+        else if(type == "cpp")
            p.print_cpp(*os);
        else if(type == "graphviz")
            p.print_graph(*os, brief);

--- a/src/driver/resnet50.cpp
+++ b/src/driver/resnet50.cpp
--- a/src/driver/verify.cpp
+++ b/src/driver/verify.cpp
@@ -145,7 +145,7 @@ void verify_reduced(program p,
    auto* mm  = p.get_main_module();
    auto last = std::prev(mm->end(), n + 1);
    mm->remove_instructions(last, mm->end());
-    std::cout << "Verify: " << std::endl;
+    std::cout << "Verify: " << n << std::endl;
    std::cout << p << std::endl;
    verify_program(std::to_string(n), p, t, options, quantize, inputs, tolerance);
 }
@@ -159,6 +159,7 @@ void verify_reduced_program(const program& p,
 {
    const auto* mm = p.get_main_module();
    auto n         = std::distance(mm->begin(), mm->end());
+    std::cout << "Verify steps: " << n << std::endl;
    for(std::size_t i = 0; i < n; i++)
    {
        verify_reduced(p, i, t, options, quantize, inputs, tolerance);