Merge branch 'develop' into jit-reduce-reg

dae94657 · Chris Austen · GitHub · b013d991 · 56c43445 · dae94657
Unverified Commit dae94657 authored Dec 14, 2022 by Chris Austen Committed by GitHub Dec 14, 2022
20 changed files
--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include "migraphx/make_op.hpp"
 #include <migraphx/gpu/mlir.hpp>
 #ifdef MIGRAPHX_MLIR
@@ -31,7 +32,13 @@
 #include <mlir-c/Dialect/MIGraphX.h>
 #include <mlir-c/IntegerSet.h>
 #include <mlir-c/Pass.h>
-#include <mlir-c/Registration.h>
+#include <mutex>
+#if !defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) || MLIR_MIGRAPHX_DIALECT_API_VERSION != 3
+#warning "Incompatible version of rocMLIR library used, disabling"
+#undef MIGRAPHX_MLIR
+#else
+#include <mlir-c/RegisterRocMLIR.h>
+#endif
 #endif
 #include <migraphx/env.hpp>
@@ -43,15 +50,12 @@
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
-#include <migraphx/iterator_for.hpp>
 #include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/permutation.hpp>
 #include <deque>
 #include <variant>
-#if defined(MLIR_MIGRAPHX_DIALECT_API_VERSION) && MLIR_MIGRAPHX_DIALECT_API_VERSION >= 2
-#define MIGRAPHX_MLIR_BARE_POINTER
-#endif
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
@@ -99,7 +103,10 @@ struct mlir_handle
    mlir_handle(T p) : handle(ptr{p}) {}
-    T get() const { return handle.get().get(); }
+    T get() const
+    {
+        return handle.get().get(); // NOLINT(readability-redundant-smartptr-get)
+    }
    T release() { return handle.release().get(); }
@@ -163,9 +170,11 @@ struct mlir_program
          location(mlirLocationUnknownGet(ctx.get())),
          mmodule(mlirModuleCreateEmpty(location))
    {
-        MlirDialectHandle mixr_handle = mlirGetDialectHandle__migraphx__();
+        MlirDialectRegistry registry = mlirDialectRegistryCreate();
-        mlirDialectHandleRegisterDialect(mixr_handle, ctx.get());
+        mlirRegisterRocMLIRDialects(registry);
-        mlirRegisterAllDialects(ctx.get());
+        mlirContextAppendDialectRegistry(ctx.get(), registry);
+        mlirContextLoadAllAvailableDialects(ctx.get());
+        mlirDialectRegistryDestroy(registry);
        mlirContextSetAllowUnregisteredDialects(ctx.get(), true /*allow*/);
    }
@@ -370,7 +379,11 @@ struct mlir_program
        mlir_operation_state& add_results(const std::vector<shape>& outputs)
        {
-            auto x = prog->make_tensors(outputs);
+            std::vector<shape> reshaped(outputs.size());
+            std::transform(outputs.begin(), outputs.end(), reshaped.begin(), [](const shape& r) {
+                return shape{r.type(), r.lens()};
+            });
+            auto x = prog->make_tensors(reshaped);
            mlirOperationStateAddResults(&op_state, x.size(), x.data());
            return *this;
        }
@@ -443,7 +456,8 @@ struct mlir_program
        auto ops = create_operation_state("func.func");
        ops.add_attributes({{"function_type", make_function_type(inputs, outputs)},
                            {"sym_name", std::string("main")},
-                            {"kernel", std::string("mixr")}});
+                            {"kernel", std::string("mixr")},
+                            {"arch", target_arch}});
        ops.add_region(std::move(region));
        insert(body, std::move(ops));
@@ -502,11 +516,13 @@ struct mlir_program
            {
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
-                std::string tuned = get_tune_params();
+                // check if HW supports xdlops
+                auto target_chip  = trim(split_string(target_arch, ':').front());
+                bool xdlops       = contains(get_xdlops_archs(), target_chip);
+                std::string tuned = get_tune_params(xdlops);
                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
-                // check if HW supports xdlops
+                if(xdlops)
-                if(contains(get_xdlops_archs(), target_name))
                    ops.add_attributes({{"xdlopsV2", true}});
            }
@@ -530,7 +546,7 @@ struct mlir_program
        // 1st pipeline to call
        mlirMIGraphXAddHighLevelPipeline(pm.get());
        // 2nd pipeline to call
-        mlirMIGraphXAddBackendPipeline(pm.get(), target_name.c_str(), "amdgcn-amd-amdhsa", "");
+        mlirMIGraphXAddBackendPipeline(pm.get(), target_arch.c_str());
        mlirPassManagerRun(pm.get(), mmodule.get());
        code_object_op op{};
@@ -540,16 +556,7 @@ struct mlir_program
        return op;
    }
-    void find_target()
+    void find_target() { target_arch = get_device_name(); }
-    {
-        std::string tname = get_device_name();
-        // HACK: Since MLIR can't handle the full target name
-        target_name = trim(split_string(tname, ':').front());
-        if(tname.size() != target_name.size())
-            std::cout
-                << "*************** WARNING: MLIR may not compile the correct target features for: "
-                << tname << std::endl;
-    }
    std::pair<std::size_t, std::size_t> get_launch_params() const
    {
@@ -571,14 +578,14 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }
-    std::string get_tune_params() { return get_mlir_perf_for_conv(pp); }
+    std::string get_tune_params(bool xdlops) { return get_mlir_perf_for_conv(pp, xdlops); }
    mlir_context ctx;
    MlirLocation location;
    mlir_module mmodule;
    problem_params pp;
    std::deque<std::string> strings{};
-    std::string target_name;
+    std::string target_arch;
 };
 std::string dump_mlir(const module& m)
@@ -589,11 +596,61 @@ std::string dump_mlir(const module& m)
    return mlir_print(&mlirOperationPrint, mod_op);
 }
-code_object_op compile_mlir(const context&, const module& m)
+void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
 {
+    auto names = m.get_parameter_names();
+    std::sort(names.begin(), names.end());
+    for(auto i : range(names.size()))
+    {
+        const auto& name  = names[i];
+        const auto& input = inputs[i]->get_shape();
+        auto param        = m.get_parameter(name);
+        if(input.standard())
+            continue;
+        auto lens    = input.lens();
+        auto strides = input.strides();
+        std::vector<operation> ops;
+        if(input.transposed())
+        {
+            auto perm  = find_permutation(input);
+            auto iperm = invert_permutation(perm);
+            lens       = reorder_dims(lens, iperm);
+            strides    = reorder_dims(strides, iperm);
+            ops.push_back(make_op("transpose", {{"permutation", perm}}));
+        }
+        if(input.broadcasted())
+        {
+            std::transform(lens.begin(),
+                           lens.end(),
+                           strides.begin(),
+                           lens.begin(),
+                           [](auto len, auto stride) -> std::size_t {
+                               if(stride == 0)
+                                   return 1;
+                               return len;
+                           });
+            ops.push_back(make_op("multibroadcast", {{"out_lens", input.lens()}}));
+        }
+        auto new_param =
+            std::accumulate(ops.begin(),
+                            ops.end(),
+                            m.add_parameter(name + ".0", shape{input.type(), lens}),
+                            [&](auto x, auto op) { return m.insert_instruction(param, op, x); });
+        m.replace_instruction(param, new_param);
+        m.remove_instruction(param);
+    }
+}
+code_object_op compile_mlir(const context&, module m, const std::vector<instruction_ref>& inputs)
+{
+    adjust_param_shapes(m, inputs);
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
    if(trace)
        std::cout << m << std::endl;
+    // set mutex while llvm thread support is disabled.
+    static std::mutex g_mlirc_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_mlirc_mutex);
    mlir_program mp;
    mp.find_target();
    mp.parse(m);
@@ -613,46 +670,9 @@ instruction_ref insert_mlir(module& m,
    std::vector<instruction_ref> refs;
    std::size_t last = 0;
-#ifdef MIGRAPHX_MLIR_BARE_POINTER
    refs.reserve(inputs.size());
    std::copy(inputs.begin(), inputs.end(), std::back_inserter(refs));
-    last = refs.size() - 1;
+    last               = refs.size() - 1;
-#else
-    refs.reserve(inputs.size() * 15);
-    std::unordered_map<uint64_t, instruction_ref> literal_map{};
-    auto get_literal = [&](uint64_t value) {
-        auto fi = literal_map.find(value);
-        if(fi != literal_map.end())
-            return fi->second;
-        auto lit = m.add_literal(value);
-        literal_map.emplace(value, lit);
-        return lit;
-    };
-    for(auto input : inputs)
-    {
-        const size_t offset = 0;
-        auto s              = input->get_shape();
-        last                = refs.size();
-        refs.push_back(input);
-        refs.push_back(input);
-        refs.push_back(get_literal(offset)); // offset
-        // dim sizes
-        std::transform(s.lens().begin(),
-                       s.lens().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-        // dim strides
-        std::transform(s.strides().begin(),
-                       s.strides().end(),
-                       std::back_inserter(refs),
-                       [&](const auto& lval) { return get_literal(lval); });
-        // refs.push_back(get_literal(1)); // G
-    }
-#endif
    co.expected_inputs = to_shapes(refs);
    co.output_arg      = last;
    return m.insert_instruction(ins, co, refs);
@@ -662,13 +682,19 @@ instruction_ref insert_mlir(module& m,
 std::string dump_mlir(const module&) { return {}; }
-code_object_op compile_mlir(const context&, const module&) { return {}; }
 template <class T>
 void use(T&)
 {
 }
+// Disabling clang-tidy warning on non-real useage.
+// NOLINTBEGIN(performance-unnecessary-value-param)
+code_object_op compile_mlir(const context&, module, const std::vector<instruction_ref>&)
+{
+    return {};
+}
+// NOLINTEND(performance-unnecessary-value-param)
 instruction_ref
 // cppcheck-suppress funcArgNamesDifferent
 insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
@@ -27,6 +27,7 @@
 #include <migraphx/stringutils.hpp>
 #include <migraphx/permutation.hpp>
 #include <fstream>
+#include <mutex>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -88,6 +89,9 @@ std::string generate_miopen_config(const problem_params& pp)
 auto query_miopen_db(const std::string& query)
 {
+    static std::mutex g_db_mutex; // NOLINT
+    const std::lock_guard<std::mutex> lock(g_db_mutex);
    // TODO: Store db as a static variable
    const auto dbpath = fs::path{"/opt"} / "rocm" / "share" / "miopen" / "db" / "miopen.db";
    // Check if db file exists.
@@ -108,16 +112,17 @@ auto query_miopen_db(const std::string& query)
 } // namespace
-std::string get_mlir_perf_for_conv(const problem_params& pp)
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops)
 {
-    std::string query = "select P.* \
+    std::string solver = xdlops ? "ConvMlirIgemmFwdXdlops" : "ConvMlirIgemmFwd";
+    std::string query  = "select P.* \
                             from perf_db P, config C \
                             where P.config = C.id AND \
-                             P.solver = 'ConvMlirIgemmFwdXdlops' AND \
+                             P.solver = '${solver}' AND \
                             ${config}";
-    auto results =
+    auto results = query_miopen_db(
-        query_miopen_db(interpolate_string(query, {{"config", generate_miopen_config(pp)}}));
+        interpolate_string(query, {{"config", generate_miopen_config(pp)}, {"solver", solver}}));
    if(results.empty())
        return "";
    return results.front().at("params");

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -51,17 +51,20 @@ struct layernorm_base
        }
        check_shapes{inputs, static_cast<const Derived&>(*this)}.has(nargs + N);
        auto s = inputs.at(0);
+        auto t = s.type();
+        if(not mods.empty())
+            t = mods.front()->get_output_shapes().front().type();
        if(s.scalar())
        {
            return s;
        }
        else if(s.broadcasted())
        {
-            return {s.type(), s.lens()};
+            return {t, s.lens()};
        }
        else
        {
-            return s.with_lens(s.lens());
+            return s.with_lens(t, s.lens());
        }
    }
 };

--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/quant_convolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape miopen_quant_convolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
-}
-argument miopen_quant_convolution::compute(context& ctx,
-                                           const shape& output_shape,
-                                           const std::vector<argument>& args) const
-{
-    auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
-    auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-    float alpha = 1;
-    float beta  = 0;
-    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                                           &alpha,
-                                           x_desc.get(),
-                                           args[0].implicit(),
-                                           w_desc.get(),
-                                           args[1].implicit(),
-                                           cd.get(),
-                                           algo,
-                                           &beta,
-                                           y_desc.get(),
-                                           args[3].implicit(),
-                                           args[2].implicit(),
-                                           args[2].get_shape().bytes());
-    if(status != miopenStatusSuccess)
-    {
-        MIGRAPHX_THROW("QUANT_CONVOLUTION: run convolution forward failed");
-    }
-    return args[3];
-}
-shape miopen_quant_convolution::find(context& ctx,
-                                     const shape& output_shape,
-                                     std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-    std::size_t workspace_size = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                             w_desc.get(),
-                                             x_desc.get(),
-                                             cd.get(),
-                                             y_desc.get(),
-                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-    auto x_shape = inputs[0];
-    auto w_shape = inputs[1];
-    if(int8_x4_format)
-    {
-        x_shape = pack_int8_shape(x_shape);
-        w_shape = pack_int8_shape(w_shape);
-    }
-    auto x         = to_gpu(generate_argument(x_shape));
-    auto w         = to_gpu(generate_argument(w_shape));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                        x_desc.get(),
-                                                        x.implicit(),
-                                                        w_desc.get(),
-                                                        w.implicit(),
-                                                        cd.get(),
-                                                        y_desc.get(),
-                                                        y.implicit(),
-                                                        1,
-                                                        &algo_count,
-                                                        &perf,
-                                                        workspace.implicit(),
-                                                        workspace_size,
-                                                        false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: find convolution failed");
-    algo = perf.fwd_algo;
-    size_t solution_count;
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution count failed");
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution failed");
-    solution_id = solutions.front().solution_id;
-    return shape{shape::int8_type, {perf.memory}};
-}
-void miopen_quant_convolution::finalize(context& ctx,
-                                        const shape& output_shape,
-                                        std::vector<shape> inputs)
-{
-    if(cd == nullptr)
-        cd = make_conv(op);
-    if(solution_id == 0)
-    {
-        // Check that workspace hasn't changed
-        auto size = inputs.at(2).bytes();
-        auto ws   = find(ctx, output_shape, inputs);
-        if(ws.bytes() > size)
-            MIGRAPHX_THROW("MIOpen Quant Convolution: workspace has changed during finalization.");
-    }
-    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                          w_desc.get(),
-                                                          x_desc.get(),
-                                                          cd.get(),
-                                                          y_desc.get(),
-                                                          solution_id);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: compile solution failed");
-}
-shape miopen_quant_convolution::pack_int8_shape(const shape& s) const
-{
-    if(s.type() != shape::int8_type)
-    {
-        MIGRAPHX_THROW("PACK_INT8_SHAPE: only process int8_type");
-    }
-    auto lens    = s.lens();
-    auto strides = s.strides();
-    lens[1]      = (lens[1] + 3) / 4 * 4;
-    strides[0]   = strides[1] * lens[1];
-    return {s.type(), lens, strides};
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -35,13 +35,13 @@
 #include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/inline_module.hpp>
 #include <migraphx/insert_pad.hpp>
+#include <migraphx/layout_nhwc.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/normalize_ops.hpp>
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
-#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_gelu.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -51,6 +51,7 @@
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -71,6 +72,7 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
 struct id_pass
 {
@@ -110,8 +112,6 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        insert_pad{},
        dead_code_elimination{},
-        rewrite_batchnorm{},
-        dead_code_elimination{},
        rewrite_rnn{},
        dead_code_elimination{},
        inline_module{},
@@ -123,6 +123,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        simplify_algebra{},
        simplify_reshapes{},
+        enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
+        dead_code_elimination{},
+        simplify_reshapes{},
        simplify_algebra{},
        prefuse_ops{},
        dead_code_elimination{},
@@ -139,14 +142,16 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
-        pack_int8_args{},
+        compile_miopen{&gctx},
        dead_code_elimination{},
-        adjust_allocation{gpu_allocation_model{}},
+        pack_int8_args{},
        dead_code_elimination{},
        fuse_ops{&ctx, options.fast_math},
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},
        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
        compile_ops{&ctx},
        dead_code_elimination{},
        write_literals{&ctx},

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -26,15 +26,12 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/op/identity.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/quant_dot.hpp>
-#include <migraphx/op/elu.hpp>
 #include <migraphx/op/im2col.hpp>
-#include <migraphx/op/leaky_relu.hpp>
 #include <migraphx/op/logsoftmax.hpp>
 #include <migraphx/op/loop.hpp>
 #include <migraphx/op/lrn.hpp>
@@ -75,84 +72,6 @@ typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::ena
    return x;
 }
-//
-// ref implemenataion of batch norm for inference
-//
-// inputs are:
-// args[0] -> input data buffer
-// args[1] -> mini batch mean
-// args[2] -> mini batch variance
-// args[3] -> gamma
-// args[4] -> bias
-//
-// The equation to compute batch norm for inference is:
-//
-// output[i] = bias + gamma * (input[i] + mean) / sqrt(variance + epsilon)
-//
-// the input data format should be nchw
-//
-struct ref_batch_norm_inference
-{
-    op::batch_norm_inference op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "ref::batch_norm_inference"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument output{output_shape};
-        double epsilon           = op.epsilon;
-        auto input               = args[0];
-        auto arg_gamma           = args[1];
-        auto arg_bias            = args[2];
-        auto mini_batch_mean     = args[3];
-        auto mini_batch_variance = args[4];
-        if(op.bn_mode == op::batch_norm_inference::spatial)
-        {
-            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
-                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
-                    par_for(output_shape.elements(), [&](auto i) {
-                        auto idx = output_shape.multi(i);
-                        auto c   = idx[1];
-                        assert((variance[c] + epsilon) > 0);
-                        result[i] =
-                            gamma[c] * (buffer[i] - mean[c]) / std::sqrt(variance[c] + epsilon) +
-                            bias[c];
-                    });
-                });
-        }
-        if(op.bn_mode == op::batch_norm_inference::per_activation)
-        {
-            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
-                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
-                    par_for(output_shape.elements(), [&](auto i) {
-                        auto idx   = output_shape.multi(i);
-                        idx[0]     = 0;
-                        auto index = output_shape.index(idx);
-                        assert((variance[index] + epsilon) > 0);
-                        result[i] = gamma[index] * (buffer[i] - mean[index]) /
-                                        std::sqrt(variance[index] + epsilon) +
-                                    bias[index];
-                    });
-                });
-        }
-        return output;
-    }
-};
-MIGRAPHX_REGISTER_OP(ref_batch_norm_inference)
 struct ref_lrn
 {
    op::lrn op;
@@ -237,15 +156,16 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
    argument compute(context&, shape output_shape, std::vector<argument> args) const
    {
        std::vector<std::size_t> padding;
-        if(op.use_dynamic_same_auto_pad)
+        if(op.padding_mode != op::padding_mode_t::default_)
        {
-            auto input_lens = args[0].get_shape().lens();
+            auto input_lens   = args[0].get_shape().lens();
-            std::vector<std::size_t> img_lens{input_lens.begin() + 2, input_lens.end()};
            auto weights_lens = args[1].get_shape().lens();
-            std::vector<std::size_t> k_lens{weights_lens.begin() + 2, weights_lens.end()};
+            padding =
-            padding = calc_dyn_auto_pad(img_lens, k_lens, op.stride, op.dilation);
+                op.padding_mode == op::same_upper
-            output_shape =
+                    ? calc_dyn_auto_pad(input_lens, weights_lens, op.stride, op.dilation, true)
-                compute_padded_shape({args.at(0).get_shape(), args.at(1).get_shape()}, padding);
+                    : calc_dyn_auto_pad(input_lens, weights_lens, op.stride, op.dilation, false);
+            output_shape = compute_padded_shape(
+                args[0].get_shape(), args[1].get_shape(), padding, op.stride, op.dilation);
        }
        else
        {
@@ -313,34 +233,6 @@ struct ref_convolution : auto_register_op<ref_convolution<Op>>
        });
        return result;
    }
-    private:
-    /*!
-     * Used for dynamic auto padding since padding needs to be computed at evaulation time.
-     * \param inputs two fixed shape inputs [input_tensor, weights]
-     * \param padding from auto_pad calculation
-     */
-    shape compute_padded_shape(const std::vector<shape>& inputs,
-                               const std::vector<std::size_t>& padding) const
-    {
-        const shape& input            = inputs.at(0);
-        const shape& weights          = inputs.at(1);
-        const size_t num_spatial_dims = input.lens().size() - 2;
-        std::vector<size_t> output_lens{input.lens()[0], weights.lens()[0]};
-        // calculate the output shape of the convolution: ((W - K + 2P) / S) + 1
-        for(size_t i = 0; i < num_spatial_dims; i++)
-        {
-            auto padding_factor = padding[i] + padding[i + num_spatial_dims];
-            output_lens.push_back(std::size_t(std::max<std::ptrdiff_t>(
-                1,
-                (input.lens()[i + 2] - (1 + op.dilation[i] * (weights.lens()[i + 2] - 1)) +
-                 padding_factor) /
-                        op.stride[i] +
-                    1)));
-        }
-        return inputs[0].with_lens(output_lens);
-    }
 };
 struct ref_im2col
@@ -491,9 +383,9 @@ struct ref_gemm
    std::string name() const { return "ref::dot"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
        migemm(result, args[0], args[1], 1.0f, 0.0f);
        return result;
@@ -537,65 +429,6 @@ struct ref_quant_gemm
 };
 MIGRAPHX_REGISTER_OP(ref_gemm)
-struct leaky_relu_op
-{
-    op::leaky_relu op;
-    std::string name() const { return "ref::leaky_relu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : x * a; };
-    }
-};
-struct elu_op
-{
-    op::elu op;
-    std::string name() const { return "ref::elu"; }
-    auto fcn() const
-    {
-        auto a = op.alpha;
-        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
-    }
-};
-template <typename Op>
-struct ref_unary : auto_register_op<ref_unary<Op>>
-{
-    ref_unary() = default;
-    template <class T>
-    ref_unary(T pop) : op(Op{std::move(pop)})
-    {
-    }
-    Op op;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op.op, f);
-    }
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs, *this}.has(1);
-        const auto& s = inputs.at(0);
-        return {s.type(), s.lens()};
-    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            assert(input.get_shape().standard());
-            std::transform(input.begin(), input.end(), output.begin(), op.fcn());
-        });
-        return result;
-    }
-};
 template <class Op>
 struct ref_softmax : auto_register_op<ref_softmax<Op>>
 {
@@ -616,10 +449,10 @@ struct ref_softmax : auto_register_op<ref_softmax<Op>>
    {
        return op.normalize_compute_shape(inputs);
    }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const dyn_output& dyn_out, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        argument result{dyn_out.computed_shape};
-        auto batch_lens        = output_shape.lens();
+        auto batch_lens        = dyn_out.computed_shape.lens();
        int64_t tuned_axis     = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
        std::size_t n_dims     = batch_lens[tuned_axis];
        batch_lens[tuned_axis] = 1;
@@ -642,7 +475,7 @@ struct ref_softmax : auto_register_op<ref_softmax<Op>>
                for(std::size_t j = 0; j < n_dims; ++j)
                {
                    idx[tuned_axis]   = j;
-                    std::size_t index = output_shape.index(idx);
+                    std::size_t index = dyn_out.computed_shape.index(idx);
                    output[index]     = std::exp(input[index] - batch_max[i]);
                }
@@ -731,16 +564,12 @@ struct ref_apply
    void init()
    {
-        apply_map["batch_norm_inference"] =
-            extend_op<ref_batch_norm_inference, op::batch_norm_inference>();
        apply_map["convolution"] = extend_op<ref_convolution<op::convolution>, op::convolution>();
        apply_map["dot"]         = extend_op<ref_gemm, op::dot>();
        apply_map["quant_dot"]   = extend_op<ref_quant_gemm, op::quant_dot>();
        apply_map["quant_convolution"] =
            extend_op<ref_convolution<op::quant_convolution>, op::quant_convolution>();
-        apply_map["elu"]        = extend_op<ref_unary<elu_op>, op::elu>();
        apply_map["im2col"]     = extend_op<ref_im2col, op::im2col>();
-        apply_map["leaky_relu"] = extend_op<ref_unary<leaky_relu_op>, op::leaky_relu>();
        apply_map["logsoftmax"] = extend_op<ref_softmax<op::logsoftmax>, op::logsoftmax>();
        apply_map["lrn"]        = extend_op<ref_lrn, op::lrn>();
        apply_map["pad"]        = extend_op<ref_pad, op::pad>();

--- a/src/tf/parse_batchnorm.cpp
+++ b/src/tf/parse_batchnorm.cpp
@@ -23,6 +23,7 @@
 */
 #include <migraphx/tf/op_parser.hpp>
 #include <migraphx/tf/tf_parser.hpp>
+#include <migraphx/instruction.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/make_op.hpp>
@@ -38,16 +39,37 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
    instruction_ref parse(const op_desc& /*opd*/,
                          const tf_parser& /*parser*/,
                          tf_parser::node_info info,
-                          const std::vector<instruction_ref>& args) const
+                          std::vector<instruction_ref> args) const
    {
-        float epsilon  = 1e-5f;
+        // different default epsilon than from ONNX
-        float momentum = 0.9f;
+        float epsilon = 1e-4f;
        if(contains(info.attributes, "epsilon"))
        {
            epsilon = info.attributes.at("epsilon").f();
        }
-        auto op = make_op("batch_norm_inference", {{"epsilon", epsilon}, {"momentum", momentum}});
-        return info.add_instruction(op, args);
+        auto x_lens = args[0]->get_shape().lens();
+        auto x_type = args[0]->get_shape().type();
+        // unsqueeze tensors of shape (C) to broadcast correctly
+        auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
+        auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+        auto scale_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[1]);
+        auto bias_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[2]);
+        auto mean_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[3]);
+        auto var_unsqueeze =
+            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[4]);
+        auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+        auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+        auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
+        auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
+        auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+        return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
    }
 };

--- a/src/tf/parse_conv.cpp
+++ b/src/tf/parse_conv.cpp
@@ -75,7 +75,6 @@ struct parse_conv : op_parser<parse_conv>
            const std::string& pad_mode = info.attributes.at("padding").s();
            if(pad_mode.find("SAME") != std::string::npos)
            {
-                op.padding_mode                 = op::padding_mode_t::same;
                std::vector<size_t> weight_dims = weights->get_shape().lens();
                size_t weight_h                 = weight_dims[2];
                size_t weight_w                 = weight_dims[3];
@@ -87,10 +86,6 @@ struct parse_conv : op_parser<parse_conv>
                op.padding = std::vector<size_t>(pads.begin(), pads.end());
            }
-            else if(pad_mode.find("VALID") != std::string::npos)
-            {
-                op.padding_mode = op::padding_mode_t::valid;
-            }
            else if(pad_mode.find("EXPLICIT") != std::string::npos)
            {
                std::vector<size_t> padding;

--- a/src/tf/parse_depthwiseconv.cpp
+++ b/src/tf/parse_depthwiseconv.cpp
@@ -80,7 +80,6 @@ struct parse_depthwiseconv : op_parser<parse_depthwiseconv>
            if(pad_mode.find("SAME") != std::string::npos)
            {
-                op.padding_mode                 = op::padding_mode_t::same;
                std::vector<size_t> weight_dims = weights->get_shape().lens();
                size_t weight_h                 = weight_dims[2];
                size_t weight_w                 = weight_dims[3];
@@ -101,10 +100,6 @@ struct parse_depthwiseconv : op_parser<parse_depthwiseconv>
                    op.padding[1] = pads[1];
                }
            }
-            else if(pad_mode.find("VALID") != std::string::npos)
-            {
-                op.padding_mode = op::padding_mode_t::valid;
-            }
        }
        std::vector<int64_t> new_weights_shape;

--- a/test/api/test_custom_op_gpu.cpp
+++ b/test/api/test_custom_op_gpu.cpp
@@ -55,7 +55,8 @@ struct half_copy_host final : migraphx::experimental_custom_op_base
                                           hipMemcpyHostToHost,
                                           ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
-        MIGRAPHX_HIP_ASSERT(hipMemset(output_buffer_ptr, 0, copy_bytes));
+        MIGRAPHX_HIP_ASSERT(
+            hipMemsetAsync(output_buffer_ptr, 0, copy_bytes, ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
        return inputs[1];
    }
@@ -97,7 +98,8 @@ struct half_copy_device final : migraphx::experimental_custom_op_base
                                           hipMemcpyDeviceToDevice,
                                           ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
-        MIGRAPHX_HIP_ASSERT(hipMemset(output_buffer_ptr, 0, copy_bytes));
+        MIGRAPHX_HIP_ASSERT(
+            hipMemsetAsync(output_buffer_ptr, 0, copy_bytes, ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
        return inputs[1];
    }
@@ -124,7 +126,7 @@ struct half_copy_device_same_buffer final : migraphx::experimental_custom_op_bas
    virtual bool runs_on_offload_target() const override { return true; }
    virtual migraphx::argument
-    compute(migraphx::context, migraphx::shape, migraphx::arguments inputs) const override
+    compute(migraphx::context ctx, migraphx::shape, migraphx::arguments inputs) const override
    {
        // This custom op simply sets first half size_bytes of the input 0, and rest of the half
        // bytes are copied. for this custom_op, it does its computation on the "device". Therefore,
@@ -133,7 +135,8 @@ struct half_copy_device_same_buffer final : migraphx::experimental_custom_op_bas
        auto input_bytes = inputs[0].get_shape().bytes();
        auto copy_bytes  = input_bytes / 2;
        MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
-        MIGRAPHX_HIP_ASSERT(hipMemset(buffer_ptr, 0, copy_bytes));
+        MIGRAPHX_HIP_ASSERT(
+            hipMemsetAsync(buffer_ptr, 0, copy_bytes, ctx.get_queue<hipStream_t>()));
        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
        return inputs[0];
    }

--- a/test/fuse_pointwise.cpp
+++ b/test/fuse_pointwise.cpp
@@ -272,6 +272,35 @@ TEST_CASE(contiguous_input)
    EXPECT(p1 == p2);
 }
+TEST_CASE(contiguous_boolean_input)
+{
+    migraphx::shape s{migraphx::shape::bool_type, {2, 3}};
+    migraphx::shape s_lit{migraphx::shape::bool_type, {1}, {0}};
+    migraphx::program p1;
+    {
+        auto* mm = p1.get_main_module();
+        auto x   = mm->add_parameter("x", s);
+        auto one = mm->add_literal(migraphx::literal(s_lit, {1.0}));
+        auto yb =
+            mm->add_instruction(migraphx::make_op("multibroadcast", {{"out_lens", s.lens()}}), one);
+        auto y    = mm->add_instruction(migraphx::make_op("contiguous"), yb);
+        auto xor1 = mm->add_instruction(migraphx::make_op("logical_xor"), x, y);
+        mm->add_return({xor1});
+    }
+    run_pass(p1);
+    migraphx::program p2;
+    {
+        auto* mm  = p2.get_main_module();
+        auto x    = mm->add_parameter("x", s);
+        auto xor1 = add_pointwise(p2, "main:pointwise0", {x}, [=](auto* pm, const auto& inputs) {
+            auto y = pm->add_literal(migraphx::literal(s_lit, {1}));
+            return pm->add_instruction(migraphx::make_op("logical_xor"), inputs[0], y);
+        });
+        mm->add_return({xor1});
+    }
+}
 TEST_CASE(all_scalar_input)
 {
    migraphx::shape s{migraphx::shape::float_type};

--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
@@ -21,44 +21,30 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/elu.hpp>
-#include <migraphx/gpu/context.hpp>
-namespace migraphx {
+#include <test.hpp>
-inline namespace MIGRAPHX_INLINE_NS {
+#include <migraphx/argument.hpp>
-namespace gpu {
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/target.hpp>
-shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
+TEST_CASE(tuple_to_from_gpu)
 {
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    migraphx::shape s1{migraphx::shape::float_type, {2, 3}};
-    return inputs.at(1);
+    migraphx::shape s2{migraphx::shape::int32_type, {2, 4}};
+    std::vector<float> p1_data = {1.1, 2.2, 3.3, 4.4, 5.5, 6.6};
+    std::vector<int> p2_data   = {1, 2, 3, 4, 5, 6, 7, 8};
+    auto p1                    = migraphx::argument{s1, p1_data.data()};
+    auto p2                    = migraphx::argument{s2, p2_data.data()};
+    auto p1_gpu                = migraphx::gpu::to_gpu(p1);
+    auto p2_gpu                = migraphx::gpu::to_gpu(p2);
+    auto p_tuple               = migraphx::gpu::from_gpu(migraphx::argument({p1_gpu, p2_gpu}));
+    std::vector<migraphx::argument> results = p_tuple.get_sub_objects();
+    std::vector<float> result1;
+    results[0].visit([&](auto output) { result1.assign(output.begin(), output.end()); });
+    std::vector<int> result2;
+    results[1].visit([&](auto output) { result2.assign(output.begin(), output.end()); });
+    EXPECT(result1 == p1_data);
+    EXPECT(result2 == p2_data);
 }
-argument miopen_elu::compute(context& ctx,
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-    return args[1];
-}
-void miopen_elu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_elu(op.alpha);
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/test/gpu/literal.cpp
+++ b/test/gpu/literal.cpp
@@ -48,4 +48,4 @@ void gpu_literal_test()
    }
 }
-int main() { gpu_literal_test(); }
+int main() { gpu_literal_test(); } // NOLINT (bugprone-exception-escape)
--- a/test/gpu/mlir.cpp
+++ b/test/gpu/mlir.cpp
@@ -84,7 +84,7 @@ migraphx::program create_program_from_mlir(const migraphx::module& mmlir)
    inputs.push_back(mm->add_parameter("output", mmlir.get_output_shapes().front()));
    migraphx::gpu::context ctx;
-    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir), inputs);
+    migraphx::gpu::insert_mlir(*mm, mm->end(), compile_mlir(ctx, mmlir, inputs), inputs);
    return p;
 }
@@ -140,8 +140,8 @@ TEST_CASE(conv)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<2x8x3x3xf32>, %arg1: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr"} {
-    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1], use_dynamic_same_auto_pad = 0 : i64} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    %0 = migraphx.convolution(%arg1, %arg0) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    return %0 : tensor<1x2x2x2xf32>
  }
 }
@@ -163,8 +163,8 @@ TEST_CASE(conv_add_relu)
 {
    const std::string mlir_output = R"__migraphx__(
 module {
-  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {kernel = "mixr"} {
+  func.func @main(%arg0: tensor<1x2x2x2xf32>, %arg1: tensor<2x8x3x3xf32>, %arg2: tensor<1x8x4x4xf32>) -> tensor<1x2x2x2xf32> attributes {arch = "", kernel = "mixr"} {
-    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1], use_dynamic_same_auto_pad = 0 : i64} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
+    %0 = migraphx.convolution(%arg2, %arg1) {dilation = [1, 1], group = 1 : i64, padding = [0, 0, 0, 0], padding_mode = 0 : i64, stride = [1, 1]} : (tensor<1x8x4x4xf32>, tensor<2x8x3x3xf32>) -> tensor<1x2x2x2xf32>
    %1 = migraphx.add(%0, %arg0) : (tensor<1x2x2x2xf32>, tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    %2 = migraphx.relu(%1) : (tensor<1x2x2x2xf32>) -> tensor<1x2x2x2xf32>
    return %2 : tensor<1x2x2x2xf32>

--- a/test/gpu/quantization.cpp
+++ b/test/gpu/quantization.cpp
@@ -30,7 +30,6 @@
 #include <migraphx/ref/target.hpp>
 #include <migraphx/gpu/target.hpp>
 #include <migraphx/verify.hpp>
-#include <migraphx/quantization.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/pass_manager.hpp>

--- a/test/instruction.cpp
+++ b/test/instruction.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/make_op.hpp>
+#include "test.hpp"
+TEST_CASE(check_undefined)
+{
+    migraphx::module m;
+    auto und = m.add_instruction(migraphx::make_op("undefined"));
+    auto cov = m.add_instruction(
+        migraphx::make_op("convert", {{"target_type", migraphx::shape::half_type}}), und);
+    auto abs = m.add_instruction(migraphx::make_op("abs"), cov);
+    migraphx::shape xs{migraphx::shape::float_type, {2, 3}};
+    std::vector<float> datax = {1, 2, 3, 4, 5, 6};
+    auto lit = m.add_literal(migraphx::literal(xs, datax));
+    auto mul = m.add_instruction(migraphx::make_op("mul"), lit, lit);
+    EXPECT(und->is_undefined());
+    EXPECT(cov->is_undefined());
+    EXPECT(abs->is_undefined());
+    EXPECT(not lit->is_undefined());
+    EXPECT(not mul->is_undefined());
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/layout_nhwc.cpp
+++ b/test/layout_nhwc.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <migraphx/layout_nhwc.hpp>
+#include <migraphx/dead_code_elimination.hpp>
+#include <migraphx/pass_manager.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/instruction.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/make_op.hpp>
+#include <test.hpp>
+void run_pass(migraphx::module& m)
+{
+    migraphx::run_passes(m, {migraphx::layout_nhwc{}, migraphx::dead_code_elimination{}});
+}
+migraphx::operation layout(std::vector<int64_t> permutation = {0, 1, 2, 3})
+{
+    return migraphx::make_op("layout", {{"permutation", permutation}});
+}
+migraphx::instruction_ref add_layout_nhwc(migraphx::module& m, migraphx::instruction_ref ins)
+{
+    return m.add_instruction(layout({0, 2, 3, 1}), ins);
+}
+TEST_CASE(conv_relu)
+{
+    migraphx::module m1;
+    {
+        auto x = m1.add_parameter("x", {migraphx::shape::float_type, {1, 8, 16, 16}});
+        auto w = m1.add_literal(
+            migraphx::generate_literal({migraphx::shape::float_type, {16, 8, 3, 3}}));
+        auto conv = m1.add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+            x,
+            w);
+        m1.add_instruction(migraphx::make_op("relu"), conv);
+    }
+    run_pass(m1);
+    migraphx::module m2;
+    {
+        auto x = add_layout_nhwc(
+            m2, m2.add_parameter("x", {migraphx::shape::float_type, {1, 8, 16, 16}}));
+        auto w    = add_layout_nhwc(m2,
+                                 m2.add_literal(migraphx::generate_literal(
+                                     {migraphx::shape::float_type, {16, 8, 3, 3}})));
+        auto conv = m2.add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+            x,
+            w);
+        auto conv_layout = m2.add_instruction(layout(), conv);
+        m2.add_instruction(migraphx::make_op("relu"), conv_layout);
+    }
+    EXPECT(m1.sort() == m2.sort());
+}
+TEST_CASE(conv_add)
+{
+    migraphx::module m1;
+    {
+        auto x = m1.add_parameter("x", {migraphx::shape::float_type, {1, 8, 16, 16}});
+        auto w = m1.add_literal(
+            migraphx::generate_literal({migraphx::shape::float_type, {16, 8, 3, 3}}));
+        auto y    = m1.add_literal(migraphx::generate_literal({migraphx::shape::float_type, {16}}));
+        auto conv = m1.add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+            x,
+            w);
+        auto b = m1.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", conv->get_shape().lens()}}),
+            y);
+        m1.add_instruction(migraphx::make_op("add"), conv, b);
+    }
+    run_pass(m1);
+    migraphx::module m2;
+    {
+        auto x = add_layout_nhwc(
+            m2, m2.add_parameter("x", {migraphx::shape::float_type, {1, 8, 16, 16}}));
+        auto w    = add_layout_nhwc(m2,
+                                 m2.add_literal(migraphx::generate_literal(
+                                     {migraphx::shape::float_type, {16, 8, 3, 3}})));
+        auto y    = m2.add_literal(migraphx::generate_literal({migraphx::shape::float_type, {16}}));
+        auto conv = m2.add_instruction(
+            migraphx::make_op("convolution",
+                              {{"padding", {1, 1}}, {"stride", {2, 2}}, {"dilation", {1, 1}}}),
+            x,
+            w);
+        auto conv_layout = m2.add_instruction(layout(), conv);
+        auto b           = m2.add_instruction(
+            migraphx::make_op("broadcast", {{"axis", 1}, {"out_lens", conv->get_shape().lens()}}),
+            y);
+        m2.add_instruction(migraphx::make_op("add"), conv_layout, b);
+    }
+    EXPECT(m1.sort() == m2.sort());
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/literal_test.cpp
+++ b/test/literal_test.cpp
@@ -49,6 +49,25 @@ TEST_CASE(literal_test)
    EXPECT(l4.empty());
 }
+TEST_CASE(literal_nstd_shape_vector)
+{
+    migraphx::shape nstd_shape{migraphx::shape::float_type, {1, 3, 2, 2}, {12, 1, 6, 3}};
+    std::vector<float> data(12);
+    std::iota(data.begin(), data.end(), 0);
+    auto l0 = migraphx::literal{nstd_shape, data};
+    // check data buffer is read in correctly
+    std::vector<float> expected_buffer = {0, 4, 8, 1, 5, 9, 2, 6, 10, 3, 7, 11};
+    const auto* start                  = reinterpret_cast<const float*>(l0.data());
+    std::vector<float> l0_data{start, start + 12};
+    EXPECT(l0_data == expected_buffer);
+    // check that using visit() (that uses a tensor view) gives data in correct order
+    std::vector<float> results_vector(12);
+    l0.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(results_vector == data);
+}
 TEST_CASE(literal_os1)
 {
    migraphx::literal l{1};

--- a/test/onnx/argmax_dyn_test.onnx
+++ b/test/onnx/argmax_dyn_test.onnx
--- a/test/onnx/averagepool_dyn_asym_padding_error_test.onnx
+++ b/test/onnx/averagepool_dyn_asym_padding_error_test.onnx