Merge remote-tracking branch 'origin/optimize' into ck-gsg

e7f7ea10 · Alan Turner · 9c6ba1ed · ae13eb93 · e7f7ea10 · 9c6ba1ed
Commit e7f7ea10 authored Nov 30, 2022 by Alan Turner
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -25,18 +25,40 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
 #include <migraphx/shape.hpp>
-#include <migraphx/op/convolution.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/register_op.hpp>
 #include <migraphx/gpu/miopen.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/deconvolution.hpp>
+#include <unordered_map>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-struct context;
+inline shape reshape_if_1d(const shape& input)
+{
+    shape new_shape{input};
+    auto dims = new_shape.lens();
+    if(dims.size() == 3)
+    {
+        std::vector<size_t> new_dims = dims;
+        new_dims.insert(new_dims.begin() + 2, 1);
+        new_shape = shape{input.type(), new_dims};
+    }
+    return new_shape;
+}
+template <class Op>
 struct miopen_convolution
 {
-    op::convolution op;
+    Op op;
+    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -48,29 +70,273 @@ struct miopen_convolution
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.op.padding, "padding"),
+        return pack(f(self.op, "op"),
-                    f(self.op.stride, "stride"),
-                    f(self.op.dilation, "dilation"),
-                    f(self.op.group, "group"),
-                    f(self.op.padding_mode, "padding_mode"),
 #ifdef MIGRAPHX_HAS_FIND_2_API
                    f(self.solution_object, "solution_object"),
 #endif
+                    f(self.algo, "algo"),
+                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }
-    std::string name() const { return "gpu::convolution"; }
+    std::string name() const { return "gpu::" + op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    inline shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, op}.has(4);
+        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
+        check_shapes{conv_inputs, *this}.max_ndims(5).packed_layouts(
+            {{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}});
+        return migraphx::compute_shape<Op>(op, conv_inputs);
+    }
    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    {
-    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs);
+        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        auto workspace_size        = args[2].get_shape().bytes();
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, args[0].implicit()},
+                {miopenTensorConvolutionW, nullptr, args[1].implicit()},
+                {miopenTensorConvolutionY, nullptr, args[3].implicit()},
+            };
+            if(solution_ptr.get() == nullptr)
+                MIGRAPHX_THROW("MIOpen " + op.name() + " : Load MIOpen Solution before running it");
+            auto status = miopenRunSolution(miopen_stream_handle,
+                                            solution_ptr.get(),
+                                            3,
+                                            tensor_args,
+                                            args[2].implicit(),
+                                            workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen " + op.name() +
+                               " : running convolution using find_2.0 failed");
+            return args[3];
+        }
+#else
+        // else use immediate mode
+        if(solution_id == 0)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : invalid solution ID");
+        auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
+                                                        w_desc.get(),
+                                                        args[1].implicit(),
+                                                        x_desc.get(),
+                                                        args[0].implicit(),
+                                                        cd.get(),
+                                                        y_desc.get(),
+                                                        args[3].implicit(),
+                                                        args[2].implicit(),
+                                                        workspace_size,
+                                                        solution_id);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": running convolution failed");
+        return args[3];
+#endif
+    }
+    void set_conv_descriptor()
+    {
+        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+    }
+    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
+    {
+        set_conv_descriptor();
+        auto ws = find(any_cast<migraphx::gpu::context>(ctx), output, input);
+        return {{"workspace", ws.bytes()}};
+    }
+    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        shape workspace_shape{};
+        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        std::size_t workspace_size = 0;
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            auto conv_problem = make_obj<miopen_problem>(
+                &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
+            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
+            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+            solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
+            auto status  = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
+            std::size_t solution_size;
+            status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Failed to fetch solution size");
+            auto solution_binary = std::vector<char>{};
+            solution_binary.resize(solution_size);
+            status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Saving solution failed");
+            solution_object = value::binary{solution_binary.data(), solution_size};
+            return shape{shape::int8_type, {workspace_size}};
+        }
+#else
+        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
+        auto x         = to_gpu(generate_argument(x_shape));
+        auto w         = to_gpu(generate_argument(w_shape));
+        auto y         = allocate_gpu(output_shape);
+        auto workspace = allocate_gpu(workspace_shape);
+        int algo_count = 1;
+        miopenConvAlgoPerf_t perf;
+        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+                                                       x_desc.get(),
+                                                       x.implicit(),
+                                                       w_desc.get(),
+                                                       w.implicit(),
+                                                       cd.get(),
+                                                       y_desc.get(),
+                                                       y.implicit(),
+                                                       1,
+                                                       &algo_count,
+                                                       &perf,
+                                                       workspace.implicit(),
+                                                       workspace_size,
+                                                       false);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
+        algo = perf.fwd_algo;
+        size_t solution_count;
+        status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          &solution_count);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution count failed");
+        std::vector<miopenConvSolution_t> solutions(solution_count);
+        status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                     w_desc.get(),
+                                                     x_desc.get(),
+                                                     cd.get(),
+                                                     y_desc.get(),
+                                                     solution_count,
+                                                     &solution_count,
+                                                     solutions.data());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution failed");
+        solution_id = solutions.front().solution_id;
+        return shape{shape::int8_type, {perf.memory}};
+#endif
+    }
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            (void)(ctx); // avoid warnings
+            (void)(output_shape);
+            (void)(inputs);
+            // load solution
+            if(solution_ptr == nullptr)
+            {
+                miopenSolution_t ptr;
+                auto status =
+                    miopenLoadSolution(&ptr,
+                                       reinterpret_cast<const char*>(solution_object.data()),
+                                       solution_object.size());
+                solution_ptr = miopen_solution{ptr};
+                if(status != miopenStatusSuccess)
+                    MIGRAPHX_THROW("MIOpen " + op.name() + ": loading convolution solution failed");
+            }
+        }
+#else
+        // Use immediate mode API
+        {
+            set_conv_descriptor();
+            if(solution_id == 0)
+            {
+                // Check that workspace hasn't changed
+                auto size = inputs.at(2).bytes();
+                auto ws   = find(ctx, output_shape, inputs);
+                if(ws.bytes() > size)
+                    MIGRAPHX_THROW("MIOpen " + op.name() +
+                                   ": workspace has changed during finalization.");
+            }
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto y_desc = make_tensor(reshape_if_1d(output_shape));
+            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                                  w_desc.get(),
+                                                                  x_desc.get(),
+                                                                  cd.get(),
+                                                                  y_desc.get(),
+                                                                  solution_id);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+        }
+#endif
+    }
+    inline std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
-};
+    inline shape pack_int8_shape(const shape& s) const
+    {
+        if(s.type() != shape::int8_type)
+        {
+            return s;
+        }
+        auto lens    = s.lens();
+        auto strides = s.strides();
+        lens[1]      = (lens[1] + 3) / 4 * 4;
+        strides[0]   = strides[1] * lens[1];
+        return {s.type(), lens, strides};
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/elu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/elu.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-#include <migraphx/op/elu.hpp>
-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/gpu/miopen.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
-struct miopen_elu
-{
-    op::elu op;
-    shared<activation_descriptor> ad;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "gpu::elu"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    void finalize(context&, const shape&, const std::vector<shape>&);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -36,7 +36,8 @@ struct module;
 namespace gpu {
 std::string dump_mlir(const module& m);
-code_object_op compile_mlir(const context& ctx, const module& m);
+code_object_op
+compile_mlir(const context& ctx, module m, const std::vector<instruction_ref>& inputs);
 instruction_ref insert_mlir(module& m,
                            instruction_ref ins,

--- a/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
@@ -41,7 +41,7 @@ struct problem_params
    shape output;
 };
-std::string get_mlir_perf_for_conv(const problem_params& pp);
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/config.hpp>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/concat.cpp
+++ b/src/targets/gpu/jit/concat.cpp
@@ -38,16 +38,19 @@ using namespace migraphx::gpu::gen; // NOLINT
 static const char* const concat_kernel = R"__migraphx__(
 #include <migraphx/kernels/concat.hpp>
 #include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/ops.hpp>
 #include <args.hpp>
 namespace migraphx {
+${preamble}
 extern "C" {
 __global__ void ${kernel}(${params}) 
 {
-    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, auto... xs) {
+    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, ${concat_params}, auto... xs) {
-        concat<${axis}>(y, xs...);
+        concat<${axis}>(${concat_args})(${post}, y, xs...);
    });
 }
@@ -68,28 +71,42 @@ struct concat_compiler : compiler<concat_compiler>
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
-        // TODO: Use reduce_dims
+        auto num_of_concat_inputs = v.get("concat_inputs", inputs.size() - 1);
        hip_compile_options options;
        options.inputs      = inputs;
        options.output      = inputs.back();
        options.params      = "-Wno-float-equal";
+        options.kernel_name = v.get("kernel", "concat_kernel");
        auto axis           = find_fast_axis(options.inputs);
        auto vec            = vectorize::elements(ctx, axis, options.inputs);
-        options.kernel_name = v.get("kernel", "concat_kernel");
        options.set_launch_params(
            v, compute_global_for(ctx, get_concat_elements(options.inputs) / vec.size, 256));
-        auto src = interpolate_string(concat_kernel,
+        auto src = interpolate_string(
-                                      {{"kernel", options.kernel_name},
+            concat_kernel,
-                                       {"params", enum_params(inputs.size(), "void * private_p")},
+            {{"kernel", options.kernel_name},
-                                       {"args", enum_params(inputs.size(), "private_p")},
+             {"params", enum_params(inputs.size(), "void * private_p")},
-                                       {"transformers", make_transformer_args(vec)},
+             {"args", enum_params(inputs.size(), "private_p")},
-                                       {"axis", v.at("axis").to<std::string>()}});
+             {"concat_params", enum_params(num_of_concat_inputs, "auto concat_x")},
+             {"concat_args", enum_params(num_of_concat_inputs, "concat_x")},
+             {"post", v.get("post", std::string{"op::id{}"})},
+             {"transformers", make_transformer_args(vec)},
+             {"preamble", v.get("preamble", std::string{})},
+             {"axis", v.at("axis").to<std::string>()}});
        return compile_hip_code_object(src, options);
    }
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        auto v = op.to_value();
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm           = ins->module_inputs().front();
+            v["concat_inputs"] = ins->inputs().size() - pm->get_parameter_names().size();
+            v["preamble"]      = generate_pointwise(*pm, "post_concat");
+            v["post"]          = "MIGRAPHX_LIFT(post_concat)";
+            v["kernel"]        = "concat_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
    }
 };

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -41,7 +41,7 @@ struct mlir_compiler : compiler<mlir_compiler>
    {
        auto* smod = ins->module_inputs().front();
        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
-        return insert(compile_mlir(ctx, *smod));
+        return insert(compile_mlir(ctx, *smod, ins->inputs()));
    }
    compiler_replace insert(code_object_op co) const

--- a/src/targets/gpu/batch_norm_inference.cpp
+++ b/src/targets/gpu/batch_norm_inference.cpp
@@ -21,65 +21,80 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/batch_norm_inference.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/float_equal.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-shape miopen_batch_norm_inference::compute_shape(const std::vector<shape>& inputs) const
+using namespace migraphx::gpu::gen; // NOLINT
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/pad.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+namespace migraphx {
+extern "C" {
+__global__ void pad_kernel(void* input_p, void* output_p) 
 {
-    check_shapes{inputs, *this}.has(6);
+    auto offsets = index_ints<${offsets}>{};
-    check_shapes{inputs.data(), inputs.data() + 1, *this}.same_ndims().max_ndims(5);
+    auto idx     = make_index();
-    return op.compute_shape({inputs.at(0), inputs.at(1), inputs.at(2), inputs.at(3), inputs.at(4)});
+    make_tensors()(input_p, output_p)([&](auto input, auto output) {
+        pad(idx, offsets, input, output, ${pad_val});
+    });
+}
 }
-inline shape reshape_to_2d(const shape& input)
+} // namespace migraphx
-{
-    auto dims = input.lens();
-    if(dims.size() >= 4)
-        return input;
-    std::vector<size_t> new_dims(dims.begin(), dims.end());
+)__migraphx__";
-    std::size_t num = 4 - dims.size();
-    new_dims.insert(new_dims.end(), num, 1);
-    return {input.type(), new_dims};
-}
-argument miopen_batch_norm_inference::compute(context& ctx,
+struct pad_compiler : compiler<pad_compiler>
-                                              const shape& output_shape,
-                                              const std::vector<argument>& args) const
 {
-    shape x_shape  = args[0].get_shape();
+    std::vector<std::string> names() const { return {"pad"}; }
-    shape y_shape  = output_shape;
-    shape bn_shape = args[3].get_shape();
-    auto x_desc  = make_tensor(reshape_to_2d(x_shape));
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
-    auto y_desc  = make_tensor(reshape_to_2d(y_shape));
+    {
-    auto bn_desc = make_tensor(reshape_to_2d(bn_shape));
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        options.kernel_name    = "pad_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
-    float alpha = 1.0;
+        auto pad_val        = v.get("value", 0.f);
-    float beta  = 0.0f;
+        auto pad_val_string = to_string(pad_val);
+        if(float_equal(pad_val, std::numeric_limits<float>::lowest()))
+            pad_val_string = "lowest{}";
+        if(float_equal(pad_val, std::numeric_limits<float>::max()))
+            pad_val_string = "highest{}";
-    miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
+        auto padding    = v.at("pads").to_vector<int64_t>();
-                                             miopenBatchNormMode_t(op.bn_mode),
+        auto input_lens = inputs.front().lens();
-                                             &alpha,
+        std::vector<size_t> offsets(input_lens.size());
-                                             &beta,
+        std::copy(padding.begin(), padding.begin() + offsets.size(), offsets.begin());
-                                             x_desc.get(),
-                                             args[0].implicit(),
-                                             y_desc.get(),
-                                             args[5].implicit(),
-                                             bn_desc.get(),
-                                             args[1].implicit(),
-                                             args[2].implicit(),
-                                             args[3].implicit(),
-                                             args[4].implicit(),
-                                             op.epsilon);
-    return args[5];
+        auto src = interpolate_string(
-}
+            pointwise_kernel,
+            {{"pad_val", to_string(pad_val_string)}, {"offsets", to_string_range(offsets)}});
+        return compile_hip_code_object(src, options);
+    }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -58,7 +58,7 @@ __global__ void ${kernel}(${params})
 struct pointwise_compiler : compiler<pointwise_compiler>
 {
-    std::vector<std::string> names() const { return {"pointwise", "contiguous"}; }
+    std::vector<std::string> names() const { return {"pointwise", "contiguous", "layout"}; }
    static std::size_t oversubscribe_if(bool b)
    {
@@ -91,12 +91,12 @@ struct pointwise_compiler : compiler<pointwise_compiler>
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        if(op.name() == "contiguous")
+        if(contains({"layout", "contiguous"}, op.name()))
        {
            return replace(compile_op(
                ctx,
                to_shapes(ins->inputs()),
-                {{"lambda", "[](auto x) { return x; }"}, {"kernel", "contiguous_kernel"}}));
+                {{"lambda", "[](auto x) { return x; }"}, {"kernel", op.name() + "_kernel"}}));
        }
        else
        {

--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -79,9 +79,10 @@ struct scatternd_compiler : compiler<scatternd_compiler>
    {
        assert(starts_with(op.name(), "scatternd_"));
        auto reduction = op.name().substr(10);
-        return insert(compile_op(ctx,
+        return insert(compile_op(
-                                 to_shapes({ins->inputs().begin() + 1, ins->inputs().end()}),
+            ctx,
-                                 {{"reduction", reduction}}));
+            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()}),
+            {{"reduction", reduction}}));
    }
    compiler_replace insert(const operation& op) const

--- a/src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
@@ -41,7 +41,15 @@ constexpr auto concat_slice(Output out, Input, Start)
        return Start{} * output_shape.strides[Axis];
    });
    constexpr auto s       = make_shape(lens, strides);
-    return make_tensor_view(&out[offset], s);
+    MIGRAPHX_ASSERT(offset < out.get_shape().element_space());
+    MIGRAPHX_ASSERT((s.element_space() + offset) <= out.get_shape().element_space());
+    return make_tensor_view(out.data() + offset, s);
+}
+template <index_int Axis, class Input, class Start, class... Ts>
+constexpr auto concat_slices(Input input, Start start, Ts... xs)
+{
+    return [=](auto f) { f(concat_slice<Axis>(xs, input, start)...); };
 }
 template <index_int Axis, class Input>
@@ -51,15 +59,19 @@ constexpr auto concat_ends(Input)
    return _c<lens[Axis]>;
 }
-template <index_int Axis, class Output, class... Inputs>
+template <index_int Axis, class... Inputs>
-__device__ void concat(Output output, Inputs... inputs)
+__device__ auto concat(Inputs... inputs)
 {
-    auto idx = make_index();
+    return [=](auto f, auto... ts) {
-    fold([&](auto start, auto input) {
+        auto idx = make_index();
-        auto y = concat_slice<Axis>(output, input, start);
+        fold([&](auto start, auto input) {
-        idx.global_stride(input.get_shape().elements(), [&](auto i) { y[i] = input[i]; });
+            concat_slices<Axis>(input, start, ts...)([&](auto y, auto... xs) {
-        return start + concat_ends<Axis>(input);
+                idx.global_stride(input.get_shape().elements(),
-    })(_c<0>, inputs...);
+                                  [&](auto i) { y[i] = f(input[i], xs[i]...); });
+            });
+            return start + concat_ends<Axis>(input);
+        })(_c<0>, inputs...);
+    };
 }
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -21,53 +21,43 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_PAD_HPP
-#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_KERNELS_PAD_HPP
-#include <migraphx/shape.hpp>
+#include <migraphx/kernels/shape.hpp>
-#include <migraphx/reflect.hpp>
+#include <migraphx/kernels/index.hpp>
-#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/kernels/algorithm.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/kernels/ranges.hpp>
 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
+template <class Offsets, class Input, class Output, class PadVal>
+__device__ void pad(const index& idx,
-struct miopen_quant_convolution
+                    const Offsets& offsets,
+                    const Input& input,
+                    Output& output,
+                    const PadVal& pad_val)
 {
-    op::quant_convolution op;
+    auto output_shape = output.get_shape();
-    bool int8_x4_format = false;
+    idx.global_stride(output_shape.elements(), [&](auto i) {
-    shared<convolution_descriptor> cd;
+        // 1. get current multi-index for output
-    miopenConvFwdAlgorithm_t algo{};
+        // 2. get the size of the input to determine input boundaries
-    uint64_t solution_id = 0;
+        // 3. compute the corresponding multi-index for input by accounting for offsets
+        // 4. if current multi-index is within offsets or input's new multi-index is out of bounds,
-    template <class Self, class F>
+        //    use pad value instead of input's value
-    static auto reflect(Self& self, F f)
+        auto multi        = output_shape.multi(i);
-    {
+        auto input_bounds = input.get_shape().lens;
-        // TODO: Add algo
+        auto input_idx    = multi - offsets;
-        return pack_join(migraphx::reflect(self.op, f),
+        auto range_multi  = range(multi.size());
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
-    }
+        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
+               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
-    std::string name() const { return "gpu::quant_convolution"; }
+           }))
-    shape compute_shape(const std::vector<shape>& inputs) const;
+            output[multi] = pad_val;
-    argument
+        else
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+            output[multi] = input[input_idx];
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    });
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+}
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-    private:
-    shape pack_int8_shape(const shape& s) const;
-};
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#include <migraphx/kernels/iota_iterator.hpp>
+namespace migraphx {
+template <class Iterator>
+struct iterator_range
+{
+    Iterator start;
+    Iterator last;
+    constexpr Iterator begin() const { return start; }
+    constexpr Iterator end() const { return last; }
+};
+constexpr iterator_range<iota_iterator> range(diff_int start, diff_int last)
+{
+    return {{start, {}}, {last, {}}};
+}
+constexpr iterator_range<iota_iterator> range(diff_int last) { return range(0, last); }
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANGES_HPP
--- a/src/targets/gpu/leaky_relu.cpp
+++ b/src/targets/gpu/leaky_relu.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/leaky_relu.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/miopen.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape miopen_leaky_relu::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-argument miopen_leaky_relu::compute(context& ctx,
-                                    const shape& output_shape,
-                                    const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-    return args[1];
-}
-void miopen_leaky_relu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_leaky_relu(op.alpha);
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -29,23 +29,15 @@
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/stringutils.hpp>
-#include <migraphx/op/convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/if_op.hpp>
 #include <migraphx/op/reshape.hpp>
-#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/quant_dot.hpp>
-#include <migraphx/gpu/batch_norm_inference.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/convolution.hpp>
-#include <migraphx/gpu/deconvolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/gpu/int8_conv_pack.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
@@ -98,14 +90,11 @@ struct miopen_apply
        add_extend_op("argmax");
        add_extend_op("argmin");
-        add_extend_op("elu");
        add_extend_op("gather");
-        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
        add_extend_op("multinomial");
        add_extend_op("nonzero");
-        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
        add_extend_op("reverse");
@@ -115,16 +104,15 @@ struct miopen_apply
        add_extend_op("scatter_none");
        add_extend_op("topk");
-        add_batch_norm_inference_op();
+        add_convolution_op("convolution");
-        add_convolution_op();
+        add_convolution_op("deconvolution");
-        add_deconvolution_op();
+        add_convolution_op("quant_convolution");
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
        add_if_op();
        add_loop_op();
        add_neg_op();
        add_nms_op();
-        add_quant_convolution_op();
    }
    void copy_params() const
@@ -232,38 +220,6 @@ struct miopen_apply
        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
    }
-    void add_convolution_op()
-    {
-        apply_map.emplace("convolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::convolution>(ins->get_operator());
-            auto conv = miopen_convolution{op, make_conv(op)};
-            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-            return mod->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
-    void add_deconvolution_op()
-    {
-        apply_map.emplace("deconvolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::deconvolution>(ins->get_operator());
-            auto conv = miopen_deconvolution{op, make_deconv(op)};
-            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-            return mod->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
    template <typename Op>
    void add_gemm_op(const std::string& name)
    {
@@ -277,32 +233,19 @@ struct miopen_apply
        });
    }
-    void add_quant_convolution_op()
+    void add_convolution_op(const std::string& name)
    {
-        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
+        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
+            operation conv = make_op(
-            shape ws;
+                "gpu::" + name,
-            miopen_quant_convolution conv;
+                {{"op", ins->get_operator().to_value()}, {"int8_x4_format", int8_x4_format}});
-            auto compile_quant_conv_with_format = [&](bool format) {
+            auto output = insert_allocation(ins, ins->get_shape());
-                conv = miopen_quant_convolution{op, format, make_conv(op)};
-                ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            };
-            try
-            {
-                compile_quant_conv_with_format(int8_x4_format);
-            }
-            catch(migraphx::exception&)
-            {
-                // In case no solver supports the default format, retry using the other format.
-                compile_quant_conv_with_format(not int8_x4_format);
-            }
-            auto args      = ins->inputs();
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-            return mod->replace_instruction(ins, conv, args[0], args[1], workspace, output);
+            return mod->replace_instruction(ins,
+                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),
+                                            ins->inputs().at(0),
+                                            ins->inputs().at(1),
+                                            output);
        });
    }
@@ -336,43 +279,6 @@ struct miopen_apply
        });
    }
-    void add_batch_norm_inference_op()
-    {
-        apply_map.emplace("batch_norm_inference", [=](instruction_ref ins) {
-            auto&& op       = any_cast<op::batch_norm_inference>(ins->get_operator());
-            auto output     = insert_allocation(ins, ins->get_shape());
-            shape old_shape = ins->inputs().at(1)->get_shape();
-            auto input      = ins->inputs()[0];
-            auto input_lens = input->get_shape().lens();
-            std::vector<int64_t> rsp_lens(input_lens.size(), 1);
-            // for per_activation case, also need to reshape input
-            if(op.bn_mode == op::batch_norm_inference::per_activation)
-            {
-                std::copy(input_lens.begin() + 1, input_lens.end(), rsp_lens.begin() + 1);
-            }
-            else
-            {
-                rsp_lens[1] = static_cast<int64_t>(old_shape.elements());
-            }
-            auto reshape_op = op::reshape{rsp_lens};
-            std::vector<instruction_ref> reshapes;
-            std::transform(ins->inputs().begin() + 1,
-                           ins->inputs().end(),
-                           std::back_inserter(reshapes),
-                           [&](auto i) { return mod->insert_instruction(ins, reshape_op, i); });
-            return mod->replace_instruction(ins,
-                                            miopen_batch_norm_inference{op},
-                                            input,
-                                            reshapes[0],
-                                            reshapes[1],
-                                            reshapes[2],
-                                            reshapes[3],
-                                            output);
-        });
-    }
    // use 0 - input to represent neg
    void add_neg_op()
    {

--- a/src/targets/gpu/mlir.cpp
+++ b/src/targets/gpu/mlir.cpp
@@ -21,6 +21,7 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
+#include "migraphx/make_op.hpp"
 #include <migraphx/gpu/mlir.hpp>
 #ifdef MIGRAPHX_MLIR
@@ -43,8 +44,9 @@
 #include <migraphx/gpu/code_object_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device_name.hpp>
-#include <migraphx/iterator_for.hpp>
 #include <migraphx/gpu/perfdb.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/permutation.hpp>
 #include <deque>
 #include <variant>
@@ -99,7 +101,10 @@ struct mlir_handle
    mlir_handle(T p) : handle(ptr{p}) {}
-    T get() const { return handle.get().get(); }
+    T get() const
+    {
+        return handle.get().get(); // NOLINT(readability-redundant-smartptr-get)
+    }
    T release() { return handle.release().get(); }
@@ -370,7 +375,11 @@ struct mlir_program
        mlir_operation_state& add_results(const std::vector<shape>& outputs)
        {
-            auto x = prog->make_tensors(outputs);
+            std::vector<shape> reshaped(outputs.size());
+            std::transform(outputs.begin(), outputs.end(), reshaped.begin(), [](const shape& r) {
+                return shape{r.type(), r.lens()};
+            });
+            auto x = prog->make_tensors(reshaped);
            mlirOperationStateAddResults(&op_state, x.size(), x.data());
            return *this;
        }
@@ -502,11 +511,12 @@ struct mlir_program
            {
                pp =
                    problem_params{ins->get_operator(), to_shapes(ins->inputs()), ins->get_shape()};
-                std::string tuned = get_tune_params();
+                // check if HW supports xdlops
+                bool xdlops       = contains(get_xdlops_archs(), target_name);
+                std::string tuned = get_tune_params(xdlops);
                if(not tuned.empty())
                    ops.add_attributes({{"perf_config", tuned}});
-                // check if HW supports xdlops
+                if(xdlops)
-                if(contains(get_xdlops_archs(), target_name))
                    ops.add_attributes({{"xdlopsV2", true}});
            }
@@ -571,7 +581,7 @@ struct mlir_program
        MIGRAPHX_THROW("Failed to compile mlir program");
    }
-    std::string get_tune_params() { return get_mlir_perf_for_conv(pp); }
+    std::string get_tune_params(bool xdlops) { return get_mlir_perf_for_conv(pp, xdlops); }
    mlir_context ctx;
    MlirLocation location;
@@ -589,8 +599,54 @@ std::string dump_mlir(const module& m)
    return mlir_print(&mlirOperationPrint, mod_op);
 }
-code_object_op compile_mlir(const context&, const module& m)
+void adjust_param_shapes(module& m, const std::vector<instruction_ref>& inputs)
 {
+    auto names = m.get_parameter_names();
+    std::sort(names.begin(), names.end());
+    for(auto i : range(names.size()))
+    {
+        const auto& name  = names[i];
+        const auto& input = inputs[i]->get_shape();
+        auto param        = m.get_parameter(name);
+        if(input.standard())
+            continue;
+        auto lens    = input.lens();
+        auto strides = input.strides();
+        std::vector<operation> ops;
+        if(input.transposed())
+        {
+            auto perm  = find_permutation(input);
+            auto iperm = invert_permutation(perm);
+            lens       = reorder_dims(lens, iperm);
+            strides    = reorder_dims(strides, iperm);
+            ops.push_back(make_op("transpose", {{"permutation", perm}}));
+        }
+        if(input.broadcasted())
+        {
+            std::transform(lens.begin(),
+                           lens.end(),
+                           strides.begin(),
+                           lens.begin(),
+                           [](auto len, auto stride) -> std::size_t {
+                               if(stride == 0)
+                                   return 1;
+                               return len;
+                           });
+            ops.push_back(make_op("multibroadcast", {{"out_lens", input.lens()}}));
+        }
+        auto new_param =
+            std::accumulate(ops.begin(),
+                            ops.end(),
+                            m.add_parameter(name + ".0", shape{input.type(), lens}),
+                            [&](auto x, auto op) { return m.insert_instruction(param, op, x); });
+        m.replace_instruction(param, new_param);
+        m.remove_instruction(param);
+    }
+}
+code_object_op compile_mlir(const context&, module m, const std::vector<instruction_ref>& inputs)
+{
+    adjust_param_shapes(m, inputs);
    const bool trace = enabled(MIGRAPHX_TRACE_MLIR{});
    if(trace)
        std::cout << m << std::endl;
@@ -662,13 +718,19 @@ instruction_ref insert_mlir(module& m,
 std::string dump_mlir(const module&) { return {}; }
-code_object_op compile_mlir(const context&, const module&) { return {}; }
 template <class T>
 void use(T&)
 {
 }
+// Disabling clang-tidy warning on non-real useage.
+// NOLINTBEGIN(performance-unnecessary-value-param)
+code_object_op compile_mlir(const context&, module, const std::vector<instruction_ref>&)
+{
+    return {};
+}
+// NOLINTEND(performance-unnecessary-value-param)
 instruction_ref
 // cppcheck-suppress funcArgNamesDifferent
 insert_mlir(module& m, instruction_ref, code_object_op co, const std::vector<instruction_ref>&)

--- a/src/targets/gpu/perfdb.cpp
+++ b/src/targets/gpu/perfdb.cpp
@@ -108,16 +108,17 @@ auto query_miopen_db(const std::string& query)
 } // namespace
-std::string get_mlir_perf_for_conv(const problem_params& pp)
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops)
 {
-    std::string query = "select P.* \
+    std::string solver = xdlops ? "ConvMlirIgemmFwdXdlops" : "ConvMlirIgemmFwd";
+    std::string query  = "select P.* \
                             from perf_db P, config C \
                             where P.config = C.id AND \
-                             P.solver = 'ConvMlirIgemmFwdXdlops' AND \
+                             P.solver = '${solver}' AND \
                             ${config}";
-    auto results =
+    auto results = query_miopen_db(
-        query_miopen_db(interpolate_string(query, {{"config", generate_miopen_config(pp)}}));
+        interpolate_string(query, {{"config", generate_miopen_config(pp)}, {"solver", solver}}));
    if(results.empty())
        return "";
    return results.front().at("params");

--- a/src/targets/gpu/prefuse_ops.cpp
+++ b/src/targets/gpu/prefuse_ops.cpp
@@ -104,7 +104,9 @@ struct find_add_layernorm
    {
        auto ins     = r.result;
        auto add_ins = r.instructions["add"];
-        auto eps     = r.instructions["eps"]->eval().at<float>();
+        float eps    = 0;
+        if(contains(r.instructions, "eps"))
+            eps = r.instructions["eps"]->eval().at<float>();
        m.replace_instruction(ins, add_layernorm{eps}, add_ins->inputs());
    }

--- a/src/targets/gpu/quant_convolution.cpp
+++ b/src/targets/gpu/quant_convolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/quant_convolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape miopen_quant_convolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    return op.normalize_compute_shape({inputs.at(0), inputs.at(1)});
-}
-argument miopen_quant_convolution::compute(context& ctx,
-                                           const shape& output_shape,
-                                           const std::vector<argument>& args) const
-{
-    auto x_desc = make_tensor(args[0].get_shape(), int8_x4_format);
-    auto w_desc = make_tensor(args[1].get_shape(), int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-    float alpha = 1;
-    float beta  = 0;
-    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
-                                           &alpha,
-                                           x_desc.get(),
-                                           args[0].implicit(),
-                                           w_desc.get(),
-                                           args[1].implicit(),
-                                           cd.get(),
-                                           algo,
-                                           &beta,
-                                           y_desc.get(),
-                                           args[3].implicit(),
-                                           args[2].implicit(),
-                                           args[2].get_shape().bytes());
-    if(status != miopenStatusSuccess)
-    {
-        MIGRAPHX_THROW("QUANT_CONVOLUTION: run convolution forward failed");
-    }
-    return args[3];
-}
-shape miopen_quant_convolution::find(context& ctx,
-                                     const shape& output_shape,
-                                     std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-    std::size_t workspace_size = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                             w_desc.get(),
-                                             x_desc.get(),
-                                             cd.get(),
-                                             y_desc.get(),
-                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-    auto x_shape = inputs[0];
-    auto w_shape = inputs[1];
-    if(int8_x4_format)
-    {
-        x_shape = pack_int8_shape(x_shape);
-        w_shape = pack_int8_shape(w_shape);
-    }
-    auto x         = to_gpu(generate_argument(x_shape));
-    auto w         = to_gpu(generate_argument(w_shape));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                        x_desc.get(),
-                                                        x.implicit(),
-                                                        w_desc.get(),
-                                                        w.implicit(),
-                                                        cd.get(),
-                                                        y_desc.get(),
-                                                        y.implicit(),
-                                                        1,
-                                                        &algo_count,
-                                                        &perf,
-                                                        workspace.implicit(),
-                                                        workspace_size,
-                                                        false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: find convolution failed");
-    algo = perf.fwd_algo;
-    size_t solution_count;
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution count failed");
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: get solution failed");
-    solution_id = solutions.front().solution_id;
-    return shape{shape::int8_type, {perf.memory}};
-}
-void miopen_quant_convolution::finalize(context& ctx,
-                                        const shape& output_shape,
-                                        std::vector<shape> inputs)
-{
-    if(cd == nullptr)
-        cd = make_conv(op);
-    if(solution_id == 0)
-    {
-        // Check that workspace hasn't changed
-        auto size = inputs.at(2).bytes();
-        auto ws   = find(ctx, output_shape, inputs);
-        if(ws.bytes() > size)
-            MIGRAPHX_THROW("MIOpen Quant Convolution: workspace has changed during finalization.");
-    }
-    auto x_desc = make_tensor(inputs[0], int8_x4_format);
-    auto w_desc = make_tensor(inputs[1], int8_x4_format);
-    auto y_desc = make_tensor(output_shape);
-    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                          w_desc.get(),
-                                                          x_desc.get(),
-                                                          cd.get(),
-                                                          y_desc.get(),
-                                                          solution_id);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Quant Convolution: compile solution failed");
-}
-shape miopen_quant_convolution::pack_int8_shape(const shape& s) const
-{
-    if(s.type() != shape::int8_type)
-    {
-        MIGRAPHX_THROW("PACK_INT8_SHAPE: only process int8_type");
-    }
-    auto lens    = s.lens();
-    auto strides = s.strides();
-    lens[1]      = (lens[1] + 3) / 4 * 4;
-    strides[0]   = strides[1] * lens[1];
-    return {s.type(), lens, strides};
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -35,13 +35,14 @@
 #include <migraphx/fuse_pointwise.hpp>
 #include <migraphx/inline_module.hpp>
 #include <migraphx/insert_pad.hpp>
+#include <migraphx/layout_nhwc.hpp>
 #include <migraphx/memory_coloring.hpp>
 #include <migraphx/normalize_ops.hpp>
+#include <migraphx/optimize.hpp>
 #include <migraphx/preallocate_param.hpp>
 #include <migraphx/propagate_constant.hpp>
 #include <migraphx/register_target.hpp>
 #include <migraphx/replace_allocate.hpp>
-#include <migraphx/rewrite_batchnorm.hpp>
 #include <migraphx/rewrite_gelu.hpp>
 #include <migraphx/rewrite_pooling.hpp>
 #include <migraphx/rewrite_quantization.hpp>
@@ -51,6 +52,7 @@
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/gpu/allocation_model.hpp>
+#include <migraphx/gpu/compile_miopen.hpp>
 #include <migraphx/gpu/compile_ops.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
 #include <migraphx/gpu/context.hpp>
@@ -73,6 +75,7 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_POINTWISE_FUSION)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
 struct id_pass
 {
@@ -112,20 +115,15 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        insert_pad{},
        dead_code_elimination{},
-        rewrite_batchnorm{},
-        dead_code_elimination{},
        rewrite_rnn{},
        dead_code_elimination{},
        inline_module{},
        rewrite_pooling{},
        dead_code_elimination{},
        rewrite_gelu{},
+        optimize{},
+        enable_pass(enabled(MIGRAPHX_ENABLE_NHWC{}), layout_nhwc{}),
        dead_code_elimination{},
-        eliminate_common_subexpression{},
-        dead_code_elimination{},
-        simplify_algebra{},
-        simplify_reshapes{},
-        simplify_algebra{},
        prefuse_ops{},
        dead_code_elimination{},
        auto_contiguous{},
@@ -134,6 +132,7 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        fuse_ck_gemm_softmax_gemm{&ctx},
        dead_code_elimination{},
+        optimize{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_POINTWISE_FUSION{}), fuse_pointwise{}),
        dead_code_elimination{},
        fuse_mlir{&ctx},
@@ -145,6 +144,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        eliminate_concat{concat_gpu_optimization{}},
        dead_code_elimination{},
+        compile_miopen{&gctx},
+        dead_code_elimination{},
        pack_int8_args{},
        dead_code_elimination{},
        adjust_allocation{gpu_allocation_model{}},
@@ -153,6 +154,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        replace_allocate{gpu_allocation_model{}, options.offload_copy},
        dead_code_elimination{},
+        adjust_allocation{gpu_allocation_model{}},
+        dead_code_elimination{},
        compile_ops{&ctx},
        dead_code_elimination{},
        write_literals{&ctx},