Merge branch 'develop' into jit-reduce-reg

dae94657 · Chris Austen · GitHub · b013d991 · 56c43445 · dae94657
Unverified Commit dae94657 authored Dec 14, 2022 by Chris Austen Committed by GitHub Dec 14, 2022
20 changed files
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -25,18 +25,40 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP
 #include <migraphx/shape.hpp>
-#include <migraphx/op/convolution.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/register_op.hpp>
 #include <migraphx/gpu/miopen.hpp>
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/deconvolution.hpp>
+#include <unordered_map>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-struct context;
+inline shape reshape_if_1d(const shape& input)
+{
+    shape new_shape{input};
+    auto dims = new_shape.lens();
+    if(dims.size() == 3)
+    {
+        std::vector<size_t> new_dims = dims;
+        new_dims.insert(new_dims.begin() + 2, 1);
+        new_shape = shape{input.type(), new_dims};
+    }
+    return new_shape;
+}
+template <class Op>
 struct miopen_convolution
 {
-    op::convolution op;
+    Op op;
+    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -48,29 +70,273 @@ struct miopen_convolution
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.op.padding, "padding"),
+        return pack(f(self.op, "op"),
-                    f(self.op.stride, "stride"),
-                    f(self.op.dilation, "dilation"),
-                    f(self.op.group, "group"),
-                    f(self.op.padding_mode, "padding_mode"),
 #ifdef MIGRAPHX_HAS_FIND_2_API
                    f(self.solution_object, "solution_object"),
 #endif
+                    f(self.algo, "algo"),
+                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }
-    std::string name() const { return "gpu::convolution"; }
+    std::string name() const { return "gpu::" + op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    inline shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, op}.has(4);
+        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
+        check_shapes{conv_inputs, *this}.max_ndims(5).packed_layouts(
+            {{0, 1, 2}, {0, 1, 2, 3}, {0, 2, 3, 1}, {0, 1, 2, 3, 4}});
+        return migraphx::compute_shape<Op>(op, conv_inputs);
+    }
    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    {
-    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs);
+        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        auto workspace_size        = args[2].get_shape().bytes();
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, args[0].implicit()},
+                {miopenTensorConvolutionW, nullptr, args[1].implicit()},
+                {miopenTensorConvolutionY, nullptr, args[3].implicit()},
+            };
+            if(solution_ptr.get() == nullptr)
+                MIGRAPHX_THROW("MIOpen " + op.name() + " : Load MIOpen Solution before running it");
+            auto status = miopenRunSolution(miopen_stream_handle,
+                                            solution_ptr.get(),
+                                            3,
+                                            tensor_args,
+                                            args[2].implicit(),
+                                            workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen " + op.name() +
+                               " : running convolution using find_2.0 failed");
+            return args[3];
+        }
+#else
+        // else use immediate mode
+        if(solution_id == 0)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : invalid solution ID");
+        auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
+                                                        w_desc.get(),
+                                                        args[1].implicit(),
+                                                        x_desc.get(),
+                                                        args[0].implicit(),
+                                                        cd.get(),
+                                                        y_desc.get(),
+                                                        args[3].implicit(),
+                                                        args[2].implicit(),
+                                                        workspace_size,
+                                                        solution_id);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": running convolution failed");
+        return args[3];
+#endif
+    }
+    void set_conv_descriptor()
+    {
+        cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+    }
+    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
+    {
+        set_conv_descriptor();
+        auto ws = find(any_cast<migraphx::gpu::context>(ctx), output, input);
+        return {{"workspace", ws.bytes()}};
+    }
+    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        shape workspace_shape{};
+        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        std::size_t workspace_size = 0;
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            auto conv_problem = make_obj<miopen_problem>(
+                &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
+            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
+            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+            solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
+            auto status  = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
+            std::size_t solution_size;
+            status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Failed to fetch solution size");
+            auto solution_binary = std::vector<char>{};
+            solution_binary.resize(solution_size);
+            status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Saving solution failed");
+            solution_object = value::binary{solution_binary.data(), solution_size};
+            return shape{shape::int8_type, {workspace_size}};
+        }
+#else
+        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
+        auto x         = to_gpu(generate_argument(x_shape));
+        auto w         = to_gpu(generate_argument(w_shape));
+        auto y         = allocate_gpu(output_shape);
+        auto workspace = allocate_gpu(workspace_shape);
+        int algo_count = 1;
+        miopenConvAlgoPerf_t perf;
+        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+                                                       x_desc.get(),
+                                                       x.implicit(),
+                                                       w_desc.get(),
+                                                       w.implicit(),
+                                                       cd.get(),
+                                                       y_desc.get(),
+                                                       y.implicit(),
+                                                       1,
+                                                       &algo_count,
+                                                       &perf,
+                                                       workspace.implicit(),
+                                                       workspace_size,
+                                                       false);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
+        algo = perf.fwd_algo;
+        size_t solution_count;
+        status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          &solution_count);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution count failed");
+        std::vector<miopenConvSolution_t> solutions(solution_count);
+        status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                     w_desc.get(),
+                                                     x_desc.get(),
+                                                     cd.get(),
+                                                     y_desc.get(),
+                                                     solution_count,
+                                                     &solution_count,
+                                                     solutions.data());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution failed");
+        solution_id = solutions.front().solution_id;
+        return shape{shape::int8_type, {perf.memory}};
+#endif
+    }
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            (void)(ctx); // avoid warnings
+            (void)(output_shape);
+            (void)(inputs);
+            // load solution
+            if(solution_ptr == nullptr)
+            {
+                miopenSolution_t ptr;
+                auto status =
+                    miopenLoadSolution(&ptr,
+                                       reinterpret_cast<const char*>(solution_object.data()),
+                                       solution_object.size());
+                solution_ptr = miopen_solution{ptr};
+                if(status != miopenStatusSuccess)
+                    MIGRAPHX_THROW("MIOpen " + op.name() + ": loading convolution solution failed");
+            }
+        }
+#else
+        // Use immediate mode API
+        {
+            set_conv_descriptor();
+            if(solution_id == 0)
+            {
+                // Check that workspace hasn't changed
+                auto size = inputs.at(2).bytes();
+                auto ws   = find(ctx, output_shape, inputs);
+                if(ws.bytes() > size)
+                    MIGRAPHX_THROW("MIOpen " + op.name() +
+                                   ": workspace has changed during finalization.");
+            }
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto y_desc = make_tensor(reshape_if_1d(output_shape));
+            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                                  w_desc.get(),
+                                                                  x_desc.get(),
+                                                                  cd.get(),
+                                                                  y_desc.get(),
+                                                                  solution_id);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+        }
+#endif
+    }
+    inline std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
-};
+    inline shape pack_int8_shape(const shape& s) const
+    {
+        if(s.type() != shape::int8_type)
+        {
+            return s;
+        }
+        auto lens    = s.lens();
+        auto strides = s.strides();
+        lens[1]      = (lens[1] + 3) / 4 * 4;
+        strides[0]   = strides[1] * lens[1];
+        return {s.type(), lens, strides};
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/elu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/elu.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-#include <migraphx/op/elu.hpp>
-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/gpu/miopen.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
-struct miopen_elu
-{
-    op::elu op;
-    shared<activation_descriptor> ad;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "gpu::elu"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    void finalize(context&, const shape&, const std::vector<shape>&);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -105,7 +105,7 @@ struct hip_copy_to_gpu
    std::string name() const { return "hip::copy_to_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2);
+        check_shapes{inputs, *this}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
@@ -131,7 +131,7 @@ struct hip_copy_from_gpu
    std::string name() const { return "hip::copy_from_gpu"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(1, 2);
+        check_shapes{inputs, *this}.has(1, 2).same_type();
        return inputs.at(0);
    }
    argument
@@ -159,7 +159,7 @@ struct hip_copy
    std::string name() const { return "hip::copy"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
-        check_shapes{inputs, *this}.has(2);
+        check_shapes{inputs, *this}.has(2).same_type();
        return inputs.at(1);
    }
    argument compute(context& ctx, const shape&, std::vector<argument> args) const

--- a/src/targets/gpu/include/migraphx/gpu/leaky_relu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/leaky_relu.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_LEAKY_RELU_HPP
-#define MIGRAPHX_GUARD_RTGLIB_LEAKY_RELU_HPP
-#include <migraphx/op/leaky_relu.hpp>
-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/gpu/miopen.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
-struct miopen_leaky_relu
-{
-    op::leaky_relu op;
-    shared<activation_descriptor> ad;
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-    std::string name() const { return "gpu::leaky_relu"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    void finalize(context&, const shape&, const std::vector<shape>&);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -36,7 +36,8 @@ struct module;
 namespace gpu {
 std::string dump_mlir(const module& m);
-code_object_op compile_mlir(const context& ctx, const module& m);
+code_object_op
+compile_mlir(const context& ctx, module m, const std::vector<instruction_ref>& inputs);
 instruction_ref insert_mlir(module& m,
                            instruction_ref ins,

--- a/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
@@ -41,7 +41,7 @@ struct problem_params
    shape output;
 };
-std::string get_mlir_perf_for_conv(const problem_params& pp);
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops);
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/rocblas.hpp
@@ -25,7 +25,7 @@
 #define MIGRAPHX_GUARD_MIGRAPHLIB_ROCBLAS_HPP
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/config.hpp>
-#include <rocblas.h>
+#include <rocblas/rocblas.h>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/jit/concat.cpp
+++ b/src/targets/gpu/jit/concat.cpp
@@ -38,16 +38,19 @@ using namespace migraphx::gpu::gen; // NOLINT
 static const char* const concat_kernel = R"__migraphx__(
 #include <migraphx/kernels/concat.hpp>
 #include <migraphx/kernels/vectorize.hpp>
+#include <migraphx/kernels/ops.hpp>
 #include <args.hpp>
 namespace migraphx {
+${preamble}
 extern "C" {
 __global__ void ${kernel}(${params}) 
 {
-    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, auto... xs) {
+    transform_args(make_tensors(), rotate_last(), ${transformers})(${args})([](auto y, ${concat_params}, auto... xs) {
-        concat<${axis}>(y, xs...);
+        concat<${axis}>(${concat_args})(${post}, y, xs...);
    });
 }
@@ -68,28 +71,42 @@ struct concat_compiler : compiler<concat_compiler>
    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
    {
-        // TODO: Use reduce_dims
+        auto num_of_concat_inputs = v.get("concat_inputs", inputs.size() - 1);
        hip_compile_options options;
        options.inputs      = inputs;
        options.output      = inputs.back();
        options.params      = "-Wno-float-equal";
+        options.kernel_name = v.get("kernel", "concat_kernel");
        auto axis           = find_fast_axis(options.inputs);
        auto vec            = vectorize::elements(ctx, axis, options.inputs);
-        options.kernel_name = v.get("kernel", "concat_kernel");
        options.set_launch_params(
            v, compute_global_for(ctx, get_concat_elements(options.inputs) / vec.size, 256));
-        auto src = interpolate_string(concat_kernel,
+        auto src = interpolate_string(
-                                      {{"kernel", options.kernel_name},
+            concat_kernel,
-                                       {"params", enum_params(inputs.size(), "void * private_p")},
+            {{"kernel", options.kernel_name},
-                                       {"args", enum_params(inputs.size(), "private_p")},
+             {"params", enum_params(inputs.size(), "void * private_p")},
-                                       {"transformers", make_transformer_args(vec)},
+             {"args", enum_params(inputs.size(), "private_p")},
-                                       {"axis", v.at("axis").to<std::string>()}});
+             {"concat_params", enum_params(num_of_concat_inputs, "auto concat_x")},
+             {"concat_args", enum_params(num_of_concat_inputs, "concat_x")},
+             {"post", v.get("post", std::string{"op::id{}"})},
+             {"transformers", make_transformer_args(vec)},
+             {"preamble", v.get("preamble", std::string{})},
+             {"axis", v.at("axis").to<std::string>()}});
        return compile_hip_code_object(src, options);
    }
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+        auto v = op.to_value();
+        if(not ins->module_inputs().empty())
+        {
+            auto* pm           = ins->module_inputs().front();
+            v["concat_inputs"] = ins->inputs().size() - pm->get_parameter_names().size();
+            v["preamble"]      = generate_pointwise(*pm, "post_concat");
+            v["post"]          = "MIGRAPHX_LIFT(post_concat)";
+            v["kernel"]        = "concat_" + generate_name_from_ops(*pm) + "_kernel";
+        }
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), v));
    }
 };

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -24,7 +24,6 @@
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/make_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/mlir.hpp>
 namespace migraphx {
@@ -41,7 +40,7 @@ struct mlir_compiler : compiler<mlir_compiler>
    {
        auto* smod = ins->module_inputs().front();
        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
-        return insert(compile_mlir(ctx, *smod));
+        return insert(compile_mlir(ctx, *smod, ins->inputs()));
    }
    compiler_replace insert(code_object_op co) const

--- a/src/targets/gpu/batch_norm_inference.cpp
+++ b/src/targets/gpu/batch_norm_inference.cpp
@@ -21,65 +21,80 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/batch_norm_inference.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/float_equal.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-shape miopen_batch_norm_inference::compute_shape(const std::vector<shape>& inputs) const
+using namespace migraphx::gpu::gen; // NOLINT
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/pad.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+namespace migraphx {
+extern "C" {
+__global__ void pad_kernel(void* input_p, void* output_p) 
 {
-    check_shapes{inputs, *this}.has(6);
+    auto offsets = index_ints<${offsets}>{};
-    check_shapes{inputs.data(), inputs.data() + 1, *this}.same_ndims().max_ndims(5);
+    auto idx     = make_index();
-    return op.compute_shape({inputs.at(0), inputs.at(1), inputs.at(2), inputs.at(3), inputs.at(4)});
+    make_tensors()(input_p, output_p)([&](auto input, auto output) {
+        pad(idx, offsets, input, output, ${pad_val});
+    });
+}
 }
-inline shape reshape_to_2d(const shape& input)
+} // namespace migraphx
-{
-    auto dims = input.lens();
-    if(dims.size() >= 4)
-        return input;
-    std::vector<size_t> new_dims(dims.begin(), dims.end());
+)__migraphx__";
-    std::size_t num = 4 - dims.size();
-    new_dims.insert(new_dims.end(), num, 1);
-    return {input.type(), new_dims};
-}
-argument miopen_batch_norm_inference::compute(context& ctx,
+struct pad_compiler : compiler<pad_compiler>
-                                              const shape& output_shape,
-                                              const std::vector<argument>& args) const
 {
-    shape x_shape  = args[0].get_shape();
+    std::vector<std::string> names() const { return {"pad"}; }
-    shape y_shape  = output_shape;
-    shape bn_shape = args[3].get_shape();
-    auto x_desc  = make_tensor(reshape_to_2d(x_shape));
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
-    auto y_desc  = make_tensor(reshape_to_2d(y_shape));
+    {
-    auto bn_desc = make_tensor(reshape_to_2d(bn_shape));
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        options.kernel_name    = "pad_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));
-    float alpha = 1.0;
+        auto pad_val        = v.get("value", 0.f);
-    float beta  = 0.0f;
+        auto pad_val_string = to_string(pad_val);
+        if(float_equal(pad_val, std::numeric_limits<float>::lowest()))
+            pad_val_string = "lowest{}";
+        if(float_equal(pad_val, std::numeric_limits<float>::max()))
+            pad_val_string = "highest{}";
-    miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
+        auto padding    = v.at("pads").to_vector<int64_t>();
-                                             miopenBatchNormMode_t(op.bn_mode),
+        auto input_lens = inputs.front().lens();
-                                             &alpha,
+        std::vector<size_t> offsets(input_lens.size());
-                                             &beta,
+        std::copy(padding.begin(), padding.begin() + offsets.size(), offsets.begin());
-                                             x_desc.get(),
-                                             args[0].implicit(),
-                                             y_desc.get(),
-                                             args[5].implicit(),
-                                             bn_desc.get(),
-                                             args[1].implicit(),
-                                             args[2].implicit(),
-                                             args[3].implicit(),
-                                             args[4].implicit(),
-                                             op.epsilon);
-    return args[5];
+        auto src = interpolate_string(
-}
+            pointwise_kernel,
+            {{"pad_val", to_string(pad_val_string)}, {"offsets", to_string_range(offsets)}});
+        return compile_hip_code_object(src, options);
+    }
+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/jit/pointwise.cpp
+++ b/src/targets/gpu/jit/pointwise.cpp
@@ -58,7 +58,7 @@ __global__ void ${kernel}(${params})
 struct pointwise_compiler : compiler<pointwise_compiler>
 {
-    std::vector<std::string> names() const { return {"pointwise", "contiguous"}; }
+    std::vector<std::string> names() const { return {"pointwise", "contiguous", "layout"}; }
    static std::size_t oversubscribe_if(bool b)
    {
@@ -91,12 +91,12 @@ struct pointwise_compiler : compiler<pointwise_compiler>
    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
    {
-        if(op.name() == "contiguous")
+        if(contains({"layout", "contiguous"}, op.name()))
        {
            return replace(compile_op(
                ctx,
                to_shapes(ins->inputs()),
-                {{"lambda", "[](auto x) { return x; }"}, {"kernel", "contiguous_kernel"}}));
+                {{"lambda", "[](auto x) { return x; }"}, {"kernel", op.name() + "_kernel"}}));
        }
        else
        {

--- a/src/targets/gpu/jit/scatternd.cpp
+++ b/src/targets/gpu/jit/scatternd.cpp
@@ -79,9 +79,10 @@ struct scatternd_compiler : compiler<scatternd_compiler>
    {
        assert(starts_with(op.name(), "scatternd_"));
        auto reduction = op.name().substr(10);
-        return insert(compile_op(ctx,
+        return insert(compile_op(
-                                 to_shapes({ins->inputs().begin() + 1, ins->inputs().end()}),
+            ctx,
-                                 {{"reduction", reduction}}));
+            to_shapes(std::vector<instruction_ref>{ins->inputs().begin() + 1, ins->inputs().end()}),
+            {{"reduction", reduction}}));
    }
    compiler_replace insert(const operation& op) const

--- a/src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/concat.hpp
@@ -41,7 +41,15 @@ constexpr auto concat_slice(Output out, Input, Start)
        return Start{} * output_shape.strides[Axis];
    });
    constexpr auto s       = make_shape(lens, strides);
-    return make_tensor_view(&out[offset], s);
+    MIGRAPHX_ASSERT(offset < out.get_shape().element_space());
+    MIGRAPHX_ASSERT((s.element_space() + offset) <= out.get_shape().element_space());
+    return make_tensor_view(out.data() + offset, s);
+}
+template <index_int Axis, class Input, class Start, class... Ts>
+constexpr auto concat_slices(Input input, Start start, Ts... xs)
+{
+    return [=](auto f) { f(concat_slice<Axis>(xs, input, start)...); };
 }
 template <index_int Axis, class Input>
@@ -51,15 +59,19 @@ constexpr auto concat_ends(Input)
    return _c<lens[Axis]>;
 }
-template <index_int Axis, class Output, class... Inputs>
+template <index_int Axis, class... Inputs>
-__device__ void concat(Output output, Inputs... inputs)
+__device__ auto concat(Inputs... inputs)
 {
-    auto idx = make_index();
+    return [=](auto f, auto... ts) {
-    fold([&](auto start, auto input) {
+        auto idx = make_index();
-        auto y = concat_slice<Axis>(output, input, start);
+        fold([&](auto start, auto input) {
-        idx.global_stride(input.get_shape().elements(), [&](auto i) { y[i] = input[i]; });
+            concat_slices<Axis>(input, start, ts...)([&](auto y, auto... xs) {
-        return start + concat_ends<Axis>(input);
+                idx.global_stride(input.get_shape().elements(),
-    })(_c<0>, inputs...);
+                                  [&](auto i) { y[i] = f(input[i], xs[i]...); });
+            });
+            return start + concat_ends<Axis>(input);
+        })(_c<0>, inputs...);
+    };
 }
 } // namespace migraphx

--- a/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/layernorm.hpp
@@ -25,6 +25,7 @@
 #define MIGRAPHX_GUARD_KERNELS_LAYERNORM_HPP
 #include <migraphx/kernels/reduce.hpp>
 #include <migraphx/kernels/ops.hpp>
+#include <migraphx/kernels/vec.hpp>
 #include <migraphx/kernels/print.hpp>
 namespace migraphx {

--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -21,53 +21,43 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_PAD_HPP
-#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#define MIGRAPHX_GUARD_KERNELS_PAD_HPP
-#include <migraphx/shape.hpp>
+#include <migraphx/kernels/shape.hpp>
-#include <migraphx/reflect.hpp>
+#include <migraphx/kernels/index.hpp>
-#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/kernels/algorithm.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/kernels/ranges.hpp>
 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-struct context;
+template <class Offsets, class Input, class Output, class PadVal>
+__device__ void pad(const index& idx,
-struct miopen_quant_convolution
+                    const Offsets& offsets,
+                    const Input& input,
+                    Output& output,
+                    const PadVal& pad_val)
 {
-    op::quant_convolution op;
+    auto output_shape = output.get_shape();
-    bool int8_x4_format = false;
+    idx.global_stride(output_shape.elements(), [&](auto i) {
-    shared<convolution_descriptor> cd;
+        // 1. get current multi-index for output
-    miopenConvFwdAlgorithm_t algo{};
+        // 2. get the size of the input to determine input boundaries
-    uint64_t solution_id = 0;
+        // 3. compute the corresponding multi-index for input by accounting for offsets
+        // 4. if current multi-index is within offsets or input's new multi-index is out of bounds,
-    template <class Self, class F>
+        //    use pad value instead of input's value
-    static auto reflect(Self& self, F f)
+        auto multi        = output_shape.multi(i);
-    {
+        auto input_bounds = input.get_shape().lens;
-        // TODO: Add algo
+        auto input_idx    = multi - offsets;
-        return pack_join(migraphx::reflect(self.op, f),
+        auto range_multi  = range(multi.size());
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
-    }
+        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
+               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
-    std::string name() const { return "gpu::quant_convolution"; }
+           }))
-    shape compute_shape(const std::vector<shape>& inputs) const;
+            output[multi] = pad_val;
-    argument
+        else
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+            output[multi] = input[input_idx];
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    });
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+}
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-    private:
-    shape pack_int8_shape(const shape& s) const;
-};
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
 #endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/pointwise.hpp
@@ -33,38 +33,6 @@
 namespace migraphx {
-template <class T>
-struct implicit_conversion_op
-{
-    T x;
-    template <index_int N, class U>
-    constexpr operator vec<U, N>() const
-    {
-        if constexpr(vec_size<T>() == 0)
-        {
-            return x;
-        }
-        else
-        {
-            static_assert(vec_size<T>() == N, "Vector mismatch size");
-            return __builtin_convertvector(x, vec<U, N>);
-        }
-    }
-    template <class U>
-    constexpr operator U() const
-    {
-        return x;
-    }
-};
-template <class T>
-constexpr implicit_conversion_op<T> implicit_conversion(T x)
-{
-    return {x};
-}
 template <class F, class T, class... Ts>
 __device__ void pointwise_tensor(index idx, F f, T out, Ts... xs)
 {

--- a/src/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/ranges.hpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#ifndef MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#include <migraphx/kernels/iota_iterator.hpp>
+namespace migraphx {
+template <class Iterator>
+struct iterator_range
+{
+    Iterator start;
+    Iterator last;
+    constexpr Iterator begin() const { return start; }
+    constexpr Iterator end() const { return last; }
+};
+constexpr iterator_range<iota_iterator> range(diff_int start, diff_int last)
+{
+    return {{start, {}}, {last, {}}};
+}
+constexpr iterator_range<iota_iterator> range(diff_int last) { return range(0, last); }
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANGES_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -185,5 +185,37 @@ constexpr auto vec_reduce(T x, Op op)
    }
 }
+template <class T>
+struct implicit_conversion_op
+{
+    T x;
+    template <index_int N, class U>
+    constexpr operator vec<U, N>() const
+    {
+        if constexpr(vec_size<T>() == 0)
+        {
+            return x;
+        }
+        else
+        {
+            static_assert(vec_size<T>() == N, "Vector mismatch size");
+            return __builtin_convertvector(x, vec<U, N>);
+        }
+    }
+    template <class U>
+    constexpr operator U() const
+    {
+        return x;
+    }
+};
+template <class T>
+constexpr implicit_conversion_op<T> implicit_conversion(T x)
+{
+    return {x};
+}
 } // namespace migraphx
 #endif // MIGRAPHX_GUARD_KERNELS_VEC_HPP
--- a/src/targets/gpu/leaky_relu.cpp
+++ b/src/targets/gpu/leaky_relu.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/leaky_relu.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/miopen.hpp>
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-shape miopen_leaky_relu::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-argument miopen_leaky_relu::compute(context& ctx,
-                                    const shape& output_shape,
-                                    const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-    return args[1];
-}
-void miopen_leaky_relu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_leaky_relu(op.alpha);
-}
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -29,23 +29,15 @@
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/stringutils.hpp>
-#include <migraphx/op/convolution.hpp>
-#include <migraphx/op/deconvolution.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/if_op.hpp>
 #include <migraphx/op/reshape.hpp>
-#include <migraphx/op/quant_convolution.hpp>
 #include <migraphx/op/quant_dot.hpp>
-#include <migraphx/gpu/batch_norm_inference.hpp>
 #include <migraphx/gpu/context.hpp>
-#include <migraphx/gpu/convolution.hpp>
-#include <migraphx/gpu/deconvolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/gpu/int8_conv_pack.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/quant_convolution.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/compiler.hpp>
 #include <migraphx/iterator_for.hpp>
@@ -98,14 +90,11 @@ struct miopen_apply
        add_extend_op("argmax");
        add_extend_op("argmin");
-        add_extend_op("elu");
        add_extend_op("gather");
-        add_extend_op("leaky_relu");
        add_extend_op("logsoftmax");
        add_extend_op("lrn");
        add_extend_op("multinomial");
        add_extend_op("nonzero");
-        add_extend_op("pad");
        add_extend_op("pooling");
        add_extend_op("prefix_scan_sum");
        add_extend_op("reverse");
@@ -115,16 +104,15 @@ struct miopen_apply
        add_extend_op("scatter_none");
        add_extend_op("topk");
-        add_batch_norm_inference_op();
+        add_convolution_op("convolution");
-        add_convolution_op();
+        add_convolution_op("deconvolution");
-        add_deconvolution_op();
+        add_convolution_op("quant_convolution");
        add_gemm_op<op::dot>("dot");
        add_gemm_op<op::quant_dot>("quant_dot");
        add_if_op();
        add_loop_op();
        add_neg_op();
        add_nms_op();
-        add_quant_convolution_op();
    }
    void copy_params() const
@@ -232,38 +220,6 @@ struct miopen_apply
        return mod->insert_instruction(ins, make_op("allocate", {{"shape", to_value(s)}}));
    }
-    void add_convolution_op()
-    {
-        apply_map.emplace("convolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::convolution>(ins->get_operator());
-            auto conv = miopen_convolution{op, make_conv(op)};
-            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-            return mod->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
-    void add_deconvolution_op()
-    {
-        apply_map.emplace("deconvolution", [=](instruction_ref ins) {
-            auto&& op = any_cast<op::deconvolution>(ins->get_operator());
-            auto conv = miopen_deconvolution{op, make_deconv(op)};
-            auto ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-            return mod->replace_instruction(
-                ins, conv, ins->inputs().at(0), ins->inputs().at(1), workspace, output);
-        });
-    }
    template <typename Op>
    void add_gemm_op(const std::string& name)
    {
@@ -277,32 +233,19 @@ struct miopen_apply
        });
    }
-    void add_quant_convolution_op()
+    void add_convolution_op(const std::string& name)
    {
-        apply_map.emplace("quant_convolution", [=](instruction_ref ins) {
+        apply_map.emplace(name, [=](instruction_ref ins) {
-            auto&& op = any_cast<op::quant_convolution>(ins->get_operator());
+            operation conv = make_op(
-            shape ws;
+                "gpu::" + name,
-            miopen_quant_convolution conv;
+                {{"op", ins->get_operator().to_value()}, {"int8_x4_format", int8_x4_format}});
-            auto compile_quant_conv_with_format = [&](bool format) {
+            auto output = insert_allocation(ins, ins->get_shape());
-                conv = miopen_quant_convolution{op, format, make_conv(op)};
-                ws   = conv.find(get_context(), ins->get_shape(), to_shapes(ins->inputs()));
-            };
-            try
-            {
-                compile_quant_conv_with_format(int8_x4_format);
-            }
-            catch(migraphx::exception&)
-            {
-                // In case no solver supports the default format, retry using the other format.
-                compile_quant_conv_with_format(not int8_x4_format);
-            }
-            auto args      = ins->inputs();
-            auto workspace = insert_allocation(ins, ws);
-            auto output    = insert_allocation(ins, ins->get_shape());
-            return mod->replace_instruction(ins, conv, args[0], args[1], workspace, output);
+            return mod->replace_instruction(ins,
+                                            make_op("gpu::miopen_op", {{"op", to_value(conv)}}),
+                                            ins->inputs().at(0),
+                                            ins->inputs().at(1),
+                                            output);
        });
    }
@@ -336,43 +279,6 @@ struct miopen_apply
        });
    }
-    void add_batch_norm_inference_op()
-    {
-        apply_map.emplace("batch_norm_inference", [=](instruction_ref ins) {
-            auto&& op       = any_cast<op::batch_norm_inference>(ins->get_operator());
-            auto output     = insert_allocation(ins, ins->get_shape());
-            shape old_shape = ins->inputs().at(1)->get_shape();
-            auto input      = ins->inputs()[0];
-            auto input_lens = input->get_shape().lens();
-            std::vector<int64_t> rsp_lens(input_lens.size(), 1);
-            // for per_activation case, also need to reshape input
-            if(op.bn_mode == op::batch_norm_inference::per_activation)
-            {
-                std::copy(input_lens.begin() + 1, input_lens.end(), rsp_lens.begin() + 1);
-            }
-            else
-            {
-                rsp_lens[1] = static_cast<int64_t>(old_shape.elements());
-            }
-            auto reshape_op = op::reshape{rsp_lens};
-            std::vector<instruction_ref> reshapes;
-            std::transform(ins->inputs().begin() + 1,
-                           ins->inputs().end(),
-                           std::back_inserter(reshapes),
-                           [&](auto i) { return mod->insert_instruction(ins, reshape_op, i); });
-            return mod->replace_instruction(ins,
-                                            miopen_batch_norm_inference{op},
-                                            input,
-                                            reshapes[0],
-                                            reshapes[1],
-                                            reshapes[2],
-                                            reshapes[3],
-                                            output);
-        });
-    }
    // use 0 - input to represent neg
    void add_neg_op()
    {