Merge branch 'dyn_model_test' of github.com:ROCmSoftwarePlatform/AMDMIGraphX into dyn_model_test

c4b1102e · charlie · 5fc48e77 · 31065c7d · c4b1102e · 5fc48e77
Commit c4b1102e authored Oct 31, 2022 by charlie
20 changed files
--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -144,9 +144,8 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)

 std::size_t compute_block_size(std::size_t n, std::size_t max_block_size)
 {
-    const std::size_t min_block_size  = 64;
-    const std::size_t base_block_size = 32;
-    auto block_size                   = (((n - 1) / base_block_size + 1)) * base_block_size;
+    const std::size_t min_block_size = 64;
+    auto block_size                  = (((n - 1) / min_block_size + 1)) * min_block_size;
    return std::min(std::max(min_block_size, block_size), max_block_size);
 }


--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/convolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-#include <miopen/miopen.h>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_convolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
-    check_shapes{conv_inputs, *this}.max_ndims(5);
-    return op.normalize_compute_shape(conv_inputs);
-}
-
-inline shape reshape_if_1d(const shape& input)
-{
-    shape new_shape{input};
-    auto dims = new_shape.lens();
-
-    if(dims.size() == 3)
-    {
-        std::vector<size_t> new_dims = dims;
-        new_dims.insert(new_dims.begin() + 2, 1);
-        new_shape = shape{input.type(), new_dims};
-    }
-    return new_shape;
-}
-
-argument miopen_convolution::compute(context& ctx,
-                                     const shape& output_shape,
-                                     const std::vector<argument>& args) const
-{
-    auto x_desc                = make_tensor(reshape_if_1d(args[0].get_shape()));
-    auto w_desc                = make_tensor(reshape_if_1d(args[1].get_shape()));
-    auto y_desc                = make_tensor(reshape_if_1d(output_shape));
-    auto* miopen_stream_handle = ctx.get_stream().get_miopen();
-    auto workspace_size        = args[2].get_shape().bytes();
-
-#ifdef MIGRAPHX_HAS_FIND_2_API
-    {
-        const miopenTensorArgument_t tensor_args[3] = {
-            {miopenTensorConvolutionX, nullptr, args[0].implicit()},
-            {miopenTensorConvolutionW, nullptr, args[1].implicit()},
-            {miopenTensorConvolutionY, nullptr, args[3].implicit()},
-        };
-
-        if(solution_ptr.get() == nullptr)
-            MIGRAPHX_THROW("MIOpen Convolution : Load MIOpen Solution before running it");
-
-        auto status = miopenRunSolution(miopen_stream_handle,
-                                        solution_ptr.get(),
-                                        3,
-                                        tensor_args,
-                                        args[2].implicit(),
-                                        workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: running convolution using find_2.0 failed");
-
-        return args[3];
-    }
-#else
-    // else use immediate mode
-    if(solution_id == 0)
-        MIGRAPHX_THROW("MIOpen Convolution: invalid solution ID");
-
-    auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
-                                                    w_desc.get(),
-                                                    args[1].implicit(),
-                                                    x_desc.get(),
-                                                    args[0].implicit(),
-                                                    cd.get(),
-                                                    y_desc.get(),
-                                                    args[3].implicit(),
-                                                    args[2].implicit(),
-                                                    workspace_size,
-                                                    solution_id);
-
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: running convolution failed");
-    return args[3];
-#endif
-}
-
-shape miopen_convolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-    auto x_desc                = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc                = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc                = make_tensor(reshape_if_1d(output_shape));
-    std::size_t workspace_size = 0;
-
-#ifdef MIGRAPHX_HAS_FIND_2_API
-    {
-        auto conv_problem = make_obj<miopen_problem>(
-            &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
-
-        set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
-        set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
-        set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
-
-        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
-
-        solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
-
-        auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution : failed to get solution's workspace size");
-
-        std::size_t solution_size;
-        status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: Failed to fetch solution size");
-
-        auto solution_binary = std::vector<char>{};
-        solution_binary.resize(solution_size);
-
-        status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: Saving solution failed");
-        solution_object = value::binary{solution_binary.data(), solution_size};
-
-        return shape{shape::int8_type, {workspace_size}};
-    }
-#else
-    // else use immediate find mode
-    auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                                           w_desc.get(),
-                                                           x_desc.get(),
-                                                           cd.get(),
-                                                           y_desc.get(),
-                                                           &workspace_size);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: Failed to get forward workspace size");
-
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-    auto x         = to_gpu(generate_argument(inputs[0]));
-    auto w         = to_gpu(generate_argument(inputs[1]));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                   x_desc.get(),
-                                                   x.implicit(),
-                                                   w_desc.get(),
-                                                   w.implicit(),
-                                                   cd.get(),
-                                                   y_desc.get(),
-                                                   y.implicit(),
-                                                   1,
-                                                   &algo_count,
-                                                   &perf,
-                                                   workspace.implicit(),
-                                                   workspace_size,
-                                                   false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: find convolution failed");
-    algo = perf.fwd_algo;
-
-    size_t solution_count;
-
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: get solution count failed");
-
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Convolution: get solution failed");
-
-    solution_id = solutions.front().solution_id;
-
-    return shape{shape::int8_type, {perf.memory}};
-#endif
-}
-
-void miopen_convolution::finalize(context& ctx,
-                                  const shape& output_shape,
-                                  const std::vector<shape>& inputs)
-{
-#ifdef MIGRAPHX_HAS_FIND_2_API
-    {
-        (void)(ctx); // avoid warnings
-        (void)(output_shape);
-        (void)(inputs);
-        // load solution
-        if(solution_ptr == nullptr)
-        {
-            miopenSolution_t ptr;
-            auto status  = miopenLoadSolution(&ptr,
-                                             reinterpret_cast<const char*>(solution_object.data()),
-                                             solution_object.size());
-            solution_ptr = miopen_solution{ptr};
-            if(status != miopenStatusSuccess)
-                MIGRAPHX_THROW("MIOpen Convolution: loading convolution solution failed");
-        }
-    }
-#else
-    // Use immediate mode API
-    {
-        if(cd == nullptr)
-            cd = make_conv(op);
-        if(solution_id == 0)
-        {
-            // Check that workspace hasn't changed
-            auto size = inputs.at(2).bytes();
-            auto ws   = find(ctx, output_shape, inputs);
-            if(ws.bytes() > size)
-                MIGRAPHX_THROW("MIOpen Convolution: workspace has changed during finalization.");
-        }
-
-        auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-        auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-        auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-        auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                              w_desc.get(),
-                                                              x_desc.get(),
-                                                              cd.get(),
-                                                              y_desc.get(),
-                                                              solution_id);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
-    }
-#endif
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/deconvolution.cpp
+++ b/src/targets/gpu/deconvolution.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/deconvolution.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/generate.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_deconvolution::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(4).standard();
-    std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
-    check_shapes{conv_inputs, *this}.max_ndims(5);
-    return op.compute_shape(conv_inputs);
-}
-
-inline shape reshape_if_1d(const shape& input)
-{
-    shape new_shape{input};
-    auto dims = new_shape.lens();
-
-    if(dims.size() == 3)
-    {
-        std::vector<size_t> new_dims = dims;
-        new_dims.insert(new_dims.begin() + 2, 1);
-        new_shape = shape{input.type(), new_dims};
-    }
-    return new_shape;
-}
-
-argument miopen_deconvolution::compute(context& ctx,
-                                       const shape& output_shape,
-                                       const std::vector<argument>& args) const
-{
-    auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()));
-    auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-    if(solution_id == 0)
-        MIGRAPHX_THROW("MIOpen Deconvolution: invalid solution ID");
-
-    auto status = miopenConvolutionForwardImmediate(ctx.get_stream().get_miopen(),
-                                                    w_desc.get(),
-                                                    args[1].implicit(),
-                                                    x_desc.get(),
-                                                    args[0].implicit(),
-                                                    cd.get(),
-                                                    y_desc.get(),
-                                                    args[3].implicit(),
-                                                    args[2].implicit(),
-                                                    args[2].get_shape().bytes(),
-                                                    solution_id);
-
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: running convolution failed");
-    return args[3];
-}
-
-shape miopen_deconvolution::find(context& ctx, const shape& output_shape, std::vector<shape> inputs)
-{
-    shape workspace_shape{};
-
-    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-    std::size_t workspace_size = 0;
-    miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                             w_desc.get(),
-                                             x_desc.get(),
-                                             cd.get(),
-                                             y_desc.get(),
-                                             &workspace_size);
-    workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-    auto x         = to_gpu(generate_argument(inputs[0]));
-    auto w         = to_gpu(generate_argument(inputs[1]));
-    auto y         = allocate_gpu(output_shape);
-    auto workspace = allocate_gpu(workspace_shape);
-
-    int algo_count = 1;
-    miopenConvAlgoPerf_t perf;
-    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                                        x_desc.get(),
-                                                        x.implicit(),
-                                                        w_desc.get(),
-                                                        w.implicit(),
-                                                        cd.get(),
-                                                        y_desc.get(),
-                                                        y.implicit(),
-                                                        1,
-                                                        &algo_count,
-                                                        &perf,
-                                                        workspace.implicit(),
-                                                        workspace_size,
-                                                        false);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: find convolution failed");
-    algo = perf.fwd_algo;
-
-    size_t solution_count;
-
-    status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
-                                                      w_desc.get(),
-                                                      x_desc.get(),
-                                                      cd.get(),
-                                                      y_desc.get(),
-                                                      &solution_count);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: get solution count failed");
-
-    std::vector<miopenConvSolution_t> solutions(solution_count);
-
-    status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
-                                                 w_desc.get(),
-                                                 x_desc.get(),
-                                                 cd.get(),
-                                                 y_desc.get(),
-                                                 solution_count,
-                                                 &solution_count,
-                                                 solutions.data());
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: get solution failed");
-
-    solution_id = solutions.front().solution_id;
-
-    return shape{shape::int8_type, {perf.memory}};
-}
-
-void miopen_deconvolution::finalize(context& ctx,
-                                    const shape& output_shape,
-                                    std::vector<shape> inputs)
-{
-    if(cd == nullptr)
-        cd = make_deconv(op);
-    if(solution_id == 0)
-    {
-        // Check that workspace hasn't changed
-        auto size = inputs.at(2).bytes();
-        auto ws   = find(ctx, output_shape, inputs);
-        if(ws.bytes() > size)
-            MIGRAPHX_THROW("MIOpen Deconvolution: workspace has changed during finalization.");
-    }
-
-    auto x_desc = make_tensor(reshape_if_1d(inputs[0]));
-    auto w_desc = make_tensor(reshape_if_1d(inputs[1]));
-    auto y_desc = make_tensor(reshape_if_1d(output_shape));
-
-    auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
-                                                          w_desc.get(),
-                                                          x_desc.get(),
-                                                          cd.get(),
-                                                          y_desc.get(),
-                                                          solution_id);
-    if(status != miopenStatusSuccess)
-        MIGRAPHX_THROW("MIOpen Deconvolution: compile solution failed");
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#include <migraphx/gpu/elu.hpp>
-#include <migraphx/gpu/context.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
-}
-
-argument miopen_elu::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
-{
-    float alpha = 1;
-    float beta  = 0;
-    auto x_desc = make_tensor(args[0].get_shape());
-    auto y_desc = make_tensor(output_shape);
-    miopenActivationForward(ctx.get_stream().get_miopen(),
-                            ad.get(),
-                            &alpha,
-                            x_desc.get(),
-                            args[0].implicit(),
-                            &beta,
-                            y_desc.get(),
-                            args[1].implicit());
-
-    return args[1];
-}
-
-void miopen_elu::finalize(context&, const shape&, const std::vector<shape>&)
-{
-    ad = make_elu(op.alpha);
-}
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -49,7 +49,7 @@ struct mlir_conv
    std::string name() const { return "gpu::mlir_conv"; }
    shape compute_shape(std::vector<shape> inputs, const std::vector<module_ref>& mods) const
    {
-        check_shapes{inputs, *this}.standard();
+        check_shapes{inputs, *this}.packed_or_broadcasted();
        if(mods.size() != 1)
            MIGRAPHX_THROW("should have one submodule.");
        if(inputs.size() < 2)
@@ -70,6 +70,9 @@ MIGRAPHX_PRED_MATCHER(is_mlir_conv, instruction_ref ins)
    auto group = v.at("group").to<int>();
    if(group != 1)
        return false;
+    // Avoid MLIR assertion: Index < Length && "Invalid index!"
+    if(ins->get_shape().lens().size() != 4)
+        return false;
    return true;
 }

@@ -96,9 +99,10 @@ struct find_conv_pointwise
                                   i.name());
           }))
            return;
-        // Only fuse with fp32 for now
+        // Only fuse with fp32/fp16
        if(std::any_of(ins->inputs().begin(), ins->inputs().end(), [&](auto i) {
-               return i->get_shape().type() != shape::type_t::float_type;
+               return not contains({shape::type_t::float_type, shape::type_t::half_type},
+                                   i->get_shape().type());
           }))
            return;
        std::sort(names.begin(), names.end());

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/gpu/fuse_ops.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/device_name.hpp>
 #include <migraphx/gpu/oper.hpp>
 #include <migraphx/gpu/gemm.hpp>
@@ -190,10 +189,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
        return false;
    auto wei = ins->inputs().at(1)->get_shape();
    assert(wei.lens().size() == 4);
-    auto conv = any_cast<miopen_convolution>(ins->get_operator());
-    if(conv.op.group > 1)
+    auto miopen_conv_op = ins->get_operator().to_value();
+    auto algo           = miopen_conv_op.at("algo").to<miopenConvFwdAlgorithm_t>();
+    auto conv_op        = from_value<op::convolution>(miopen_conv_op["op"]);
+    if(conv_op.group > 1)
        return false;
-    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
+    if(wei.lens()[1] > 512 and algo != miopenConvolutionFwdAlgoWinograd)
        return false;

    // Do not fuse non-symmetric input
@@ -201,13 +202,12 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    if(input_lens[2] != input_lens[3] or wei.lens()[2] != wei.lens()[3])
        return false;

-    auto op = conv.op;
    // Dont fuse winograd for non-3x3s since there is no fused windograd for those configs
-    if(conv.algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and
-       wei.lens()[3] != 3 and contains({{1, 1}}, op.stride))
+    if(algo == miopenConvolutionFwdAlgoWinograd and wei.lens()[2] != 3 and wei.lens()[3] != 3 and
+       contains({{1, 1}}, conv_op.stride))
        return false;
-    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, op.padding) and
-           contains({{0, 0}, {1, 1}}, op.stride) and contains({{1, 1}}, op.dilation);
+    return contains({{0, 0, 0, 0}, {1, 1, 1, 1}, {2, 2, 2, 2}}, conv_op.padding) and
+           contains({{0, 0}, {1, 1}}, conv_op.stride) and contains({{1, 1}}, conv_op.dilation);
 }

 void move_broadcasted_back(std::vector<instruction_ref>& args)
@@ -462,7 +462,7 @@ void apply_conv_bias(context& ctx, module& m, const match::matcher_result& r)
    auto ins         = r.result;
    auto input_ins   = conv_ins->inputs().at(0);
    auto weights_ins = conv_ins->inputs().at(1);
-    auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+    auto conv_op     = from_value<op::convolution>((conv_ins->get_operator()).to_value()["op"]);
    auto alloc_ins   = ins->inputs().back();
    auto old_ws_ins  = conv_ins->inputs().at(2);

@@ -528,7 +528,7 @@ struct find_conv_pointwise
        auto ins         = r.result;
        auto input_ins   = conv_ins->inputs().at(0);
        auto weights_ins = conv_ins->inputs().at(1);
-        auto conv_op     = any_cast<miopen_convolution>(conv_ins->get_operator()).op;
+        auto conv_op     = from_value<op::convolution>(conv_ins->get_operator().to_value()["op"]);
        auto alloc_ins   = ins->inputs().back();

        module_ref pm = ins->module_inputs().front();

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -183,8 +183,8 @@ argument register_on_gpu(const argument& arg)
 {
    auto arg_shared = arg.share();
    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
-    return {arg_shared.get_shape(),
-            [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
+    auto s          = arg_shared.get_shape();
+    return {s, [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
 }

 argument to_gpu(const argument& arg, bool host)

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -197,7 +197,9 @@ struct hip_device
 struct context
 {
    context(std::size_t device_id = 0, std::size_t n = value_of(MIGRAPHX_NSTREAMS{}, 1))
-        : current_device(std::make_shared<hip_device>(device_id, n))
+        : current_device(std::make_shared<hip_device>(device_id, n)),
+          begin_event(create_event()),
+          finish_event(create_event())
    {
    }

@@ -274,6 +276,24 @@ struct context
        this->current_device = std::make_shared<hip_device>(0, n_streams);
    }

+    void wait_for(any_ptr queue)
+    {
+        auto status = hipEventRecord(begin_event.get(), queue.get<hipStream_t>());
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("failed to record " + hip_error(status));
+
+        get_stream().wait(begin_event.get());
+    }
+
+    void finish_on(any_ptr queue)
+    {
+        get_stream().record(finish_event.get());
+
+        auto status = hipStreamWaitEvent(queue.get<hipStream_t>(), finish_event.get(), 0);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to wait on event " + hip_error(status));
+    }
+
    any_ptr get_queue() { return get_stream().get(); }

    void enable_perf_measurement(bool b = true)
@@ -316,9 +336,13 @@ struct context
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
    std::vector<shared<hip_event_ptr>> events;
-    bool measure_perf                 = false;
+    bool measure_perf = false;
+    // for event perf timing
    shared<hip_event_ptr> start_event = nullptr;
    shared<hip_event_ptr> stop_event  = nullptr;
+    // for stream syncronization
+    shared<hip_event_ptr> begin_event  = nullptr;
+    shared<hip_event_ptr> finish_event = nullptr;
 };

 inline void migraphx_to_value(value& v, const context& ctx) { v = ctx.to_value(); }

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -25,18 +25,40 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONVOLUTION_HPP

 #include <migraphx/shape.hpp>
-#include <migraphx/op/convolution.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/register_op.hpp>
 #include <migraphx/gpu/miopen.hpp>
-
+#include <migraphx/op/identity.hpp>
+#include <migraphx/op/convolution.hpp>
+#include <migraphx/op/quant_convolution.hpp>
+#include <migraphx/op/deconvolution.hpp>
+#include <unordered_map>
+#include <migraphx/reflect.hpp>
+#include <migraphx/gpu/context.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-struct context;
+inline shape reshape_if_1d(const shape& input)
+{
+    shape new_shape{input};
+    auto dims = new_shape.lens();
+
+    if(dims.size() == 3)
+    {
+        std::vector<size_t> new_dims = dims;
+        new_dims.insert(new_dims.begin() + 2, 1);
+        new_shape = shape{input.type(), new_dims};
+    }
+    return new_shape;
+}

+template <class Op>
 struct miopen_convolution
 {
-    op::convolution op;
+    Op op;
+    bool int8_x4_format               = false;
    shared<convolution_descriptor> cd = nullptr;
    miopenConvFwdAlgorithm_t algo{};
 #ifdef MIGRAPHX_HAS_FIND_2_API
@@ -48,29 +70,276 @@ struct miopen_convolution
    template <class Self, class F>
    static auto reflect(Self& self, F f)
    {
-        return pack(f(self.op.padding, "padding"),
-                    f(self.op.stride, "stride"),
-                    f(self.op.dilation, "dilation"),
-                    f(self.op.group, "group"),
-                    f(self.op.padding_mode, "padding_mode"),
+        return pack(f(self.op, "op"),
 #ifdef MIGRAPHX_HAS_FIND_2_API
                    f(self.solution_object, "solution_object"),
 #endif
+                    f(self.algo, "algo"),
+                    f(self.int8_x4_format, "int8_x4_format"),
                    f(self.solution_id, "solution_id"));
    }

-    std::string name() const { return "gpu::convolution"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
+    std::string name() const { return "gpu::" + op.name(); }
+
+    inline shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, op}.has(4).standard();
+        std::vector<shape> conv_inputs(inputs.begin(), inputs.begin() + 2);
+        check_shapes{conv_inputs, op}.max_ndims(5);
+        return migraphx::compute_shape<Op>(op, conv_inputs);
+    }
+
    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        auto x_desc = make_tensor(reshape_if_1d(args[0].get_shape()), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(args[1].get_shape()), int8_x4_format);
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+        auto workspace_size        = args[2].get_shape().bytes();
+
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, args[0].implicit()},
+                {miopenTensorConvolutionW, nullptr, args[1].implicit()},
+                {miopenTensorConvolutionY, nullptr, args[3].implicit()},
+            };
+
+            if(solution_ptr.get() == nullptr)
+                MIGRAPHX_THROW("MIOpen " + op.name() + " : Load MIOpen Solution before running it");
+
+            auto status = miopenRunSolution(miopen_stream_handle,
+                                            solution_ptr.get(),
+                                            3,
+                                            tensor_args,
+                                            args[2].implicit(),
+                                            workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen " + op.name() +
+                               " : running convolution using find_2.0 failed");
+
+            return args[3];
+        }
+#else
+        // else use immediate mode
+        if(solution_id == 0)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : invalid solution ID");
+
+        auto status = miopenConvolutionForwardImmediate(miopen_stream_handle,
+                                                        w_desc.get(),
+                                                        args[1].implicit(),
+                                                        x_desc.get(),
+                                                        args[0].implicit(),
+                                                        cd.get(),
+                                                        y_desc.get(),
+                                                        args[3].implicit(),
+                                                        args[2].implicit(),
+                                                        workspace_size,
+                                                        solution_id);
+
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": running convolution failed");
+        return args[3];
+#endif
+    }
+
+    inline void set_conv_descriptor()
+    {
+        if(cd == nullptr)
+        {
+            cd = (op.name() == "deconvolution") ? make_deconv(op) : make_conv(op);
+        }
+    }
+
+    value compile(migraphx::context& ctx, const shape& output, const std::vector<shape>& input)
+    {
+        set_conv_descriptor();
+        auto ws = find(any_cast<migraphx::gpu::context>(ctx), output, input);
+        return {{"workspace", ws.bytes()}};
+    }
+
+    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+        shape workspace_shape{};
+        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        std::size_t workspace_size = 0;
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            auto conv_problem = make_obj<miopen_problem>(
+                &miopenCreateConvProblem, cd.get(), miopenProblemDirectionForward);
+
+            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);
+
+            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+
+            solution_ptr = find_solution(miopen_stream_handle, conv_problem.get());
+            auto status  = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");
+
+            std::size_t solution_size;
+            status = miopenGetSolutionSize(solution_ptr.get(), &solution_size);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Failed to fetch solution size");
+
+            auto solution_binary = std::vector<char>{};
+            solution_binary.resize(solution_size);
+
+            status = miopenSaveSolution(solution_ptr.get(), solution_binary.data());
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen" + op.name() + ": Saving solution failed");
+            solution_object = value::binary{solution_binary.data(), solution_size};
+            return shape{shape::int8_type, {workspace_size}};
+        }
+#else
+        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
+        auto x         = to_gpu(generate_argument(x_shape));
+        auto w         = to_gpu(generate_argument(w_shape));
+        auto y         = allocate_gpu(output_shape);
+        auto workspace = allocate_gpu(workspace_shape);
+
+        int algo_count = 1;
+        miopenConvAlgoPerf_t perf;
+        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+                                                       x_desc.get(),
+                                                       x.implicit(),
+                                                       w_desc.get(),
+                                                       w.implicit(),
+                                                       cd.get(),
+                                                       y_desc.get(),
+                                                       y.implicit(),
+                                                       1,
+                                                       &algo_count,
+                                                       &perf,
+                                                       workspace.implicit(),
+                                                       workspace_size,
+                                                       false);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + " : find convolution failed");
+        algo = perf.fwd_algo;
+
+        size_t solution_count;
+
+        status = miopenConvolutionForwardGetSolutionCount(ctx.get_stream().get_miopen(),
+                                                          w_desc.get(),
+                                                          x_desc.get(),
+                                                          cd.get(),
+                                                          y_desc.get(),
+                                                          &solution_count);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution count failed");
+
+        std::vector<miopenConvSolution_t> solutions(solution_count);
+
+        status = miopenConvolutionForwardGetSolution(ctx.get_stream().get_miopen(),
+                                                     w_desc.get(),
+                                                     x_desc.get(),
+                                                     cd.get(),
+                                                     y_desc.get(),
+                                                     solution_count,
+                                                     &solution_count,
+                                                     solutions.data());
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen " + op.name() + ": get solution failed");
+
+        solution_id = solutions.front().solution_id;
+
+        return shape{shape::int8_type, {perf.memory}};
+#endif
+    }
+
+    void finalize(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
+    {
+#ifdef MIGRAPHX_HAS_FIND_2_API
+        {
+            (void)(ctx); // avoid warnings
+            (void)(output_shape);
+            (void)(inputs);
+            // load solution
+            if(solution_ptr == nullptr)
+            {
+                miopenSolution_t ptr;
+                auto status =
+                    miopenLoadSolution(&ptr,
+                                       reinterpret_cast<const char*>(solution_object.data()),
+                                       solution_object.size());
+                solution_ptr = miopen_solution{ptr};
+                if(status != miopenStatusSuccess)
+                    MIGRAPHX_THROW("MIOpen " + op.name() + ": loading convolution solution failed");
+            }
+        }
+#else
+        // Use immediate mode API
+        {
+            set_conv_descriptor();
+            if(solution_id == 0)
+            {
+                // Check that workspace hasn't changed
+                auto size = inputs.at(2).bytes();
+                auto ws   = find(ctx, output_shape, inputs);
+                if(ws.bytes() > size)
+                    MIGRAPHX_THROW("MIOpen " + op.name() +
+                                   ": workspace has changed during finalization.");
+            }
+
+            auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+            auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+            auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+            auto status = miopenConvolutionForwardCompileSolution(ctx.get_stream().get_miopen(),
+                                                                  w_desc.get(),
+                                                                  x_desc.get(),
+                                                                  cd.get(),
+                                                                  y_desc.get(),
+                                                                  solution_id);
+            if(status != miopenStatusSuccess)
+                MIGRAPHX_THROW("MIOpen Convolution: compile solution failed");
+        }
+#endif
+    }
+
+    inline std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
    {
        return shapes.size() - 1;
    }
-};

+    inline shape pack_int8_shape(const shape& s) const
+    {
+        if(s.type() != shape::int8_type)
+        {
+            return s;
+        }
+
+        auto lens    = s.lens();
+        auto strides = s.strides();
+        lens[1]      = (lens[1] + 3) / 4 * 4;
+        strides[0]   = strides[1] * lens[1];
+
+        return {s.type(), lens, strides};
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/deconvolution.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_DECONVOLUTION_HPP
-#define MIGRAPHX_GUARD_RTGLIB_DECONVOLUTION_HPP
-
-#include <migraphx/shape.hpp>
-#include <migraphx/op/deconvolution.hpp>
-#include <migraphx/gpu/miopen.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-
-struct miopen_deconvolution
-{
-    op::deconvolution op;
-    shared<convolution_descriptor> cd;
-    miopenConvFwdAlgorithm_t algo{};
-    uint64_t solution_id = 0;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return pack_join(op::deconvolution::reflect(self.op, f),
-                         pack(f(self.solution_id, "solution_id")));
-    }
-
-    std::string name() const { return "gpu::deconv"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/elu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/elu.hpp
-/*
- * The MIT License (MIT)
- *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in
- * all copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
- * THE SOFTWARE.
- */
-#ifndef MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-#define MIGRAPHX_GUARD_RTGLIB_ELU_HPP
-
-#include <migraphx/op/elu.hpp>
-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/gpu/miopen.hpp>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {
-
-struct context;
-
-struct miopen_elu
-{
-    op::elu op;
-    shared<activation_descriptor> ad;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
-
-    std::string name() const { return "gpu::elu"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    void finalize(context&, const shape&, const std::vector<shape>&);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif
--- a/src/targets/gpu/include/migraphx/gpu/mlir.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/mlir.hpp
@@ -36,7 +36,8 @@ struct module;
 namespace gpu {

 std::string dump_mlir(const module& m);
-code_object_op compile_mlir(const context& ctx, const module& m);
+code_object_op
+compile_mlir(const context& ctx, module m, const std::vector<instruction_ref>& inputs);

 instruction_ref insert_mlir(module& m,
                            instruction_ref ins,

--- a/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/perfdb.hpp
@@ -41,7 +41,7 @@ struct problem_params
    shape output;
 };

-std::string get_mlir_perf_for_conv(const problem_params& pp);
+std::string get_mlir_perf_for_conv(const problem_params& pp, bool xdlops);

 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/jit/mlir.cpp
+++ b/src/targets/gpu/jit/mlir.cpp
@@ -41,7 +41,7 @@ struct mlir_compiler : compiler<mlir_compiler>
    {
        auto* smod = ins->module_inputs().front();
        assert(smod->get_parameter_names().size() == ins->inputs().size() - 1);
-        return insert(compile_mlir(ctx, *smod));
+        return insert(compile_mlir(ctx, *smod, ins->inputs()));
    }

    compiler_replace insert(code_object_op co) const

--- a/src/targets/gpu/batch_norm_inference.cpp
+++ b/src/targets/gpu/batch_norm_inference.cpp
@@ -21,65 +21,80 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#include <migraphx/gpu/batch_norm_inference.hpp>
+#include <migraphx/gpu/compiler.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/compile_gen.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/float_equal.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-shape miopen_batch_norm_inference::compute_shape(const std::vector<shape>& inputs) const
+using namespace migraphx::gpu::gen; // NOLINT
+
+static const char* const pointwise_kernel = R"__migraphx__(
+#include <migraphx/kernels/pad.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/ops.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+extern "C" {
+__global__ void pad_kernel(void* input_p, void* output_p) 
 {
-    check_shapes{inputs, *this}.has(6);
-    check_shapes{inputs.data(), inputs.data() + 1, *this}.same_ndims().max_ndims(5);
-    return op.compute_shape({inputs.at(0), inputs.at(1), inputs.at(2), inputs.at(3), inputs.at(4)});
+    auto offsets = index_ints<${offsets}>{};
+    auto idx     = make_index();
+    make_tensors()(input_p, output_p)([&](auto input, auto output) {
+        pad(idx, offsets, input, output, ${pad_val});
+    });
+}
+    
 }

-inline shape reshape_to_2d(const shape& input)
-{
-    auto dims = input.lens();
-    if(dims.size() >= 4)
-        return input;
+} // namespace migraphx

-    std::vector<size_t> new_dims(dims.begin(), dims.end());
-    std::size_t num = 4 - dims.size();
-    new_dims.insert(new_dims.end(), num, 1);
-    return {input.type(), new_dims};
-}
+)__migraphx__";

-argument miopen_batch_norm_inference::compute(context& ctx,
-                                              const shape& output_shape,
-                                              const std::vector<argument>& args) const
+struct pad_compiler : compiler<pad_compiler>
 {
-    shape x_shape  = args[0].get_shape();
-    shape y_shape  = output_shape;
-    shape bn_shape = args[3].get_shape();
+    std::vector<std::string> names() const { return {"pad"}; }

-    auto x_desc  = make_tensor(reshape_to_2d(x_shape));
-    auto y_desc  = make_tensor(reshape_to_2d(y_shape));
-    auto bn_desc = make_tensor(reshape_to_2d(bn_shape));
+    operation compile_op(context& ctx, const std::vector<shape>& inputs, const value& v) const
+    {
+        hip_compile_options options;
+        options.inputs         = inputs;
+        options.output         = inputs.back();
+        options.virtual_inputs = reduce_dims(inputs);
+        options.kernel_name    = "pad_kernel";
+        options.set_launch_params(v, compute_global_for(ctx, inputs.at(1).elements()));

-    float alpha = 1.0;
-    float beta  = 0.0f;
+        auto pad_val        = v.get("value", 0.f);
+        auto pad_val_string = to_string(pad_val);
+        if(float_equal(pad_val, std::numeric_limits<float>::lowest()))
+            pad_val_string = "lowest{}";
+        if(float_equal(pad_val, std::numeric_limits<float>::max()))
+            pad_val_string = "highest{}";

-    miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
-                                             miopenBatchNormMode_t(op.bn_mode),
-                                             &alpha,
-                                             &beta,
-                                             x_desc.get(),
-                                             args[0].implicit(),
-                                             y_desc.get(),
-                                             args[5].implicit(),
-                                             bn_desc.get(),
-                                             args[1].implicit(),
-                                             args[2].implicit(),
-                                             args[3].implicit(),
-                                             args[4].implicit(),
-                                             op.epsilon);
+        auto padding    = v.at("pads").to_vector<int64_t>();
+        auto input_lens = inputs.front().lens();
+        std::vector<size_t> offsets(input_lens.size());
+        std::copy(padding.begin(), padding.begin() + offsets.size(), offsets.begin());

-    return args[5];
-}
+        auto src = interpolate_string(
+            pointwise_kernel,
+            {{"pad_val", to_string(pad_val_string)}, {"offsets", to_string_range(offsets)}});
+        return compile_hip_code_object(src, options);
+    }

+    compiler_replace compile(context& ctx, instruction_ref ins, const operation& op) const
+    {
+        return replace(compile_op(ctx, to_shapes(ins->inputs()), op.to_value()));
+    }
+};
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/jit/softmax.cpp
+++ b/src/targets/gpu/jit/softmax.cpp
@@ -32,6 +32,8 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_USE_FAST_SOFTMAX)
+
 using namespace migraphx::gpu::gen; // NOLINT

 static const char* const softmax_kernel = R"__migraphx__(
@@ -81,6 +83,9 @@ struct softmax_compiler : compiler<softmax_compiler>
        options.inputs      = inputs;
        options.kernel_name = "softmax_kernel";

+        if(enabled(MIGRAPHX_USE_FAST_SOFTMAX{}))
+            options.params = "-DMIGRAPHX_USE_FAST_SOFTMAX";
+
        auto src = interpolate_string(
            softmax_kernel,
            {{"transformers", make_transformer_args(vec)}, {"axis", to_string(axis)}});

--- a/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/quant_convolution.hpp
@@ -21,53 +21,43 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
-#define MIGRAPHX_GUARD_RTGLIB_QUANT_CONVOLUTION_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_PAD_HPP
+#define MIGRAPHX_GUARD_KERNELS_PAD_HPP

-#include <migraphx/shape.hpp>
-#include <migraphx/reflect.hpp>
-#include <migraphx/op/quant_convolution.hpp>
-#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/kernels/shape.hpp>
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/algorithm.hpp>
+#include <migraphx/kernels/ranges.hpp>

 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {

-struct context;
-
-struct miopen_quant_convolution
+template <class Offsets, class Input, class Output, class PadVal>
+__device__ void pad(const index& idx,
+                    const Offsets& offsets,
+                    const Input& input,
+                    Output& output,
+                    const PadVal& pad_val)
 {
-    op::quant_convolution op;
-    bool int8_x4_format = false;
-    shared<convolution_descriptor> cd;
-    miopenConvFwdAlgorithm_t algo{};
-    uint64_t solution_id = 0;
-
-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        // TODO: Add algo
-        return pack_join(migraphx::reflect(self.op, f),
-                         pack(f(self.int8_x4_format, "int8_x4_format")));
-    }
-
-    std::string name() const { return "gpu::quant_convolution"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape find(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
+    auto output_shape = output.get_shape();
+    idx.global_stride(output_shape.elements(), [&](auto i) {
+        // 1. get current multi-index for output
+        // 2. get the size of the input to determine input boundaries
+        // 3. compute the corresponding multi-index for input by accounting for offsets
+        // 4. if current multi-index is within offsets or input's new multi-index is out of bounds,
+        //    use pad value instead of input's value
+        auto multi        = output_shape.multi(i);
+        auto input_bounds = input.get_shape().lens;
+        auto input_idx    = multi - offsets;
+        auto range_multi  = range(multi.size());
+
+        if(any_of(range_multi.begin(), range_multi.end(), [&](auto j) {
+               return multi[j] < offsets[j] or input_idx[j] >= input_bounds[j];
+           }))
+            output[multi] = pad_val;
+        else
+            output[multi] = input[input_idx];
+    });
+}

-    private:
-    shape pack_int8_shape(const shape& s) const;
-};
-
-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
-
 #endif
--- a/src/targets/gpu/include/migraphx/gpu/batch_norm_inference.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/batch_norm_inference.hpp
@@ -21,41 +21,29 @@
 * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
 * THE SOFTWARE.
 */
-#ifndef MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP
-#define MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP
+#ifndef MIGRAPHX_GUARD_KERNELS_RANGES_HPP
+#define MIGRAPHX_GUARD_KERNELS_RANGES_HPP

-#include <migraphx/argument.hpp>
-#include <migraphx/op/batch_norm_inference.hpp>
-#include <migraphx/reflect.hpp>
+#include <migraphx/kernels/iota_iterator.hpp>

 namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-namespace gpu {

-struct context;
-
-struct miopen_batch_norm_inference
+template <class Iterator>
+struct iterator_range
 {
-    op::batch_norm_inference op;
+    Iterator start;
+    Iterator last;

-    template <class Self, class F>
-    static auto reflect(Self& self, F f)
-    {
-        return migraphx::reflect(self.op, f);
-    }
+    constexpr Iterator begin() const { return start; }

-    std::string name() const { return "gpu::batch_norm_inference"; }
-    shape compute_shape(const std::vector<shape>& inputs) const;
-    argument
-    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
-    {
-        return shapes.size() - 1;
-    }
+    constexpr Iterator end() const { return last; }
 };

-} // namespace gpu
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
+constexpr iterator_range<iota_iterator> range(diff_int start, diff_int last)
+{
+    return {{start, {}}, {last, {}}};
+}
+constexpr iterator_range<iota_iterator> range(diff_int last) { return range(0, last); }

-#endif
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_KERNELS_RANGES_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -197,11 +197,11 @@ struct block
    struct reducer
    {
        index idx;
-        Slicer slicer;
+        Slicer slice;
        template <class Op, class T, class Read>
        __device__ auto reduce(Op op, T init, Read read) const
        {
-            return sliced(slicer, [=](auto x, auto... xs) {
+            return sliced(slice, [=](auto x, auto... xs) {
                return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
                    return vec_reduce(read(x[j], xs[j]...), op);
                });
@@ -218,7 +218,7 @@ struct block
        template <class F>
        __device__ auto inner(F f) const
        {
-            return sliced(slicer, [=](auto x, auto... xs) {
+            return sliced(slice, [=](auto x, auto... xs) {
                idx.local_stride(x.get_shape().elements(), [&](auto j) { f(x[j], xs[j]...); });
            });
        }
@@ -226,7 +226,7 @@ struct block
        template <class Input>
        constexpr auto elements() const
        {
-            using reduce_type        = decltype(slicer(Input{}));
+            using reduce_type        = decltype(slice(Input{}));
            using value_type         = typename Input::type;
            constexpr auto relements = get_shape_c<reduce_type>{}.elements();
            if constexpr(vec_size<value_type>() > 1)
@@ -260,11 +260,11 @@ struct lane
    struct reducer
    {
        index idx;
-        Slicer slicer;
+        Slicer slice;
        template <class Op, class T, class Read>
        __device__ auto reduce(Op op, T init, Read read) const
        {
-            return sliced(slicer, [=](auto x, auto... xs) {
+            return sliced(slice, [=](auto x, auto... xs) {
                using type = typename decltype(x)::type;
                type r     = init;
                for(index_int j = 0; j < x.get_shape().elements(); j++)
@@ -284,7 +284,7 @@ struct lane
        template <class F>
        __device__ auto inner(F f) const
        {
-            return sliced(slicer, [=](auto x, auto... xs) {
+            return sliced(slice, [=](auto x, auto... xs) {
                for(index_int j = 0; j < x.get_shape().elements(); j++)
                {
                    f(x[j], xs[j]...);
@@ -295,7 +295,7 @@ struct lane
        template <class Input>
        constexpr auto elements() const
        {
-            using reduce_type = decltype(slicer(Input{}));
+            using reduce_type = decltype(slice(Input{}));
            return get_shape_c<reduce_type>{}.elements();
        }
    };

--- a/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/softmax.hpp
@@ -33,11 +33,15 @@ template <index_int Axis, class Input, class Output>
 __device__ void softmax(Input input, Output output)
 {
    reduce::block::run<reduce::with_axis<Input, Axis>>([&](auto, auto r) {
-        auto batch_max = r.reduce(op::max{}, lowest{}, op::id{})(input);
-        auto batch_sum =
-            r.reduce(op::sum{}, 0, [&](auto x) { return migraphx::exp(x - batch_max); })(input);
-        r.inner([&](auto& y, auto x) { y = migraphx::exp(x - batch_max) / batch_sum; })(output,
-                                                                                        input);
+#ifdef MIGRAPHX_USE_FAST_SOFTMAX
+        const auto c = vec_at(r.slice(input)[0], 0);
+#else
+        const auto c = r.reduce(op::max{}, lowest{}, op::id{})(input);
+#endif
+        auto batch_sum = r.reduce(op::sum{}, 0, [&](auto x) {
+            return migraphx::convert<float>(migraphx::exp(x - c));
+        })(input);
+        r.inner([&](auto& y, auto x) { y = migraphx::exp(x - c) / batch_sum; })(output, input);
    });
 }