Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into nhwc_workaround

94cf1bf3 · Khalique Ahmed · c0547e9a · 49280e51 · 94cf1bf3 · 94cf1bf3
Commit 94cf1bf3 authored Jul 25, 2023 by Khalique Ahmed
20 changed files
--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -251,9 +251,10 @@ auto compute_op(rank<1>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f)
-    -> decltype(
-        x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f))
+                F f) -> decltype(x.compute(make_compute_output_shape(pack(x, output, inputs)),
+                                           inputs,
+                                           module_args,
+                                           f))
 {
    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f);
 }
@@ -309,9 +310,10 @@ auto compute_op(rank<3>,
                const shape& output,
                const std::vector<argument>& inputs,
                const std::vector<module_ref>& module_args,
-                F f)
-    -> decltype(
-        x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f))
+                F f) -> decltype(x.compute(make_compute_output_shape(pack(x, output, inputs)),
+                                           inputs,
+                                           module_args,
+                                           f))
 {
    return x.compute(make_compute_output_shape(pack(x, output, inputs)), inputs, module_args, f);
 }
@@ -573,7 +575,7 @@ struct operation
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -1263,7 +1265,7 @@ struct operation
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -48,6 +48,7 @@
 #include <migraphx/op/convolution_backwards.hpp>
 #include <migraphx/op/cosh.hpp>
 #include <migraphx/op/cos.hpp>
+#include <migraphx/op/dimensions_of.hpp>
 #include <migraphx/op/div.hpp>
 #include <migraphx/op/dot.hpp>
 #include <migraphx/op/elu.hpp>

--- a/src/include/migraphx/pass.hpp
+++ b/src/include/migraphx/pass.hpp
@@ -116,7 +116,7 @@ struct pass
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -292,7 +292,7 @@ struct pass
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/include/migraphx/schedule_model.hpp
+++ b/src/include/migraphx/schedule_model.hpp
@@ -99,7 +99,7 @@ struct schedule_model
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -274,7 +274,7 @@ struct schedule_model
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/include/migraphx/shape.hpp
+++ b/src/include/migraphx/shape.hpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -240,6 +240,10 @@ struct MIGRAPHX_EXPORT shape
    template <class Iterator>
    std::size_t index(Iterator start, Iterator last) const
    {
+        if(this->dynamic())
+        {
+            MIGRAPHX_THROW("SHAPE: index() called on dynamic shape");
+        }
        assert(std::distance(start, last) <= this->lens().size());
        assert(this->lens().size() == this->strides().size());
        return std::inner_product(start, last, this->strides().begin(), std::size_t{0}); // NOLINT

--- a/src/include/migraphx/stream_model.hpp
+++ b/src/include/migraphx/stream_model.hpp
@@ -100,7 +100,7 @@ struct stream_model
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -288,7 +288,7 @@ struct stream_model
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -167,7 +167,7 @@ struct target
    {
        using std::swap;
        auto* derived = this->any_cast<PrivateDetailTypeErasedT>();
-        if(derived and private_detail_te_handle_mem_var.unique())
+        if(derived and private_detail_te_handle_mem_var.use_count() == 1)
        {
            *derived = std::forward<PrivateDetailTypeErasedT>(value);
        }
@@ -428,7 +428,7 @@ struct target
    private_detail_te_handle_base_type& private_detail_te_get_handle()
    {
        assert(private_detail_te_handle_mem_var != nullptr);
-        if(not private_detail_te_handle_mem_var.unique())
+        if(private_detail_te_handle_mem_var.use_count() > 1)
            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
        return *private_detail_te_handle_mem_var;
    }

--- a/src/onnx/parse_batchnorm.cpp
+++ b/src/onnx/parse_batchnorm.cpp
@@ -57,13 +57,12 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto x_rank = x_lens.size();
        if(x_rank == 1 or x_rank == 2)
        {
-            auto rt      = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
-            auto eps     = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
-            auto numer   = info.add_broadcastable_binary_op("sub", args[0], args[3]);
-            auto var_eps = info.add_broadcastable_binary_op("add", args[4], eps);
-            auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
-            auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
-            auto r0      = info.add_broadcastable_binary_op("mul", div0, args[1]);
+            auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
+            auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], args[3]);
+            auto var_eps    = info.add_broadcastable_binary_op("add", args[4], eps);
+            auto rsqrt      = info.add_instruction(make_op("rsqrt"), var_eps);
+            auto mul0       = info.add_broadcastable_binary_op("mul", args[1], rsqrt);
+            auto r0         = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
            return info.add_broadcastable_binary_op("add", r0, args[2]);
        }
        else if(x_rank > 2)
@@ -71,7 +70,6 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
            // unsqueeze tensors of shape (C) to broadcast correctly
            std::vector<int64_t> unsqueeze_axes(x_lens.size() - 2);
            std::iota(unsqueeze_axes.begin(), unsqueeze_axes.end(), 1);
-            auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
            auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});
            auto scale_unsqueeze = info.add_instruction(
                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[1]);
@@ -81,11 +79,11 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[3]);
            auto var_unsqueeze = info.add_instruction(
                migraphx::make_op("unsqueeze", {{"axes", unsqueeze_axes}}), args[4]);
-            auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
-            auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
-            auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
-            auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
-            auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+            auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+            auto var_eps    = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+            auto rsqrt      = info.add_instruction(make_op("rsqrt"), var_eps);
+            auto mul0       = info.add_broadcastable_binary_op("mul", scale_unsqueeze, rsqrt);
+            auto r0         = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
            return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
        }
        else

--- a/src/onnx/parse_instancenorm.cpp
+++ b/src/onnx/parse_instancenorm.cpp
@@ -79,13 +79,11 @@ struct parse_instancenorm : op_parser<parse_instancenorm>
        auto x     = args[0];
        auto scale = args[1];
        auto bias  = args[2];
-        auto dims  = x->get_shape().lens();
        if(not contains(valid_types, dtype))
            MIGRAPHX_THROW(opd.op_name + ": invalid output type: " + std::to_string(dtype) +
                           ". Valid types are 1 (float), 10 (half), and 11 (double).");

-        bool dyn_input = x->get_shape().dynamic();
-        auto ndims     = x->get_shape().ndim();
+        auto ndims = x->get_shape().ndim();
        assert(ndims >= 2);
        auto kdims = ndims - 2;
        std::vector<int64_t> axes(kdims);
@@ -102,6 +100,12 @@ struct parse_instancenorm : op_parser<parse_instancenorm>
            (dtype == shape::half_type and not convert_fp16) ? "reduce_sum" : "reduce_mean";
        if(dtype == shape::half_type and not convert_fp16)
        {
+            if(x->get_shape().dynamic())
+            {
+                MIGRAPHX_THROW("PARSE_INSTANCENORM: half type not supported with dynamic shape "
+                               "unless convert_fp16 is TRUE");
+            }
+            auto dims = x->get_shape().lens();
            double n =
                std::accumulate(dims.begin() + 2, dims.end(), 1, [&](const auto& i, const auto& j) {
                    return i * j;
@@ -122,13 +126,14 @@ struct parse_instancenorm : op_parser<parse_instancenorm>
        // both scale and bias.
        instruction_ref scale_bcast;
        instruction_ref bias_bcast;
-        if(dyn_input)
+        if(x->get_shape().dynamic())
        {
            scale_bcast = info.add_instruction(make_op("broadcast", {{"axis", 1}}), scale, x);
            bias_bcast  = info.add_instruction(make_op("broadcast", {{"axis", 1}}), bias, x);
        }
        else
        {
+            auto dims   = x->get_shape().lens();
            scale_bcast = info.add_instruction(
                make_op("broadcast", {{"axis", 1}, {"out_lens", dims}}), scale);
            bias_bcast =

--- a/src/onnx/parse_shape.cpp
+++ b/src/onnx/parse_shape.cpp
@@ -30,8 +30,11 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace onnx {

-// Use a literal instruction to replace the shape since, output of
-// shape operator are literals in migraphx
+/**
+ * If static shape input, creates a literal in migraphx.
+ * If dynamic shape input, creates a dimensions_of operator in migraphx (runtime evaluation of
+ * shape).
+ */
 struct parse_shape : op_parser<parse_shape>
 {
    std::vector<op_desc> operators() const { return {{"Shape"}}; }
@@ -43,13 +46,54 @@ struct parse_shape : op_parser<parse_shape>
    {
        if(args.size() != 1)
            MIGRAPHX_THROW("Shape: operator should have 1 operand");
-        std::vector<std::size_t> arg_shape = args[0]->get_shape().lens();
-        std::vector<int64_t> vec_shape(arg_shape.size());
-        migraphx::shape s(migraphx::shape::int64_type, {arg_shape.size()});
-        std::transform(arg_shape.begin(), arg_shape.end(), vec_shape.begin(), [](auto i) {
-            return int64_t(i);
-        });
-        return info.add_literal(migraphx::literal{s, vec_shape});
+        auto input_shape  = args[0]->get_shape();
+        int input_ndim    = input_shape.ndim();
+        std::size_t start = 0;
+        std::size_t end   = input_ndim;
+        // Normalizing the start and end is handled here because of how the static shape version
+        // works. Clamping to [-r, r], where r is ndim of input and then making positive.
+        auto normalize_ind = [&](int64_t ind) {
+            if(ind < (-1 * input_ndim))
+            {
+                ind = -1 * input_ndim;
+            }
+            if(ind > input_ndim)
+            {
+                ind = input_ndim;
+            }
+            return (ind >= 0) ? ind : input_ndim + ind;
+        };
+        if(contains(info.attributes, "end"))
+        {
+            end = normalize_ind(info.attributes.at("end").i());
+        }
+        if(contains(info.attributes, "start"))
+        {
+            start = normalize_ind(info.attributes.at("start").i());
+        }
+        if(end <= start)
+        {
+            MIGRAPHX_THROW("PARSE_SHAPE: ending axis <= starting axis, end: " +
+                           std::to_string(end) + " start: " + std::to_string(start));
+        }
+
+        if(input_shape.dynamic())
+        {
+            return info.add_instruction(make_op("dimensions_of", {{"start", start}, {"end", end}}),
+                                        args[0]);
+        }
+        else
+        {
+            std::size_t output_ndim = end - start;
+            std::vector<int64_t> vec_shape(output_ndim);
+            migraphx::shape s(migraphx::shape::int64_type, {output_ndim});
+            std::vector<std::size_t> input_lens = input_shape.lens();
+            std::transform(input_lens.begin() + start,
+                           input_lens.begin() + end,
+                           vec_shape.begin(),
+                           [](auto i) { return int64_t(i); });
+            return info.add_literal(migraphx::literal{s, vec_shape});
+        }
    }
 };


--- a/src/quantization.cpp
+++ b/src/quantization.cpp
@@ -29,6 +29,7 @@
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/simplify_qdq.hpp>
 #include <migraphx/eliminate_common_subexpression.hpp>
+#include <migraphx/optimize_module.hpp>
 #include <migraphx/dead_code_elimination.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
@@ -48,19 +49,12 @@ MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_INT8_QUANTIZATION_PARAMS)

 // This function is to convert any instructions specified in the input
 // from double or float to float16 by inserting a convert operator.
-// For the conversion, there could be cases of overflowing, but it
-// is very rare in the area of deeping learning, so we just do a
-// truncate of the input to get the fp16.
+// For the conversion, there could be cases of overflowing or underflowing, but it
+// is uncommon. Run optimize_module() before converting to fp16 to const eval and fold in FP32 to
+// avoid loss of precision.
 void quantize_fp16(program& prog, const std::vector<std::string>& ins_names)
 {
-    run_passes(prog,
-               {quantize_fp16_pass{ins_names},
-                eliminate_common_subexpression{},
-                dead_code_elimination{},
-                simplify_reshapes{},
-                dead_code_elimination{},
-                simplify_qdq{},
-                dead_code_elimination{}});
+    run_passes(prog, {optimize_module{}, quantize_fp16_pass{ins_names}, optimize_module{}});
 }

 void quantize_int8(program& prog,

--- a/src/shape.cpp
+++ b/src/shape.cpp
 /*
 * The MIT License (MIT)
 *
- * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ * Copyright (c) 2015-2023 Advanced Micro Devices, Inc. All rights reserved.
 *
 * Permission is hereby granted, free of charge, to any person obtaining a copy
 * of this software and associated documentation files (the "Software"), to deal
@@ -273,9 +273,23 @@ shape shape::from_permutation(type_t t,

 shape::type_t shape::type() const { return impl->m_type; }

-const std::vector<std::size_t>& shape::lens() const { return impl->m_lens; }
+const std::vector<std::size_t>& shape::lens() const
+{
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: lens() called on a dynamic shape");
+    }
+    return impl->m_lens;
+}

-const std::vector<std::size_t>& shape::strides() const { return impl->m_strides; }
+const std::vector<std::size_t>& shape::strides() const
+{
+    if(this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: strides() called on a dynamic shape");
+    }
+    return impl->m_strides;
+}

 std::size_t shape::ndim() const
 {
@@ -535,7 +549,14 @@ bool shape::any_of_dynamic() const
    });
 }

-const std::vector<shape::dynamic_dimension>& shape::dyn_dims() const { return impl->m_dyn_dims; }
+const std::vector<shape::dynamic_dimension>& shape::dyn_dims() const
+{
+    if(not this->dynamic())
+    {
+        MIGRAPHX_THROW("SHAPE: dyn_dims() called on a static shape");
+    }
+    return impl->m_dyn_dims;
+}

 std::vector<std::size_t> shape::min_lens() const
 {
@@ -679,12 +700,22 @@ const std::vector<shape>& shape::sub_shapes() const { return impl->m_shapes; }
 void migraphx_to_value(value& v, const shape& s)
 {
    value result;
-    result["type"]               = migraphx::to_value(s.type_string());
-    result["lens"]               = migraphx::to_value(s.lens());
-    result["strides"]            = migraphx::to_value(s.strides());
-    result["sub_shapes"]         = migraphx::to_value(s.sub_shapes());
-    result["dynamic_dimensions"] = migraphx::to_value(s.dyn_dims());
-    v                            = result;
+    result["type"]       = migraphx::to_value(s.type_string());
+    result["sub_shapes"] = migraphx::to_value(s.sub_shapes());
+    // avoid calling functions that will throw
+    if(s.dynamic())
+    {
+        result["lens"]               = {};
+        result["strides"]            = {};
+        result["dynamic_dimensions"] = migraphx::to_value(s.dyn_dims());
+    }
+    else
+    {
+        result["lens"]               = migraphx::to_value(s.lens());
+        result["strides"]            = migraphx::to_value(s.strides());
+        result["dynamic_dimensions"] = {};
+    }
+    v = result;
 }

 void migraphx_from_value(const value& v, shape& s)

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -33,7 +33,10 @@ if(NOT TARGET MIOpen)
    message(SEND_ERROR "Cant find miopen")
 endif()

-find_package(composable_kernel 1.0.0 COMPONENTS jit_library REQUIRED) 
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    find_package(composable_kernel 1.0.0 REQUIRED COMPONENTS jit_library)
+endif()

 if(BUILD_DEV)
    set(MIGRAPHX_USE_HIPRTC OFF CACHE BOOL "Use hipRTC APIs")
@@ -85,6 +88,12 @@ target_link_libraries(kernel_file_check compile_for_gpu)
 rocm_clang_tidy_check(kernel_file_check)

 file(GLOB JIT_GPU_SRCS CONFIGURE_DEPENDS ${CMAKE_CURRENT_SOURCE_DIR}/jit/*.cpp)
+
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    list(REMOVE_ITEM JIT_GPU_SRCS ${CMAKE_CURRENT_SOURCE_DIR}/jit/ck_gemm.cpp)
+endif()
+
 add_library(migraphx_gpu
    abs.cpp
    analyze_streams.cpp
@@ -133,6 +142,7 @@ add_library(migraphx_gpu
    write_literals.cpp
    ${JIT_GPU_SRCS}
 )
+
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 migraphx_generate_export_header(migraphx_gpu)

@@ -236,7 +246,12 @@ check_library_exists(MIOpen "miopenFindSolutions" "${MIOPEN_LOCATION}" HAS_FIND_
 set(MIGRAPHX_USE_FIND_2_API "${HAS_FIND_2_API}" CACHE BOOL "")

 if(MIGRAPHX_USE_FIND_2_API)
-    target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    check_library_exists(MIOpen "miopenSetFindOptionPreallocatedTensor" "${MIOPEN_LOCATION}" HAS_PREALLOCATION_API)
+    if(HAS_PREALLOCATION_API)
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API -DMIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS)
+    else()
+        target_compile_definitions(migraphx_gpu PUBLIC -DMIGRAPHX_HAS_FIND_2_API)
+    endif()
    message(STATUS "MIGraphx is using Find-2.0 API of MIOpen")
 else()
    message(STATUS "MIGraphx is using legacy Find API in MIOpen")
@@ -250,7 +265,11 @@ else()
 endif()

 target_link_libraries(migraphx_gpu PUBLIC migraphx MIOpen roc::rocblas)
-target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels composable_kernel::jit_library)
+target_link_libraries(migraphx_gpu PRIVATE migraphx_device migraphx_kernels)
+if(NOT WIN32)
+    # TODO: re-enable when CK is ported to Windows
+    target_link_libraries(migraphx_gpu PRIVATE composable_kernel::jit_library)
+endif()

 add_subdirectory(driver)
 add_subdirectory(hiprtc)

--- a/src/targets/gpu/compile_hip_code_object.cpp
+++ b/src/targets/gpu/compile_hip_code_object.cpp
@@ -135,14 +135,13 @@ compute_global_for(context& ctx, std::size_t n, std::size_t over)
    std::size_t max_global = ctx.get_current_device().get_cu_count() *
                             ctx.get_current_device().get_max_workitems_per_cu();
    return [n, over, max_global](std::size_t local) {
-        std::size_t num_elements = n;
+        // hip require global workitems multiple of local workitems. It may degrade performance.
+        // [TODO]: consider adding "fno-hip-uniform-block" flag when it becomes available.
+        // https://reviews.llvm.org/D155213
+        std::size_t num_elements = ((n + local - 1) / local) * local;
        std::size_t groups       = (num_elements + local - 1) / local;
        std::size_t max_blocks   = max_global / local;
        std::size_t nglobal      = std::min(max_blocks * over, groups) * local;
-#ifdef MIGRAPHX_USE_HIPRTC
-        if(enabled(MIGRAPHX_ENABLE_HIPRTC_WORKAROUNDS{}))
-            num_elements = ((num_elements + local - 1) / local) * local;
-#endif
        return std::min(nglobal, num_elements);
    };
 }

--- a/src/targets/gpu/fuse_mlir.cpp
+++ b/src/targets/gpu/fuse_mlir.cpp
@@ -216,6 +216,7 @@ struct find_mlir_op
                                                                "quant_dot",
                                                                "add",
                                                                "clip",
+                                                                "relu",
                                                                "sub",
                                                                "mul",
                                                                "div",

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -160,10 +160,31 @@ struct miopen_convolution
    shape find(context& ctx, const shape& output_shape, const std::vector<shape>& inputs)
    {
        shape workspace_shape{};
-        auto x_desc                = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
-        auto w_desc                = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
-        auto y_desc                = make_tensor(reshape_if_1d(output_shape));
+        auto x_desc = make_tensor(reshape_if_1d(inputs[0]), int8_x4_format);
+        auto w_desc = make_tensor(reshape_if_1d(inputs[1]), int8_x4_format);
+        auto y_desc = make_tensor(reshape_if_1d(output_shape));
+
+        auto* miopen_stream_handle = ctx.get_stream().get_miopen();
        std::size_t workspace_size = 0;
+        auto status                = miopenConvolutionForwardGetWorkSpaceSize(miopen_stream_handle,
+                                                               w_desc.get(),
+                                                               x_desc.get(),
+                                                               cd.get(),
+                                                               y_desc.get(),
+                                                               &workspace_size);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
+
+        workspace_shape = shape{shape::int8_type, {workspace_size}};
+
+        auto x_shape = inputs[0];
+        auto w_shape = inputs[1];
+        if(int8_x4_format)
+        {
+            x_shape = pack_int8_shape(x_shape);
+            w_shape = pack_int8_shape(w_shape);
+        }
+
 #ifdef MIGRAPHX_HAS_FIND_2_API
        {
            auto conv_problem = make_obj<miopen_problem>(
@@ -171,13 +192,34 @@ struct miopen_convolution

            set_tensor_descriptor(miopenTensorConvolutionX, x_desc, conv_problem);
            set_tensor_descriptor(miopenTensorConvolutionW, w_desc, conv_problem);
+            bool preallocate = false;
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+            // MIOpen has APIs to pass pre-allocated buffers starting from rocm-5.6
+            preallocate = true;
+#endif
+            auto x = preallocate ? to_gpu(generate_argument(x_shape)) : inputs[0];
+            auto w = preallocate ? to_gpu(generate_argument(w_shape)) : inputs[1];
+            auto y = preallocate ? allocate_gpu(output_shape) : inputs[2];
+            auto workspace =
+                preallocate ? allocate_gpu(workspace_shape) : migraphx::argument(workspace_shape);
+
            set_tensor_descriptor(miopenTensorConvolutionY, y_desc, conv_problem);

-            auto* miopen_stream_handle = ctx.get_stream().get_miopen();
+            const miopenTensorArgument_t tensor_args[3] = {
+                {miopenTensorConvolutionX, nullptr, x.implicit()},
+                {miopenTensorConvolutionW, nullptr, w.implicit()},
+                {miopenTensorConvolutionY, nullptr, y.implicit()},
+            };
+
+            solution_ptr = find_solution(miopen_stream_handle,
+                                         3,
+                                         tensor_args,
+                                         workspace.implicit(),
+                                         workspace_size,
+                                         conv_problem.get(),
+                                         ctx.get_exhaustive_tune_flag());

-            solution_ptr = find_solution(
-                miopen_stream_handle, conv_problem.get(), ctx.get_exhaustive_tune_flag());
-            auto status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
+            status = miopenGetSolutionWorkspaceSize(solution_ptr.get(), &workspace_size);
            if(status != miopenStatusSuccess)
                MIGRAPHX_THROW("MIOpen" + op.name() + " : failed to get solution's workspace size");

@@ -196,29 +238,10 @@ struct miopen_convolution
            return shape{shape::int8_type, {workspace_size}};
        }
 #else
-        auto status = miopenConvolutionForwardGetWorkSpaceSize(ctx.get_stream().get_miopen(),
-                                                               w_desc.get(),
-                                                               x_desc.get(),
-                                                               cd.get(),
-                                                               y_desc.get(),
-                                                               &workspace_size);
-        if(status != miopenStatusSuccess)
-            MIGRAPHX_THROW("MIOpen" + op.name() + " : Failed to get forward workspace size");
-
-        workspace_shape = shape{shape::int8_type, {workspace_size}};
-
-        auto x_shape = inputs[0];
-        auto w_shape = inputs[1];
-        if(int8_x4_format)
-        {
-            x_shape = pack_int8_shape(x_shape);
-            w_shape = pack_int8_shape(w_shape);
-        }
        auto x         = to_gpu(generate_argument(x_shape));
        auto w         = to_gpu(generate_argument(w_shape));
        auto y         = allocate_gpu(output_shape);
        auto workspace = allocate_gpu(workspace_shape);
-
        int algo_count = 1;
        miopenConvAlgoPerf_t perf;
        status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
@@ -338,6 +361,7 @@ struct miopen_convolution
        return {s.type(), lens, strides};
    }
 };
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -75,21 +75,43 @@ using miopen_find_options = MIGRAPHX_MANAGE_PTR(miopenFindOptions_t, miopenDestr
 using miopen_problem      = MIGRAPHX_MANAGE_PTR(miopenProblem_t, miopenDestroyProblem);
 using miopen_solution     = MIGRAPHX_MANAGE_PTR(miopenSolution_t, miopenDestroySolution);

-inline miopen_solution
-find_solution(miopenHandle_t handle, miopenProblem_t problem, bool tune = false)
+inline miopen_solution find_solution(miopenHandle_t handle,
+                                     size_t num_inputs,
+                                     const miopenTensorArgument_t* tensor_args,
+                                     void* workspace,
+                                     size_t workspace_size,
+                                     miopenProblem_t problem,
+                                     bool tune = false)
 {
    miopenSolution_t solution;
    size_t found           = 0;
-    miopen_find_options fo = nullptr;
+    miopen_find_options fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
    if(tune)
    {
-        fo = make_obj<miopen_find_options>(&miopenCreateFindOptions);
        miopenSetFindOptionTuning(fo.get(), 1);
    }
-    auto status = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
+#ifdef MIGRAPHX_PREALLOCATE_MIOPEN_BUFFERS
+    for(auto i : range(num_inputs))
+    {
+        auto status = miopenSetFindOptionPreallocatedTensor(
+            fo.get(), tensor_args[i].id, tensor_args[i].buffer);
+        if(status != miopenStatusSuccess)
+            MIGRAPHX_THROW("MIOpen: failed to preallocate tensors for the find process");
+    }
+    auto status = miopenSetFindOptionPreallocatedWorkspace(fo.get(), workspace, workspace_size);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("MIOpen: failed to preallocate workspace for the find process");
+#else
+    miopenStatus_t status;
+    (void)(num_inputs);
+    (void)(tensor_args);
+    (void)(workspace_size);
+    (void)(workspace);
+#endif
+    status      = miopenFindSolutions(handle, problem, fo.get(), &solution, &found, 1);
    auto result = miopen_solution{solution};
    if(status != miopenStatusSuccess or found == 0)
-        MIGRAPHX_THROW("MIOpen miopenFindSolutions failed");
+        MIGRAPHX_THROW("MIOpen: miopenFindSolutions failed");
    return result;
 }


--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -76,7 +76,9 @@ namespace gpu {
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_SCHEDULE_PASS)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_REDUCE_FUSION)
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NHWC)
+#ifdef _WIN32
 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_CK)
+#endif

 struct id_pass
 {
@@ -139,7 +141,9 @@ std::vector<pass> target::get_passes(migraphx::context& gctx, const compile_opti
        dead_code_elimination{},
        enable_pass(not enabled(MIGRAPHX_DISABLE_REDUCE_FUSION{}), fuse_reduce{}),
        dead_code_elimination{},
+#ifdef _WIN32
        enable_pass(enabled(MIGRAPHX_ENABLE_CK{}), fuse_ck{}),
+#endif
        dead_code_elimination{},
        enable_pass(mlir_enabled(), fuse_mlir{&ctx}),
        dead_code_elimination{},

--- a/src/tf/parse_batchnorm.cpp
+++ b/src/tf/parse_batchnorm.cpp
@@ -52,7 +52,6 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto x_type = args[0]->get_shape().type();

        // unsqueeze tensors of shape (C) to broadcast correctly
-        auto rt  = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {0.5}});
        auto eps = info.add_literal(migraphx::literal{migraphx::shape{x_type}, {epsilon}});

        auto scale_unsqueeze =
@@ -64,11 +63,11 @@ struct parse_batchnorm : op_parser<parse_batchnorm>
        auto var_unsqueeze =
            info.add_instruction(migraphx::make_op("unsqueeze", {{"axes", {1, 2}}}), args[4]);

-        auto numer   = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
-        auto var_eps = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
-        auto denom   = info.add_broadcastable_binary_op("pow", var_eps, rt);
-        auto div0    = info.add_broadcastable_binary_op("div", numer, denom);
-        auto r0      = info.add_broadcastable_binary_op("mul", div0, scale_unsqueeze);
+        auto x_sub_mean = info.add_broadcastable_binary_op("sub", args[0], mean_unsqueeze);
+        auto var_eps    = info.add_broadcastable_binary_op("add", var_unsqueeze, eps);
+        auto rsqrt      = info.add_instruction(make_op("rsqrt"), var_eps);
+        auto mul0       = info.add_broadcastable_binary_op("mul", scale_unsqueeze, rsqrt);
+        auto r0         = info.add_broadcastable_binary_op("mul", x_sub_mean, mul0);
        return info.add_broadcastable_binary_op("add", r0, bias_unsqueeze);
    }
 };

--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
@@ -34,7 +34,6 @@ TEST_CASE(load_and_run)
    auto shapes_before = p.get_output_shapes();
    migraphx::compile_options options;
    options.set_offload_copy();
-    options.set_exhaustive_tune_flag();
    p.compile(migraphx::target("gpu"), options);
    auto shapes_after = p.get_output_shapes();
    CHECK(shapes_before.size() == 1);