added conv+bias+relu+atomic_add

d3bd5922 · Chao Liu · 7dba659f · d3bd5922 · d3bd5922 · d3bd5922
Commit d3bd5922 authored Dec 20, 2021 by Chao Liu
13 changed files
--- a/composable_kernel/include/utility/amd_buffer_addressing.hpp
+++ b/composable_kernel/include/utility/amd_buffer_addressing.hpp
@@ -31,7 +31,7 @@ __device__ int32x4_t make_wave_buffer_resource(T* p_wave, index_t element_space_
    return wave_buffer_resource.content;
 }
-// load
+// buffer load i8
 __device__ int8_t
 llvm_amdgcn_raw_buffer_load_i8(int32x4_t srsrc,
                               index_t voffset,
@@ -50,6 +50,7 @@ llvm_amdgcn_raw_buffer_load_i8x4(int32x4_t srsrc,
                                 index_t soffset,
                                 index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i8");
+// buffer load i16
 __device__ ushort
 llvm_amdgcn_raw_buffer_load_i16(int32x4_t srsrc,
                                index_t voffset,
@@ -68,6 +69,7 @@ llvm_amdgcn_raw_buffer_load_i16x4(int32x4_t srsrc,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i16");
+// buffer load i32
 __device__ int32_t
 llvm_amdgcn_raw_buffer_load_i32(int32x4_t srsrc,
                                index_t voffset,
@@ -85,7 +87,7 @@ llvm_amdgcn_raw_buffer_load_i32x4(int32x4_t srsrc,
                                  index_t voffset,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4i32");
-// half
+// buffer load fp16
 __device__ half_t
 llvm_amdgcn_raw_buffer_load_fp16(int32x4_t srsrc,
                                 index_t voffset,
@@ -104,7 +106,7 @@ llvm_amdgcn_raw_buffer_load_fp16x4(int32x4_t srsrc,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f16");
-// float
+// buffer load fp32
 __device__ float
 llvm_amdgcn_raw_buffer_load_fp32(int32x4_t srsrc,
                                 index_t voffset,
@@ -123,7 +125,7 @@ llvm_amdgcn_raw_buffer_load_fp32x4(int32x4_t srsrc,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.load.v4f32");
-// store
+// buffer store i8
 __device__ void
 llvm_amdgcn_raw_buffer_store_i8(int8_t vdata,
                                int32x4_t rsrc,
@@ -145,6 +147,7 @@ llvm_amdgcn_raw_buffer_store_i8x4(int8x4_t vdata,
                                  index_t soffset,
                                  index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i8");
+// buffer store i16
 __device__ void
 llvm_amdgcn_raw_buffer_store_i16(ushort vdata,
                                 int32x4_t rsrc,
@@ -166,6 +169,7 @@ llvm_amdgcn_raw_buffer_store_i16x4(ushort4_t vdata,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i16");
+// buffer store i32
 __device__ void
 llvm_amdgcn_raw_buffer_store_i32(int32_t vdata,
                                 int32x4_t rsrc,
@@ -187,7 +191,7 @@ llvm_amdgcn_raw_buffer_store_i32x4(int32x4_t vdata,
                                   index_t soffset,
                                   index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4i32");
-// half
+// buffer store fp16
 __device__ void
 llvm_amdgcn_raw_buffer_store_fp16(half_t vdata,
                                  int32x4_t rsrc,
@@ -208,7 +212,7 @@ llvm_amdgcn_raw_buffer_store_fp16x4(half4_t vdata,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f16");
-// float
+// buffer store fp32
 __device__ void
 llvm_amdgcn_raw_buffer_store_fp32(float vdata,
                                  int32x4_t rsrc,
@@ -229,8 +233,15 @@ llvm_amdgcn_raw_buffer_store_fp32x4(float4_t vdata,
                                    index_t voffset,
                                    index_t soffset,
                                    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.store.v4f32");
-// atomic add
+// buffer atomic-add fp16
-// int
+__device__ half2_t llvm_amdgcn_raw_buffer_atomic_add_fp16x2(
+    half2_t vdata,
+    int32x4_t rsrc,
+    index_t voffset,
+    index_t soffset,
+    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.v2f16");
+// buffer atomic-add i32
 __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
    int32_t vdata,
    int32x4_t rsrc,
@@ -238,7 +249,7 @@ __device__ int32_t llvm_amdgcn_raw_buffer_atomic_add_i32(
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.add.i32");
-// float
+// buffer atomic-add fp32
 __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
    float vdata,
    int32x4_t rsrc,
@@ -752,6 +763,7 @@ __device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::typ
                                           index_t dst_wave_addr_offset)
 {
    static_assert((is_same<T, float>::value && (N == 1 || N == 2 || N == 4)) ||
+                      (is_same<T, half_t>::value && (N == 2 || N == 4 || N == 8)) ||
                      (is_same<T, int32_t>::value && (N == 1 || N == 2 || N == 4)),
                  "wrong! not implemented");
@@ -810,6 +822,41 @@ __device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::typ
                                                   0);
        }
    }
+    else if constexpr(is_same<T, half_t>::value)
+    {
+        if constexpr(N == 2)
+        {
+            llvm_amdgcn_raw_buffer_atomic_add_fp16x2(src_thread_data,
+                                                     dst_wave_buffer_resource,
+                                                     dst_thread_addr_offset,
+                                                     dst_wave_addr_offset,
+                                                     0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<half_t, 4> tmp{src_thread_data};
+            static_for<0, 2, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
+                                                         dst_wave_buffer_resource,
+                                                         dst_thread_addr_offset,
+                                                         dst_wave_addr_offset + i * sizeof(half2_t),
+                                                         0);
+            });
+        }
+        else if constexpr(N == 8)
+        {
+            vector_type<half_t, 8> tmp{src_thread_data};
+            static_for<0, 4, 1>{}([&](auto i) {
+                llvm_amdgcn_raw_buffer_atomic_add_fp16x2(tmp.AsType<half2_t>()[i],
+                                                         dst_wave_buffer_resource,
+                                                         dst_thread_addr_offset,
+                                                         dst_wave_addr_offset + i * sizeof(half2_t),
+                                                         0);
+            });
+        }
+    }
    else if constexpr(is_same<T, int32_t>::value)
    {
        if constexpr(N == 1)

--- a/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp
+#include <stdlib.h>
+#include "config.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_fwd_bias_activation_atomic_add_instance {
+using F16 = ck::half_t;
+using F32 = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+static constexpr auto InMemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
+using device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instances = std::tuple<
+    // clang-format off
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|               Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|   CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|      GlobalMemory|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeate|   NRepeate| _MBlock_MRepeat_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation|     DataOperation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle| PerShuffle| _NBlock_NRepeat_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |            |            |            |                  |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |           |                             |                |
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 4, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 16>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 4, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 4, 1, 1, 16>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 8, 1, 1, 16>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 4, 1, 1, 32>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 4, 1, 1, 16>,               2>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu, InMemoryAtomicAdd,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 4, 1, 1, 16>,               2>
+    // clang-format on
+    >;
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasActivationPtr<PassThrough, PassThrough, AddRelu>>&
+        instance_container)
+{
+    using Instances =
+        device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instances;
+    const auto instances = Instances{};
+    ck::static_for<0, std::tuple_size_v<Instances>, 1>{}([&](auto i) {
+        using Instance = remove_cvref_t<decltype(std::get<i>(instances))>;
+        auto instance = Instance{};
+        instance_container.push_back(std::make_unique<Instance>(instance));
+    });
+}
+} // namespace device_conv2d_fwd_bias_activation_atomic_add_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
+++ b/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instance.cpp
@@ -17,25 +17,27 @@ using S = ck::Sequence<Is...>;
 using PassThrough = ck::tensor_operation::element_wise::PassThrough;
 using AddRelu     = ck::tensor_operation::element_wise::AddRelu;
+static constexpr auto MemorySet = ck::InMemoryDataOperationEnum_t::Set;
 using device_conv2d_fwd_xdl_c_shuffle_bias_relu_nhwc_kyxc_nhwk_f16_instances = std::tuple<
    // clang-format off
-        //##########################################################################################| InData| WeiData| OutData| AccData|           A|           B|           C| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|   CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+        //##########################################################################################| InData| WeiData| OutData| AccData|          In|         Wei|         Out|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|   CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeate|   NRepeate| _MBlock_MRepeat_MWaveMPerXdl| ScalarPerVector|
+        //##########################################################################################|   Type|    Type|    Type|    Type| Elementwise| Elementwise| Elementwise|  GlobalMemory|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    MRepeate|   NRepeate| _MBlock_MRepeat_MWaveMPerXdl| ScalarPerVector|
-        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle| PerShuffle| _NBlock_NRepeat_NWaveNPerXdl|   _NWaveNPerXdl|
+        //##########################################################################################|       |        |        |        |   Operation|   Operation|   Operation| DataOperation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle| PerShuffle| _NBlock_NRepeat_NWaveNPerXdl|   _NWaveNPerXdl|
-        //##########################################################################################|       |        |        |        |            |            |            |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |           |                             |                |
+        //##########################################################################################|       |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |           |                             |                |
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   256,   256,   128,     4,  8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   128,   128,   128,     4,  8,   32,   32,    4,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   256,   128,   128,     4,  8,   32,   32,    2,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   128,   128,    64,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   128,    64,   128,     4,  8,   32,   32,    2,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    64,    64,    64,     4,  8,   32,   32,    2,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   256,   128,    64,     4,  8,   32,   32,    2,    1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   256,    64,   128,     4,  8,   32,   32,    1,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   128,   128,    32,     4,  8,   32,   32,    2,    1,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 32, 1, 1, 4>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 8>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,   128,    32,   128,     4,  8,   32,   32,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 8>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 4>,               8>,
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    64,    64,    32,     4,  8,   32,   32,    2,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 4>,               8>,
-        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 4>,               8>
+        DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K<    F16,     F16,     F16,     F32, PassThrough, PassThrough,     AddRelu,     MemorySet,    64,    32,    64,     4,  8,   32,   32,    1,    2,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,           1,          1,         S<1, 1, 16, 1, 1, 4>,               8>
    // clang-format on
    >;

--- a/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
+++ b/device_operation/include/device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp
@@ -26,6 +26,7 @@ template <
    typename InElementwiseOperation,
    typename WeiElementwiseOperation,
    typename OutElementwiseOperation,
+    InMemoryDataOperationEnum_t OutGlobalMemoryDataOperation,
    ck::index_t BlockSize,
    ck::index_t MPerBlock,
    ck::index_t NPerBlock,
@@ -229,7 +230,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X
        ABDataType, // TODO: distinguish A/B datatype
        AccDataType,
        CDataType,
-        InMemoryDataOperationEnum_t::Set,
+        OutGlobalMemoryDataOperation,
        AGridDesc_K0_M_K1,
        BGridDesc_K0_N_K1,
        CGridDesc_M_N,

--- a/example/5_conv2d_fwd_xdl_c_shuffle_bias_relu/conv2d_fwd_xdl_c_shuffle_bias_relu.cpp
+++ b/example/5_conv2d_fwd_xdl_c_shuffle_bias_relu/conv2d_fwd_xdl_c_shuffle_bias_relu.cpp
@@ -30,15 +30,17 @@ using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
 using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+static constexpr auto MemorySet = ck::InMemoryDataOperationEnum_t::Set;
 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
    // clang-format off
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|   MRepeate|   NRepeate| _MBlock_MRepeat_MWaveMPerXdl| ScalarPerVector|
+//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  GlobalMemory|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|   MRepeate|   NRepeate| _MBlock_MRepeat_MWaveMPerXdl| ScalarPerVector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerShuffle| PerShuffle| _NBlock_NRepeat_NWaveNPerXdl|   _NWaveNPerXdl|
+//      |          |            |            |            |   Operation|   Operation|     Operation| DataOperation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerShuffle| PerShuffle| _NBlock_NRepeat_NWaveNPerXdl|   _NWaveNPerXdl|
-//      |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |           |           |                             |                |
+//      |          |            |            |            |            |            |              |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |           |           |                             |                |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,          1,          1,         S<1, 1, 32, 1, 1, 8>,               8>;
+        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,     MemorySet,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,          1,          1,         S<1, 1, 32, 1, 1, 8>,               8>;
 // clang-format on
 template <typename TIn,

--- a/example/7_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add/README.md
+++ b/example/7_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add/README.md
+# Instructions for ```conv_xdl_bias_relu_add``` Example
+## Docker script
+```bash
+docker run                                                                   \
+-it                                                                          \
+--rm                                                                         \
+--privileged                                                                 \
+--group-add sudo                                                             \
+-w /root/workspace                                                           \
+-v ${PATH_TO_LOCAL_WORKSPACE}:/root/workspace                                \
+rocm/tensorflow:rocm4.3.1-tf2.6-dev                                          \
+/bin/bash
+```
+## Build ```conv_xdl_bias_relu_add```
+```bash
+mkdir build && cd build
+```
+```bash
+# Need to specify target ID, example below is gfx908
+cmake                                                                  \
+-D BUILD_DEV=OFF                                                       \
+-D CMAKE_BUILD_TYPE=Release                                            \
+-D CMAKE_CXX_FLAGS="-DCK_AMD_GPU_GFX908 --amdgpu-target=gfx908 -O3 "   \
+-D CMAKE_CXX_COMPILER=/opt/rocm/bin/hipcc                              \
+-D CMAKE_PREFIX_PATH=/opt/rocm                                         \
+..
+```
+```bash
+ make -j conv_xdl_bias_relu_add
+```
+## Run ```conv_xdl_bias_relu_add```
+```bash
+#arg1: verification (0=no, 1=yes)
+#arg2: initialization (0=no init, 1=integer value, 2=decimal value)
+#arg3: run kernel # of times (>1)
+#arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, RightPx
+./example/conv_xdl_bias_relu_add 0 1 5
+```
+Result (MI100 @ 1087Mhz, 133.5TFlops peak FP16)
+```
+in_n_c_hi_wi: dim 4, lengths {128, 192, 71, 71}, strides {967872, 1, 13632, 192}
+wei_k_c_y_x: dim 4, lengths {256, 192, 3, 3}, strides {1728, 1, 576, 192}
+out_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+bias_k: dim 1, lengths {256}, strides {1}
+resi_n_k_ho_wo: dim 4, lengths {128, 256, 36, 36}, strides {331776, 1, 9216, 256}
+arg.a_grid_desc_k0_m_k1_{216, 165888, 8}
+arg.b_grid_desc_k0_n_k1_{216, 256, 8}
+arg.c_grid_desc_m_n_{ 165888, 256}
+arg.c0_grid_desc_m_n_{ 165888, 256}
+arg.c1_grid_desc_m_n_{ 165888, 256}
+launch_and_time_kernel: grid_dim {1296, 1, 1}, block_dim {256, 1, 1}
+Warm up
+Start running 5 times...
+Perf: 1.71779 ms, 85.4396 TFlops, 194.2 GB/s
+```
--- a/example/7_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add/conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add.cpp
+++ b/example/7_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add/conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "config.hpp"
+#include "print.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "tensor_layout.hpp"
+#include "device_conv2d_fwd_xdl_c_shuffle_bias_activation_nhwc_kyxc_nhwk.hpp"
+#include "element_wise_operation.hpp"
+using InDataType  = ck::half_t;
+using WeiDataType = ck::half_t;
+using OutDataType = ck::half_t;
+using AccDataType = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using InLayout  = ck::tensor_layout::convolution::NHWC;
+using WeiLayout = ck::tensor_layout::convolution::KYXC;
+using OutLayout = ck::tensor_layout::convolution::NHWK;
+using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+static constexpr auto MemoryAtomicAdd = ck::InMemoryDataOperationEnum_t::AtomicAdd;
+// clang-format off
+using DeviceConvFwdInstance = ck::tensor_operation::device::
+    DeviceConv2dFwdXdl_C_Shuffle_Bias_Activation_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
+    // clang-format off
+//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out|             Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|   CShuffle|   CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|    GlobalMemory|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|   MRepeate|   NRepeate| _MBlock_MRepeat_MWaveMPerXdl| ScalarPerVector|
+//      |          |            |            |            |   Operation|   Operation|     Operation|   DataOperation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          | PerShuffle| PerShuffle| _NBlock_NRepeat_NWaveNPerXdl|   _NWaveNPerXdl|
+//      |          |            |            |            |            |            |              |                |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |           |           |                             |                |
+        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp, MemoryAtomicAdd,   256,   128,   256,     4,  8,   32,   32,    2,    4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,          1,          1,         S<1, 1,  8, 1, 1,32>,               2>;
+// clang-format on
+template <typename TIn,
+          typename TWei,
+          typename TOut,
+          typename InElementOp,
+          typename WeiElementOp,
+          typename OutElementOp>
+void host_reference_calculation(const Tensor<TIn>& in_n_c_hi_wi,
+                                const Tensor<TWei>& wei_k_c_y_x,
+                                Tensor<TOut>& out_n_k_ho_wo,
+                                const Tensor<TOut>& bias_k,
+                                const std::vector<ck::index_t>& conv_strides,
+                                const std::vector<ck::index_t>& conv_dilations,
+                                const std::vector<ck::index_t>& in_left_pads,
+                                const std::vector<ck::index_t>& /* in_right_pads */,
+                                const InElementOp& in_element_op,
+                                const WeiElementOp& wei_element_op,
+                                const OutElementOp& out_element_op)
+{
+    auto f_nchw = [&](auto n, auto k, auto ho, auto wo) {
+        double v = 0;
+        for(int c = 0; c < wei_k_c_y_x.mDesc.GetLengths()[1]; ++c)
+        {
+            for(int y = 0; y < wei_k_c_y_x.mDesc.GetLengths()[2]; ++y)
+            {
+                int hi = ho * conv_strides[0] + y * conv_dilations[0] - in_left_pads[0];
+                for(int x = 0; x < wei_k_c_y_x.mDesc.GetLengths()[3]; ++x)
+                {
+                    int wi = wo * conv_strides[1] + x * conv_dilations[1] - in_left_pads[1];
+                    if(hi >= 0 && hi < in_n_c_hi_wi.mDesc.GetLengths()[2] && wi >= 0 &&
+                       wi < in_n_c_hi_wi.mDesc.GetLengths()[3])
+                    {
+                        v += in_element_op(static_cast<const double>(in_n_c_hi_wi(n, c, hi, wi))) *
+                             wei_element_op(static_cast<const double>(wei_k_c_y_x(k, c, y, x)));
+                    }
+                }
+            }
+        }
+        out_n_k_ho_wo(n, k, ho, wo) += out_element_op(v, bias_k(k));
+    };
+    make_ParallelTensorFunctor(f_nchw,
+                               out_n_k_ho_wo.mDesc.GetLengths()[0],
+                               out_n_k_ho_wo.mDesc.GetLengths()[1],
+                               out_n_k_ho_wo.mDesc.GetLengths()[2],
+                               out_n_k_ho_wo.mDesc.GetLengths()[3])(
+        std::thread::hardware_concurrency());
+}
+int main(int argc, char* argv[])
+{
+    bool do_verification = 0;
+    int init_method      = 0;
+    int nrepeat          = 5;
+    // Conv shape
+    ck::index_t N               = 128;
+    ck::index_t K               = 256;
+    ck::index_t C               = 192;
+    ck::index_t Y               = 3;
+    ck::index_t X               = 3;
+    ck::index_t Hi              = 71;
+    ck::index_t Wi              = 71;
+    ck::index_t conv_stride_h   = 2;
+    ck::index_t conv_stride_w   = 2;
+    ck::index_t conv_dilation_h = 1;
+    ck::index_t conv_dilation_w = 1;
+    ck::index_t in_left_pad_h   = 1;
+    ck::index_t in_left_pad_w   = 1;
+    ck::index_t in_right_pad_h  = 1;
+    ck::index_t in_right_pad_w  = 1;
+    if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+    }
+    else if(argc == 19)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        nrepeat         = std::stoi(argv[3]);
+        N               = std::stoi(argv[4]);
+        K               = std::stoi(argv[5]);
+        C               = std::stoi(argv[6]);
+        Y               = std::stoi(argv[7]);
+        X               = std::stoi(argv[8]);
+        Hi              = std::stoi(argv[9]);
+        Wi              = std::stoi(argv[10]);
+        conv_stride_h   = std::stoi(argv[11]);
+        conv_stride_w   = std::stoi(argv[12]);
+        conv_dilation_h = std::stoi(argv[13]);
+        conv_dilation_w = std::stoi(argv[14]);
+        in_left_pad_h   = std::stoi(argv[15]);
+        in_left_pad_w   = std::stoi(argv[16]);
+        in_right_pad_h  = std::stoi(argv[17]);
+        in_right_pad_w  = std::stoi(argv[18]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 18: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(0);
+    }
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    const std::vector<ck::index_t> conv_filter_strides{{conv_stride_h, conv_stride_w}};
+    const std::vector<ck::index_t> conv_filter_dilations{{conv_dilation_h, conv_dilation_w}};
+    const std::vector<ck::index_t> input_left_pads{{in_left_pad_h, in_left_pad_w}};
+    const std::vector<ck::index_t> input_right_pads{{in_right_pad_h, in_right_pad_w}};
+    // tensor layout
+    auto f_host_tensor_descriptor = [](std::size_t N_,
+                                       std::size_t C_,
+                                       std::size_t H,
+                                       std::size_t W,
+                                       auto layout) {
+        if constexpr(ck::is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                     ck::is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                        std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+        }
+        else if constexpr(ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::NHWC>::value ||
+                          ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::KYXC>::value ||
+                          ck::is_same<decltype(layout),
+                                      ck::tensor_layout::convolution::NHWK>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                        std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+        }
+    };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_1<InDataType>{});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_1<WeiDataType>{});
+        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
+        bias_k.GenerateTensorValue(GeneratorTensor_1<OutDataType>{});
+        break;
+    case 2:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        out_n_k_ho_wo_host_result.GenerateTensorValue(GeneratorTensor_3<OutDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    out_device_buf.ToDevice(out_n_k_ho_wo_host_result.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+    auto conv    = DeviceConvFwdInstance{};
+    auto invoker = conv.MakeInvoker();
+    auto argument =
+        conv.MakeArgument(static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+                          static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+                          static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+                          static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+                          N,
+                          K,
+                          C,
+                          std::vector<ck::index_t>{{Hi, Wi}},
+                          std::vector<ck::index_t>{{Y, X}},
+                          std::vector<ck::index_t>{{Ho, Wo}},
+                          conv_filter_strides,
+                          conv_filter_dilations,
+                          input_left_pads,
+                          input_right_pads,
+                          InElementOp{},
+                          WeiElementOp{},
+                          OutElementOp{});
+    if(!conv.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device operator with the specified compilation parameters does "
+            "not support this problem");
+    }
+    float ave_time = invoker.Run(argument, nrepeat);
+    std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+    std::size_t num_btype = sizeof(InDataType) * (N * C * Hi * Wi) +
+                            sizeof(WeiDataType) * (K * C * Y * X) +
+                            sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s"
+              << std::endl;
+    if(do_verification)
+    {
+        host_reference_calculation(in_n_c_hi_wi,
+                                   wei_k_c_y_x,
+                                   out_n_k_ho_wo_host_result,
+                                   bias_k,
+                                   conv_filter_strides,
+                                   conv_filter_dilations,
+                                   input_left_pads,
+                                   input_right_pads,
+                                   InElementOp{},
+                                   WeiElementOp{},
+                                   OutElementOp{});
+        out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+        check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
+        LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",") << std::endl;
+        LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",") << std::endl;
+        LogRangeAsType<float>(std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+            << std::endl;
+        LogRangeAsType<float>(std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+            << std::endl;
+    }
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -19,6 +19,7 @@ set(CONV2D_FWD_XDL_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_bias_relu/conv2d_fwd_xdl_bi
 set(CONV2D_FWD_XDL_C_SHUFFLE_BIAS_RELU_SOURCE 5_conv2d_fwd_xdl_c_shuffle_bias_relu/conv2d_fwd_xdl_c_shuffle_bias_relu.cpp)
 set(CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_bias_relu_add/conv2d_fwd_xdl_bias_relu_add.cpp)
 set(CONV2D_FWD_XDL_C_SHUFFLE_BIAS_RELU_ADD_SOURCE 6_conv2d_fwd_xdl_c_shuffle_bias_relu_add/conv2d_fwd_xdl_c_shuffle_bias_relu_add.cpp)
+set(CONV2D_FWD_XDL_C_SHUFFLE_BIAS_RELU_ATOMIC_ADD_SOURCE 7_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add/conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add.cpp)
 add_executable(gemm_xdl ${GEMM_XDL_SOURCE})
 add_executable(gemm_xdl_bias_relu_add ${GEMM_XDL_BIAS_RELU_ADD_SOURCE})
@@ -28,6 +29,7 @@ add_executable(conv2d_fwd_xdl_bias_relu ${CONV2D_FWD_XDL_BIAS_RELU_SOURCE})
 add_executable(conv2d_fwd_xdl_c_shuffle_bias_relu ${CONV2D_FWD_XDL_C_SHUFFLE_BIAS_RELU_SOURCE})
 add_executable(conv2d_fwd_xdl_bias_relu_add ${CONV2D_FWD_XDL_BIAS_RELU_ADD_SOURCE})
 add_executable(conv2d_fwd_xdl_c_shuffle_bias_relu_add ${CONV2D_FWD_XDL_C_SHUFFLE_BIAS_RELU_ADD_SOURCE})
+add_executable(conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add ${CONV2D_FWD_XDL_C_SHUFFLE_BIAS_RELU_ATOMIC_ADD_SOURCE})
 target_link_libraries(gemm_xdl PRIVATE host_tensor)
 target_link_libraries(gemm_xdl_bias_relu_add PRIVATE host_tensor)
@@ -37,3 +39,4 @@ target_link_libraries(conv2d_fwd_xdl_bias_relu PRIVATE host_tensor)
 target_link_libraries(conv2d_fwd_xdl_c_shuffle_bias_relu PRIVATE host_tensor)
 target_link_libraries(conv2d_fwd_xdl_bias_relu_add PRIVATE host_tensor)
 target_link_libraries(conv2d_fwd_xdl_c_shuffle_bias_relu_add PRIVATE host_tensor)
+target_link_libraries(conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add PRIVATE host_tensor)
--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -69,8 +69,25 @@ target_compile_features(device_conv2d_fwd_bias_relu_add_instance PUBLIC)
 set_target_properties(device_conv2d_fwd_bias_relu_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
 install(TARGETS device_conv2d_fwd_bias_relu_add_instance LIBRARY DESTINATION lib) 
+# device_conv2d_fwd_bias_relu_atomic_add_instance
+set(DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE 
+   ${PROJECT_SOURCE_DIR}/device_operation/device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instance.cpp;
+) 
+add_library(device_conv2d_fwd_bias_relu_atomic_add_instance SHARED ${DEVICE_CONV2D_FWD_BIAS_RELU_ATOMIC_ADD_INSTANCE_SOURCE}) 
+target_include_directories(device_conv2d_fwd_bias_relu_atomic_add_instance SYSTEM PUBLIC $<BUILD_INTERFACE:${HALF_INCLUDE_DIR}>)
+target_compile_features(device_conv2d_fwd_bias_relu_atomic_add_instance PUBLIC)
+set_target_properties(device_conv2d_fwd_bias_relu_atomic_add_instance PROPERTIES POSITION_INDEPENDENT_CODE ON)
+install(TARGETS device_conv2d_fwd_bias_relu_atomic_add_instance LIBRARY DESTINATION lib) 
 # ck_profiler
-set(PROFILER_SOURCE profiler.cpp profile_gemm.cpp profile_conv_fwd.cpp profile_conv_fwd_bias_relu.cpp profile_conv_fwd_bias_relu_add.cpp)
+set(PROFILER_SOURCE 
+    profiler.cpp
+    profile_gemm.cpp
+    profile_conv_fwd.cpp
+    profile_conv_fwd_bias_relu.cpp
+    profile_conv_fwd_bias_relu_add.cpp
+    profile_conv_fwd_bias_relu_atomic_add.cpp)
 add_executable(ckProfiler ${PROFILER_SOURCE})
 target_link_libraries(ckProfiler PRIVATE host_tensor)
@@ -78,3 +95,4 @@ target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_instance)
 target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_add_instance)
+target_link_libraries(ckProfiler PRIVATE device_conv2d_fwd_bias_relu_atomic_add_instance)
--- a/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+++ b/profiler/include/profile_conv_fwd_bias_relu_atomic_add_impl.hpp
+#pragma once
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "host_conv.hpp"
+#include "tensor_layout.hpp"
+#include "device_tensor.hpp"
+#include "device_conv_fwd_bias_activation.hpp"
+#include "element_wise_operation.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace device_conv2d_fwd_bias_activation_atomic_add_instance {
+using DeviceConvFwdBiasReluPtr =
+    DeviceConvFwdBiasActivationPtr<ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::PassThrough,
+                                   ck::tensor_operation::element_wise::AddRelu>;
+void add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instances(
+    std::vector<DeviceConvFwdBiasReluPtr>&);
+} // namespace device_conv2d_fwd_bias_activation_atomic_add_instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
+namespace ck {
+namespace profiler {
+void cpu_conv_bias_relu_atomic_add(ck::half_t* in_ptr,
+                                   ck::half_t* weight_ptr,
+                                   ck::half_t* output_ptr,
+                                   ck::half_t* bias_ptr,
+                                   const ck::index_t N,
+                                   const ck::index_t K,
+                                   const ck::index_t C,
+                                   const ck::index_t Y,
+                                   const ck::index_t X,
+                                   const ck::index_t Hi,
+                                   const ck::index_t Wi,
+                                   const ck::index_t Ho,
+                                   const ck::index_t Wo,
+                                   const ck::index_t Stride,
+                                   const ck::index_t Dilation,
+                                   const ck::index_t Pad)
+{
+    const auto in_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                      static_cast<std::size_t>(Hi),
+                                                      static_cast<std::size_t>(Wi),
+                                                      static_cast<std::size_t>(C)});
+    const auto wei_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(K),
+                                                      static_cast<std::size_t>(Y),
+                                                      static_cast<std::size_t>(X),
+                                                      static_cast<std::size_t>(C)});
+    const auto out_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(N),
+                                                      static_cast<std::size_t>(Ho),
+                                                      static_cast<std::size_t>(Wo),
+                                                      static_cast<std::size_t>(K)});
+    const auto bias_desc =
+        HostTensorDescriptor(std::vector<std::size_t>{static_cast<std::size_t>(K)});
+    auto f_k = [&](auto k) {
+        for(int n = 0; n < N; ++n)
+        {
+            for(int ho = 0; ho < Ho; ++ho)
+            {
+                for(int wo = 0; wo < Wo; ++wo)
+                {
+                    double v = 0;
+                    for(int c = 0; c < C; ++c)
+                    {
+                        for(int y = 0; y < Y; ++y)
+                        {
+                            int hi = ho * Stride + y * Dilation - Pad;
+                            for(int x = 0; x < X; ++x)
+                            {
+                                int wi = wo * Stride + x * Dilation - Pad;
+                                if(hi >= 0 && hi < Hi && wi >= 0 && wi < Wi)
+                                {
+                                    double in =
+                                        in_ptr[in_desc.GetOffsetFromMultiIndex(n, hi, wi, c)];
+                                    double wei =
+                                        weight_ptr[wei_desc.GetOffsetFromMultiIndex(k, y, x, c)];
+                                    v += in * wei;
+                                }
+                            }
+                        }
+                    }
+                    v += bias_ptr[bias_desc.GetOffsetFromMultiIndex(k)];
+                    v = v > 0 ? v : 0;
+                    output_ptr[out_desc.GetOffsetFromMultiIndex(n, ho, wo, k)] = v;
+                }
+            }
+        }
+    };
+    make_ParallelTensorFunctor(f_k, K)(std::thread::hardware_concurrency());
+}
+template <int NDimSpatial,
+          typename InDataType,
+          typename WeiDataType,
+          typename OutDataType,
+          typename InLayout,
+          typename WeiLayout,
+          typename OutLayout>
+void profile_conv_fwd_bias_relu_atomic_add_impl(int do_verification,
+                                                int init_method,
+                                                bool do_log,
+                                                int nrepeat,
+                                                ck::index_t N,
+                                                ck::index_t K,
+                                                ck::index_t C,
+                                                std::vector<ck::index_t> input_spatial_lengths,
+                                                std::vector<ck::index_t> filter_spatial_lengths,
+                                                std::vector<ck::index_t> output_spatial_lengths,
+                                                std::vector<ck::index_t> conv_filter_strides,
+                                                std::vector<ck::index_t> conv_filter_dilations,
+                                                std::vector<ck::index_t> input_left_pads,
+                                                std::vector<ck::index_t> input_right_pads)
+{
+    const ck::index_t Y = filter_spatial_lengths[0];
+    const ck::index_t X = filter_spatial_lengths[1];
+    const ck::index_t Hi = input_spatial_lengths[0];
+    const ck::index_t Wi = input_spatial_lengths[1];
+    const ck::index_t Ho = output_spatial_lengths[0];
+    const ck::index_t Wo = output_spatial_lengths[1];
+    auto f_host_tensor_descriptor =
+        [](std::size_t N_, std::size_t C_, std::size_t H, std::size_t W, auto layout) {
+            if constexpr(is_same<decltype(layout), ck::tensor_layout::convolution::NCHW>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::KCYX>::value ||
+                         is_same<decltype(layout), ck::tensor_layout::convolution::NKHW>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, H * W, W, 1}));
+            }
+            else if constexpr(is_same<decltype(layout), tensor_layout::convolution::NHWC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::KYXC>::value ||
+                              is_same<decltype(layout), tensor_layout::convolution::NHWK>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({N_, C_, H, W}),
+                                            std::vector<std::size_t>({C_ * H * W, 1, W * C_, C_}));
+            }
+        };
+    Tensor<InDataType> in_n_c_hi_wi(f_host_tensor_descriptor(N, C, Hi, Wi, InLayout{}));
+    Tensor<WeiDataType> wei_k_c_y_x(f_host_tensor_descriptor(K, C, Y, X, WeiLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_host_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    Tensor<OutDataType> out_n_k_ho_wo_device_result(
+        f_host_tensor_descriptor(N, K, Ho, Wo, OutLayout{}));
+    // bias: assume contiguous 1d vector
+    Tensor<OutDataType> bias_k(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(K)})));
+    std::cout << "in_n_c_hi_wi: " << in_n_c_hi_wi.mDesc << std::endl;
+    std::cout << "wei_k_c_y_x: " << wei_k_c_y_x.mDesc << std::endl;
+    std::cout << "out_n_k_ho_wo: " << out_n_k_ho_wo_host_result.mDesc << std::endl;
+    std::cout << "bias_k: " << bias_k.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_2<InDataType>{-5, 5});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_2<WeiDataType>{-5, 5});
+        bias_k.GenerateTensorValue(GeneratorTensor_2<OutDataType>{-5, 5});
+        break;
+    default:
+        in_n_c_hi_wi.GenerateTensorValue(GeneratorTensor_3<InDataType>{0.0, 1.0});
+        wei_k_c_y_x.GenerateTensorValue(GeneratorTensor_3<WeiDataType>{-0.5, 0.5});
+        bias_k.GenerateTensorValue(GeneratorTensor_3<OutDataType>{0.0, 1.0});
+    }
+    using InElementOp  = ck::tensor_operation::element_wise::PassThrough;
+    using WeiElementOp = ck::tensor_operation::element_wise::PassThrough;
+    using OutElementOp = ck::tensor_operation::element_wise::AddRelu;
+    if(do_verification)
+    {
+        cpu_conv_bias_relu_atomic_add(in_n_c_hi_wi.mData.data(),
+                                      wei_k_c_y_x.mData.data(),
+                                      out_n_k_ho_wo_host_result.mData.data(),
+                                      bias_k.mData.data(),
+                                      N,
+                                      K,
+                                      C,
+                                      Y,
+                                      X,
+                                      Hi,
+                                      Wi,
+                                      Ho,
+                                      Wo,
+                                      conv_filter_strides[0],
+                                      conv_filter_dilations[0],
+                                      input_left_pads[0]);
+    }
+    DeviceMem in_device_buf(sizeof(InDataType) * in_n_c_hi_wi.mDesc.GetElementSpace());
+    DeviceMem wei_device_buf(sizeof(WeiDataType) * wei_k_c_y_x.mDesc.GetElementSpace());
+    DeviceMem out_device_buf(sizeof(OutDataType) *
+                             out_n_k_ho_wo_device_result.mDesc.GetElementSpace());
+    DeviceMem bias_device_buf(sizeof(OutDataType) * bias_k.mDesc.GetElementSpace());
+    in_device_buf.ToDevice(in_n_c_hi_wi.mData.data());
+    wei_device_buf.ToDevice(wei_k_c_y_x.mData.data());
+    bias_device_buf.ToDevice(bias_k.mData.data());
+    using DeviceConvFwdBiasReluPtr = ck::tensor_operation::device::
+        DeviceConvFwdBiasActivationPtr<InElementOp, WeiElementOp, OutElementOp>;
+    // add device operator instances
+    std::vector<DeviceConvFwdBiasReluPtr> op_ptrs;
+    if constexpr(ck::is_same_v<ck::remove_cv_t<InDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<WeiDataType>, ck::half_t> &&
+                 ck::is_same_v<ck::remove_cv_t<OutDataType>, ck::half_t>)
+    {
+        ck::tensor_operation::device::device_conv2d_fwd_bias_activation_atomic_add_instance::
+            add_device_conv2d_fwd_xdl_c_shuffle_bias_relu_atomic_add_nhwc_kyxc_nhwk_f16_instances(
+                op_ptrs);
+    }
+    if(op_ptrs.size() <= 0)
+    {
+        throw std::runtime_error("wrong! no device Conv instance found");
+    }
+    std::string best_conv_name;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device Conv instances
+    for(auto& op_ptr : op_ptrs)
+    {
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            static_cast<const InDataType*>(in_device_buf.GetDeviceBuffer()),
+            static_cast<const WeiDataType*>(wei_device_buf.GetDeviceBuffer()),
+            static_cast<OutDataType*>(out_device_buf.GetDeviceBuffer()),
+            static_cast<const OutDataType*>(bias_device_buf.GetDeviceBuffer()),
+            N,
+            K,
+            C,
+            input_spatial_lengths,
+            filter_spatial_lengths,
+            output_spatial_lengths,
+            conv_filter_strides,
+            conv_filter_dilations,
+            input_left_pads,
+            input_right_pads,
+            InElementOp{},
+            WeiElementOp{},
+            OutElementOp{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            std::string conv_name = op_ptr->GetTypeString();
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), nrepeat);
+            std::size_t flop = std::size_t(2) * N * K * Ho * Wo * C * Y * X;
+            std::size_t num_btype =
+                sizeof(InDataType) * (N * C * Hi * Wi) + sizeof(WeiDataType) * (K * C * Y * X) +
+                sizeof(OutDataType) * (N * K * Ho * Wo) + sizeof(OutDataType) * (K);
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << conv_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_conv_name  = conv_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+            if(do_verification)
+            {
+                out_device_buf.FromDevice(out_n_k_ho_wo_device_result.mData.data());
+                check_error(out_n_k_ho_wo_host_result, out_n_k_ho_wo_device_result);
+                if(do_log)
+                {
+                    LogRangeAsType<float>(std::cout << "in : ", in_n_c_hi_wi.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(std::cout << "wei: ", wei_k_c_y_x.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_host  : ", out_n_k_ho_wo_host_result.mData, ",")
+                        << std::endl;
+                    LogRangeAsType<float>(
+                        std::cout << "out_device: ", out_n_k_ho_wo_device_result.mData, ",")
+                        << std::endl;
+                }
+            }
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_conv_name << std::endl;
+}
+} // namespace profiler
+} // namespace ck
--- a/profiler/include/profile_conv_fwd_impl.hpp
+++ b/profiler/include/profile_conv_fwd_impl.hpp
@@ -152,14 +152,18 @@ void profile_conv_fwd_impl(int do_verification,
        ck::tensor_operation::device::device_conv2d_fwd_instance::
            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+#if 0 // debug
        ck::tensor_operation::device::device_conv2d_fwd_instance::
            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_p0_f16_instances(conv_ptrs);
        ck::tensor_operation::device::device_conv2d_fwd_instance::
            add_device_conv2d_fwd_xdl_nhwc_kyxc_nhwk_1x1_s1_p0_f16_instances(conv_ptrs);
+#endif
+#if 1 // debug
        ck::tensor_operation::device::device_conv2d_fwd_instance::
            add_device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk_f16_instances(conv_ptrs);
+#endif
    }
    if(conv_ptrs.size() <= 0)

--- a/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+++ b/profiler/profile_conv_fwd_bias_relu_atomic_add.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include <half.hpp>
+#include "profile_conv_fwd_bias_relu_atomic_add_impl.hpp"
+enum ConvDataType
+{
+    F32_F32_F32, // 0
+    F16_F16_F16, // 1
+};
+enum ConvInputLayout
+{
+    NCHW, // 0
+    NHWC, // 1
+};
+enum ConvWeightLayout
+{
+    KCYX, // 0
+    KYXC, // 1
+};
+enum ConvOutputLayout
+{
+    NKHW, // 0
+    NHWK, // 1
+};
+int profile_conv_fwd_bias_relu_atomic_add(int argc, char* argv[])
+{
+    if(argc != 25)
+    {
+        printf("arg1: tensor operation (conv_fwd_bias_relu_atomic_add: "
+               "ForwardConvolution+Bias+ReLu+AtomicAdd)\n");
+        printf("arg2: data type (0: fp32; 1: fp16)\n");
+        printf("arg3: input tensor layout (0: NCHW; 1: NHWC)\n");
+        printf("arg4: weight tensor layout (0: KCYX; 1: KYXC)\n");
+        printf("arg5: output tensor layout (0: NKHW; 1: NHWK)\n");
+        printf("arg6: verification (0: no; 1: yes)\n");
+        printf("arg7: initialization (0: no init; 1: integer value; 2: decimal value)\n");
+        printf("arg8: print tensor value (0: no; 1: yes)\n");
+        printf("arg9: run kernel # of times (>1)\n");
+        printf("arg10 to 24: N, K, C, Y, X, Hi, Wi, Sy, Sx, Dy, Dx, LeftPy, LeftPx, RightPy, "
+               "RightPx\n");
+        exit(1);
+    }
+    const int data_type        = static_cast<ConvDataType>(std::stoi(argv[2]));
+    const int in_layout        = static_cast<ConvInputLayout>(std::stoi(argv[3]));
+    const int wei_layout       = static_cast<ConvWeightLayout>(std::stoi(argv[4]));
+    const int out_layout       = static_cast<ConvOutputLayout>(std::stoi(argv[5]));
+    const bool do_verification = std::stoi(argv[6]);
+    const int init_method      = std::stoi(argv[7]);
+    const bool do_log          = std::stoi(argv[8]);
+    const int nrepeat          = std::stoi(argv[9]);
+    const ck::index_t N  = std::stoi(argv[10]);
+    const ck::index_t K  = std::stoi(argv[11]);
+    const ck::index_t C  = std::stoi(argv[12]);
+    const ck::index_t Y  = std::stoi(argv[13]);
+    const ck::index_t X  = std::stoi(argv[14]);
+    const ck::index_t Hi = std::stoi(argv[15]);
+    const ck::index_t Wi = std::stoi(argv[16]);
+    const ck::index_t conv_stride_h   = std::stoi(argv[17]);
+    const ck::index_t conv_stride_w   = std::stoi(argv[18]);
+    const ck::index_t conv_dilation_h = std::stoi(argv[19]);
+    const ck::index_t conv_dilation_w = std::stoi(argv[20]);
+    const ck::index_t in_left_pad_h   = std::stoi(argv[21]);
+    const ck::index_t in_left_pad_w   = std::stoi(argv[22]);
+    const ck::index_t in_right_pad_h  = std::stoi(argv[23]);
+    const ck::index_t in_right_pad_w  = std::stoi(argv[24]);
+    const ck::index_t YEff = (Y - 1) * conv_dilation_h + 1;
+    const ck::index_t XEff = (X - 1) * conv_dilation_w + 1;
+    const ck::index_t Ho = (Hi + in_left_pad_h + in_right_pad_h - YEff) / conv_stride_h + 1;
+    const ck::index_t Wo = (Wi + in_left_pad_w + in_right_pad_w - XEff) / conv_stride_w + 1;
+    if(data_type == ConvDataType::F16_F16_F16 && in_layout == ConvInputLayout::NHWC &&
+       wei_layout == ConvWeightLayout::KYXC && out_layout == ConvOutputLayout::NHWK)
+    {
+        ck::profiler::profile_conv_fwd_bias_relu_atomic_add_impl<
+            2,
+            ck::half_t,
+            ck::half_t,
+            ck::half_t,
+            ck::tensor_layout::convolution::NHWC,
+            ck::tensor_layout::convolution::KYXC,
+            ck::tensor_layout::convolution::NHWK>(
+            do_verification,
+            init_method,
+            do_log,
+            nrepeat,
+            N,
+            K,
+            C,
+            std::vector<ck::index_t>{Hi, Wi},
+            std::vector<ck::index_t>{Y, X},
+            std::vector<ck::index_t>{Ho, Wo},
+            std::vector<ck::index_t>{conv_stride_h, conv_stride_w},
+            std::vector<ck::index_t>{conv_dilation_h, conv_dilation_w},
+            std::vector<ck::index_t>{in_left_pad_h, in_left_pad_w},
+            std::vector<ck::index_t>{in_right_pad_h, in_right_pad_w});
+    }
+    else
+    {
+        throw std::runtime_error("wrong! data_type & layout for this operator is not implemented");
+    }
+    return 1;
+}
--- a/profiler/profiler.cpp
+++ b/profiler/profiler.cpp
@@ -9,6 +9,7 @@ int profile_gemm(int, char*[]);
 int profile_conv_fwd(int, char*[]);
 int profile_conv_fwd_bias_relu(int, char*[]);
 int profile_conv_fwd_bias_relu_add(int, char*[]);
+int profile_conv_fwd_bias_relu_atomic_add(int, char*[]);
 int main(int argc, char* argv[])
 {
@@ -28,13 +29,18 @@ int main(int argc, char* argv[])
    {
        return profile_conv_fwd_bias_relu_add(argc, argv);
    }
+    else if(strcmp(argv[1], "conv_fwd_bias_relu_atomic_add") == 0)
+    {
+        return profile_conv_fwd_bias_relu_atomic_add(argc, argv);
+    }
    else
    {
-        printf(
+        printf("arg1: tensor operation (gemm: GEMM;\n"
-            "arg1: tensor operation (gemm: GEMM;\n"
               "                        conv_fwd: ForwardConvolution;\n"
               "                        conv_fwd_bias_relu: ForwardConvolution+Bias+ReLU)\n"
-            "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add)\n");
+               "                        conv_fwd_bias_relu_add: ForwardConvolution+Bias+ReLU+Add)\n"
+               "                        conv_fwd_bias_relu_atomic_add: "
+               "ForwardConvolution+Bias+ReLU+AtomicAdd)\n");
        return 0;
    }
 }