Gemm reduce max (#209)

* [What] Rename the example [Why] Prepare to add unary reduction * Add global oparation to the parameter * Add atomicmax * Fix compile error * Support atomicMax (hip library) * Rename the reduction example * Fix target name * use p_d1_grid as the indicator directly * Prevent performance issue. Let passthrough handle it. * Implement the function template the specialize the float2 * No need to separate into two lines * Remove empty line * add comment * Fix compile error due to merge from develop * make the implementation of atomic_max / atomic_add explicit for each datatype * Refine typo * For future CI test * Fix compiler error in ckProfiler * Merge commit 'de2769e3a6695b38a20529261273ddc5cdaab2fe' * simply use remove_pointer * Rename type and var * Refine example * Modify reducemax example * Fix bug in reduction * Change initialize range * Implement F64 version of atomicMax * Move reduction code together * Add buffer atomic_max * Fix coding style by clang-format * Integrate new api of DeviceGemmReduce_Xdl_CShuffle * Integrate Batch gemm reduction * Fix example * fix example * clean up * Fix batch gemm tensor operation * Fix coding style * Fix template augument * Fix clang format * Keep flexible of different stride for each D tensor * Fix compile error for ckProfiler * Fix typo * [What] Fix naming [Why] Prepare to add out elementop * Add DoutElementOp Co-authored-by: Chao Liu <chao.liu2@amd.com> Co-authored-by: rocking <chunylai@amd.com>

Gemm reduce max (#209)
* [What] Rename the example [Why] Prepare to add unary reduction * Add global oparation to the parameter * Add atomicmax * Fix compile error * Support atomicMax (hip library) * Rename the reduction example * Fix target name * use p_d1_grid as the indicator directly * Prevent performance issue. Let passthrough handle it. * Implement the function template the specialize the float2 * No need to separate into two lines * Remove empty line * add comment * Fix compile error due to merge from develop * make the implementation of atomic_max / atomic_add explicit for each datatype * Refine typo * For future CI test * Fix compiler error in ckProfiler * Merge commit 'de2769e3a6695b38a20529261273ddc5cdaab2fe' * simply use remove_pointer * Rename type and var * Refine example * Modify reducemax example * Fix bug in reduction * Change initialize range * Implement F64 version of atomicMax * Move reduction code together * Add buffer atomic_max * Fix coding style by clang-format * Integrate new api of DeviceGemmReduce_Xdl_CShuffle * Integrate Batch gemm reduction * Fix example * fix example * clean up * Fix batch gemm tensor operation * Fix coding style * Fix template augument * Fix clang format * Keep flexible of different stride for each D tensor * Fix compile error for ckProfiler * Fix typo * [What] Fix naming [Why] Prepare to add out elementop * Add DoutElementOp Co-authored-by: Chao Liu <chao.liu2@amd.com> Co-authored-by: rocking <chunylai@amd.com>
0ffe956a · rocking5566 · GitHub · aafc3ac2 · 0ffe956a · 0ffe956a
Unverified Commit 0ffe956a authored May 20, 2022 by rocking5566 Committed by GitHub May 19, 2022
20 changed files
--- a/example/16_gemm_reduce/CMakeLists.txt
+++ b/example/16_gemm_reduce/CMakeLists.txt
-add_example_executable(example_gemm_reduce_xdl_fp16 gemm_reduce_xdl_fp16.cpp)
+add_example_executable(example_gemm_reduce_xdl_max_fp16 gemm_reduce_xdl_max_fp16.cpp)
+add_example_executable(example_gemm_reduce_xdl_sum_squaresum_fp16 gemm_reduce_xdl_sum_squaresum_fp16.cpp)
--- a/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_max_fp16.cpp
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include <stdlib.h>
+#include "check_err.hpp"
+#include "config.hpp"
+#include "device.hpp"
+#include "host_tensor.hpp"
+#include "host_tensor_generator.hpp"
+#include "device_tensor.hpp"
+#include "device_gemm_reduce_xdl_cshuffle.hpp"
+#include "element_wise_operation.hpp"
+#include "reference_gemm.hpp"
+#include "gemm_specialization.hpp"
+#include "element_wise_reduce_operation.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using F64 = double;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using ADataType         = F16;
+using BDataType         = F16;
+using CDataType         = F16;
+using ReduceAccDataType = F32;
+using DDataType         = F64;
+using DPtrsGlobal       = ck::Tuple<DDataType*>;
+using ALayout = ck::tensor_layout::gemm::RowMajor;
+using BLayout = ck::tensor_layout::gemm::ColumnMajor;
+using CLayout = ck::tensor_layout::gemm::RowMajor;
+using AElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using BElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp  = ck::tensor_operation::element_wise::PassThrough;
+using DsReduceOp  = ck::Tuple<ck::reduce::Max<ReduceAccDataType>>;
+using DsElementOp = ck::Tuple<
+    ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>>;
+using DGlobalMemOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicMax>;
+static constexpr auto GemmSpecialization =
+    ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle|         ReduceAcc|         DData|           A|           B|           C|         Dxs|   DxsInEleOp|  DxsOutEleOp|             D|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType|          DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|             |             |    MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        |     |      |      |         |         |                  |              |   Operation|   Operation|   Operation|   Operation|             |             |     Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |         |         |                  |              |            |            |            |            |             |             |              |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32, ReduceAccDataType,   DPtrsGlobal,  AElementOp,  BElementOp,  CElementOp, DsReduceOp,   DsElementOp,  DsElementOp,  DGlobalMemOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+// clang-format on
+using ReferenceGemmInstance = ck::tensor_operation::host::
+    ReferenceGemm<ADataType, BDataType, CDataType, AElementOp, BElementOp, CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    // GEMM shape
+    ck::index_t M = 3840;
+    ck::index_t N = 4096;
+    ck::index_t K = 4096;
+    ck::index_t StrideA = 4096;
+    ck::index_t StrideB = 4096;
+    ck::index_t StrideC = 4096;
+    if(argc == 1)
+    {
+        // do nothing
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 10)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        StrideA = std::stoi(argv[7]);
+        StrideB = std::stoi(argv[8]);
+        StrideC = std::stoi(argv[9]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: run kernel # of times (>1)\n");
+        printf("arg4 to 9: M (256x), N(128x), K(32x), StrideA, StrideB, StrideC\n");
+        exit(0);
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, StrideA, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, StrideB, BLayout{}));
+    Tensor<CDataType> c_m_n_host_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<DDataType> d_m_host_result(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    Tensor<CDataType> c_m_n_device_result(f_host_tensor_descriptor(M, N, StrideC, CLayout{}));
+    Tensor<DDataType> d_m_device_result(
+        HostTensorDescriptor(std::vector<std::size_t>({static_cast<std::size_t>(M)})));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::cout << "d_m: " << d_m_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        break;
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem c_device_buf(sizeof(CDataType) * c_m_n_device_result.mDesc.GetElementSpace());
+    DeviceMem d_device_buf(sizeof(DDataType) * d_m_device_result.mDesc.GetElementSpace());
+    a_device_buf.ToDevice(a_m_k.mData.data());
+    b_device_buf.ToDevice(b_k_n.mData.data());
+    auto a_element_op  = AElementOp{};
+    auto b_element_op  = BElementOp{};
+    auto c_element_op  = CElementOp{};
+    auto ds_element_op = DsElementOp{};
+    auto p_ds_global   = ck::make_tuple(static_cast<DDataType*>(d_device_buf.GetDeviceBuffer()));
+    // do GEMM
+    auto gemm     = DeviceGemmReduceInstance{};
+    auto invoker  = gemm.MakeInvoker();
+    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
+                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
+                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+                                      p_ds_global,
+                                      M,
+                                      N,
+                                      K,
+                                      StrideA,
+                                      StrideB,
+                                      StrideC,
+                                      a_element_op,
+                                      b_element_op,
+                                      c_element_op,
+                                      ds_element_op,
+                                      ds_element_op);
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error(
+            "wrong! device_gemm with the specified compilation parameters does "
+            "not support this GEMM problem");
+    }
+    // init D
+    d_device_buf.SetValue(ck::NumericLimits<DDataType>::Lowest());
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop = std::size_t(2) * M * N * K;
+    std::size_t num_btype =
+        sizeof(ADataType) * M * K + sizeof(BDataType) * K * N + sizeof(CDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+    bool pass = true;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
+        d_device_buf.FromDevice(d_m_device_result.mData.data());
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument = ref_gemm.MakeArgument(
+            a_m_k, b_k_n, c_m_n_host_result, a_element_op, b_element_op, c_element_op);
+        ref_invoker.Run(ref_argument);
+        auto d_reduce_op = DsReduceOp{}[ck::Number<0>{}];
+        for(int m = 0; m < M; ++m)
+        {
+            ReduceAccDataType d_acc = d_reduce_op.GetReductionZeroVal();
+            for(int n = 0; n < N; ++n)
+                d_reduce_op(d_acc, c_m_n_host_result(m, n));
+            d_m_host_result(m) = d_acc;
+        }
+        pass = ck::utils::check_err(c_m_n_device_result.mData,
+                                    c_m_n_host_result.mData,
+                                    "Error: Incorrect results c") &&
+               ck::utils::check_err(d_m_device_result.mData,
+                                    d_m_host_result.mData,
+                                    "Error: Incorrect results d",
+                                    1e-3,
+                                    1e-3);
+    }
+    return pass ? 0 : 1;
+}
--- a/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
+++ b/example/16_gemm_reduce/gemm_reduce_xdl_fp16.cpp
@@ -3,7 +3,7 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <stdlib.h>
-#include <half.hpp>
 #include "check_err.hpp"
 #include "config.hpp"
 #include "device.hpp"
@@ -26,10 +26,12 @@ using F32 = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
-using ADataType = F16;
+using ADataType         = F16;
-using BDataType = F16;
+using BDataType         = F16;
-using CDataType = F16;
+using CDataType         = F16;
-using DDataType = F32;
+using ReduceAccDataType = F32;
+using DDataType         = F32;
+using DPtrsGlobal       = ck::Tuple<DDataType*, DDataType*>;
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
@@ -38,20 +40,31 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
 using AElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using D0ReduceOp  = ck::reduce::Add<float>;
+using D0ReduceOp  = ck::reduce::Add<ReduceAccDataType>;
-using D1ReduceOp  = ck::reduce::Add<float>;
+using D1ReduceOp  = ck::reduce::Add<ReduceAccDataType>;
-using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
+using DxsReduceOp = ck::Tuple<D0ReduceOp, D1ReduceOp>;
+using UnaryIdenticElementOp =
+    ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>;
+using UnarySquareElementOp =
+    ck::tensor_operation::element_wise::UnarySquare<ReduceAccDataType, ReduceAccDataType, false>;
+using DxsInElementOp  = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
+using DxsOutElementOp = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;
+using DGlobalMemOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
 static constexpr auto GemmSpecialization =
    ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
 using DeviceGemmReduceInstance = ck::tensor_operation::device::DeviceGemmReduce_Xdl_CShuffle
-//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|         D0|         D1|    D1EleOp|    GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|     DxsInEleOp|     DxsOutEleOp|             D|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
-//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|     Reduce|     Reduce|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |    MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
-//######|        |        |        |     |      |      |         |         |          |      |   Operation|   Operation|   Operation|  Operation|  Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |     Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
-//######|        |        |        |     |      |      |         |         |          |      |            |            |            |           |           |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+//######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |              |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32,  AElementOp,  BElementOp,  CElementOp, D0ReduceOp, D1ReduceOp, D1ElementOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   DPtrsGlobal,  AElementOp,  BElementOp,  CElementOp, DxsReduceOp, DxsInElementOp, DxsOutElementOp,  DGlobalMemOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
 // clang-format on
 using ReferenceGemmInstance = ck::tensor_operation::host::
@@ -162,10 +175,11 @@ int main(int argc, char* argv[])
    a_device_buf.ToDevice(a_m_k.mData.data());
    b_device_buf.ToDevice(b_k_n.mData.data());
-    auto a_element_op  = AElementOp{};
+    auto a_element_op = AElementOp{};
-    auto b_element_op  = BElementOp{};
+    auto b_element_op = BElementOp{};
-    auto c_element_op  = CElementOp{};
+    auto c_element_op = CElementOp{};
-    auto d1_element_op = D1ElementOp{};
+    auto dxs_global   = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+                                     static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()));
    // do GEMM
    auto gemm     = DeviceGemmReduceInstance{};
@@ -173,8 +187,7 @@ int main(int argc, char* argv[])
    auto argument = gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                      static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
                                      static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                      static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+                                      dxs_global,
-                                      static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()),
                                      M,
                                      N,
                                      K,
@@ -184,7 +197,8 @@ int main(int argc, char* argv[])
                                      a_element_op,
                                      b_element_op,
                                      c_element_op,
-                                      d1_element_op);
+                                      DxsInElementOp{},
+                                      DxsOutElementOp{});
    if(!gemm.IsSupportedArgument(argument))
    {
@@ -213,6 +227,7 @@ int main(int argc, char* argv[])
              << gemm.GetTypeString() << std::endl;
    bool pass = true;
    if(do_verification)
    {
        c_device_buf.FromDevice(c_m_n_device_result.mData.data());
@@ -237,10 +252,12 @@ int main(int argc, char* argv[])
            for(int n = 0; n < N; ++n)
            {
-                float d0_val = ck::type_convert<float>(c_m_n_host_result(m, n));
+                float c_val  = ck::type_convert<float>(c_m_n_host_result(m, n));
-                float d1_val;
+                float d0_val = 0;
+                float d1_val = 0;
-                d1_element_op(d1_val, d0_val);
+                UnaryIdenticElementOp{}(d0_val, c_val);
+                UnarySquareElementOp{}(d1_val, c_val);
                d0_reduce_op(d0_acc, d0_val);
                d1_reduce_op(d1_acc, d1_val);
            }
@@ -249,18 +266,19 @@ int main(int argc, char* argv[])
            d1_m_host_result(m) = ck::type_convert<DDataType>(d1_acc);
        }
-        pass &= ck::utils::check_err(
+        pass = ck::utils::check_err(c_m_n_device_result.mData,
-            c_m_n_device_result.mData, c_m_n_host_result.mData, "Error: Incorrect results c");
+                                    c_m_n_host_result.mData,
-        pass &= ck::utils::check_err(d0_m_device_result.mData,
+                                    "Error: Incorrect results c") &&
-                                     d0_m_host_result.mData,
+               ck::utils::check_err(d0_m_device_result.mData,
-                                     "Error: Incorrect results d0",
+                                    d0_m_host_result.mData,
-                                     1e-3,
+                                    "Error: Incorrect results d0",
-                                     1e-3);
+                                    1e-4,
-        pass &= ck::utils::check_err(d1_m_device_result.mData,
+                                    1e-5) &&
-                                     d1_m_host_result.mData,
+               ck::utils::check_err(d1_m_device_result.mData,
-                                     "Error: Incorrect results d1",
+                                    d1_m_host_result.mData,
-                                     1e-3,
+                                    "Error: Incorrect results d1",
-                                     1e-3);
+                                    1e-3,
+                                    1e-5);
    }
    return pass ? 0 : 1;

--- a/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
+++ b/example/18_batched_gemm_reduce/batched_gemm_reduce_xdl_fp16.cpp
@@ -25,10 +25,12 @@ using F32 = float;
 using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;
-using ADataType = F16;
+using ADataType         = F16;
-using BDataType = F16;
+using BDataType         = F16;
-using CDataType = F16;
+using CDataType         = F16;
-using DDataType = F32;
+using ReduceAccDataType = F32;
+using DDataType         = F32;
+using DPtrsGlobal       = ck::Tuple<DDataType*, DDataType*>;
 using ALayout = ck::tensor_layout::gemm::RowMajor;
 using BLayout = ck::tensor_layout::gemm::ColumnMajor;
@@ -37,20 +39,31 @@ using CLayout = ck::tensor_layout::gemm::RowMajor;
 using AElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using BElementOp  = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp  = ck::tensor_operation::element_wise::PassThrough;
-using D0ReduceOp  = ck::reduce::Add<float>;
+using D0ReduceOp  = ck::reduce::Add<ReduceAccDataType>;
-using D1ReduceOp  = ck::reduce::Add<float>;
+using D1ReduceOp  = ck::reduce::Add<ReduceAccDataType>;
-using D1ElementOp = ck::tensor_operation::element_wise::UnarySquare<float, float, false>;
+using DxsReduceOp = ck::Tuple<D0ReduceOp, D1ReduceOp>;
+using UnaryIdenticElementOp =
+    ck::tensor_operation::element_wise::UnaryIdentic<ReduceAccDataType, ReduceAccDataType, false>;
+using UnarySquareElementOp =
+    ck::tensor_operation::element_wise::UnarySquare<ReduceAccDataType, ReduceAccDataType, false>;
+using DxsInElementOp  = ck::Tuple<UnaryIdenticElementOp, UnarySquareElementOp>;
+using DxsOutElementOp = ck::Tuple<UnaryIdenticElementOp, UnaryIdenticElementOp>;
+using DGlobalMemOp =
+    ck::InMemoryDataOperationEnumSequence<ck::InMemoryDataOperationEnum::AtomicAdd,
+                                          ck::InMemoryDataOperationEnum::AtomicAdd>;
 static constexpr auto GemmSpecialization =
    ck::tensor_operation::device::GemmSpecialization::Default;
 // clang-format off
 using DeviceBatchedGemmReduceInstance = ck::tensor_operation::device::DeviceBatchedGemmReduce_Xdl_CShuffle
-//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc| DData|           A|           B|           C|         D0|         D1|     D1EleOp|      GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
+//######| ALayout| BLayout| CLayout|AData| BData| CData|  GemmAcc| CShuffle| ReduceAcc|         DData|           A|           B|           C|         Dxs|     DxsInEleOp|     DxsOutEleOp|            D|               GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|              CReduce| CReduceThreadLds2VGprCopy| CReduceThreadVgpr2GlobalCopy|
-//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|  Type| Elementwise| Elementwise| Elementwise|     Reduce|     Reduce|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
+//######|        |        |        | Type|  Type|  Type| DataType| DataType|  DataType|    Type Tuple| Elementwise| Elementwise| Elementwise|      Reduce|               |                |   MemoryData|     Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|    ExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar|    ExtraN| MXdlPerWave| NXdlPerWave|            _MBlock_MPerBlock| ScalarPerVector| ThreadClusterLengths|     SrcDstScalarPerVector|        SrcDstScalarPerVector|
-//######|        |        |        |     |      |      |         |         |          |      |   Operation|   Operation|   Operation|  Operation|  Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
+//######|        |        |        |     |      |      |         |         |          |              |   Operation|   Operation|   Operation|   Operation|               |                |    Operation|                   |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|            _NBlock_NPerBlock|      _NPerBlock| _MPerBlock_NPerBlock|                _NPerBlock|                   _MPerBlock|
-//######|        |        |        |     |      |      |         |         |          |      |            |            |            |           |           |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
+//######|        |        |        |     |      |      |         |         |          |              |            |            |            |            |               |                |             |                   |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |                     |                          |                             |
-        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   F32,  AElementOp,  BElementOp,  CElementOp, D0ReduceOp, D1ReduceOp, D1ElementOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
+        <     Row,     Col,     Row,  F16,   F16,   F16,      F32,      F32,       F32,   DPtrsGlobal,  AElementOp,  BElementOp,  CElementOp, DxsReduceOp, DxsInElementOp, DxsOutElementOp, DGlobalMemOp, GemmSpecialization,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               8,             S<64, 4>,                         4,                            1>;
 // clang-format on
 using ReferenceBatchedGemmInstance = ck::tensor_operation::host::
@@ -170,12 +183,11 @@ int main(int argc, char* argv[])
    a_device_buf.ToDevice(a_g_m_k.mData.data());
    b_device_buf.ToDevice(b_g_k_n.mData.data());
-    auto a_element_op  = AElementOp{};
+    auto a_element_op = AElementOp{};
-    auto b_element_op  = BElementOp{};
+    auto b_element_op = BElementOp{};
-    auto c_element_op  = CElementOp{};
+    auto c_element_op = CElementOp{};
-    auto d0_reduce_op  = D0ReduceOp{};
+    auto dxs_global   = ck::make_tuple(static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
-    auto d1_reduce_op  = D1ReduceOp{};
+                                     static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()));
-    auto d1_element_op = D1ElementOp{};
    // do GEMM
    auto batched_gemm = DeviceBatchedGemmReduceInstance{};
@@ -184,8 +196,7 @@ int main(int argc, char* argv[])
        batched_gemm.MakeArgument(static_cast<ADataType*>(a_device_buf.GetDeviceBuffer()),
                                  static_cast<BDataType*>(b_device_buf.GetDeviceBuffer()),
                                  static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
-                                  static_cast<DDataType*>(d0_device_buf.GetDeviceBuffer()),
+                                  dxs_global,
-                                  static_cast<DDataType*>(d1_device_buf.GetDeviceBuffer()),
                                  M,
                                  N,
                                  K,
@@ -195,7 +206,8 @@ int main(int argc, char* argv[])
                                  a_element_op,
                                  b_element_op,
                                  c_element_op,
-                                  d1_element_op,
+                                  DxsInElementOp{},
+                                  DxsOutElementOp{},
                                  BatchCount);
    if(!batched_gemm.IsSupportedArgument(argument))
@@ -240,6 +252,9 @@ int main(int argc, char* argv[])
        ref_invoker.Run(ref_argument);
+        auto d0_reduce_op = D0ReduceOp{};
+        auto d1_reduce_op = D1ReduceOp{};
        for(int batch = 0; batch < BatchCount; ++batch)
        {
            for(int m = 0; m < M; ++m)
@@ -249,10 +264,12 @@ int main(int argc, char* argv[])
                for(int n = 0; n < N; ++n)
                {
-                    float d0_val = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n));
+                    float c_val  = ck::type_convert<float>(c_g_m_n_host_result(batch, m, n));
-                    float d1_val;
+                    float d0_val = 0;
+                    float d1_val = 0;
-                    d1_element_op(d1_val, d0_val);
+                    UnaryIdenticElementOp{}(d0_val, c_val);
+                    UnarySquareElementOp{}(d1_val, c_val);
                    d0_reduce_op(d0_acc, d0_val);
                    d1_reduce_op(d1_acc, d1_val);
                }
@@ -262,17 +279,19 @@ int main(int argc, char* argv[])
            }
        }
-        pass &= ck::utils::check_err(c_g_m_n_host_result.mData, c_g_m_n_device_result.mData);
+        pass = ck::utils::check_err(c_g_m_n_host_result.mData,
-        pass &= ck::utils::check_err(d0_g_m_device_result.mData,
+                                    c_g_m_n_device_result.mData,
-                                     d0_g_m_host_result.mData,
+                                    "Error: Incorrect results c") &&
-                                     "Error: Incorrect results! D0",
+               ck::utils::check_err(d0_g_m_device_result.mData,
-                                     1e-3,
+                                    d0_g_m_host_result.mData,
-                                     1e-3);
+                                    "Error: Incorrect results! D0",
-        pass &= ck::utils::check_err(d1_g_m_device_result.mData,
+                                    1e-4,
-                                     d1_g_m_host_result.mData,
+                                    1e-5) &&
-                                     "Error: Incorrect results! D1",
+               ck::utils::check_err(d1_g_m_device_result.mData,
-                                     1e-3,
+                                    d1_g_m_host_result.mData,
-                                     1e-3);
+                                    "Error: Incorrect results! D1",
+                                    1e-3,
+                                    1e-5);
    }
    return pass ? 0 : 1;

--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -33,7 +33,7 @@ function(add_example_executable_no_testing EXAMPLE_NAME FILE_NAME)
    add_executable(${EXAMPLE_NAME} ${FILE_NAME})
    target_link_libraries(${EXAMPLE_NAME} PRIVATE host_tensor)
    add_dependencies(examples ${EXAMPLE_NAME})
-endfunction(add_example_executable EXAMPLE_NAME)
+endfunction(add_example_executable_no_testing EXAMPLE_NAME)
 add_subdirectory(01_gemm)
 add_subdirectory(02_gemm_alpha_beta)

--- a/include/ck/config.hpp
+++ b/include/ck/config.hpp
@@ -76,6 +76,12 @@
 #define CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT 0
 #endif
+#if defined(__gfx90a__) // for GPU code
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 1
+#else
+#define CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64 0
+#endif
 // inline asm
 #define CK_USE_AMD_INLINE_ASM 1
@@ -91,10 +97,11 @@
 // experimental feature: static tensor descriptor
 #define CK_EXPERIMENTAL_STATIC_TENSOR_DESCRIPTOR 0
-// experimental feature: buffer load/store/atomic-add OOB trick
+// experimental feature: buffer load/store/atomic-add/ OOB trick
 #define CK_EXPERIMENTAL_USE_BUFFER_LOAD_OOB_CHECK_OFFSET_TRICK 0
 #define CK_EXPERIMENTAL_USE_BUFFER_STORE_OOB_CHECK_OFFSET_TRICK 1
 #define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_ADD_OOB_CHECK_OFFSET_TRICK 1
+#define CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK 1
 // experimental feature: in-regsiter sub-dword transpose
 #define CK_EXPERIMENTAL_USE_IN_REGISTER_SUB_DWORD_TRANSPOSE 1
@@ -142,9 +149,23 @@ enum struct InMemoryDataOperationEnum
 {
    Set,
    AtomicAdd,
+    AtomicMax,
    Add
 };
+template <InMemoryDataOperationEnum... Is>
+struct InMemoryDataOperationEnumSequence
+{
+    static constexpr int mSize = sizeof...(Is);
+    __host__ __device__ static constexpr InMemoryDataOperationEnum At(int I)
+    {
+        // the last dummy element is to prevent compiler complain about empty array, when mSize = 0
+        const InMemoryDataOperationEnum mData[mSize + 1] = {Is..., InMemoryDataOperationEnum::Set};
+        return mData[I];
+    }
+};
 // TODO: no longer needed, remove this
 enum struct ActivTypeEnum
 {

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
@@ -17,11 +17,12 @@ namespace device {
 template <typename GridwiseGemm,
          typename FloatAB,
          typename FloatC,
-          typename FloatD,
+          typename DPtrsGlobal,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename D1ElementwiseOperation,
+          typename DxsInElementwiseOperation,
+          typename DxsOutElementwiseOperation,
          typename AGridDesc_AK0_M_AK1,
          typename BGridDesc_BK0_N_BK1,
          typename CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -37,13 +38,13 @@ __global__ void
            const FloatAB* __restrict__ p_a_grid,
            const FloatAB* __restrict__ p_b_grid,
            FloatC* __restrict__ p_c_grid,
-            FloatD* __restrict__ p_d0_grid,
+            DPtrsGlobal p_ds_grid,
-            FloatD* __restrict__ p_d1_grid,
            const index_t batch_count,
            const AElementwiseOperation a_element_op,
            const BElementwiseOperation b_element_op,
            const CElementwiseOperation c_element_op,
-            const D1ElementwiseOperation d1_element_op,
+            const DxsInElementwiseOperation dxs_in_element_op,
+            const DxsOutElementwiseOperation dxs_out_element_op,
            const AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1,
            const BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1,
            const CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock
@@ -64,23 +65,24 @@ __global__ void
    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetCBasePtr(g_idx)));
-    const long_index_t d0_batch_offset = __builtin_amdgcn_readfirstlane(
+    static_for<0, p_ds_grid.Size(), 1>{}([&](auto In) {
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetD0BasePtr(g_idx)));
+        const long_index_t d_batch_offset = __builtin_amdgcn_readfirstlane(
-    const long_index_t d1_batch_offset = __builtin_amdgcn_readfirstlane(
+            static_cast<long_index_t>(compute_base_ptr_of_batch_.GetDBasePtr(g_idx, In)));
-        static_cast<long_index_t>(compute_base_ptr_of_batch_.GetD1BasePtr(g_idx)));
+        p_ds_grid(In) = p_ds_grid(In) + d_batch_offset;
+    });
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    GridwiseGemm::template Run<HasMainK0BlockLoop>(p_a_grid + a_batch_offset,
                                                   p_b_grid + b_batch_offset,
                                                   p_c_grid + c_batch_offset,
-                                                   p_d0_grid + d0_batch_offset,
+                                                   p_ds_grid,
-                                                   p_d1_grid + d1_batch_offset,
                                                   p_shared,
                                                   a_element_op,
                                                   b_element_op,
                                                   c_element_op,
-                                                   d1_element_op,
+                                                   dxs_in_element_op,
+                                                   dxs_out_element_op,
                                                   a_grid_desc_ak0_m_ak1,
                                                   b_grid_desc_bk0_n_bk1,
                                                   c_grid_desc_mblock_mperblock_nblock_nperblock,
@@ -90,13 +92,13 @@ __global__ void
    ignore = p_a_grid;
    ignore = p_b_grid;
    ignore = p_c_grid;
-    ignore = p_d0_grid;
+    ignore = p_ds_grid;
-    ignore = p_d1_grid;
    ignore = batch_count;
    ignore = a_element_op;
    ignore = b_element_op;
    ignore = c_element_op;
-    ignore = d1_element_op;
+    ignore = dxs_in_element_op;
+    ignore = dxs_out_element_op;
    ignore = a_grid_desc_ak0_m_ak1;
    ignore = b_grid_desc_bk0_n_bk1;
    ignore = c_grid_desc_mblock_mperblock_nblock_nperblock;
@@ -118,13 +120,14 @@ template <typename ALayout,
          typename GemmAccDataType,
          typename CShuffleDataType,
          typename ReduceAccDataType,
-          typename DDataType,
+          typename DPtrsGlobal,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename D0ReduceOperation,
+          typename DxsReduceOperation,
-          typename D1ReduceOperation,
+          typename DxsInElementwiseOperation,
-          typename D1ElementwiseOperation,
+          typename DxsOutElementwiseOperation,
+          typename DGlobalMemoryDataOperation,
          GemmSpecialization GemmSpec,
          index_t NumGemmKPrefetchStage,
          index_t BlockSize,
@@ -159,10 +162,12 @@ template <typename ALayout,
          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
+struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
+                                                                      AElementwiseOperation,
                                                                      BElementwiseOperation,
                                                                      CElementwiseOperation,
-                                                                      D1ElementwiseOperation>
+                                                                      DxsInElementwiseOperation,
+                                                                      DxsOutElementwiseOperation>
 {
    using DeviceOp = DeviceBatchedGemmReduce_Xdl_CShuffle;
@@ -508,13 +513,11 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
        ComputeBasePtrOfStridedBatch(index_t BatchStrideA,
                                     index_t BatchStrideB,
                                     index_t BatchStrideC,
-                                     index_t BatchStrideD0,
+                                     index_t BatchStrideD)
-                                     index_t BatchStrideD1)
            : BatchStrideA_(BatchStrideA),
              BatchStrideB_(BatchStrideB),
              BatchStrideC_(BatchStrideC),
-              BatchStrideD0_(BatchStrideD0),
+              BatchStrideD_(BatchStrideD)
-              BatchStrideD1_(BatchStrideD1)
        {
        }
@@ -533,22 +536,20 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
            return g_idx * static_cast<long_index_t>(BatchStrideC_);
        }
-        __host__ __device__ constexpr long_index_t GetD0BasePtr(index_t g_idx) const
+        template <index_t I>
+        __host__ __device__ constexpr long_index_t GetDBasePtr(index_t g_idx,
+                                                               Number<I> reduction_idx) const
        {
-            return g_idx * static_cast<long_index_t>(BatchStrideD0_);
+            // TODO - Support sequence of StrideD in MakeArgument()
-        }
+            (void)reduction_idx;
+            return g_idx * static_cast<long_index_t>(BatchStrideD_);
-        __host__ __device__ constexpr long_index_t GetD1BasePtr(index_t g_idx) const
-        {
-            return g_idx * static_cast<long_index_t>(BatchStrideD1_);
        }
        private:
        index_t BatchStrideA_;
        index_t BatchStrideB_;
        index_t BatchStrideC_;
-        index_t BatchStrideD0_;
+        index_t BatchStrideD_;
-        index_t BatchStrideD1_;
    };
    // GridwiseGemm
@@ -558,15 +559,15 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
        CShuffleDataType,
        CDataType,
        ReduceAccDataType,
-        DDataType,
+        DPtrsGlobal,
        AElementwiseOperation,
        BElementwiseOperation,
        CElementwiseOperation,
-        D0ReduceOperation,
+        DxsReduceOperation,
-        D1ReduceOperation,
+        DxsInElementwiseOperation,
-        D1ElementwiseOperation,
+        DxsOutElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        InMemoryDataOperationEnum::AtomicAdd,
+        DGlobalMemoryDataOperation,
        AGridDesc_AK0_M_AK1,
        BGridDesc_BK0_N_BK1,
        CGridDesc_M_N,
@@ -615,8 +616,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
        Argument(const ADataType* p_a_grid,
                 const BDataType* p_b_grid,
                 CDataType* p_c_grid,
-                 DDataType* p_d0_grid,
+                 DPtrsGlobal p_ds_grid,
-                 DDataType* p_d1_grid,
                 index_t MRaw,
                 index_t NRaw,
                 index_t KRaw,
@@ -626,13 +626,13 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
-                 D1ElementwiseOperation d1_element_op,
+                 DxsInElementwiseOperation dxs_in_element_op,
+                 DxsOutElementwiseOperation dxs_out_element_op,
                 index_t BatchCount)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
-              p_d0_grid_{p_d0_grid},
+              p_ds_grid_{p_ds_grid},
-              p_d1_grid_{p_d1_grid},
              BatchCount_(BatchCount),
              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
@@ -644,13 +644,13 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                  type_convert<index_t>(a_grid_desc_ak0_m_ak1_.GetElementSpaceSize()),
                  type_convert<index_t>(b_grid_desc_bk0_n_bk1_.GetElementSpaceSize()),
                  type_convert<index_t>(c_grid_desc_m_n_.GetElementSpaceSize()),
-                  type_convert<index_t>(d_grid_desc_m_.GetElementSpaceSize()),
                  type_convert<index_t>(d_grid_desc_m_.GetElementSpaceSize())},
              block_2_ctile_map_{},
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              c_element_op_{c_element_op},
-              d1_element_op_{d1_element_op}
+              dxs_in_element_op_{dxs_in_element_op},
+              dxs_out_element_op_{dxs_out_element_op}
        {
            if(GridwiseGemm::CheckValidity(
                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, c_grid_desc_m_n_))
@@ -670,8 +670,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
        const ADataType* p_a_grid_;
        const BDataType* p_b_grid_;
        CDataType* p_c_grid_;
-        DDataType* p_d0_grid_;
+        DPtrsGlobal p_ds_grid_;
-        DDataType* p_d1_grid_;
        index_t BatchCount_;
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
@@ -685,7 +684,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
        CElementwiseOperation c_element_op_;
-        D1ElementwiseOperation d1_element_op_;
+        DxsInElementwiseOperation dxs_in_element_op_;
+        DxsOutElementwiseOperation dxs_out_element_op_;
    };
    // Invoker
@@ -736,11 +736,12 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    DDataType,
+                    DPtrsGlobal,
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    D1ElementwiseOperation,
+                    DxsInElementwiseOperation,
+                    DxsOutElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -758,13 +759,13 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                                           arg.p_a_grid_,
                                           arg.p_b_grid_,
                                           arg.p_c_grid_,
-                                           arg.p_d0_grid_,
+                                           arg.p_ds_grid_,
-                                           arg.p_d1_grid_,
                                           arg.BatchCount_,
                                           arg.a_element_op_,
                                           arg.b_element_op_,
                                           arg.c_element_op_,
-                                           arg.d1_element_op_,
+                                           arg.dxs_in_element_op_,
+                                           arg.dxs_out_element_op_,
                                           arg.a_grid_desc_ak0_m_ak1_,
                                           arg.b_grid_desc_bk0_n_bk1_,
                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
@@ -778,11 +779,12 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    DDataType,
+                    DPtrsGlobal,
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    D1ElementwiseOperation,
+                    DxsInElementwiseOperation,
+                    DxsOutElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -800,13 +802,13 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                                           arg.p_a_grid_,
                                           arg.p_b_grid_,
                                           arg.p_c_grid_,
-                                           arg.p_d0_grid_,
+                                           arg.p_ds_grid_,
-                                           arg.p_d1_grid_,
                                           arg.BatchCount_,
                                           arg.a_element_op_,
                                           arg.b_element_op_,
                                           arg.c_element_op_,
-                                           arg.d1_element_op_,
+                                           arg.dxs_in_element_op_,
+                                           arg.dxs_out_element_op_,
                                           arg.a_grid_desc_ak0_m_ak1_,
                                           arg.b_grid_desc_bk0_n_bk1_,
                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
@@ -855,8 +857,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
    static auto MakeArgument(const ADataType* p_a,
                             const BDataType* p_b,
                             CDataType* p_c,
-                             DDataType* p_d0,
+                             DPtrsGlobal p_dxs,
-                             DDataType* p_d1,
                             index_t MRaw,
                             index_t NRaw,
                             index_t KRaw,
@@ -866,14 +867,14 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
-                             D1ElementwiseOperation d1_element_op,
+                             DxsInElementwiseOperation dxs_in_element_op,
+                             DxsOutElementwiseOperation dxs_out_element_op,
                             index_t BatchCount)
    {
        return Argument{p_a,
                        p_b,
                        p_c,
-                        p_d0,
+                        p_dxs,
-                        p_d1,
                        MRaw,
                        NRaw,
                        KRaw,
@@ -883,7 +884,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                        a_element_op,
                        b_element_op,
                        c_element_op,
-                        d1_element_op,
+                        dxs_in_element_op,
+                        dxs_out_element_op,
                        BatchCount};
    }
@@ -893,8 +895,7 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                      const void* p_b,
                                                      void* p_c,
-                                                      void* p_d0,
+                                                      DPtrsGlobal p_dxs,
-                                                      void* p_d1,
                                                      index_t MRaw,
                                                      index_t NRaw,
                                                      index_t KRaw,
@@ -904,14 +905,14 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                                                      AElementwiseOperation a_element_op,
                                                      BElementwiseOperation b_element_op,
                                                      CElementwiseOperation c_element_op,
-                                                      D1ElementwiseOperation d1_element_op,
+                                                      DxsInElementwiseOperation dxs_in_element_op,
+                                                      DxsOutElementwiseOperation dxs_out_element_op,
                                                      index_t BatchCount) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          static_cast<DDataType*>(p_d0),
+                                          p_dxs,
-                                          static_cast<DDataType*>(p_d1),
                                          MRaw,
                                          NRaw,
                                          KRaw,
@@ -921,7 +922,8 @@ struct DeviceBatchedGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwi
                                          a_element_op,
                                          b_element_op,
                                          c_element_op,
-                                          d1_element_op,
+                                          dxs_in_element_op,
+                                          dxs_out_element_op,
                                          BatchCount);
    }

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -6,40 +6,47 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename AElementwiseOperation,
+template <typename DPtrsGlobal,
+          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename D1ElementwiseOperation>
+          typename DxsInElementwiseOperation,
+          typename DxsOutElementwiseOperation>
 struct DeviceGemmReduce : public BaseOperator
 {
-    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    virtual std::unique_ptr<BaseArgument>
-                                                              const void* p_b,
+    MakeArgumentPointer(const void* p_a,
-                                                              void* p_c,
+                        const void* p_b,
-                                                              void* p_d0,
+                        void* p_c,
-                                                              void* p_d1,
+                        DPtrsGlobal p_dxs,
-                                                              ck::index_t M,
+                        ck::index_t M,
-                                                              ck::index_t N,
+                        ck::index_t N,
-                                                              ck::index_t K,
+                        ck::index_t K,
-                                                              ck::index_t StrideA,
+                        ck::index_t StrideA,
-                                                              ck::index_t StrideB,
+                        ck::index_t StrideB,
-                                                              ck::index_t StrideC,
+                        ck::index_t StrideC,
-                                                              AElementwiseOperation a_element_op,
+                        AElementwiseOperation a_element_op,
-                                                              BElementwiseOperation b_element_op,
+                        BElementwiseOperation b_element_op,
-                                                              CElementwiseOperation c_element_op,
+                        CElementwiseOperation c_element_op,
-                                                              D1ElementwiseOperation d1_element_op,
+                        DxsInElementwiseOperation dxs_in_element_op,
-                                                              ck::index_t BatchCount = 1) = 0;
+                        DxsOutElementwiseOperation dxs_out_element_op,
+                        ck::index_t BatchCount = 1) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <typename AElementwiseOperation,
+template <typename DPtrsGlobal,
+          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename D1ElementwiseOperation>
+          typename DxsInElementwiseOperation,
-using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<AElementwiseOperation,
+          typename DxsOutElementwiseOperation>
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<DPtrsGlobal,
+                                                             AElementwiseOperation,
                                                             BElementwiseOperation,
                                                             CElementwiseOperation,
-                                                             D1ElementwiseOperation>>;
+                                                             DxsInElementwiseOperation,
+                                                             DxsOutElementwiseOperation>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
@@ -26,13 +26,14 @@ template <typename ALayout,
          typename GemmAccDataType,
          typename CShuffleDataType,
          typename ReduceAccDataType,
-          typename DDataType,
+          typename DPtrsGlobal,
          typename AElementwiseOperation,
          typename BElementwiseOperation,
          typename CElementwiseOperation,
-          typename D0ReduceOperation,
+          typename DxsReduceOperation,
-          typename D1ReduceOperation,
+          typename DxsInElementwiseOperation,
-          typename D1ElementwiseOperation,
+          typename DxsOutElementwiseOperation,
+          typename DGlobalMemoryDataOperation,
          GemmSpecialization GemmSpec,
          index_t NumGemmKPrefetchStage,
          index_t BlockSize,
@@ -67,10 +68,12 @@ template <typename ALayout,
          index_t CReduceThreadLds2VGprCopySrcDstScalarPerVector_NPerBlock,
          index_t CReduceThreadVgpr2GlobalCopySrcDstScalarPerVector_MPerBlock,
          LoopScheduler LoopSched = make_default_loop_scheduler()>
-struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOperation,
+struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<DPtrsGlobal,
+                                                               AElementwiseOperation,
                                                               BElementwiseOperation,
                                                               CElementwiseOperation,
-                                                               D1ElementwiseOperation>
+                                                               DxsInElementwiseOperation,
+                                                               DxsOutElementwiseOperation>
 {
    using DeviceOp = DeviceGemmReduce_Xdl_CShuffle;
@@ -380,15 +383,15 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
        CShuffleDataType,
        CDataType,
        ReduceAccDataType,
-        DDataType,
+        DPtrsGlobal,
        AElementwiseOperation,
        BElementwiseOperation,
        CElementwiseOperation,
-        D0ReduceOperation,
+        DxsReduceOperation,
-        D1ReduceOperation,
+        DxsInElementwiseOperation,
-        D1ElementwiseOperation,
+        DxsOutElementwiseOperation,
        InMemoryDataOperationEnum::Set,
-        InMemoryDataOperationEnum::AtomicAdd,
+        DGlobalMemoryDataOperation,
        AGridDesc_AK0_M_AK1,
        BGridDesc_BK0_N_BK1,
        CGridDesc_M_N,
@@ -435,8 +438,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
        Argument(const ADataType* p_a_grid,
                 const BDataType* p_b_grid,
                 CDataType* p_c_grid,
-                 DDataType* p_d0_grid,
+                 DPtrsGlobal p_ds_grid,
-                 DDataType* p_d1_grid,
                 index_t MRaw,
                 index_t NRaw,
                 index_t KRaw,
@@ -446,12 +448,12 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
-                 D1ElementwiseOperation d1_element_op)
+                 DxsInElementwiseOperation dxs_in_element_op,
+                 DxsOutElementwiseOperation dxs_out_element_op)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
-              p_d0_grid_{p_d0_grid},
+              p_ds_grid_{p_ds_grid},
-              p_d1_grid_{p_d1_grid},
              a_grid_desc_ak0_m_ak1_{DeviceOp::MakeAGridDescriptor_AK0_M_AK1(MRaw, KRaw, StrideA)},
              b_grid_desc_bk0_n_bk1_{DeviceOp::MakeBGridDescriptor_BK0_N_BK1(KRaw, NRaw, StrideB)},
              c_grid_desc_m_n_{DeviceOp::MakeCGridDescriptor_M_N(MRaw, NRaw, StrideC)},
@@ -462,7 +464,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
              a_element_op_{a_element_op},
              b_element_op_{b_element_op},
              c_element_op_{c_element_op},
-              d1_element_op_{d1_element_op}
+              dxs_in_element_op_{dxs_in_element_op},
+              dxs_out_element_op_{dxs_out_element_op}
        {
            if(GridwiseGemm::CheckValidity(
                   a_grid_desc_ak0_m_ak1_, b_grid_desc_bk0_n_bk1_, c_grid_desc_m_n_))
@@ -482,8 +485,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
        const ADataType* p_a_grid_;
        const BDataType* p_b_grid_;
        CDataType* p_c_grid_;
-        DDataType* p_d0_grid_;
+        DPtrsGlobal p_ds_grid_;
-        DDataType* p_d1_grid_;
        AGridDesc_AK0_M_AK1 a_grid_desc_ak0_m_ak1_;
        BGridDesc_BK0_N_BK1 b_grid_desc_bk0_n_bk1_;
        CGridDesc_M_N c_grid_desc_m_n_;
@@ -495,7 +497,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
        AElementwiseOperation a_element_op_;
        BElementwiseOperation b_element_op_;
        CElementwiseOperation c_element_op_;
-        D1ElementwiseOperation d1_element_op_;
+        DxsInElementwiseOperation dxs_in_element_op_;
+        DxsOutElementwiseOperation dxs_out_element_op_;
    };
    // Invoker
@@ -543,11 +546,12 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    DDataType,
+                    DPtrsGlobal,
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    D1ElementwiseOperation,
+                    DxsInElementwiseOperation,
+                    DxsOutElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -564,12 +568,12 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                                           arg.p_a_grid_,
                                           arg.p_b_grid_,
                                           arg.p_c_grid_,
-                                           arg.p_d0_grid_,
+                                           arg.p_ds_grid_,
-                                           arg.p_d1_grid_,
                                           arg.a_element_op_,
                                           arg.b_element_op_,
                                           arg.c_element_op_,
-                                           arg.d1_element_op_,
+                                           arg.dxs_in_element_op_,
+                                           arg.dxs_out_element_op_,
                                           arg.a_grid_desc_ak0_m_ak1_,
                                           arg.b_grid_desc_bk0_n_bk1_,
                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
@@ -582,11 +586,12 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
                    CDataType,
-                    DDataType,
+                    DPtrsGlobal,
                    AElementwiseOperation,
                    BElementwiseOperation,
                    CElementwiseOperation,
-                    D1ElementwiseOperation,
+                    DxsInElementwiseOperation,
+                    DxsOutElementwiseOperation,
                    DeviceOp::AGridDesc_AK0_M_AK1,
                    DeviceOp::BGridDesc_BK0_N_BK1,
                    typename GridwiseGemm::CGridDescriptor_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -603,12 +608,12 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                                           arg.p_a_grid_,
                                           arg.p_b_grid_,
                                           arg.p_c_grid_,
-                                           arg.p_d0_grid_,
+                                           arg.p_ds_grid_,
-                                           arg.p_d1_grid_,
                                           arg.a_element_op_,
                                           arg.b_element_op_,
                                           arg.c_element_op_,
-                                           arg.d1_element_op_,
+                                           arg.dxs_in_element_op_,
+                                           arg.dxs_out_element_op_,
                                           arg.a_grid_desc_ak0_m_ak1_,
                                           arg.b_grid_desc_bk0_n_bk1_,
                                           arg.c_grid_desc_mblock_mperblock_nblock_nperblock_,
@@ -648,8 +653,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
    static auto MakeArgument(const ADataType* p_a,
                             const BDataType* p_b,
                             CDataType* p_c,
-                             DDataType* p_d0,
+                             DPtrsGlobal p_dxs,
-                             DDataType* p_d1,
                             index_t MRaw,
                             index_t NRaw,
                             index_t KRaw,
@@ -659,13 +663,13 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
-                             D1ElementwiseOperation d1_element_op)
+                             DxsInElementwiseOperation dxs_in_element_op,
+                             DxsOutElementwiseOperation dxs_out_element_op)
    {
        return Argument{p_a,
                        p_b,
                        p_c,
-                        p_d0,
+                        p_dxs,
-                        p_d1,
                        MRaw,
                        NRaw,
                        KRaw,
@@ -675,7 +679,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                        a_element_op,
                        b_element_op,
                        c_element_op,
-                        d1_element_op};
+                        dxs_in_element_op,
+                        dxs_out_element_op};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -684,8 +689,7 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
                                                      const void* p_b,
                                                      void* p_c,
-                                                      void* p_d0,
+                                                      DPtrsGlobal p_dxs,
-                                                      void* p_d1,
                                                      index_t MRaw,
                                                      index_t NRaw,
                                                      index_t KRaw,
@@ -695,14 +699,14 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                                                      AElementwiseOperation a_element_op,
                                                      BElementwiseOperation b_element_op,
                                                      CElementwiseOperation c_element_op,
-                                                      D1ElementwiseOperation d1_element_op,
+                                                      DxsInElementwiseOperation dxs_in_element_op,
+                                                      DxsOutElementwiseOperation dxs_out_element_op,
                                                      index_t /* KBatch */ = 1) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
                                          static_cast<CDataType*>(p_c),
-                                          static_cast<DDataType*>(p_d0),
+                                          p_dxs,
-                                          static_cast<DDataType*>(p_d1),
                                          MRaw,
                                          NRaw,
                                          KRaw,
@@ -712,7 +716,8 @@ struct DeviceGemmReduce_Xdl_CShuffle : public DeviceGemmReduce<AElementwiseOpera
                                          a_element_op,
                                          b_element_op,
                                          c_element_op,
-                                          d1_element_op);
+                                          dxs_in_element_op,
+                                          dxs_out_element_op);
    }
    // polymorphic

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_reduce_xdl_cshuffle_v1.hpp
--- a/include/ck/utility/amd_buffer_addressing.hpp
+++ b/include/ck/utility/amd_buffer_addressing.hpp
@@ -258,6 +258,14 @@ __device__ float llvm_amdgcn_raw_buffer_atomic_add_fp32(
    index_t soffset,
    index_t glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fadd.f32");
+// buffer atomic-add fp32
+__device__ double
+llvm_amdgcn_raw_buffer_atomic_max_fp64(double vdata,
+                                       int32x4_t rsrc, // dst_wave_buffer_resource
+                                       int voffset,    // dst_thread_addr_offset
+                                       int soffset,    // dst_wave_addr_offset
+                                       int glc_slc) __asm("llvm.amdgcn.raw.buffer.atomic.fmax.f64");
 template <typename T, index_t N>
 __device__ typename vector_type<T, N>::type amd_buffer_load_impl(int32x4_t src_wave_buffer_resource,
                                                                 index_t src_thread_addr_offset,
@@ -915,6 +923,71 @@ __device__ void amd_buffer_atomic_add_impl(const typename vector_type<T, N>::typ
    }
 }
+template <typename T, index_t N>
+__device__ void amd_buffer_atomic_max_impl(const typename vector_type<T, N>::type src_thread_data,
+                                           int32x4_t dst_wave_buffer_resource,
+                                           index_t dst_thread_addr_offset,
+                                           index_t dst_wave_addr_offset)
+{
+    static_assert((is_same<T, double>::value && (N == 1 || N == 2 || N == 4)),
+                  "wrong! not implemented");
+    if constexpr(is_same<T, double>::value)
+    {
+        if constexpr(N == 1)
+        {
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(src_thread_data,
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+        }
+        else if constexpr(N == 2)
+        {
+            vector_type<double, 2> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(double),
+                                                   0);
+        }
+        else if constexpr(N == 4)
+        {
+            vector_type<double, 4> tmp{src_thread_data};
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<0>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset,
+                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<1>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + sizeof(double),
+                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<2>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 2 * sizeof(double),
+                                                   0);
+            llvm_amdgcn_raw_buffer_atomic_max_fp64(tmp.AsType<double>()[Number<3>{}],
+                                                   dst_wave_buffer_resource,
+                                                   dst_thread_addr_offset,
+                                                   dst_wave_addr_offset + 3 * sizeof(double),
+                                                   0);
+        }
+    }
+}
 // buffer_load requires:
 //   1) p_src_wave must point to global memory space
 //   2) p_src_wave must be a wavewise pointer.
@@ -1046,4 +1119,39 @@ amd_buffer_atomic_add(const typename vector_type_maker<T, N>::type::type src_thr
 #endif
 }
+// buffer_atomic_max requires:
+//   1) p_dst_wave must point to global memory
+//   2) p_dst_wave must be a wavewise pointer.
+// It is user's responsibility to make sure that is true.
+template <typename T, index_t N>
+__device__ void
+amd_buffer_atomic_max(const typename vector_type_maker<T, N>::type::type src_thread_data,
+                      T* p_dst_wave,
+                      const index_t dst_thread_element_offset,
+                      const bool dst_thread_element_valid,
+                      const index_t dst_element_space_size)
+{
+    const int32x4_t dst_wave_buffer_resource =
+        make_wave_buffer_resource(p_dst_wave, dst_element_space_size);
+    index_t dst_thread_addr_offset = dst_thread_element_offset * sizeof(T);
+    using vector_t                = typename vector_type_maker<T, N>::type::type;
+    using scalar_t                = typename scalar_type<vector_t>::type;
+    constexpr index_t vector_size = scalar_type<vector_t>::vector_size;
+#if CK_EXPERIMENTAL_USE_BUFFER_ATOMIC_MAX_OOB_CHECK_OFFSET_TRICK
+    uint32_t dst_addr_shift = dst_thread_element_valid ? 0 : 0x7fffffff;
+    amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+        src_thread_data, dst_wave_buffer_resource, dst_addr_shift + dst_thread_addr_offset, 0);
+#else
+    if(dst_thread_element_valid)
+    {
+        amd_buffer_atomic_max_impl<scalar_t, vector_size>(
+            src_thread_data, dst_wave_buffer_resource, dst_thread_addr_offset, 0);
+    }
+#endif
+}
 } // namespace ck
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
@@ -32,7 +32,7 @@
 #include "debug.hpp"
 #include "amd_buffer_addressing.hpp"
-#include "generic_memory_space_atomic_add.hpp"
+#include "generic_memory_space_atomic.hpp"
 #include "get_id.hpp"
 #include "synchronization.hpp"
 #include "amd_address_space.hpp"

--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
@@ -3,7 +3,7 @@
 #include "enable_if.hpp"
 #include "c_style_pointer_cast.hpp"
 #include "amd_buffer_addressing.hpp"
-#include "generic_memory_space_atomic_add.hpp"
+#include "generic_memory_space_atomic.hpp"
 namespace ck {
@@ -125,6 +125,10 @@ struct DynamicBuffer
        {
            this->template AtomicAdd<X>(i, is_valid_element, x);
        }
+        else if constexpr(Op == InMemoryDataOperationEnum::AtomicMax)
+        {
+            this->template AtomicMax<X>(i, is_valid_element, x);
+        }
        else if constexpr(Op == InMemoryDataOperationEnum::Add)
        {
            auto tmp = this->template Get<X>(i, is_valid_element);
@@ -326,6 +330,42 @@ struct DynamicBuffer
        }
    }
+    template <typename X,
+              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
+                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
+                                 bool>::type = false>
+    __host__ __device__ void AtomicMax(index_t i, bool is_valid_element, const X& x)
+    {
+        // X contains multiple T
+        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;
+        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;
+        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
+                      "wrong! X should contain multiple T");
+        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
+#if CK_USE_AMD_BUFFER_ATOMIC_MAX_FLOAT64
+        using scalar_t                           = typename scalar_type<remove_cvref_t<T>>::type;
+        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, double>;
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+            amd_buffer_atomic_max<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
+        }
+        else if(is_valid_element)
+        {
+            atomic_max<X>(c_style_pointer_cast<X*>(&p_data_[i]), x);
+        }
+    }
    __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }

--- a/include/ck/utility/generic_memory_space_atomic_add.hpp
+++ b/include/ck/utility/generic_memory_space_atomic_add.hpp
@@ -3,6 +3,10 @@
 namespace ck {
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to make the implementation of atomic_add explicit for
+// each datatype.
 template <typename X>
 __device__ X atomic_add(X* p_dst, const X& x);
@@ -41,4 +45,53 @@ __device__ float2_t atomic_add<float2_t>(float2_t* p_dst, const float2_t& x)
    return vy.template AsType<float2_t>()[I0];
 }
+// Caution: DO NOT REMOVE
+// intentionally have only declaration but no definition to cause compilation failure when trying to
+// instantiate this template. The purpose is to make the implementation of atomic_max explicit for
+// each datatype.
+template <typename X>
+__device__ X atomic_max(X* p_dst, const X& x);
+template <>
+__device__ int32_t atomic_max<int32_t>(int32_t* p_dst, const int32_t& x)
+{
+    return atomicMax(p_dst, x);
+}
+template <>
+__device__ uint32_t atomic_max<uint32_t>(uint32_t* p_dst, const uint32_t& x)
+{
+    return atomicMax(p_dst, x);
+}
+template <>
+__device__ float atomic_max<float>(float* p_dst, const float& x)
+{
+    return atomicMax(p_dst, x);
+}
+template <>
+__device__ double atomic_max<double>(double* p_dst, const double& x)
+{
+    return atomicMax(p_dst, x);
+}
+template <>
+__device__ float2_t atomic_max<float2_t>(float2_t* p_dst, const float2_t& x)
+{
+    constexpr auto I0 = Number<0>{};
+    constexpr auto I1 = Number<1>{};
+    const vector_type<float, 2> vx{x};
+    vector_type<float, 2> vy{0};
+    vy.template AsType<float>()(I0) =
+        atomicMax(c_style_pointer_cast<float*>(p_dst), vx.template AsType<float>()[I0]);
+    vy.template AsType<float>()(I1) =
+        atomicMax(c_style_pointer_cast<float*>(p_dst) + 1, vx.template AsType<float>()[I1]);
+    return vy.template AsType<float2_t>()[I0];
+}
 } // namespace ck
--- a/include/ck/utility/type.hpp
+++ b/include/ck/utility/type.hpp
@@ -29,6 +29,9 @@ using remove_cv_t = typename std::remove_cv<T>::type;
 template <typename T>
 using remove_cvref_t = remove_cv_t<std::remove_reference_t<T>>;
+template <typename T>
+using remove_pointer_t = typename std::remove_pointer<T>::type;
 template <typename T>
 inline constexpr bool is_pointer_v = std::is_pointer<T>::value;

--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -10,6 +10,15 @@
 #include "stream_config.hpp"
 #include "ck/options.hpp"
+template <typename T>
+__global__ void set_buffer_value(T* p, T x, uint64_t buffer_element_size)
+{
+    for(uint64_t i = threadIdx.x; i < buffer_element_size; i += blockDim.x)
+    {
+        p[i] = x;
+    }
+}
 inline void hip_check_error(hipError_t x)
 {
    if(x != hipSuccess)
@@ -30,6 +39,16 @@ struct DeviceMem
    void ToDevice(const void* p);
    void FromDevice(void* p);
    void SetZero();
+    template <typename T>
+    void SetValue(T x)
+    {
+        if(mMemSize % sizeof(T) != 0)
+        {
+            throw std::runtime_error("wrong! not entire DeviceMem will be set");
+        }
+        set_buffer_value<T><<<1, 1024>>>(static_cast<T*>(mpDeviceBuf), x, mMemSize / sizeof(T));
+    }
    ~DeviceMem();
    void* mpDeviceBuf;
@@ -74,8 +93,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        printf("Warm up 1 time\n");
        // warm up
-        hipLaunchKernelGGL(
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
-            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
        printf("Start running %d times...\n", nrepeat);
@@ -84,8 +102,7 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
        for(int i = 0; i < nrepeat; ++i)
        {
-            hipLaunchKernelGGL(
+            kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
-                kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
        }
        timer.End();
@@ -94,13 +111,12 @@ float launch_and_time_kernel(const StreamConfig& stream_config,
    }
    else
    {
-        hipLaunchKernelGGL(
+        kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
-            kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
        return 0;
    }
 #else
-    hipLaunchKernelGGL(kernel, grid_dim, block_dim, lds_byte, stream_config.stream_id_, args...);
+    kernel<<<grid_dim, block_dim, lds_byte, stream_config.stream_id_>>>(args...);
    return 0;
 #endif

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/CMakeLists.txt
-set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE 
+set(DEVICE_BATCHED_GEMM_REDUCE_INSTANCE_SOURCE
    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gnk_gmn_instance.cpp
    device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp

--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gkn_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gkm_gnk_gmn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_reduce/device_batched_gemm_reduce_xdl_cshuffle_f16_f16_f16_f32_f32_gmk_gkn_gmn_instance.cpp