Merge remote-tracking branch 'origin/develop' into rosenrodt/gemm-layernorm

2b27d5fc · Chao Liu · f689a155 · fa9a0a5c · 2b27d5fc · 2b27d5fc
Commit 2b27d5fc authored Jul 01, 2022 by Chao Liu
20 changed files
--- a/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
+++ b/example/25_gemm_bias_c_permute/gemm_bias_c_permute_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_bias_c_permute_xdl.hpp"
+#include "ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp"
+#include "ck/library/host_tensor/device_memory.hpp"
+#include "ck/library/host_tensor/host_tensor.hpp"
+#include "ck/library/host_tensor/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
+#include "ck/library/utility/check_err.hpp"
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using F16 = ck::half_t;
+using F32 = float;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using Add         = ck::tensor_operation::element_wise::Add;
+using ADataType        = F16;
+using BDataType        = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using DDataType        = F16;
+using EDataType        = F16;
+using ALayout = Row;
+using BLayout = Col;
+using DLayout = Row;
+using ELayout = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = Add;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+// clang-format off
+using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmBiasCPermute_Xdl
+//######| ALayout| BLayout| ELayout|     AData|     BData|     AccData|         CShuffle|     DsData|     EData|           A|           B|          CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
+//######|        |        |        |      Type|      Type|        Type|         DataType|       Type|      Type| Elementwise| Elementwise|  Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|
+//######|        |        |        |          |          |            |                 |           |          |   Operation|   Operation|    Operation|               |    Stage|      |      |      |      |    |    |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|
+//######|        |        |        |          |          |            |                 |           |          |            |            |             |               |         |      |      |      |      |    |    |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |            |            |                             |                |
+        < ALayout, BLayout, ELayout, ADataType, BDataType, AccDataType, CShuffleDataType,  DDataType, EDataType,  AElementOp,  BElementOp, CDEElementOp,    GemmDefault,        1,   256,   256,   128,    32,   8,   8,   32,   32,    4,    2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,         1,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,         1,           1,           1,               S<1, 32, 1, 8>,               1>;
+// clang-format on
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    ck::index_t M0 = 4;
+    ck::index_t M1 = 32;
+    ck::index_t M2 = 128;
+    ck::index_t N0 = 16;
+    ck::index_t N1 = 256;
+    // GEMM shape
+    ck::index_t M = M0 * M1 * M2;
+    ck::index_t N = N0 * N1;
+    ck::index_t K = 128;
+    ck::index_t stride_A = K;
+    ck::index_t stride_B = K;
+#if 1
+    // E = [M0, N0, M1, N1, M2]
+    ck::index_t stride_E_M0 = N0 * M1 * N1 * M2;
+    ck::index_t stride_E_M1 = N1 * M2;
+    ck::index_t stride_E_M2 = 1;
+    ck::index_t stride_E_N0 = M1 * N1 * M2;
+    ck::index_t stride_E_N1 = M2;
+    // D = [0, N0, 0, N1, 0]
+    ck::index_t stride_D_M0 = 0;
+    ck::index_t stride_D_M1 = 0;
+    ck::index_t stride_D_M2 = 0;
+    ck::index_t stride_D_N0 = N1;
+    ck::index_t stride_D_N1 = 1;
+#else
+    // D = [0, 0, 0, N0, N1]
+    ck::index_t stride_D_M0 = 0;
+    ck::index_t stride_D_M1 = 0;
+    ck::index_t stride_D_M2 = 0;
+    ck::index_t stride_D_N0 = N1;
+    ck::index_t stride_D_N1 = 1;
+    // E = [M0, M1, M2, N0, N1]
+    ck::index_t stride_E_M0 = M1 * M2 * N0 * N1;
+    ck::index_t stride_E_M1 = M2 * N0 * N1;
+    ck::index_t stride_E_M2 = N0 * N1;
+    ck::index_t stride_E_N0 = N1;
+    ck::index_t stride_E_N1 = 1;
+#endif
+    const ck::tensor_operation::device::DEGridDesc_M0_M1_M2_N0_N1 d_grid_desc{
+        M0, M1, M2, N0, N1, stride_D_M0, stride_D_M1, stride_D_M2, stride_D_N0, stride_D_N1};
+    const ck::tensor_operation::device::DEGridDesc_M0_M1_M2_N0_N1 e_grid_desc{
+        M0, M1, M2, N0, N1, stride_E_M0, stride_E_M1, stride_E_M2, stride_E_N0, stride_E_N1};
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        exit(0);
+    }
+    auto f_host_tensor_descriptor =
+        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
+            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({stride, 1}));
+            }
+            else
+            {
+                return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+                                            std::vector<std::size_t>({1, stride}));
+            }
+        };
+    auto f_host_de_tensor_descriptor =
+        [](ck::tensor_operation::device::DEGridDesc_M0_M1_M2_N0_N1 de_grid_desc) {
+            std::size_t m0        = de_grid_desc.M0_;
+            std::size_t m1        = de_grid_desc.M1_;
+            std::size_t m2        = de_grid_desc.M2_;
+            std::size_t n0        = de_grid_desc.N0_;
+            std::size_t n1        = de_grid_desc.N1_;
+            std::size_t stride_m0 = de_grid_desc.stride_M0_;
+            std::size_t stride_m1 = de_grid_desc.stride_M1_;
+            std::size_t stride_m2 = de_grid_desc.stride_M2_;
+            std::size_t stride_n0 = de_grid_desc.stride_N0_;
+            std::size_t stride_n1 = de_grid_desc.stride_N1_;
+            return HostTensorDescriptor(
+                std::vector<std::size_t>({m0, m1, m2, n0, n1}),
+                std::vector<std::size_t>({stride_m0, stride_m1, stride_m2, stride_n0, stride_n1}));
+        };
+    Tensor<ADataType> a_m_k(f_host_tensor_descriptor(M, K, stride_A, ALayout{}));
+    Tensor<BDataType> b_k_n(f_host_tensor_descriptor(K, N, stride_B, BLayout{}));
+    Tensor<DDataType> d_m0_m1_m2_n0_n1(f_host_de_tensor_descriptor(d_grid_desc));
+    Tensor<EDataType> e_m0_m1_m2_n0_n1_host_result(f_host_de_tensor_descriptor(e_grid_desc));
+    Tensor<EDataType> e_m0_m1_m2_n0_n1_device_result(f_host_de_tensor_descriptor(e_grid_desc));
+    std::cout << "a_m_k: " << a_m_k.mDesc << std::endl;
+    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
+    std::cout << "d_m0_m1_m2_n0_n1: " << d_m0_m1_m2_n0_n1.mDesc << std::endl;
+    std::cout << "e_m0_m1_m2_n0_n1: " << e_m0_m1_m2_n0_n1_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        d_m0_m1_m2_n0_n1.GenerateTensorValue(GeneratorTensor_2<DDataType>{-5, 5});
+        break;
+    default:
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        d_m0_m1_m2_n0_n1.GenerateTensorValue(GeneratorTensor_3<DDataType>{0.0, 1.0});
+    }
+    DeviceMem a_m_k_device_buf(sizeof(ADataType) * a_m_k.mDesc.GetElementSpace());
+    DeviceMem b_k_n_device_buf(sizeof(BDataType) * b_k_n.mDesc.GetElementSpace());
+    DeviceMem d_m0_m1_m2_n0_n1_device_buf(sizeof(DDataType) *
+                                          d_m0_m1_m2_n0_n1.mDesc.GetElementSpace());
+    DeviceMem e_m0_m1_m2_n0_n1_device_buf(sizeof(EDataType) *
+                                          e_m0_m1_m2_n0_n1_device_result.mDesc.GetElementSpace());
+    a_m_k_device_buf.ToDevice(a_m_k.mData.data());
+    b_k_n_device_buf.ToDevice(b_k_n.mData.data());
+    d_m0_m1_m2_n0_n1_device_buf.ToDevice(d_m0_m1_m2_n0_n1.mData.data());
+    auto a_element_op   = AElementOp{};
+    auto b_element_op   = BElementOp{};
+    auto cde_element_op = CDEElementOp{};
+    // do GEMM
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto argument  = device_op.MakeArgument(a_m_k_device_buf.GetDeviceBuffer(),
+                                           b_k_n_device_buf.GetDeviceBuffer(),
+                                           d_m0_m1_m2_n0_n1_device_buf.GetDeviceBuffer(),
+                                           e_m0_m1_m2_n0_n1_device_buf.GetDeviceBuffer(),
+                                           M,
+                                           N,
+                                           K,
+                                           stride_A,
+                                           stride_B,
+                                           d_grid_desc,
+                                           e_grid_desc,
+                                           a_element_op,
+                                           b_element_op,
+                                           cde_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    std::size_t flop      = std::size_t(2) * M * N * K;
+    std::size_t num_btype = sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                            sizeof(DDataType) * N + sizeof(EDataType) * M * N;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << device_op.GetTypeString() << std::endl;
+    if(do_verification)
+    {
+        Tensor<AccDataType> c_m_n(HostTensorDescriptor(
+            std::vector<std::size_t>{static_cast<std::size_t>(M), static_cast<std::size_t>(N)}));
+        using ReferenceGemmInstance = ck::tensor_operation::host::ReferenceGemm<ADataType,
+                                                                                BDataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                BElementOp,
+                                                                                PassThrough>;
+        auto ref_gemm    = ReferenceGemmInstance{};
+        auto ref_invoker = ref_gemm.MakeInvoker();
+        auto ref_argument =
+            ref_gemm.MakeArgument(a_m_k, b_k_n, c_m_n, a_element_op, b_element_op, PassThrough{});
+        ref_invoker.Run(ref_argument);
+        for(int m0 = 0; m0 < M0; ++m0)
+            for(int m1 = 0; m1 < M1; ++m1)
+                for(int m2 = 0; m2 < M2; ++m2)
+                    for(int n0 = 0; n0 < N0; ++n0)
+                        for(int n1 = 0; n1 < N1; ++n1)
+                        {
+                            int m = m0 * M1 * M2 + m1 * M2 + m2;
+                            int n = n0 * N1 + n1;
+                            cde_element_op(e_m0_m1_m2_n0_n1_host_result(m0, m1, m2, n0, n1),
+                                           ck::type_convert<EDataType>(c_m_n(m, n)),
+                                           d_m0_m1_m2_n0_n1(m0, m1, m2, n0, n1));
+                        }
+        e_m0_m1_m2_n0_n1_device_buf.FromDevice(e_m0_m1_m2_n0_n1_device_result.mData.data());
+        return ck::utils::check_err(e_m0_m1_m2_n0_n1_device_result.mData,
+                                    e_m0_m1_m2_n0_n1_host_result.mData)
+                   ? 0
+                   : 1;
+    }
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -42,3 +42,4 @@ add_subdirectory(20_convnd_bwd_weight_xdl)
 add_subdirectory(21_gemm_layernorm)
 add_subdirectory(22_cgemm)
 add_subdirectory(23_softmax)
+add_subdirectory(25_gemm_bias_c_permute)
--- a/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_5ary_elementwise.hpp
@@ -10,7 +10,7 @@
 #include "ck/utility/common_header.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_5ary_Elementwise_1d.hpp"
 #include "ck/device_utility/device_prop.hpp"
 #include "ck/device_utility/kernel_launch.hpp"
@@ -35,7 +35,7 @@ template <typename ADataType,
          index_t DScalarPerVector,
          index_t EScalarPerVector,
          index_t FScalarPerVector>
-struct Device5AryElementwise : public BaseOperator
+struct Device5AryElementwise : public DeviceElementwise<5, 1, NDim, ElementwiseFunctor>
 {
    static constexpr auto I0 = Number<0>{};
@@ -268,12 +268,8 @@ struct Device5AryElementwise : public BaseOperator
        return true;
    };
-    static auto MakeArgument(const ADataType* p_a,
+    static auto MakeArgument(std::array<const void*, 5> p_inputs,
-                             const BDataType* p_b,
+                             std::array<void*, 1> p_outputs,
-                             const CDataType* p_c,
-                             const DDataType* p_d,
-                             const EDataType* p_e,
-                             FDataType* p_f,
                             std::vector<index_t> lengths,
                             std::vector<index_t> a_strides,
                             std::vector<index_t> b_strides,
@@ -283,12 +279,12 @@ struct Device5AryElementwise : public BaseOperator
                             std::vector<index_t> f_strides,
                             ElementwiseFunctor functor)
    {
-        return Argument{p_a,
+        return Argument{static_cast<const ADataType*>(p_inputs[0]),
-                        p_b,
+                        static_cast<const BDataType*>(p_inputs[1]),
-                        p_c,
+                        static_cast<const CDataType*>(p_inputs[2]),
-                        p_d,
+                        static_cast<const DDataType*>(p_inputs[3]),
-                        p_e,
+                        static_cast<const EDataType*>(p_inputs[4]),
-                        p_f,
+                        static_cast<FDataType*>(p_outputs[0]),
                        lengths,
                        a_strides,
                        b_strides,
@@ -299,40 +295,58 @@ struct Device5AryElementwise : public BaseOperator
                        functor};
    }
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    std::unique_ptr<BaseArgument>
-                                                      const void* p_b,
+    MakeArgumentPointer(std::array<const void*, 5> p_inputs,
-                                                      const void* p_c,
+                        std::array<void*, 1> p_outputs,
-                                                      const void* p_d,
-                                                      const void* p_e,
-                                                      void* p_f,
                        std::vector<index_t> lengths,
-                                                      std::vector<index_t> a_strides,
+                        std::vector<std::vector<index_t>> input_strides,
-                                                      std::vector<index_t> b_strides,
+                        std::vector<std::vector<index_t>> output_strides,
-                                                      std::vector<index_t> c_strides,
+                        ElementwiseFunctor functor) override
-                                                      std::vector<index_t> d_strides,
-                                                      std::vector<index_t> e_strides,
-                                                      std::vector<index_t> f_strides,
-                                                      ElementwiseFunctor functor)
    {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_inputs[0]),
-                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BDataType*>(p_inputs[1]),
-                                          static_cast<const CDataType*>(p_c),
+                                          static_cast<const CDataType*>(p_inputs[2]),
-                                          static_cast<const DDataType*>(p_d),
+                                          static_cast<const DDataType*>(p_inputs[3]),
-                                          static_cast<const EDataType*>(p_e),
+                                          static_cast<const EDataType*>(p_inputs[4]),
-                                          static_cast<FDataType*>(p_f),
+                                          static_cast<FDataType*>(p_outputs[0]),
                                          lengths,
-                                          a_strides,
+                                          input_strides[0],
-                                          b_strides,
+                                          input_strides[1],
-                                          c_strides,
+                                          input_strides[2],
-                                          d_strides,
+                                          input_strides[3],
-                                          e_strides,
+                                          input_strides[4],
-                                          f_strides,
+                                          output_strides[0],
                                          functor);
    }
    static auto MakeInvoker() { return Invoker{}; }
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
-};
+    {
+        return std::make_unique<Invoker>();
+    }
+    // polymorphic
+    std::string GetTypeString() const override
+    {
+        auto str = std::stringstream();
+        // clang-format off
+        str << "Device5aryElementwise"
+            << "<"
+            << "NDim = " << NDim
+            << "MPerThread = " << MPerThread
+            << "AScalarPerVector = " << AScalarPerVector
+            << "BScalarPerVector = " << BScalarPerVector
+            << "CScalarPerVector = " << CScalarPerVector
+            << "DScalarPerVector = " << DScalarPerVector
+            << "EScalarPerVector = " << EScalarPerVector
+            << "FScalarPerVector = " << FScalarPerVector
+            << ">";
+        // clang-format on
+        return str.str();
+    }
+}; // namespace device
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t Batch) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceBatchedGemmPtr = std::unique_ptr<
+    DeviceBatchedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_reduce_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_xdl.hpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp"
 #include "ck/device_utility/device_prop.hpp"
@@ -152,7 +152,7 @@ template <typename ADataType,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector>
 struct DeviceBatchedGemmXdl
-    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+    : public DeviceBatchedGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -339,11 +339,11 @@ struct DeviceBatchedGemmXdl
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CElementwiseOperation c_element_op,
-                 index_t BatchCount)
+                 index_t Batch)
            : p_a_grid_{p_a_grid},
              p_b_grid_{p_b_grid},
              p_c_grid_{p_c_grid},
-              BatchCount_(BatchCount),
+              Batch_(Batch),
              a_grid_desc_k0_m_k1_{
                  DeviceBatchedGemmXdl::MakeAGridDescriptor_K0_M_K1(M, K, StrideA)},
              b_grid_desc_k0_n_k1_{
@@ -376,7 +376,7 @@ struct DeviceBatchedGemmXdl
        const ADataType* p_a_grid_;
        const BDataType* p_b_grid_;
        CDataType* p_c_grid_;
-        index_t BatchCount_;
+        index_t Batch_;
        AGridDesc_K0_M_K1 a_grid_desc_k0_m_k1_;
        BGridDesc_K0_N_K1 b_grid_desc_k0_n_k1_;
        CGridDesc_M_N c_grid_desc_m_n_;
@@ -420,7 +420,7 @@ struct DeviceBatchedGemmXdl
            }
            const index_t grid_size =
-                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.BatchCount_;
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.Batch_;
            const auto K =
                arg.a_grid_desc_k0_m_k1_.GetLength(I0) * arg.a_grid_desc_k0_m_k1_.GetLength(I2);
@@ -451,7 +451,7 @@ struct DeviceBatchedGemmXdl
                                                  arg.p_a_grid_,
                                                  arg.p_b_grid_,
                                                  arg.p_c_grid_,
-                                                  arg.BatchCount_,
+                                                  arg.Batch_,
                                                  arg.a_grid_desc_k0_m_k1_,
                                                  arg.b_grid_desc_k0_n_k1_,
                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
@@ -485,7 +485,7 @@ struct DeviceBatchedGemmXdl
                                                  arg.p_a_grid_,
                                                  arg.p_b_grid_,
                                                  arg.p_c_grid_,
-                                                  arg.BatchCount_,
+                                                  arg.Batch_,
                                                  arg.a_grid_desc_k0_m_k1_,
                                                  arg.b_grid_desc_k0_n_k1_,
                                                  arg.c_grid_desc_m0_n0_m1_n1_m2_m3_m4_n2_,
@@ -539,7 +539,7 @@ struct DeviceBatchedGemmXdl
                             AElementwiseOperation a_element_op,
                             BElementwiseOperation b_element_op,
                             CElementwiseOperation c_element_op,
-                             index_t BatchCount)
+                             index_t Batch)
    {
        return Argument{p_a,
                        p_b,
@@ -555,7 +555,7 @@ struct DeviceBatchedGemmXdl
                        a_element_op,
                        b_element_op,
                        c_element_op,
-                        BatchCount};
+                        Batch};
    }
    static auto MakeInvoker() { return Invoker{}; }
@@ -573,7 +573,7 @@ struct DeviceBatchedGemmXdl
                                                      AElementwiseOperation a_element_op,
                                                      BElementwiseOperation b_element_op,
                                                      CElementwiseOperation c_element_op,
-                                                      index_t BatchCount) override
+                                                      index_t Batch) override
    {
        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
                                          static_cast<const BDataType*>(p_b),
@@ -589,7 +589,7 @@ struct DeviceBatchedGemmXdl
                                          a_element_op,
                                          b_element_op,
                                          c_element_op,
-                                          BatchCount);
+                                          Batch);
    }
    // polymorphic

--- a/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_binary_elementwise.hpp
@@ -9,6 +9,7 @@
 #include "ck/device_utility/device_prop.hpp"
 #include "ck/device_utility/kernel_launch.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_elementwise.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_binary_elementwise_1d.hpp"
 namespace ck {
@@ -25,7 +26,7 @@ template <typename ADataType,
          index_t AScalarPerVector,
          index_t BScalarPerVector,
          index_t CScalarPerVector>
-struct DeviceBinaryElementwise : public BaseOperator
+struct DeviceBinaryElementwise : public DeviceElementwise<2, 1, NDim, ElementwiseFunctor>
 {
    static constexpr auto I0 = Number<0>{};
@@ -198,27 +199,30 @@ struct DeviceBinaryElementwise : public BaseOperator
        return true;
    };
-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+    virtual std::unique_ptr<BaseArgument>
-                                                      const void* p_b,
+    MakeArgumentPointer(std::array<const void*, 2> p_inputs,
-                                                      void* p_c,
+                        std::array<void*, 1> p_outputs,
                        std::vector<index_t> lengths,
-                                                      std::vector<index_t> a_strides,
+                        std::vector<std::vector<index_t>> input_strides,
-                                                      std::vector<index_t> b_strides,
+                        std::vector<std::vector<index_t>> output_strides,
-                                                      std::vector<index_t> c_strides,
+                        ElementwiseFunctor functor) override
-                                                      ElementwiseFunctor functor)
    {
-        return std::make_unique<Argument>(static_cast<const ADataType*>(p_a),
+        return std::make_unique<Argument>(static_cast<const ADataType*>(p_inputs[0]),
-                                          static_cast<const BDataType*>(p_b),
+                                          static_cast<const BDataType*>(p_inputs[1]),
-                                          static_cast<CDataType*>(p_c),
+                                          static_cast<CDataType*>(p_outputs[0]),
                                          lengths,
-                                          a_strides,
+                                          input_strides[0],
-                                          b_strides,
+                                          input_strides[1],
-                                          c_strides,
+                                          output_strides[0],
                                          functor);
    }
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); }
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    }
+    // polymorphic
    std::string GetTypeString() const override
    {
        auto str = std::stringstream();
@@ -226,7 +230,11 @@ struct DeviceBinaryElementwise : public BaseOperator
        // clang-format off
        str << "DeviceBinaryElementwise"
            << "<"
+            << "NDim = " << NDim
            << "MPerThread = " << MPerThread
+            << "AScalarPerVector = " << AScalarPerVector
+            << "BScalarPerVector = " << BScalarPerVector
+            << "CScalarPerVector = " << CScalarPerVector
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_elementwise.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <ck::index_t NumInputTensor,
+          ck::index_t NumOutputTensor,
+          index_t NDim,
+          typename ElementwiseFunctor>
+struct DeviceElementwise : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(std::array<const void*, NumInputTensor> p_inputs,
+                        std::array<void*, NumOutputTensor> p_outputs,
+                        std::vector<index_t> lengths,
+                        std::vector<std::vector<index_t>> input_strides,
+                        std::vector<std::vector<index_t>> output_strides,
+                        ElementwiseFunctor functor) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <ck::index_t NumInputTensor,
+          ck::index_t NumOutputTensor,
+          index_t NDim,
+          typename ElementwiseFunctor>
+using DeviceElementwisePtr =
+    std::unique_ptr<DeviceElementwise<NumInputTensor, NumOutputTensor, NDim, ElementwiseFunctor>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_add_reduce_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_c_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_c_permute.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <array>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+struct DEGridDesc_M0_M1_M2_N0_N1
+{
+    ck::index_t M0_, M1_, M2_, N0_, N1_;
+    ck::index_t stride_M0_, stride_M1_, stride_M2_, stride_N0_, stride_N1_;
+};
+// input : A[M, K], B[K, N],
+// input : D[M, N], ...
+// output : E[M, N]
+// C = a_op(A) * b_op(B)
+// E = cde_op(C, D)
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CDEElementwiseOperation>
+struct DeviceGemmBiasCPermute : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b,
+                        const void* p_d,
+                        void* p_e,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB,
+                        DEGridDesc_M0_M1_M2_N0_N1 d_gride_desc,
+                        DEGridDesc_M0_M1_M2_N0_N1 e_gride_desc,
+                        AElementwiseOperation a_element_op,
+                        BElementwiseOperation b_element_op,
+                        CDEElementwiseOperation cde_element_op) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmBiasCPermutePtr = std::unique_ptr<
+    DeviceGemmBiasCPermute<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_bias_c_permute_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_bias_c_permute_xdl.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce.hpp
@@ -9,91 +9,34 @@ namespace ck {
 namespace tensor_operation {
 namespace device {
-template <typename AElementwiseOperation,
+template <ck::index_t NumDTensor, ck::index_t NumReduce>
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename DxsInElementwiseOperation,
-          typename DxsReduceAccElementwiseOperation>
 struct DeviceGemmReduce : public BaseOperator
 {
    virtual std::unique_ptr<BaseArgument>
    MakeArgumentPointer(const void* p_a,
                        const void* p_b,
+                        const void* p_bias,
+                        std::array<const void*, NumDTensor> p_ds,
                        void* p_c,
-                        void* p_dxs,
+                        std::array<void*, NumReduce> p_reduces,
                        ck::index_t M,
                        ck::index_t N,
                        ck::index_t K,
                        ck::index_t StrideA,
                        ck::index_t StrideB,
                        ck::index_t StrideC,
-                        AElementwiseOperation a_element_op,
+                        std::array<ck::index_t, NumDTensor> StrideDs,
-                        BElementwiseOperation b_element_op,
+                        std::array<void*, 3> gemm_element_ops,
-                        CElementwiseOperation c_element_op,
+                        std::array<void*, NumDTensor> d_element_ops,
-                        DxsInElementwiseOperation dxs_in_element_op,
+                        std::array<void*, NumReduce> reduce_in_element_ops,
-                        DxsReduceAccElementwiseOperation dxs_out_element_op,
+                        std::array<void*, NumReduce> reduce_out_element_ops,
                        ck::index_t BatchCount = 1) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };
-template <typename AElementwiseOperation,
+template <ck::index_t NumDTensor, ck::index_t NumReduce>
-          typename BElementwiseOperation,
+using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<NumDTensor, NumReduce>>;
-          typename CElementwiseOperation,
-          typename DxsInElementwiseOperation,
-          typename DxsReduceAccElementwiseOperation>
-using DeviceGemmReducePtr = std::unique_ptr<DeviceGemmReduce<AElementwiseOperation,
-                                                             BElementwiseOperation,
-                                                             CElementwiseOperation,
-                                                             DxsInElementwiseOperation,
-                                                             DxsReduceAccElementwiseOperation>>;
-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename C1ElementwiseOperation,
-          typename DxsInElementwiseOperation,
-          typename DxsReduceAccElementwiseOperation>
-struct DeviceGemmBiasAddReduce : public BaseOperator
-{
-    virtual std::unique_ptr<BaseArgument>
-    MakeArgumentPointer(const void* p_a,
-                        const void* p_b,
-                        void* p_c,
-                        const void* p_c0,
-                        const void* p_c1,
-                        void* p_dxs,
-                        ck::index_t M,
-                        ck::index_t N,
-                        ck::index_t K,
-                        ck::index_t StrideA,
-                        ck::index_t StrideB,
-                        ck::index_t StrideC,
-                        ck::index_t StrideC1,
-                        AElementwiseOperation a_element_op,
-                        BElementwiseOperation b_element_op,
-                        CElementwiseOperation c_element_op,
-                        C1ElementwiseOperation c1_element_op,
-                        DxsInElementwiseOperation dxs_in_element_op,
-                        DxsReduceAccElementwiseOperation dxs_out_element_op,
-                        ck::index_t BatchCount = 1) = 0;
-    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
-};
-template <typename AElementwiseOperation,
-          typename BElementwiseOperation,
-          typename CElementwiseOperation,
-          typename C1ElementwiseOperation,
-          typename DxsInElementwiseOperation,
-          typename DxsReduceAccElementwiseOperation>
-using DeviceGemmBiasAddReducePtr =
-    std::unique_ptr<DeviceGemmBiasAddReduce<AElementwiseOperation,
-                                            BElementwiseOperation,
-                                            CElementwiseOperation,
-                                            C1ElementwiseOperation,
-                                            DxsInElementwiseOperation,
-                                            DxsReduceAccElementwiseOperation>>;
 } // namespace device
 } // namespace tensor_operation

--- a/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_reduce_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_splitk.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceGemmSplitK : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const void* p_a,
+                                                              const void* p_b,
+                                                              void* p_c,
+                                                              ck::index_t M,
+                                                              ck::index_t N,
+                                                              ck::index_t K,
+                                                              ck::index_t StrideA,
+                                                              ck::index_t StrideB,
+                                                              ck::index_t StrideC,
+                                                              AElementwiseOperation a_element_op,
+                                                              BElementwiseOperation b_element_op,
+                                                              CElementwiseOperation c_element_op,
+                                                              ck::index_t KBatch) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+template <typename AElementwiseOperation,
+          typename BElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceGemmSplitKPtr = std::unique_ptr<
+    DeviceGemmSplitK<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk.hpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4.hpp"
 #include "ck/device_utility/device_prop.hpp"
@@ -57,7 +57,7 @@ template <typename ADataType,
          ck::index_t CThreadTransferSrcDstVectorDim,
          ck::index_t CThreadTransferDstScalarPerVector>
 struct DeviceGemmXdlSplitK
-    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+    : public DeviceGemmSplitK<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};

--- a/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_xdl_splitk_c_shuffle.hpp
@@ -10,7 +10,7 @@
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/device/device_gemm.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_splitk.hpp"
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r4r2.hpp"
 #include "ck/device_utility/device_prop.hpp"
@@ -59,7 +59,7 @@ template <typename ADataType,
          typename CBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
          index_t CBlockTransferScalarPerVector_NWaveNPerXDL>
 struct DeviceGemmXdlSplitKCShuffle
-    : public DeviceGemm<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
+    : public DeviceGemmSplitK<AElementwiseOperation, BElementwiseOperation, CElementwiseOperation>
 {
    static constexpr auto I0 = Number<0>{};
    static constexpr auto I1 = Number<1>{};
@@ -420,6 +420,7 @@ struct DeviceGemmXdlSplitKCShuffle
                    arg.c_grid_desc_mblock_mperblock_nblock_nperblock_.GetElementSpaceSize() *
                        sizeof(CDataType)));
+                ave_time =
                    launch_and_time_kernel(stream_config,
                                           kernel,
                                           dim3(grid_size),

--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#pragma once
+#include <iostream>
+#include <vector>
+#include "ck/tensor_operation/gpu/device/device_base.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+struct DeviceNormalization : public BaseOperator
+{
+    // inLengths: input tensor extent(s) from high to low dimension
+    // inStrides: input tensor stride(s) from high to low dimension
+    // reduceDims: the dimension(s) the normalization operation is applied
+    // alpha: typeless pointer in host memory storing the alpha scaling value of type AccDataType
+    // beta: typeless pointer in host memory storing the beta scaling value of type AccDataType
+    // in_dev: typeless const pointer in device memory storing the input tensor
+    // out_dev: typeless pointer in device memory storing the output tensor
+    virtual std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
+                                                              const std::vector<index_t> inStrides,
+                                                              const std::vector<int> reduceDims,
+                                                              const void* alpha,
+                                                              const void* beta,
+                                                              const void* in_dev,
+                                                              void* out_dev) = 0;
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+    virtual index_t GetRank() const = 0;
+    virtual index_t GetNumReduceDim() const = 0;
+};
+using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization>;
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_softmax.hpp
@@ -9,6 +9,7 @@
 #include "ck/utility/reduction_operator.hpp"
 #include "ck/tensor_operation/gpu/device/device_base.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
 #include "ck/tensor_operation/gpu/grid/gridwise_softmax.hpp"
@@ -33,8 +34,15 @@ template <typename InDataType,
          index_t InSrcVectorDim,
          index_t InSrcVectorSize,
          index_t OutDstVectorSize>
-struct DeviceSoftmax : public BaseOperator
+struct DeviceSoftmax : public DeviceNormalization
 {
+    static constexpr index_t kRank         = Rank;
+    static constexpr index_t kNumReduceDim = NumReduceDim;
+    virtual index_t GetRank() const override { return kRank; }
+    virtual index_t GetNumReduceDim() const override { return kNumReduceDim; }
    using PassThrough = tensor_operation::element_wise::PassThrough;
    // Used for freeloading of some handy functions from DeviceReduceMultiBlock
@@ -61,7 +69,7 @@ struct DeviceSoftmax : public BaseOperator
    using GridDesc_M_K = decltype(Reduction::MakeSrc2dDescriptor({1}, {1}, 1, 1));
-    using GridwiseReduce = GridwiseSoftmax_mk_to_mk<InDataType,
+    using GridwiseSoftmaxGeneric = GridwiseSoftmax_mk_to_mk<InDataType,
                                                            OutDataType,
                                                            AccDataType,
                                                            GridDesc_M_K,
@@ -72,7 +80,22 @@ struct DeviceSoftmax : public BaseOperator
                                                            KThreadSliceSize,
                                                            InSrcVectorDim,
                                                            InSrcVectorSize,
-                                                    OutDstVectorSize>;
+                                                            OutDstVectorSize,
+                                                            false>;
+    using GridwiseSoftmaxSweepOnce = GridwiseSoftmax_mk_to_mk<InDataType,
+                                                              OutDataType,
+                                                              AccDataType,
+                                                              GridDesc_M_K,
+                                                              BlockSize,
+                                                              MThreadClusterSize,
+                                                              KThreadClusterSize,
+                                                              MThreadSliceSize,
+                                                              KThreadSliceSize,
+                                                              InSrcVectorDim,
+                                                              InSrcVectorSize,
+                                                              OutDstVectorSize,
+                                                              true>;
    struct Argument : public Reduction::Argument
    {
@@ -121,8 +144,19 @@ struct DeviceSoftmax : public BaseOperator
            const auto out_grid_desc_m_k = Reduction::MakeSrc2dDescriptor(
                arg.inLengths_, arg.inStrides_, arg.blkGroupSize, arg.numBlockTileIteration);
-            const auto kernel_main =
+            bool sweep_once =
-                kernel_softmax<GridwiseReduce, InDataType, OutDataType, AccDataType, GridDesc_M_K>;
+                in_grid_desc_m_k.GetLength(Number<1>{}) <= KThreadClusterSize * KThreadSliceSize;
+            const auto kernel_main = sweep_once ? kernel_softmax<GridwiseSoftmaxSweepOnce,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>
+                                                : kernel_softmax<GridwiseSoftmaxGeneric,
+                                                                 InDataType,
+                                                                 OutDataType,
+                                                                 AccDataType,
+                                                                 GridDesc_M_K>;
            float avg_time = 0;
@@ -167,24 +201,34 @@ struct DeviceSoftmax : public BaseOperator
        return true;
    };
+    // inLengths: input tensor extent(s) from high to low dimension
+    // inStrides: input tensor stride(s) from high to low dimension
+    // reduceDims: the dimension(s) the softmax normalization operate on
+    // alpha: typeless pointer in host memory storing the alpha scaling value as type AccDataType
+    // beta: typeless pointer in host memory storing the beta scaling value as type AccDataType
+    // in_dev: typeless const pointer in device memory storing the input tensor
+    // out_dev: typeless pointer in device memory storing the output tensor
    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> inLengths,
                                                      const std::vector<index_t> inStrides,
                                                      const std::vector<int> reduceDims,
-                                                      AccDataType alpha,
+                                                      const void* alpha,
-                                                      AccDataType beta,
+                                                      const void* beta,
                                                      const void* in_dev,
-                                                      void* out_dev)
+                                                      void* out_dev) override
    {
        return std::make_unique<Argument>(inLengths,
                                          inStrides,
                                          reduceDims,
-                                          alpha,
+                                          *static_cast<const AccDataType*>(alpha),
-                                          beta,
+                                          *static_cast<const AccDataType*>(beta),
                                          static_cast<const InDataType*>(in_dev),
                                          static_cast<OutDataType*>(out_dev));
    };
-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };
    std::string GetTypeString() const override
    {

--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -11,8 +11,8 @@ namespace element_wise {
 struct Add
 {
-    template <typename T>
+    template <typename Y, typename X0, typename X1>
-    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
+    __host__ __device__ constexpr void operator()(Y& y, const X0& x0, const X1& x1) const;
    template <>
    __host__ __device__ constexpr void
@@ -28,6 +28,13 @@ struct Add
        y = x0 + x1;
    };
+    template <>
+    __host__ __device__ constexpr void
+    operator()<half_t>(half_t& y, const float& x0, const half_t& x1) const
+    {
+        y = type_convert<half_t>(x0) + x1;
+    };
    // Question: should half_t be supported ?
    template <>
    __host__ __device__ constexpr void

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_bias_add_reduce_xdl_cshuffle_v1.hpp