Merge remote-tracking branch 'origin/develop' into fused-gemm

000eefbf · Chao Liu · b64a2860 · cac014f1 · 000eefbf · 000eefbf
Commit 000eefbf authored Aug 13, 2022 by Chao Liu
20 changed files
--- a/example/32_batched_gemm_softmax_gemm/batched_gemm_softmax_gemm_xdl_fp16.cpp
+++ b/example/32_batched_gemm_softmax_gemm/batched_gemm_softmax_gemm_xdl_fp16.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+/*
+Gemm + Gemm fused operation. Computes C_m_o = A_m_k * B0_k_n * B1_n_o
+                                              |------------|
+                                                   Gemm0
+                                              |---------------------|
+                                                       Gemm1
+*/
+
+#include <iostream>
+#include <numeric>
+#include <initializer_list>
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+
+using F16 = ck::half_t;
+using F32 = float;
+
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+
+using ALayout  = Row;
+using B0Layout = Col;
+using B1Layout = Row;
+using CLayout  = Row;
+
+using AElementOp    = PassThrough;
+using B0ElementOp   = PassThrough;
+using Acc0ElementOp = PassThrough;
+using B1ElementOp   = PassThrough;
+using CElementOp    = PassThrough;
+
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+
+using DeviceGemmInstance = ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm_Xdl_CShuffle<
+    ALayout,
+    B0Layout,
+    B1Layout,
+    CLayout,
+    ADataType,
+    B0DataType,
+    B1DataType,
+    CDataType,
+    AccDataType,
+    CShuffleDataType,
+    AElementOp,
+    B0ElementOp,
+    Acc0ElementOp,
+    B1ElementOp,
+    CElementOp,
+    GemmDefault,
+    1,
+    256,
+    128,         // MPerBlock
+    128,         // NPerBlock
+    32,          // KPerBlock
+    64,          // Gemm1NPerBlock
+    32,          // Gemm1KPerBlock
+    8,           // AK1
+    8,           // BK1
+    2,           // B1K1
+    32,          // MPerXDL
+    32,          // NPerXDL
+    1,           // MXdlPerWave
+    4,           // NXdlPerWave
+    2,           // Gemm1NXdlPerWave
+    S<4, 64, 1>, // ABlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<4, 64, 1>, // BBlockTransfer
+    S<1, 0, 2>,
+    S<1, 0, 2>,
+    2,
+    8,
+    8,
+    true,
+    S<16, 16, 1>, // B1BlockTransfer
+    S<0, 2, 1>,
+    S<0, 2, 1>,
+    1,
+    4,
+    2,
+    false,
+    1,              // CShuffleMXdlPerWavePerShuffle
+    2,              // CShuffleNXdlPerWavePerShuffle
+    S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+    8>;             // CShuffleBlockTransferScalarPerVector_NPerBlock
+
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                CElementOp>;
+
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+
+    // GEMM shape
+    ck::index_t M             = 1024;
+    ck::index_t N             = 1024;
+    ck::index_t K             = 64;
+    ck::index_t O             = 128;
+    ck::index_t BatchCount    = 4;
+    ck::index_t StrideA       = -1;
+    ck::index_t StrideB0      = -1;
+    ck::index_t StrideB1      = -1;
+    ck::index_t StrideC       = -1;
+    ck::index_t BatchStrideA  = -1;
+    ck::index_t BatchStrideB0 = -1;
+    ck::index_t BatchStrideB1 = -1;
+    ck::index_t BatchStrideC  = -1;
+
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 9)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+    }
+    else if(argc == 17)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+
+        M = std::stoi(argv[4]);
+        N = std::stoi(argv[5]);
+        K = std::stoi(argv[6]);
+        O = std::stoi(argv[7]);
+
+        BatchCount = std::stoi(argv[8]);
+
+        StrideA  = std::stoi(argv[9]);
+        StrideB0 = std::stoi(argv[10]);
+        StrideB1 = std::stoi(argv[11]);
+        StrideC  = std::stoi(argv[12]);
+
+        BatchStrideA  = std::stoi(argv[13]);
+        BatchStrideB0 = std::stoi(argv[14]);
+        BatchStrideB1 = std::stoi(argv[15]);
+        BatchStrideC  = std::stoi(argv[16]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 17: M, N, K, O, Batch, StrideA, StrideB0, StrideB1, StrideC, BatchStrideA, "
+               "BatchStrideB0, BatchStrideB1, BatchStrideC\n");
+        exit(0);
+    }
+
+    const int DefaultStrideA  = ck::is_same_v<ALayout, Row> ? K : M;
+    const int DefaultStrideB0 = ck::is_same_v<B0Layout, Row> ? N : K;
+    const int DefaultStrideB1 = ck::is_same_v<B1Layout, Row> ? O : N;
+    const int DefaultStrideC  = ck::is_same_v<CLayout, Row> ? O : M;
+
+    StrideA  = (StrideA < 0) ? DefaultStrideA : StrideA;
+    StrideB0 = (StrideB0 < 0) ? DefaultStrideB0 : StrideB0;
+    StrideB1 = (StrideB1 < 0) ? DefaultStrideB1 : StrideB1;
+    StrideC  = (StrideC < 0) ? DefaultStrideC : StrideC;
+
+    const int DefaultBatchStrideA  = (ck::is_same_v<ALayout, Col> ? K : M) * StrideA;
+    const int DefaultBatchStrideB0 = (ck::is_same_v<B0Layout, Col> ? N : K) * StrideB0;
+    const int DefaultBatchStrideB1 = (ck::is_same_v<B1Layout, Col> ? O : N) * StrideB1;
+    const int DefaultBatchStrideC  = (ck::is_same_v<CLayout, Col> ? O : M) * StrideC;
+
+    BatchStrideA  = BatchStrideA < 0 ? DefaultBatchStrideA : BatchStrideA;
+    BatchStrideB0 = BatchStrideB0 < 0 ? DefaultBatchStrideB0 : BatchStrideB0;
+    BatchStrideB1 = BatchStrideB1 < 0 ? DefaultBatchStrideB1 : BatchStrideB1;
+    BatchStrideC  = BatchStrideC < 0 ? DefaultBatchStrideC : BatchStrideC;
+
+    auto f_host_tensor_descriptor = [](std::size_t batch_count,
+                                       std::size_t row,
+                                       std::size_t col,
+                                       std::size_t stride,
+                                       std::size_t batch_stride,
+                                       auto layout) {
+        if(std::is_same<decltype(layout), Row>::value)
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, stride, 1}));
+        }
+        else
+        {
+            return HostTensorDescriptor(std::vector<std::size_t>({batch_count, row, col}),
+                                        std::vector<std::size_t>({batch_stride, 1, stride}));
+        }
+    };
+
+    // C_m_o = A_m_k * B0_k_n * B1_n_o
+    Tensor<ADataType> a_g_m_k(
+        f_host_tensor_descriptor(BatchCount, M, K, StrideA, BatchStrideA, ALayout{}));
+    Tensor<B0DataType> b0_g_k_n(
+        f_host_tensor_descriptor(BatchCount, K, N, StrideB0, BatchStrideB0, B0Layout{}));
+    Tensor<B1DataType> b1_g_n_o(
+        f_host_tensor_descriptor(BatchCount, N, O, StrideB1, BatchStrideB1, B1Layout{}));
+    Tensor<CDataType> c_g_m_o_host_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+    Tensor<CDataType> c_g_m_o_device_result(
+        f_host_tensor_descriptor(BatchCount, M, O, StrideC, BatchStrideC, CLayout{}));
+
+    std::cout << "a_g_m_k: " << a_g_m_k.mDesc << std::endl;
+    std::cout << "b0_g_k_n: " << b0_g_k_n.mDesc << std::endl;
+    std::cout << "b1_g_n_o: " << b1_g_n_o.mDesc << std::endl;
+    std::cout << "c_g_m_o: " << c_g_m_o_host_result.mDesc << std::endl;
+
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-5, 5});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-5, 5});
+        break;
+    case 2:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        break;
+    case 3:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        break;
+    default:
+        a_g_m_k.GenerateTensorValue(GeneratorTensor_1<ADataType>{1});
+        b0_g_k_n.GenerateTensorValue(GeneratorTensor_Sequential<1>{});
+        b1_g_n_o.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+    }
+
+    DeviceMem a_g_m_k_device_buf(sizeof(ADataType) * a_g_m_k.mDesc.GetElementSize());
+    DeviceMem b0_g_k_n_device_buf(sizeof(B0DataType) * b0_g_k_n.mDesc.GetElementSize());
+    DeviceMem b1_g_n_o_device_buf(sizeof(B1DataType) * b1_g_n_o.mDesc.GetElementSize());
+    DeviceMem c_g_m_o_device_buf(sizeof(CDataType) * c_g_m_o_device_result.mDesc.GetElementSize());
+
+    a_g_m_k_device_buf.ToDevice(a_g_m_k.mData.data());
+    b0_g_k_n_device_buf.ToDevice(b0_g_k_n.mData.data());
+    b1_g_n_o_device_buf.ToDevice(b1_g_n_o.mData.data());
+
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto acc0_element_op = Acc0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+
+    // do GEMM
+    auto gemm    = DeviceGemmInstance{};
+    auto invoker = gemm.MakeInvoker();
+    auto argument =
+        gemm.MakeArgument(static_cast<ADataType*>(a_g_m_k_device_buf.GetDeviceBuffer()),
+                          static_cast<B0DataType*>(b0_g_k_n_device_buf.GetDeviceBuffer()),
+                          static_cast<B1DataType*>(b1_g_n_o_device_buf.GetDeviceBuffer()),
+                          static_cast<CDataType*>(c_g_m_o_device_buf.GetDeviceBuffer()),
+                          M,
+                          N,
+                          K,
+                          O,
+                          BatchCount,
+                          StrideA,
+                          StrideB0,
+                          StrideB1,
+                          StrideC,
+                          BatchStrideA,
+                          BatchStrideB0,
+                          BatchStrideB1,
+                          BatchStrideC,
+                          a_element_op,
+                          b0_element_op,
+                          acc0_element_op,
+                          b1_element_op,
+                          c_element_op);
+
+    if(!gemm.IsSupportedArgument(argument))
+    {
+        std::cout << gemm.GetTypeString() << " does not support this problem" << std::endl;
+
+        return 0;
+    }
+
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+
+    std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                             sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
+                            BatchCount;
+
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << gemm.GetTypeString() << std::endl;
+
+    c_g_m_o_device_buf.FromDevice(c_g_m_o_device_result.mData.data());
+
+    if(do_verification)
+    {
+        // Output of Gemm0 is input A of Gemm1
+        Tensor<AccDataType> acc0_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+        Tensor<ADataType> a1_g_m_n(f_host_tensor_descriptor(BatchCount, M, N, N, M * N, Row{}));
+
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, PassThrough{});
+
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+
+        ref_softmax_invoker.Run(ref_softmax_argument);
+
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+
+        return ck::utils::check_err(c_g_m_o_device_result.mData, c_g_m_o_host_result.mData) ? 0 : 1;
+    }
+
+    return 0;
+}
--- a/example/CMakeLists.txt
+++ b/example/CMakeLists.txt
@@ -44,5 +44,6 @@ add_subdirectory(26_contraction)
 add_subdirectory(27_layernorm)
 add_subdirectory(28_grouped_gemm_bias_e_permute)
 add_subdirectory(29_batched_gemm_bias_e_permute)
-add_subdirectory(30_grouped_convnd_fwd_bias_relu)
+add_subdirectory(30_grouped_convnd_fwd_bias_relu_add)
 add_subdirectory(31_batched_gemm_gemm)
+add_subdirectory(32_batched_gemm_softmax_gemm)
--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -4,6 +4,7 @@
 #pragma once

 #include "ck/utility/common_header.hpp"
+#include "ck/utility/sequence_helper.hpp"
 #include "ck/tensor_description/multi_index_transform.hpp"

 namespace ck {
@@ -159,6 +160,12 @@ struct TensorDescriptor
        return transforms_[Number<itran>{}].GetUpperLengths()[Number<idim_up>{}];
    }

+    __host__ __device__ constexpr auto GetLengths() const
+    {
+        // FIXME: use Tuple of reference instead
+        return generate_sequence_v2([&](auto I) { return GetLength(I); }, Number<ndim_visible_>{});
+    }
+
    __host__ __device__ constexpr auto GetElementSize() const { return element_size_; }

    __host__ __device__ constexpr auto GetElementSpaceSize() const { return element_space_size_; }

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -701,7 +701,9 @@ struct BlockwiseGemmXdlops_v2
        const auto waveId_m = wave_idx[I0];
        const auto waveId_n = wave_idx[I1];

-        const auto blk_idx = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+        const auto tmp = xdlops_gemm.GetBeginOfThreadBlk(xdlops_i, blk_i);
+        const auto blk_idx =
+            TransposeC ? make_multi_index(tmp[I1], tmp[I0]) : make_multi_index(tmp[I0], tmp[I1]);

        constexpr auto mrepeat_mwave_mperxdl_to_m_adaptor = make_single_stage_tensor_adaptor(
            make_tuple(make_unmerge_transform(make_tuple(MRepeat, MWaves, MPerXDL))),

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include "ck/utility/data_type.hpp"
+#include "ck/utility/reduction_common.hpp"
+#include "ck/utility/reduction_operator.hpp"
+#include "ck/utility/reduction_functions_accumulate.hpp"
+#include "ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp"
+#include "ck/tensor_operation/gpu/thread/reduction_functions_threadwise.hpp"
+
+namespace ck {
+
+template <index_t BlockSize,
+          typename AccDataType,
+          typename ThreadMap_M_K, // thread_id to m_k
+          typename ThreadClusterDesc_M_K,
+          typename ThreadSliceDesc_M_K>
+struct BlockwiseSoftmax
+{
+    static constexpr auto I0         = Number<0>{};
+    static constexpr auto I1         = Number<1>{};
+    static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0);
+    static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1);
+
+    using ThreadSliceDesc_M = decltype(
+        make_naive_tensor_descriptor_packed(make_tuple(ThreadSliceDesc_M_K{}.GetLength(I0))));
+
+    using ThreadwiseMaxReduce = ThreadwiseReduction<AccDataType,
+                                                    ThreadSliceDesc_M_K,
+                                                    ThreadSliceDesc_M,
+                                                    reduce::Max,
+                                                    false>;
+
+    using ThreadClusterLengths_M_K = decltype(ThreadClusterDesc_M_K{}.GetLengths());
+
+    using BlockwiseMaxReduce = PartitionedBlockwiseReduction_v2<AccDataType,
+                                                                BlockSize,
+                                                                ThreadClusterLengths_M_K,
+                                                                ThreadMap_M_K,
+                                                                reduce::Max,
+                                                                false>;
+
+    using BlockwiseSumReduce = PartitionedBlockwiseReduction_v2<AccDataType,
+                                                                BlockSize,
+                                                                ThreadClusterLengths_M_K,
+                                                                ThreadMap_M_K,
+                                                                reduce::Add,
+                                                                false>;
+
+    using ThreadwiseSumReduce = ThreadwiseReduction<AccDataType,
+                                                    ThreadSliceDesc_M_K,
+                                                    ThreadSliceDesc_M,
+                                                    reduce::Add,
+                                                    false>;
+
+    using BufferType = StaticBuffer<AddressSpaceEnum::Vgpr, AccDataType, MRepeat, true>;
+
+    template <typename CThreadBuffer, typename WorkspaceBuffer>
+    __host__ __device__ void Run(CThreadBuffer& in_thread_buf, WorkspaceBuffer& reduce_work_buf)
+    {
+        // find max value
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            max_value_buf(I) = reduce::Max::template GetIdentityValue<AccDataType>();
+        });
+        ThreadwiseMaxReduce::Reduce(in_thread_buf, max_value_buf);
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I));
+            block_sync_lds();
+        });
+
+        // calculate exp for elements, P=exp(s-max)
+        static_for<0, MRepeat, 1>{}([&](auto iM) {
+            static_for<0, KRepeat, 1>{}([&](auto iK) {
+                auto offset = Number<ThreadSliceDesc_M_K{}.CalculateOffset(make_tuple(iM, iK))>{};
+                in_thread_buf(offset) = math::exp(in_thread_buf[offset] - max_value_buf(iM));
+            });
+        });
+
+        // sum data
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            sum_value_buf(I) = reduce::Add::template GetIdentityValue<AccDataType>();
+        });
+        ThreadwiseSumReduce::Reduce(in_thread_buf, sum_value_buf);
+        static_for<0, MRepeat, 1>{}([&](auto I) {
+            BlockwiseSumReduce::Reduce(reduce_work_buf, sum_value_buf(I));
+            block_sync_lds();
+        });
+    }
+
+    BufferType max_value_buf;
+    BufferType sum_value_buf;
+};
+
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
+++ b/include/ck/tensor_operation/gpu/block/reduction_functions_blockwise.hpp
@@ -82,6 +82,78 @@ struct PartitionedBlockwiseReduction
    };
 };

+// clang-format off
+// Assume:
+//  1) work_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data
+//  2) work_buffer has AccDataType elements, and space size is no less than BlockSize
+//  3) in_out_value is the input data in vgpr from each thread
+//  4) in_out_value is the over-written reduced output in vgpr for each thread
+// clang-format on
+template <typename AccDataType,
+          index_t BlockSize,
+          typename ThreadClusterLengths_M_K,
+          typename ThreadClusterDesc,
+          typename OpReduce,
+          bool PropagateNan,
+          typename Accumulation =
+              detail::AccumulateWithNanCheck<PropagateNan, OpReduce, AccDataType>>
+struct PartitionedBlockwiseReduction_v2
+{
+    static_assert(BlockSize == ThreadClusterLengths_M_K::At(0) * ThreadClusterLengths_M_K::At(1),
+                  "The product of cluster lengths should be same as BlockSize!");
+
+    static constexpr auto BufferLength_M = ThreadClusterLengths_M_K::At(0);
+    static constexpr auto BufferLength_K = ThreadClusterLengths_M_K::At(1);
+
+    static_assert(BufferLength_K > 1, "Parallel reduction need work on at least two elements");
+
+    static constexpr auto block_buf_desc_m_k = make_naive_tensor_descriptor_packed(
+        make_tuple(Number<BufferLength_M>{}, Number<BufferLength_K>{}));
+
+    static constexpr auto thread_cluster_desc = ThreadClusterDesc{};
+
+    template <typename BufferType>
+    __device__ static void Reduce(BufferType& work_buffer, AccDataType& in_out_value)
+    {
+        static_assert(is_same<typename BufferType::type, AccDataType>{},
+                      "Buffer data type should be consistent as AccDataType!");
+
+        constexpr auto cluster_len_shift = get_shift<BufferLength_K>();
+
+        const auto thread_cluster_idx =
+            thread_cluster_desc.CalculateBottomIndex(make_multi_index(get_thread_local_1d_id()));
+
+        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
+        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
+
+        work_buffer(block_buf_desc_m_k.CalculateOffset(thread_cluster_idx)) = in_out_value;
+
+        __syncthreads();
+
+        static_for<0, cluster_len_shift, 1>{}([&](auto I) {
+            constexpr index_t indOffset = 1 << (cluster_len_shift - 1 - I());
+
+            if(thread_k_cluster_id < indOffset)
+            {
+                index_t offset1 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx);
+                index_t offset2 = block_buf_desc_m_k.CalculateOffset(thread_cluster_idx +
+                                                                     make_tuple(0, indOffset));
+
+                AccDataType opData1 = work_buffer[offset1];
+                AccDataType opData2 = work_buffer[offset2];
+                Accumulation::Calculate(opData1, opData2);
+                work_buffer(offset1) = opData1;
+            }
+
+            __syncthreads();
+        });
+
+        index_t offset = block_buf_desc_m_k.CalculateOffset(make_tuple(thread_m_cluster_id, 0));
+
+        in_out_value = work_buffer[offset];
+    };
+};
+
 // clang-format off
 // Assume:
 //  1) work_val_buffer/work_idx_buffer is buffer (typically LDS) allocated outside as workspace, does not include any in/out data

--- a/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_contraction_multiple_d_xdl_cshuffle.hpp
@@ -500,11 +500,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
            std::array<long_index_t, NumDTensor> ds_offset;

            static_for<0, NumDTensor, 1>{}([&](auto i) {
-                if constexpr(NumDimG > 0)
                ds_offset[i] =
                    ds_grid_desc_g_m_n_[i].CalculateOffset(make_multi_index(g_idx, 0, 0));
-                else
-                    ds_offset[i] = 0;
            });

            return ds_offset;
@@ -512,10 +509,7 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle

        __host__ __device__ constexpr long_index_t GetEPtrOffset(index_t g_idx) const
        {
-            if constexpr(NumDimG > 0)
            return e_grid_desc_g_m_n_.CalculateOffset(make_multi_index(g_idx, 0, 0));
-            else
-                return 0;
        }

        private:
@@ -634,6 +628,8 @@ struct DeviceBatchedContractionMultipleD_Xdl_CShuffle
              compute_ptr_offset_of_batch_{
                  a_batch_stride_, b_batch_stride_, ds_grid_desc_g_m_n_, e_grid_desc_g_m_n_}
        {
+            static_assert(NumDimG > 0 && NumDimM > 0 && NumDimN > 0 && NumDimK > 0, "");
+
            // populate pointer, batch stride, desc for Ds
            static_for<0, NumDTensor, 1>{}([&](auto i) {
                using DDataType = remove_cvref_t<tuple_element_t<i.value, DsDataType>>;

--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <iostream>
+#include <vector>
+
+#include "device_base.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+struct DeviceBatchedGemmSoftmaxGemm : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const void* p_a,
+                        const void* p_b0,
+                        const void* p_b1,
+                        void* p_c,
+                        ck::index_t M,
+                        ck::index_t N,
+                        ck::index_t K,
+                        ck::index_t O,
+                        ck::index_t Batch,
+                        ck::index_t StrideA,
+                        ck::index_t StrideB0,
+                        ck::index_t StrideB1,
+                        ck::index_t StrideC,
+                        ck::index_t BatchStrideA,
+                        ck::index_t BatchStrideB0,
+                        ck::index_t BatchStrideB1,
+                        ck::index_t BatchStrideC,
+                        AElementwiseOperation a_element_op,
+                        B0ElementwiseOperation b0_element_op,
+                        Acc0ElementwiseOperation acc0_element_op,
+                        B1ElementwiseOperation b1_element_op,
+                        CElementwiseOperation c_element_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename Acc0ElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename CElementwiseOperation>
+using DeviceBatchedGemmSoftmaxGemmPtr =
+    std::unique_ptr<DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                                 B0Layout,
+                                                 B1Layout,
+                                                 CLayout,
+                                                 ADataType,
+                                                 B0DataType,
+                                                 B1DataType,
+                                                 CDataType,
+                                                 AElementwiseOperation,
+                                                 B0ElementwiseOperation,
+                                                 Acc0ElementwiseOperation,
+                                                 B1ElementwiseOperation,
+                                                 CElementwiseOperation>>;
+
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/device/device_layernorm.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_layernorm.hpp
@@ -7,7 +7,7 @@
 #include <sstream>

 #include "ck/utility/reduction_operator.hpp"
-#include "ck/tensor_operation/gpu/device/device_base.hpp"
+#include "ck/tensor_operation/gpu/device/device_normalization.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_multiblock.hpp"
 #include "ck/tensor_operation/gpu/device/device_reduce_common.hpp"
@@ -39,7 +39,14 @@ template <typename XDataType,
          index_t GammaSrcVectorSize,
          index_t BetaSrcVectorSize,
          index_t YDstVectorSize>
-struct DeviceLayernorm : public BaseOperator
+struct DeviceLayernorm : public DeviceNormalization2<XDataType,
+                                                     GammaDataType,
+                                                     BetaDataType,
+                                                     AccDataType,
+                                                     YDataType,
+                                                     AccElementwiseOperation,
+                                                     Rank,
+                                                     NumReduceDim>
 {
    static_assert(
        (KThreadSliceSize % GammaSrcVectorSize == 0),
@@ -297,7 +304,8 @@ struct DeviceLayernorm : public BaseOperator
        return true;
    };

-    std::unique_ptr<BaseArgument> MakeArgumentPointer(const std::vector<index_t> lengths,
+    std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
                        const std::vector<index_t> xStrides,
                        const std::vector<index_t> gammaStrides,
                        const std::vector<index_t> betaStrides,
@@ -307,7 +315,7 @@ struct DeviceLayernorm : public BaseOperator
                        const void* p_gamma,
                        const void* p_beta,
                        void* p_y,
-                                                      AccElementwiseOperation acc_elementwise_op)
+                        AccElementwiseOperation acc_elementwise_op) override
    {
        return std::make_unique<Argument>(lengths,
                                          xStrides,
@@ -322,7 +330,10 @@ struct DeviceLayernorm : public BaseOperator
                                          static_cast<YDataType*>(p_y));
    };

-    std::unique_ptr<BaseInvoker> MakeInvokerPointer() { return std::make_unique<Invoker>(); };
+    std::unique_ptr<BaseInvoker> MakeInvokerPointer() override
+    {
+        return std::make_unique<Invoker>();
+    };

    std::string GetTypeString() const override
    {
@@ -332,7 +343,6 @@ struct DeviceLayernorm : public BaseOperator
        str << "DeviceLayernorm<" << BlockSize << ",";
        str << "M_C" << MThreadClusterSize << "_S" << MThreadSliceSize << ",";
        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
-        str << "K_C" << KThreadClusterSize << "_S" << KThreadSliceSize << ",";
        str << "XYSrcVectorDim_" << XYSrcVectorDim  << ",";
        str << "VectorSize_X" << XSrcVectorSize << "_Gamma" << GammaSrcVectorSize << "_Beta" << BetaSrcVectorSize << "_Y" << YDstVectorSize << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_normalization.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_normalization.hpp
@@ -38,6 +38,49 @@ struct DeviceNormalization : public BaseOperator

 using DeviceNormalizationPtr = std::unique_ptr<DeviceNormalization>;

+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename AccElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+struct DeviceNormalization2 : public BaseOperator
+{
+    virtual std::unique_ptr<BaseArgument>
+    MakeArgumentPointer(const std::vector<index_t> lengths,
+                        const std::vector<index_t> xStrides,
+                        const std::vector<index_t> gammaStrides,
+                        const std::vector<index_t> betaStrides,
+                        const std::vector<index_t> reduceDims,
+                        AccDataType epsilon,
+                        const void* p_x,
+                        const void* p_gamma,
+                        const void* p_beta,
+                        void* p_y,
+                        AccElementwiseOperation acc_elementwise_op) = 0;
+
+    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
+};
+
+template <typename XDataType,
+          typename GammaDataType,
+          typename BetaDataType,
+          typename AccDataType,
+          typename YDataType,
+          typename AccElementwiseOperation,
+          index_t Rank,
+          index_t NumReduceDim>
+using DeviceNormalization2Ptr = std::unique_ptr<DeviceNormalization2<XDataType,
+                                                                     GammaDataType,
+                                                                     BetaDataType,
+                                                                     AccDataType,
+                                                                     YDataType,
+                                                                     AccElementwiseOperation,
+                                                                     Rank,
+                                                                     NumReduceDim>>;
+
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/element_wise_operation.hpp
@@ -78,6 +78,26 @@ struct AddReluAdd
        float c = b + x2;
        y       = c;
    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<bhalf_t, float, bhalf_t, bhalf_t>(
+        bhalf_t& y, const float& x0, const bhalf_t& x1, const bhalf_t& x2) const
+    {
+        float a = x0 + x1;
+        float b = a > 0 ? a : 0;
+        float c = b + x2;
+        y       = c;
+    }
+
+    template <>
+    __host__ __device__ constexpr void operator()<int8_t, int8_t, int8_t, int8_t>(
+        int8_t& y, const int8_t& x0, const int8_t& x1, const int8_t& x2) const
+    {
+        int32_t a = x0 + x1;
+        int32_t b = a > 0 ? a : 0;
+        int32_t c = b + x2;
+        y         = c;
+    }
 };

 struct AddHardswishAdd
@@ -114,28 +134,33 @@ struct AddHardswishAdd
 // E = FastGelu(C + D0 + D1)
 struct AddAddFastGelu
 {
-    template <typename E, typename C, typename D0, typename D1>
-    __host__ __device__ void operator()(E&, const C&, const D0&, const D1&) const;
-
-    template <>
-    __host__ __device__ void operator()<half_t, float, half_t, half_t>(half_t& e,
-                                                                       const float& c,
-                                                                       const half_t& d0,
-                                                                       const half_t& d1) const
-    {
    // Fast GeLU
    // https://paperswithcode.com/method/gelu
    // y = 0.5*x*(1+tanh(sqrt(2/pi)*(x+0.044715*x^3)))
-        const auto fast_gelu = [&](float x) {
-            const float u   = float(2) * x * (float(0.035677) * x * x + float(0.797885));
+    __host__ __device__ static constexpr float GetFastGeLU(float x)
+    {
+        const float u   = 2.f * x * (0.035677f * x * x + 0.797885f);
        const float emu = exp(-u);
-            const float cdf = float(0.5) + float(0.5) * (float(2) / (float(1) + emu) - float(1));
+        const float cdf = 0.5f + 0.5f * (2.f / (1.f + emu) - 1.f);
        return x * cdf;
-        };
+    }
+
+    template <typename T>
+    static inline constexpr bool is_valid_param_type_v =
+        std::is_same_v<T, float> || std::is_same_v<T, half_t> || std::is_same_v<T, bhalf_t> ||
+        std::is_same_v<T, int32_t> || std::is_same_v<T, int8_t>;
+
+    template <typename E, typename C, typename D0, typename D1>
+    __host__ __device__ constexpr void
+    operator()(E& e, const C& c, const D0& d0, const D1& d1) const
+    {
+        static_assert(is_valid_param_type_v<E> && is_valid_param_type_v<C> &&
+                      is_valid_param_type_v<D0> && is_valid_param_type_v<D1>);

-        const float y = fast_gelu(c + float(d0) + float(d1));
+        const float y =
+            GetFastGeLU(type_convert<float>(c) + type_convert<float>(d0) + type_convert<float>(d1));

-        e = type_convert<half_t>(y);
+        e = type_convert<E>(y);
    }
 };


--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_gemm_softmax_gemm_xdl_cshuffle_v1.hpp
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -934,6 +934,8 @@ using int8x64_t = typename vector_type<int8_t, 64>::type;
 template <typename Y, typename X>
 __host__ __device__ constexpr Y type_convert(X x)
 {
+    static_assert(!std::is_reference_v<Y> && !std::is_reference_v<X>);
+
    return static_cast<Y>(x);
 }


--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -20,6 +20,29 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>

    __host__ __device__ constexpr StaticBuffer() : base{} {}

+    __host__ __device__ constexpr StaticBuffer& operator=(StaticBuffer& y)
+    {
+        StaticBuffer& x = *this;
+        static_for<0, base::Size(), 1>{}([&](auto i) { x(i) = y[i]; });
+        return x;
+    }
+
+    template <typename... Ys>
+    __host__ __device__ constexpr StaticBuffer& operator=(const Tuple<Ys...>& y)
+    {
+        static_assert(base::Size() == sizeof...(Ys), "wrong! size not the same");
+        StaticBuffer& x = *this;
+        static_for<0, base::Size(), 1>{}([&](auto i) { x(i) = y[i]; });
+        return x;
+    }
+
+    __host__ __device__ constexpr StaticBuffer& operator=(const T& y)
+    {
+        StaticBuffer& x = *this;
+        static_for<0, base::Size(), 1>{}([&](auto i) { x(i) = y; });
+        return x;
+    }
+
    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }

    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }
@@ -40,10 +63,12 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
        return base::operator()(i);
    }

-    __host__ __device__ void Clear()
+    __host__ __device__ void Set(T x)
    {
-        static_for<0, N, 1>{}([&](auto i) { operator()(i) = T{0}; });
+        static_for<0, N, 1>{}([&](auto i) { operator()(i) = T{x}; });
    }
+
+    __host__ __device__ void Clear() { Set(T{0}); }
 };

 // static buffer for vector

--- a/include/ck/utility/statically_indexed_array_multi_index.hpp
+++ b/include/ck/utility/statically_indexed_array_multi_index.hpp
@@ -34,7 +34,10 @@ __host__ __device__ constexpr auto to_multi_index(const T& x)
 // is the alias of the latter. This is because compiler cannot infer the NSize if
 // using MultiIndex<NSize>
 // TODO: how to fix this?
-template <typename... Ys, typename X>
+template <
+    typename... Ys,
+    typename X,
+    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
 __host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
 {
    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
@@ -43,7 +46,10 @@ __host__ __device__ constexpr auto operator+=(Tuple<Ys...>& y, const X& x)
    return y;
 }

-template <typename... Ys, typename X>
+template <
+    typename... Ys,
+    typename X,
+    enable_if_t<!std::is_integral<X>::value && !std::is_floating_point<X>::value, bool> = false>
 __host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
 {
    static_assert(X::Size() == sizeof...(Ys), "wrong! size not the same");
@@ -52,7 +58,10 @@ __host__ __device__ constexpr auto operator-=(Tuple<Ys...>& y, const X& x)
    return y;
 }

-template <typename... Xs, typename Y>
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
 {
    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
@@ -63,7 +72,10 @@ __host__ __device__ constexpr auto operator+(const Tuple<Xs...>& x, const Y& y)
    return r;
 }

-template <typename... Xs, typename Y>
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
 {
    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
@@ -74,7 +86,10 @@ __host__ __device__ constexpr auto operator-(const Tuple<Xs...>& x, const Y& y)
    return r;
 }

-template <typename... Xs, typename Y>
+template <
+    typename... Xs,
+    typename Y,
+    enable_if_t<!std::is_integral<Y>::value && !std::is_floating_point<Y>::value, bool> = false>
 __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
 {
    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
@@ -85,9 +100,11 @@ __host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, const Y& y)
    return r;
 }

-// MultiIndex = index_t * MultiIndex
-template <typename... Xs>
-__host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
+// MultiIndex = scalar * MultiIndex
+template <typename... Xs,
+          typename Y,
+          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(Y a, const Tuple<Xs...>& x)
 {
    constexpr index_t NSize = sizeof...(Xs);

@@ -96,13 +113,40 @@ __host__ __device__ constexpr auto operator*(index_t a, const Tuple<Xs...>& x)
    return r;
 }

-// MultiIndex = MultiIndex * index_t
-template <typename... Xs>
-__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, index_t a)
+// MultiIndex = MultiIndex * scalar
+template <typename... Xs,
+          typename Y,
+          enable_if_t<std::is_integral<Y>::value || std::is_floating_point<Y>::value, bool> = false>
+__host__ __device__ constexpr auto operator*(const Tuple<Xs...>& x, Y a)
 {
    return a * x;
 }

+namespace mathext {
+
+template <typename... Xs>
+__host__ __device__ constexpr auto exp(const Tuple<Xs...>& x)
+{
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = math::exp(x[i]); });
+    return r;
+}
+
+template <typename... Xs, typename Y>
+__host__ __device__ constexpr auto max(const Tuple<Xs...>& x, const Y& y)
+{
+    static_assert(Y::Size() == sizeof...(Xs), "wrong! size not the same");
+    constexpr index_t NSize = sizeof...(Xs);
+
+    Tuple<Xs...> r;
+    static_for<0, NSize, 1>{}([&](auto i) { r(i) = math::max(x[i], y[i]); });
+    return r;
+}
+
+} // namespace mathext
+
 template <typename... Xs>
 __host__ __device__ void print_multi_index(const Tuple<Xs...>& x)
 {

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <cstdlib>
+
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+
+#include "ck/library/tensor_operation_instance/device_operation_instance_factory.hpp"
+
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+
+void add_device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+    std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemm<Row,
+                                                             Col,
+                                                             Row,
+                                                             Row,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             F16,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             PassThrough,
+                                                             PassThrough>>>& instances);
+
+template <typename ALayout,
+          typename B0Layout,
+          typename B1Layout,
+          typename CLayout,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType>
+struct DeviceOperationInstanceFactory<
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                                               B0Layout,
+                                                               B1Layout,
+                                                               CLayout,
+                                                               ADataType,
+                                                               B0DataType,
+                                                               B1DataType,
+                                                               CDataType,
+                                                               PassThrough,
+                                                               PassThrough,
+                                                               PassThrough,
+                                                               PassThrough,
+                                                               PassThrough>>
+{
+    using DeviceOp = DeviceBatchedGemmSoftmaxGemm<ALayout,
+                                                  B0Layout,
+                                                  B1Layout,
+                                                  CLayout,
+                                                  ADataType,
+                                                  B0DataType,
+                                                  B1DataType,
+                                                  CDataType,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough,
+                                                  PassThrough>;
+
+    static auto GetInstances()
+    {
+        std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
+
+        if constexpr(is_same_v<ADataType, half_t> && is_same_v<B0DataType, half_t> &&
+                     is_same_v<B1DataType, half_t> && is_same_v<CDataType, half_t>)
+        {
+            if constexpr(is_same_v<ALayout, Row> && is_same_v<B0Layout, Col> &&
+                         is_same_v<B1Layout, Row> && is_same_v<CLayout, Row>)
+            {
+                add_device_batched_gemm_softmax_gemm_xdl_cshuffle_f16_f16_f16_f16_gmk_gnk_gno_gmo_instance(
+                    op_ptrs);
+            }
+        }
+        return op_ptrs;
+    }
+};
+
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/include/ck/library/utility/host_tensor.hpp
+++ b/library/include/ck/library/utility/host_tensor.hpp
@@ -77,38 +77,36 @@ struct HostTensorDescriptor

    void CalculateStrides();

-    template <typename X>
+    template <typename X, typename = std::enable_if_t<std::is_convertible_v<X, std::size_t>>>
    HostTensorDescriptor(const std::initializer_list<X>& lens) : mLens(lens.begin(), lens.end())
    {
        this->CalculateStrides();
    }

-    template <typename X>
-    HostTensorDescriptor(const std::vector<X>& lens) : mLens(lens.begin(), lens.end())
-    {
-        this->CalculateStrides();
-    }
-
-    template <typename Range>
+    template <typename Range,
+              typename = std::enable_if_t<
+                  std::is_convertible_v<decltype(*std::begin(std::declval<Range>())), std::size_t>>>
    HostTensorDescriptor(const Range& lens) : mLens(lens.begin(), lens.end())
    {
        this->CalculateStrides();
    }

-    template <typename X, typename Y>
+    template <typename X,
+              typename Y,
+              typename = std::enable_if_t<std::is_convertible_v<X, std::size_t> &&
+                                          std::is_convertible_v<Y, std::size_t>>>
    HostTensorDescriptor(const std::initializer_list<X>& lens,
                         const std::initializer_list<Y>& strides)
        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
    {
    }

-    template <typename X, typename Y>
-    HostTensorDescriptor(const std::vector<X>& lens, const std::vector<Y>& strides)
-        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
-    {
-    }
-
-    template <typename Range1, typename Range2>
+    template <
+        typename Range1,
+        typename Range2,
+        typename = std::enable_if_t<
+            std::is_convertible_v<decltype(*std::begin(std::declval<Range1>())), std::size_t> &&
+            std::is_convertible_v<decltype(*std::begin(std::declval<Range2>())), std::size_t>>>
    HostTensorDescriptor(const Range1& lens, const Range2& strides)
        : mLens(lens.begin(), lens.end()), mStrides(strides.begin(), strides.end())
    {

--- a/library/include/ck/library/utility/literals.hpp
+++ b/library/include/ck/library/utility/literals.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+namespace ck {
+namespace literals {
+// [P0330] Literal Suffix for (signed) size_t (C++23)
+// ref: https://wg21.link/p0330r8
+inline constexpr std::size_t operator""_uz(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+
+inline constexpr std::size_t operator""_zu(unsigned long long size)
+{
+    return static_cast<std::size_t>(size);
+}
+} // namespace literals
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/CMakeLists.txt
+++ b/library/src/tensor_operation_instance/gpu/CMakeLists.txt
@@ -14,6 +14,7 @@ add_subdirectory(gemm_bias_add_reduce)
 add_subdirectory(batched_gemm)
 add_subdirectory(batched_gemm_reduce)
 add_subdirectory(batched_gemm_gemm)
+add_subdirectory(batched_gemm_softmax_gemm)
 add_subdirectory(grouped_gemm)
 add_subdirectory(contraction_scale)
 add_subdirectory(contraction_bilinear)