Merge branch 'develop' into aosewski/ggemm

458df691 · Adam Osewski · GitHub · 6c9bdbad · f7d28f3e · 458df691
Unverified Commit 458df691 authored Feb 10, 2023 by Adam Osewski Committed by GitHub Feb 10, 2023
20 changed files
--- a/CHANGELOG.md
+++ b/CHANGELOG.md
+# Change Log for Composable Kernel
+Full documentation for Composable Kernel is not yet available.
+## CK 0.1.1 for ROCm 5.5.0
+### Fixed
+- Fixed a bug in 6-dimensional kernels (#555).
+- Fixed grouped ConvBwdWeight test case failure (#524).
+### Optimizations
+- Optimized ...
+### Added
+- Added user tutorial (#563).
+- Added more instances for irregular GEMM sizes (#560).
+- Added inter-wave consumer-producer programming model for GEMM kernels (#310).
+- Added multi-D GEMM client APIs (#534).
+- Added multi-embeddings support (#542).
+- Added Navi3x blockwise GEMM and real GEMM support (#541).
+### Changed
+- Changed ...
--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -19,7 +19,14 @@ def runShell(String command){
 }
 def getDockerImageName(){
-    def img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+    def img
+    if (params.COMPILER_COMMIT == ""){
+        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}"
+    }
+    else{
+        def commit = "${params.COMPILER_COMMIT}"[0..6]
+        img = "${env.CK_DOCKERHUB}:ck_ub20.04_rocm${params.ROCMVERSION}_${params.COMPILER_VERSION}_${commit}"
+    }
    return img
 }
@@ -551,8 +558,8 @@ def process_results(Map conf=[:]){
 //launch develop branch daily at 23:00 UT in FULL_QA mode and at 19:00 UT with latest staging compiler version
 CRON_SETTINGS = BRANCH_NAME == "develop" ? '''0 23 * * * % RUN_FULL_QA=true
-                                              0 21 * * * % RUN_FULL_QA=false;COMPILER_VERSION=release;COMPILER_COMMIT=""
+                                              0 21 * * * % RUN_FULL_QA=false;COMPILER_VERSION=release;COMPILER_COMMIT=
-                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=""''' : ""
+                                              0 19 * * * % BUILD_DOCKER=true;COMPILER_VERSION=amd-stg-open;COMPILER_COMMIT=''' : ""
 pipeline {
    agent none

--- a/client_example/01_gemm/gemm.cpp
+++ b/client_example/01_gemm/gemm.cpp
@@ -83,7 +83,7 @@ int main(int argc, char* argv[])
        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
            using Layout = decltype(layout);
-            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
            {
                return (nRow - 1) * stride + nCol;
            }

--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_add_fastgelu.cpp
@@ -92,7 +92,7 @@ int main(int argc, char* argv[])
        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
            using Layout = decltype(layout);
-            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
            {
                return (nRow - 1) * stride + nCol;
            }

--- a/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_add_fastgelu.cpp
@@ -88,7 +88,7 @@ int main(int argc, char* argv[])
        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
            using Layout = decltype(layout);
-            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
            {
                return (nRow - 1) * stride + nCol;
            }

--- a/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
+++ b/client_example/02_gemm_add_add_fastgelu/gemm_fastgelu.cpp
@@ -84,7 +84,7 @@ int main(int argc, char* argv[])
        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
            using Layout = decltype(layout);
-            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
            {
                return (nRow - 1) * stride + nCol;
            }

--- a/client_example/03_gemm_layernorm/CMakeLists.txt
+++ b/client_example/03_gemm_layernorm/CMakeLists.txt
-add_executable(client_gemm_add_add_reduce_normalize gemm_add_add_layernorm.cpp)
+add_executable(client_gemm_add_add_layernorm_naive gemm_add_add_layernorm_naive.cpp)
-target_link_libraries(client_gemm_add_add_reduce_normalize PRIVATE composable_kernel::device_operations)
+target_link_libraries(client_gemm_add_add_layernorm_naive PRIVATE composable_kernel::device_operations)
+add_executable(client_gemm_add_relu_add_layernorm_welford gemm_add_relu_add_layernorm_welford.cpp)
+target_link_libraries(client_gemm_add_relu_add_layernorm_welford PRIVATE composable_kernel::device_operations)
--- a/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_add_layernorm.cpp
@@ -190,7 +190,7 @@ int main()
        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
            using Layout = decltype(layout);
-            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
            {
                return (nRow - 1) * stride + nCol;
            }

--- a/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
+++ b/client_example/03_gemm_layernorm/gemm_add_relu_add_layernorm_welford.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iomanip>
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/gemm_add_relu_add_layernorm.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_gemm_multiple_d_layernorm.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using F16 = ck::half_t;
+using Row = ck::tensor_layout::gemm::RowMajor;
+using Col = ck::tensor_layout::gemm::ColumnMajor;
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AddReluAdd  = ck::tensor_operation::element_wise::AddReluAdd;
+// DataType
+using ADataType     = F16;
+using BDataType     = F16;
+using D0DataType    = F16;
+using D1DataType    = F16;
+using GammaDataType = F16;
+using BetaDataType  = F16;
+using HDataType     = F16;
+// Layout
+using ALayout  = Row;
+using BLayout  = Col;
+using D0Layout = Row;
+using D1Layout = Row;
+using HLayout  = Row;
+using AElementOp   = PassThrough;
+using BElementOp   = PassThrough;
+using CDEElementOp = AddReluAdd;
+using HElementOp   = PassThrough;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}, mMemSize_(mem_size)
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    void SetZero() const { (void)hipMemset(p_mem_, 0, mMemSize_); }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+    std::size_t mMemSize_;
+};
+int main(int argc, char* argv[])
+{
+    // GEMM shape
+    ck::index_t M = 1024;
+    ck::index_t N = 1024;
+    ck::index_t K = 1024;
+    ck::index_t StrideA  = K;
+    ck::index_t StrideB  = K;
+    ck::index_t StrideD0 = 0;
+    ck::index_t StrideD1 = N;
+    ck::index_t StrideH  = N;
+    float epsilon = 1e-5;
+    auto f_matrix_space_size =
+        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
+            using Layout = decltype(layout);
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            {
+                return (nRow - 1) * stride + nCol;
+            }
+            else
+            {
+                return (nCol - 1) * stride + nRow;
+            }
+        };
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * f_matrix_space_size(M, K, StrideA, ALayout{}));
+    SimpleDeviceMem b_device_buf(sizeof(BDataType) * f_matrix_space_size(K, N, StrideB, BLayout{}));
+    SimpleDeviceMem d0_device_buf(sizeof(D0DataType) *
+                                  f_matrix_space_size(M, N, StrideD0, D0Layout{}));
+    SimpleDeviceMem d1_device_buf(sizeof(D1DataType) *
+                                  f_matrix_space_size(M, N, StrideD1, D1Layout{}));
+    SimpleDeviceMem gamma_device_buf(sizeof(GammaDataType) * N);
+    SimpleDeviceMem beta_device_buf(sizeof(BetaDataType) * N);
+    SimpleDeviceMem h_device_buf(sizeof(HDataType) * f_matrix_space_size(M, N, StrideH, HLayout{}));
+    using DeviceOp = ck::tensor_operation::device::DeviceGemmMultipleDLayernorm<
+        ALayout,
+        BLayout,
+        ck::Tuple<D0Layout, D1Layout>,
+        HLayout,
+        ADataType,
+        BDataType,
+        ck::Tuple<D0DataType, D1DataType>,
+        GammaDataType,
+        BetaDataType,
+        HDataType,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::PassThrough,
+        ck::tensor_operation::element_wise::AddReluAdd,
+        ck::tensor_operation::element_wise::PassThrough>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    const auto a_element_op   = AElementOp{};
+    const auto b_element_op   = BElementOp{};
+    const auto cde_element_op = CDEElementOp{};
+    const auto h_element_op   = HElementOp{};
+    std::string best_op_name;
+    bool found            = false;
+    int best_op_id        = -1;
+    float best_ave_time   = std::numeric_limits<float>::max();
+    float best_gb_per_sec = 0;
+    // profile device operation instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+            gamma_device_buf.GetDeviceBuffer(),
+            beta_device_buf.GetDeviceBuffer(),
+            h_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            {StrideD0, StrideD1},
+            StrideH,
+            epsilon,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            h_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace_dev(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+            h_device_buf.SetZero();
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t num_byte =
+                sizeof(ADataType) * M * K + sizeof(BDataType) * K * N +
+                (sizeof(D0DataType) + sizeof(D1DataType) + sizeof(HDataType)) * M * N +
+                (sizeof(GammaDataType) + sizeof(BetaDataType)) * N;
+            float gb_per_sec = num_byte / 1.E6 / ave_time;
+            std::cout << "Perf: " << std::setw(10) << ave_time << " ms, " << gb_per_sec << " GB/s, "
+                      << op_name << std::endl;
+            if(ave_time < best_ave_time)
+            {
+                found           = true;
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_gb_per_sec << " GB/s, "
+              << best_op_name << std::endl;
+    // run the best intance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b_device_buf.GetDeviceBuffer(),
+            {d0_device_buf.GetDeviceBuffer(), d1_device_buf.GetDeviceBuffer()},
+            gamma_device_buf.GetDeviceBuffer(),
+            beta_device_buf.GetDeviceBuffer(),
+            h_device_buf.GetDeviceBuffer(),
+            M,
+            N,
+            K,
+            StrideA,
+            StrideB,
+            {StrideD0, StrideD1},
+            StrideH,
+            epsilon,
+            a_element_op,
+            b_element_op,
+            cde_element_op,
+            h_element_op);
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            size_t workspace_sz = op_ptr->GetWorkSpaceSize(argument_ptr.get());
+            SimpleDeviceMem workspace_dev(workspace_sz);
+            op_ptr->SetWorkSpacePointer(argument_ptr.get(), workspace_dev.GetDeviceBuffer());
+            h_device_buf.SetZero();
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
\ No newline at end of file
--- a/client_example/08_fused_attention/CMakeLists.txt
+++ b/client_example/08_fused_attention/CMakeLists.txt
 add_executable(client_fused_attention fused_attention.cpp)
 target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_operations)
+add_executable(client_fused_attention_bias fused_attention_bias.cpp)
+target_link_libraries(client_fused_attention_bias PRIVATE composable_kernel::device_operations)
--- a/client_example/08_fused_attention/fused_attention_bias.cpp
+++ b/client_example/08_fused_attention/fused_attention_bias.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_bias_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using Acc0ElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+constexpr static auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+using ADataType   = ck::half_t;
+using B0DataType  = ck::half_t;
+using B1DataType  = ck::half_t;
+using CDataType   = ck::half_t;
+using D0DataType  = ck::half_t;
+using AccDataType = float;
+struct SimpleDeviceMem
+{
+    SimpleDeviceMem() = delete;
+    SimpleDeviceMem(std::size_t mem_size) : p_mem_{}
+    {
+        (void)hipMalloc(static_cast<void**>(&p_mem_), mem_size);
+    }
+    void* GetDeviceBuffer() { return p_mem_; }
+    ~SimpleDeviceMem() { (void)hipFree(p_mem_); }
+    void* p_mem_;
+};
+int main(int argc, char* argv[])
+{
+    int G0 = 48;
+    int G1 = 16;
+    int M  = 1024;
+    int N  = 1024;
+    int K  = 64;
+    int O  = 64;
+    // A layout [G0, M, G1, K]
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
+    // B0 layout [G0, N, G1, K]
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{N * G1 * K, K, G1 * K, 1};
+    // B1 layout [G0, N, G1, O]
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{N * G1 * O, O, 1, G1 * O};
+    // C layout [G0, M, G1, O]
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
+    // D layout [G0, M, G1, N]
+    std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
+    std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
+    SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
+    SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
+    SimpleDeviceMem d0_device_buf(sizeof(D0DataType) * G0 * G1 * M * N);
+    SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
+    SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
+    using DeviceOp =
+        ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute<2,
+                                                                          1,
+                                                                          1,
+                                                                          1,
+                                                                          1,
+                                                                          ADataType,
+                                                                          B0DataType,
+                                                                          B1DataType,
+                                                                          CDataType,
+                                                                          ck::Tuple<D0DataType>,
+                                                                          ck::Tuple<>,
+                                                                          AElementOp,
+                                                                          B0ElementOp,
+                                                                          Acc0ElementOp,
+                                                                          B1ElementOp,
+                                                                          CElementOp,
+                                                                          MaskingSpec>;
+    // get device op instances
+    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+        DeviceOp>::GetInstances();
+    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
+    std::string best_op_name;
+    int best_op_id        = -1;
+    float best_ave_time   = 0;
+    float best_tflops     = 0;
+    float best_gb_per_sec = 0;
+    // profile device op instances
+    std::cout << "Run all instances and do timing" << std::endl;
+    for(int i = 0; i < op_ptrs.size(); ++i)
+    {
+        auto& op_ptr      = op_ptrs[i];
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b0_device_buf.GetDeviceBuffer(),
+            b1_device_buf.GetDeviceBuffer(),
+            c_device_buf.GetDeviceBuffer(),
+            std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
+            {},                                                    // p_acc1_biases
+            a_gs_ms_ks_lengths,
+            a_gs_ms_ks_strides,
+            b0_gs_ns_ks_lengths,
+            b0_gs_ns_ks_strides,
+            b1_gs_os_ns_lengths,
+            b1_gs_os_ns_strides,
+            c_gs_ms_os_lengths,
+            c_gs_ms_os_strides,
+            std::array<std::vector<ck::index_t>, 1>{
+                d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
+            std::array<std::vector<ck::index_t>, 1>{
+                d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
+            {},                       // acc1_biases_gs_ms_os_lengths
+            {},                       // acc1_biases_gs_ms_os_strides
+            AElementOp{},
+            B0ElementOp{},
+            Acc0ElementOp{1 / sqrtf(K)},
+            B1ElementOp{},
+            CElementOp{});
+        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
+        std::string op_name = op_ptr->GetTypeString();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            float ave_time = invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, true});
+            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
+            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O +
+                                     sizeof(D0DataType) * M * N) *
+                                    G0 * G1;
+            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+            float gb_per_sec = num_btype / 1.E6 / ave_time;
+            std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                      << " GB/s, " << op_name << std::endl;
+            if(tflops > best_tflops)
+            {
+                best_op_id      = i;
+                best_op_name    = op_name;
+                best_tflops     = tflops;
+                best_ave_time   = ave_time;
+                best_gb_per_sec = gb_per_sec;
+            }
+        }
+        else
+        {
+            std::cout << op_name << " does not support this problem" << std::endl;
+        }
+    }
+    std::cout << "Best Perf: " << best_ave_time << " ms, " << best_tflops << " TFlops, "
+              << best_gb_per_sec << " GB/s, " << best_op_name << std::endl;
+    // run the best instance
+    {
+        auto& op_ptr = op_ptrs[best_op_id];
+        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
+                  << std::endl;
+        auto argument_ptr = op_ptr->MakeArgumentPointer(
+            a_device_buf.GetDeviceBuffer(),
+            b0_device_buf.GetDeviceBuffer(),
+            b1_device_buf.GetDeviceBuffer(),
+            c_device_buf.GetDeviceBuffer(),
+            std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
+            {},                                                    // p_acc1_biases
+            a_gs_ms_ks_lengths,
+            a_gs_ms_ks_strides,
+            b0_gs_ns_ks_lengths,
+            b0_gs_ns_ks_strides,
+            b1_gs_os_ns_lengths,
+            b1_gs_os_ns_strides,
+            c_gs_ms_os_lengths,
+            c_gs_ms_os_strides,
+            std::array<std::vector<ck::index_t>, 1>{
+                d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
+            std::array<std::vector<ck::index_t>, 1>{
+                d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
+            {},                       // acc1_biases_gs_ms_os_lengths
+            {},                       // acc1_biases_gs_ms_os_strides
+            AElementOp{},
+            B0ElementOp{},
+            Acc0ElementOp{1 / sqrtf(K)},
+            B1ElementOp{},
+            CElementOp{});
+        auto invoker_ptr = op_ptr->MakeInvokerPointer();
+        if(op_ptr->IsSupportedArgument(argument_ptr.get()))
+        {
+            invoker_ptr->Run(argument_ptr.get(), StreamConfig{nullptr, false});
+        }
+        std::cout << "Done" << std::endl;
+    }
+    return 0;
+}
--- a/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
+++ b/client_example/15_gemm_add_multiply/gemm_add_multiply.cpp
@@ -92,7 +92,7 @@ int main(int argc, char* argv[])
        [](std::size_t nRow, std::size_t nCol, std::size_t stride, auto layout) {
            using Layout = decltype(layout);
-            if(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<Layout, ck::tensor_layout::gemm::RowMajor>::value)
            {
                return (nRow - 1) * stride + nCol;
            }

--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_naive_fp16.cpp
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -116,7 +115,7 @@ auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
        using namespace ck::literals;
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor({row, col}, {stride, 1_uz});
        }

--- a/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_bias_relu_add_layernorm_xdl_welford_fp16.cpp
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -15,6 +14,7 @@
 #include "ck/library/utility/device_memory.hpp"
 #include "ck/library/utility/host_tensor.hpp"
 #include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_layernorm.hpp"
 #include "ck/library/utility/check_err.hpp"
@@ -69,21 +69,20 @@ using DeviceOpInstance = ck::tensor_operation::device::DeviceGemmMultipleDLayern
 // clang-format on
 auto f_host_tensor_descriptor1d = [](std::size_t len, std::size_t stride) {
-    return HostTensorDescriptor(std::vector<std::size_t>({len}),
+    return HostTensorDescriptor({len}, {stride});
-                                std::vector<std::size_t>({stride}));
 };
 auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        using namespace ck::literals;
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            return HostTensorDescriptor({row, col}, {stride, 1_uz});
-                                        std::vector<std::size_t>({stride, 1}));
        }
        else
        {
-            return HostTensorDescriptor(std::vector<std::size_t>({row, col}),
+            return HostTensorDescriptor({row, col}, {1_uz, stride});
-                                        std::vector<std::size_t>({1, stride}));
        }
    };
@@ -97,6 +96,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
                         AElementOp a_element_op,
                         BElementOp b_element_op,
                         CDEElementOp cde_element_op,
+                         HElementOp h_element_op,
                         int M,
                         int N,
                         AccDataType epsilon = 1e-5)
@@ -145,7 +145,7 @@ void host_gemm_layernorm(Tensor<HDataType>& h_m_n,
    auto ref_layernorm_invoker = ref_layernorm.MakeInvoker();
    auto ref_layernorm_argument = ref_layernorm.MakeArgument(
-        e_m_n, gamma_n, beta_n, h_m_n, HElementOp{}, {M, N}, {1}, epsilon);
+        e_m_n, gamma_n, beta_n, h_m_n, h_element_op, {M, N}, {1}, epsilon);
    ref_layernorm_invoker.Run(ref_layernorm_argument);
 }
@@ -249,6 +249,7 @@ int main()
                            a_element_op,
                            b_element_op,
                            cde_element_op,
+                            h_element_op,
                            M,
                            N,
                            epsilon);

--- a/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_layernorm_xdl_naive_fp16.cpp
@@ -4,7 +4,6 @@
 #include <iostream>
 #include <numeric>
 #include <initializer_list>
-#include <cstdlib>
 #include "ck/ck.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -115,7 +114,7 @@ auto f_host_tensor_descriptor2d =
    [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
        using namespace ck::literals;
-        if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+        if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
        {
            return HostTensorDescriptor({row, col}, {stride, 1_uz});
        }

--- a/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
+++ b/example/21_gemm_layernorm/gemm_xdl_layernorm_naive_single_kernel_fp16.cpp
@@ -135,7 +135,7 @@ int main(int argc, char* argv[])
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {
            using namespace ck::literals;
-            if(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
+            if constexpr(std::is_same<decltype(layout), ck::tensor_layout::gemm::RowMajor>::value)
            {
                return HostTensorDescriptor({row, col}, {stride, 1_uz});
            }

--- a/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+++ b/example/47_gemm_bias_softmax_gemm_permute/CMakeLists.txt
+add_example_executable(example_gemm_bias_softmax_gemm_permute gemm_bias_softmax_gemm_permute.cpp)
--- a/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+++ b/example/47_gemm_bias_softmax_gemm_permute/gemm_bias_softmax_gemm_permute.cpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <iostream>
+#include <vector>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
+#include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
+#include "ck/library/utility/check_err.hpp"
+#include "ck/library/utility/device_memory.hpp"
+#include "ck/library/utility/host_tensor.hpp"
+#include "ck/library/utility/host_tensor_generator.hpp"
+#include "ck/library/utility/literals.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
+#include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"
+using PassThrough = ck::tensor_operation::element_wise::PassThrough;
+using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
+using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using C0DEElementOp = ck::tensor_operation::element_wise::ScaleAdd;
+using Acc0ElementOp = ck::tensor_operation::element_wise::PassThrough;
+using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
+using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+constexpr static auto MaskingSpec =
+    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
+static constexpr auto TensorSpecA  = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB0 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecB1 = ck::tensor_operation::device::TensorSpecialization::Default;
+static constexpr auto TensorSpecC  = ck::tensor_operation::device::TensorSpecialization::Default;
+using F16              = ck::half_t;
+using F32              = float;
+using ADataType        = F16;
+using B0DataType       = F16;
+using B1DataType       = F16;
+using AccDataType      = F32;
+using CShuffleDataType = F32;
+using CDataType        = F16;
+using D0DataType       = F16;
+using Acc0BiasDataType = ck::Tuple<D0DataType>;
+using Acc1BiasDataType = ck::Tuple<>;
+static constexpr ck::index_t NumDimG = 2;
+static constexpr ck::index_t NumDimM = 1;
+static constexpr ck::index_t NumDimN = 1;
+static constexpr ck::index_t NumDimK = 1;
+static constexpr ck::index_t NumDimO = 1;
+using DeviceOpInstance =
+    ck::tensor_operation::device::DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        ADataType,
+        B0DataType,
+        B1DataType,
+        CDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        CShuffleDataType,
+        AElementOp,
+        B0ElementOp,
+        C0DEElementOp,
+        B1ElementOp,
+        CElementOp,
+        GemmSpec,
+        TensorSpecA,
+        TensorSpecB0,
+        TensorSpecB1,
+        TensorSpecC,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        32,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<16, 16, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+// Ref Gemm0: fp16 in, fp32 out
+using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B0DataType,
+                                                                                AccDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B0ElementOp,
+                                                                                Acc0ElementOp>;
+// Ref Softmax: fp32 in, fp16 out
+using ReferenceSoftmaxInstance =
+    ck::tensor_operation::host::ReferenceSoftmax<AccDataType, ADataType, AccDataType>;
+// Ref Gemm1: fp16 in, fp16 out
+using ReferenceGemm1Instance = ck::tensor_operation::host::ReferenceBatchedGemm<ADataType,
+                                                                                B1DataType,
+                                                                                CDataType,
+                                                                                AccDataType,
+                                                                                AElementOp,
+                                                                                B1ElementOp,
+                                                                                CElementOp>;
+int main(int argc, char* argv[])
+{
+    bool do_verification = true;
+    int init_method      = 1;
+    bool time_kernel     = false;
+    int G0      = 3;
+    int G1      = 2;
+    int M       = 1024;
+    int N       = 1024;
+    int K       = 64;
+    int O       = 64;
+    float alpha = 1;
+    if(argc == 1)
+    {
+        // use default case
+    }
+    else if(argc == 4)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+    }
+    else if(argc == 11)
+    {
+        do_verification = std::stoi(argv[1]);
+        init_method     = std::stoi(argv[2]);
+        time_kernel     = std::stoi(argv[3]);
+        M  = std::stoi(argv[4]);
+        N  = std::stoi(argv[5]);
+        K  = std::stoi(argv[6]);
+        O  = std::stoi(argv[7]);
+        G0 = std::stoi(argv[8]);
+        G1 = std::stoi(argv[9]);
+        alpha = std::stof(argv[10]);
+    }
+    else
+    {
+        printf("arg1: verification (0=no, 1=yes)\n");
+        printf("arg2: initialization (0=no init, 1=integer value, 2=decimal value)\n");
+        printf("arg3: time kernel (0=no, 1=yes)\n");
+        printf("arg4 to 11: M, N, K, O, G0, G1\n");
+        printf("arg10: scale (alpha)\n");
+        exit(0);
+    }
+    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
+    std::vector<ck::index_t> a_gs_ms_ks_strides{
+        M * G1 * K, K, G1 * K, 1}; // A layout [G0, M, G1, K]
+    std::vector<ck::index_t> b0_gs_ns_ks_lengths{G0, G1, N, K};
+    std::vector<ck::index_t> b0_gs_ns_ks_strides{
+        N * G1 * K, K, G1 * K, 1}; // B0 layout [G0, N, G1, K]
+    std::vector<ck::index_t> b1_gs_os_ns_lengths{G0, G1, O, N};
+    std::vector<ck::index_t> b1_gs_os_ns_strides{
+        N * G1 * O, O, 1, G1 * O}; // B1 layout [G0, N, G1, O]
+    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
+    std::vector<ck::index_t> c_gs_ms_os_strides{
+        M * G1 * O, O, G1 * O, 1}; // C layout [G0, M, G1, O]
+    // D layout [G0, M, G1, N]
+    std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
+    std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
+    Tensor<ADataType> a_gs_ms_ks(a_gs_ms_ks_lengths, a_gs_ms_ks_strides);
+    Tensor<B0DataType> b0_gs_ns_ks(b0_gs_ns_ks_lengths, b0_gs_ns_ks_strides);
+    Tensor<B1DataType> b1_gs_os_ns(b1_gs_os_ns_lengths, b1_gs_os_ns_strides);
+    Tensor<D0DataType> d0_gs_ms_ns(d0_gs_ms_ns_lengths, d0_gs_ms_ns_strides);
+    Tensor<CDataType> c_gs_ms_os_host_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    Tensor<CDataType> c_gs_ms_os_device_result(c_gs_ms_os_lengths, c_gs_ms_os_strides);
+    std::cout << "a_gs_ms_ks: " << a_gs_ms_ks.mDesc << std::endl;
+    std::cout << "b0_gs_ns_ks: " << b0_gs_ns_ks.mDesc << std::endl;
+    std::cout << "b1_gs_os_ns: " << b1_gs_os_ns.mDesc << std::endl;
+    std::cout << "c_gs_ms_os: " << c_gs_ms_os_host_result.mDesc << std::endl;
+    switch(init_method)
+    {
+    case 0: break;
+    case 1:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_2<B0DataType>{-2, 2});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_2<B1DataType>{-2, 2});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-2, 2});
+        break;
+    case 2:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_3<B0DataType>{0.0, 1.0});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_3<B1DataType>{-0.5, 0.5});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_2<D0DataType>{-1, 1});
+        break;
+    case 3:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_2<ADataType>{-2, 2});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
+        break;
+    default:
+        a_gs_ms_ks.GenerateTensorValue(GeneratorTensor_Sequential<2>{});
+        b0_gs_ns_ks.GenerateTensorValue(GeneratorTensor_Diagonal<B0DataType>{});
+        b1_gs_os_ns.GenerateTensorValue(GeneratorTensor_Diagonal<B1DataType>{});
+        d0_gs_ms_ns.GenerateTensorValue(GeneratorTensor_1<D0DataType>{1});
+    }
+    DeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
+    DeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
+    DeviceMem d0_device_buf(sizeof(D0DataType) * G0 * G1 * M * N);
+    DeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
+    DeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
+    a_device_buf.ToDevice(a_gs_ms_ks.mData.data());
+    b0_device_buf.ToDevice(b0_gs_ns_ks.mData.data());
+    b1_device_buf.ToDevice(b1_gs_os_ns.mData.data());
+    d0_device_buf.ToDevice(d0_gs_ms_ns.mData.data());
+    auto device_op = DeviceOpInstance{};
+    auto invoker   = device_op.MakeInvoker();
+    auto a_element_op    = AElementOp{};
+    auto b0_element_op   = B0ElementOp{};
+    auto c0de_element_op = C0DEElementOp{alpha};
+    auto acc0_element_op = Acc0ElementOp{};
+    auto b1_element_op   = B1ElementOp{};
+    auto c_element_op    = CElementOp{};
+    auto argument = device_op.MakeArgument(
+        static_cast<const ADataType*>(a_device_buf.GetDeviceBuffer()),
+        static_cast<const B0DataType*>(b0_device_buf.GetDeviceBuffer()),
+        static_cast<const B1DataType*>(b1_device_buf.GetDeviceBuffer()),
+        static_cast<CDataType*>(c_device_buf.GetDeviceBuffer()),
+        std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
+        {},                                                    // p_acc1_biases
+        a_gs_ms_ks_lengths,
+        a_gs_ms_ks_strides,
+        b0_gs_ns_ks_lengths,
+        b0_gs_ns_ks_strides,
+        b1_gs_os_ns_lengths,
+        b1_gs_os_ns_strides,
+        c_gs_ms_os_lengths,
+        c_gs_ms_os_strides,
+        std::array<std::vector<ck::index_t>, 1>{
+            d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
+        std::array<std::vector<ck::index_t>, 1>{
+            d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
+        {},                       // acc1_biases_gs_ms_os_lengths
+        {},                       // acc1_biases_gs_ms_os_strides
+        a_element_op,
+        b0_element_op,
+        c0de_element_op,
+        b1_element_op,
+        c_element_op);
+    if(!device_op.IsSupportedArgument(argument))
+    {
+        throw std::runtime_error("wrong! this device_op instance does not support this problem");
+    }
+    float ave_time = invoker.Run(argument, StreamConfig{nullptr, time_kernel});
+    ck::index_t BatchCount = G0 * G1;
+    std::size_t flop       = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * BatchCount;
+    std::size_t num_btype =
+        (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N + sizeof(B1DataType) * N * O +
+         sizeof(CDataType) * M * O + sizeof(D0DataType) * M * N) *
+        BatchCount;
+    float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
+    float gb_per_sec = num_btype / 1.E6 / ave_time;
+    std::cout << "Perf: " << ave_time << " ms, " << tflops << " TFlops, " << gb_per_sec << " GB/s, "
+              << std::endl;
+    if(do_verification)
+    {
+        c_device_buf.FromDevice(c_gs_ms_os_device_result.mData.data());
+        Tensor<ADataType> a_g_m_k({BatchCount, M, K});
+        Tensor<B0DataType> b0_g_k_n({BatchCount, K, N});
+        Tensor<B1DataType> b1_g_n_o({BatchCount, N, O});
+        Tensor<AccDataType> acc0_g_m_n({BatchCount, M, N});        // scratch object after gemm0
+        Tensor<ADataType> a1_g_m_n({BatchCount, M, N});            // scratch object after softmax
+        Tensor<CDataType> c_g_m_o_host_result({BatchCount, M, O}); // scratch object after gemm1
+        Tensor<D0DataType> d0_g_m_n({BatchCount, M, N});
+        // permute
+        a_gs_ms_ks.ForEach([&](auto& self, auto idx) {
+            a_g_m_k(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+        });
+        b0_gs_ns_ks.ForEach([&](auto& self, auto idx) {
+            b0_g_k_n(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+        b1_gs_os_ns.ForEach([&](auto& self, auto idx) {
+            b1_g_n_o(idx[0] * G1 + idx[1], idx[3], idx[2]) = self(idx);
+        });
+        d0_gs_ms_ns.ForEach([&](auto& self, auto idx) {
+            d0_g_m_n(idx[0] * G1 + idx[1], idx[2], idx[3]) = self(idx);
+        });
+        // gemm 0
+        auto ref_gemm0          = ReferenceGemm0Instance{};
+        auto ref_gemm0_invoker  = ref_gemm0.MakeInvoker();
+        auto ref_gemm0_argument = ref_gemm0.MakeArgument(
+            a_g_m_k, b0_g_k_n, acc0_g_m_n, a_element_op, b0_element_op, acc0_element_op);
+        ref_gemm0_invoker.Run(ref_gemm0_argument);
+        acc0_g_m_n.ForEach([&](auto&, auto idx) {
+            c0de_element_op(acc0_g_m_n(idx), acc0_g_m_n(idx), d0_g_m_n(idx));
+        });
+        // masking
+        const auto mask = DeviceOpInstance::C0MatrixMask(N);
+        acc0_g_m_n.ForEach([&](auto& self, auto idx) {
+            if(mask.IsMaskedElement(idx[1], idx[2]))
+                self(idx) = -ck::NumericLimits<float>::Infinity();
+        });
+        // softmax
+        auto ref_softmax          = ReferenceSoftmaxInstance{};
+        auto ref_softmax_invoker  = ref_softmax.MakeInvoker();
+        auto ref_softmax_argument = ref_softmax.MakeArgument(acc0_g_m_n, a1_g_m_n, 1, 0, {2});
+        ref_softmax_invoker.Run(ref_softmax_argument);
+        // gemm1
+        auto ref_gemm1          = ReferenceGemm1Instance{};
+        auto ref_gemm1_invoker  = ref_gemm1.MakeInvoker();
+        auto ref_gemm1_argument = ref_gemm1.MakeArgument(
+            a1_g_m_n, b1_g_n_o, c_g_m_o_host_result, PassThrough{}, b1_element_op, c_element_op);
+        ref_gemm1_invoker.Run(ref_gemm1_argument);
+        // permute
+        c_gs_ms_os_host_result.ForEach([&](auto& self, auto idx) {
+            const size_t& g0 = idx[0];
+            const size_t& g1 = idx[1];
+            const size_t g = g0 * G1 + g1;
+            self(idx) = c_g_m_o_host_result(g, idx[2], idx[3]);
+        });
+        // default absolute error and relative error is 0.001
+        double rtol = 1e-3;
+        double atol = 1e-3;
+        return ck::utils::check_err(c_gs_ms_os_device_result.mData,
+                                    c_gs_ms_os_host_result.mData,
+                                    "Error: Incorrect results!",
+                                    rtol,
+                                    atol)
+                   ? 0
+                   : 1;
+    }
+    return 0;
+}
--- a/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp
@@ -26,9 +26,9 @@ template <index_t NumDimG,
          typename Acc1BiasDataType,
          typename AElementwiseOperation,
          typename B0ElementwiseOperation,
-          typename Acc0ElementwiseOperation,
+          typename C0DEElementwiseOperation,
          typename B1ElementwiseOperation,
-          typename CElementwiseOperation,
+          typename C1DEElementwiseOperation,
          MaskingSpecialization MaskingSpec>
 struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
 {
@@ -58,9 +58,9 @@ struct DeviceBatchedGemmSoftmaxGemmPermute : public BaseOperator
            acc1_biases_gs_ms_gemm1ns_strides, // acc1_biases_gs_ms_os_strides
        AElementwiseOperation a_element_op,
        B0ElementwiseOperation b0_element_op,
-        Acc0ElementwiseOperation acc0_element_op,
+        C0DEElementwiseOperation c0de_element_op,
        B1ElementwiseOperation b1_element_op,
-        CElementwiseOperation c_element_op) = 0;
+        C1DEElementwiseOperation c1de_element_op) = 0;
    virtual std::unique_ptr<BaseInvoker> MakeInvokerPointer() = 0;
 };

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp