expose deviece instance interface

77404e83 · ltqin · bb673452 · 77404e83 · 77404e83 · 77404e83
Commit 77404e83 authored Mar 22, 2023 by ltqin
4 changed files
--- a/client_example/08_fused_attention/CMakeLists.txt
+++ b/client_example/08_fused_attention/CMakeLists.txt
@@ -4,8 +4,7 @@ target_link_libraries(client_fused_attention PRIVATE composable_kernel::device_o
 add_executable(client_fused_attention_bias fused_attention_bias.cpp)
 target_link_libraries(client_fused_attention_bias PRIVATE composable_kernel::device_operations)
-add_executable(client_fused_attention_mask fused_attention_mask.cpp)
-target_link_libraries(client_fused_attention_mask PRIVATE composable_kernel::device_operations)
 add_executable(client_fused_attention_bias_mask fused_attention_bias_mask.cpp)
 target_link_libraries(client_fused_attention_bias_mask PRIVATE composable_kernel::device_operations)
+add_executable(client_fused_attention_no_lib fused_attention_no_lib.cpp)
--- a/client_example/08_fused_attention/fused_attention_mask.cpp
+++ b/client_example/08_fused_attention/fused_attention_mask.cpp
@@ -5,14 +5,14 @@
 #include <vector>
 #include "ck/ck.hpp"
-#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute_general.hpp"
+#include "ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
 #include "ck/tensor_operation/gpu/device/device_batched_gemm_softmax_gemm_permute.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"
 using AElementOp    = ck::tensor_operation::element_wise::PassThrough;
 using B0ElementOp   = ck::tensor_operation::element_wise::PassThrough;
-using Acc0ElementOp = ck::tensor_operation::element_wise::ScaleMask;
+using Acc0ElementOp = ck::tensor_operation::element_wise::Scale;
 using B1ElementOp   = ck::tensor_operation::element_wise::PassThrough;
 using CElementOp    = ck::tensor_operation::element_wise::PassThrough;
@@ -23,7 +23,6 @@ using ADataType   = ck::half_t;
 using B0DataType  = ck::half_t;
 using B1DataType  = ck::half_t;
 using CDataType   = ck::half_t;
-using D0DataType  = int32_t;
 using AccDataType = float;
 struct SimpleDeviceMem
@@ -42,7 +41,7 @@ struct SimpleDeviceMem
    void* p_mem_;
 };
-int main(int argc, char* argv[])
+int main()
 {
    int G0 = 48;
    int G1 = 16;
@@ -50,7 +49,7 @@ int main(int argc, char* argv[])
    int N  = 1024;
    int K  = 64;
    int O  = 64;
    // A layout [G0, M, G1, K]
    std::vector<ck::index_t> a_gs_ms_ks_lengths{G0, G1, M, K};
    std::vector<ck::index_t> a_gs_ms_ks_strides{M * G1 * K, K, G1 * K, 1};
@@ -67,13 +66,8 @@ int main(int argc, char* argv[])
    std::vector<ck::index_t> c_gs_ms_os_lengths{G0, G1, M, O};
    std::vector<ck::index_t> c_gs_ms_os_strides{M * G1 * O, O, G1 * O, 1};
-    // D layout [G0, M, G1, N]
-    std::vector<ck::index_t> d0_gs_ms_ns_lengths{G0, G1, M, N};
-    std::vector<ck::index_t> d0_gs_ms_ns_strides{M * G1 * N, N, G1 * N, 1};
    SimpleDeviceMem a_device_buf(sizeof(ADataType) * G0 * G1 * M * K);
    SimpleDeviceMem b0_device_buf(sizeof(B0DataType) * G0 * G1 * N * K);
-    SimpleDeviceMem d0_device_buf(sizeof(D0DataType) * G0 * G1 * M * N);
    SimpleDeviceMem b1_device_buf(sizeof(B1DataType) * G0 * G1 * O * N);
    SimpleDeviceMem c_device_buf(sizeof(CDataType) * G0 * G1 * M * O);
@@ -87,7 +81,7 @@ int main(int argc, char* argv[])
                                                                          B0DataType,
                                                                          B1DataType,
                                                                          CDataType,
-                                                                          ck::Tuple<D0DataType>,
+                                                                          ck::Tuple<>,
                                                                          ck::Tuple<>,
                                                                          AElementOp,
                                                                          B0ElementOp,
@@ -95,10 +89,11 @@ int main(int argc, char* argv[])
                                                                          B1ElementOp,
                                                                          CElementOp,
                                                                          MaskingSpec>;
    // get device op instances
-    const auto op_ptrs = ck::tensor_operation::device::instance::DeviceOperationInstanceFactory<
+    std::vector<std::unique_ptr<DeviceOp>> op_ptrs;
-        DeviceOp>::GetInstances();
+    ck::tensor_operation::device::instance::
+        add_device_batched_gemm_mutiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances(
+            op_ptrs);
    std::cout << "found " << op_ptrs.size() << " instances" << std::endl;
@@ -111,35 +106,32 @@ int main(int argc, char* argv[])
    // profile device op instances
    std::cout << "Run all instances and do timing" << std::endl;
-    for(int i = 0; i < op_ptrs.size(); ++i)
+    for(size_t i = 0; i < op_ptrs.size(); ++i)
    {
        auto& op_ptr      = op_ptrs[i];
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
-            a_device_buf.GetDeviceBuffer(),
+                                                        b0_device_buf.GetDeviceBuffer(),
-            b0_device_buf.GetDeviceBuffer(),
+                                                        b1_device_buf.GetDeviceBuffer(),
-            b1_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
-            c_device_buf.GetDeviceBuffer(),
+                                                        {}, // p_acc0_biases
-            std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
+                                                        {}, // p_acc1_biases
-            {},                                                    // p_acc1_biases
+                                                        a_gs_ms_ks_lengths,
-            a_gs_ms_ks_lengths,
+                                                        a_gs_ms_ks_strides,
-            a_gs_ms_ks_strides,
+                                                        b0_gs_ns_ks_lengths,
-            b0_gs_ns_ks_lengths,
+                                                        b0_gs_ns_ks_strides,
-            b0_gs_ns_ks_strides,
+                                                        b1_gs_os_ns_lengths,
-            b1_gs_os_ns_lengths,
+                                                        b1_gs_os_ns_strides,
-            b1_gs_os_ns_strides,
+                                                        c_gs_ms_os_lengths,
-            c_gs_ms_os_lengths,
+                                                        c_gs_ms_os_strides,
-            c_gs_ms_os_strides,
+                                                        {}, // acc0_biases_gs_ms_ns_lengths
-            std::array<std::vector<ck::index_t>, 1>{
+                                                        {}, // acc0_biases_gs_ms_ns_strides
-                d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
+                                                        {}, // acc1_biases_gs_ms_os_lengths
-            std::array<std::vector<ck::index_t>, 1>{
+                                                        {}, // acc1_biases_gs_ms_os_strides
-                d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
+                                                        AElementOp{},
-            {},                       // acc1_biases_gs_ms_os_lengths
+                                                        B0ElementOp{},
-            {},                       // acc1_biases_gs_ms_os_strides
+                                                        Acc0ElementOp{1 / sqrtf(K)},
-            AElementOp{},
+                                                        B1ElementOp{},
-            B0ElementOp{},
+                                                        CElementOp{});
-            Acc0ElementOp{1 / sqrtf(K), 0.1},
-            B1ElementOp{},
-            CElementOp{});
        auto invoker_ptr    = op_ptr->MakeInvokerPointer();
        std::string op_name = op_ptr->GetTypeString();
@@ -151,8 +143,7 @@ int main(int argc, char* argv[])
            std::size_t flop      = (size_t(M) * N * K * 2 + size_t(M) * N * O * 2) * G0 * G1;
            std::size_t num_btype = (sizeof(ADataType) * M * K + sizeof(B0DataType) * K * N +
-                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O +
+                                     sizeof(B1DataType) * N * O + sizeof(CDataType) * M * O) *
-                                     sizeof(D0DataType) * M * N) *
                                    G0 * G1;
            float tflops = static_cast<float>(flop) / 1.E9 / ave_time;
@@ -185,32 +176,29 @@ int main(int argc, char* argv[])
        auto& op_ptr = op_ptrs[best_op_id];
        std::cout << "Run the best instance without timing: " << op_ptr->GetTypeString()
                  << std::endl;
-        auto argument_ptr = op_ptr->MakeArgumentPointer(
+        auto argument_ptr = op_ptr->MakeArgumentPointer(a_device_buf.GetDeviceBuffer(),
-            a_device_buf.GetDeviceBuffer(),
+                                                        b0_device_buf.GetDeviceBuffer(),
-            b0_device_buf.GetDeviceBuffer(),
+                                                        b1_device_buf.GetDeviceBuffer(),
-            b1_device_buf.GetDeviceBuffer(),
+                                                        c_device_buf.GetDeviceBuffer(),
-            c_device_buf.GetDeviceBuffer(),
+                                                        {}, // p_acc0_biases
-            std::array<void*, 1>{d0_device_buf.GetDeviceBuffer()}, // p_acc0_biases
+                                                        {}, // p_acc1_biases
-            {},                                                    // p_acc1_biases
+                                                        a_gs_ms_ks_lengths,
-            a_gs_ms_ks_lengths,
+                                                        a_gs_ms_ks_strides,
-            a_gs_ms_ks_strides,
+                                                        b0_gs_ns_ks_lengths,
-            b0_gs_ns_ks_lengths,
+                                                        b0_gs_ns_ks_strides,
-            b0_gs_ns_ks_strides,
+                                                        b1_gs_os_ns_lengths,
-            b1_gs_os_ns_lengths,
+                                                        b1_gs_os_ns_strides,
-            b1_gs_os_ns_strides,
+                                                        c_gs_ms_os_lengths,
-            c_gs_ms_os_lengths,
+                                                        c_gs_ms_os_strides,
-            c_gs_ms_os_strides,
+                                                        {}, // acc0_biases_gs_ms_ns_lengths
-            std::array<std::vector<ck::index_t>, 1>{
+                                                        {}, // acc0_biases_gs_ms_ns_strides
-                d0_gs_ms_ns_lengths}, // acc0_biases_gs_ms_ns_lengths
+                                                        {}, // acc1_biases_gs_ms_os_lengths
-            std::array<std::vector<ck::index_t>, 1>{
+                                                        {}, // acc1_biases_gs_ms_os_strides
-                d0_gs_ms_ns_strides}, // acc0_biases_gs_ms_ns_strides
+                                                        AElementOp{},
-            {},                       // acc1_biases_gs_ms_os_lengths
+                                                        B0ElementOp{},
-            {},                       // acc1_biases_gs_ms_os_strides
+                                                        Acc0ElementOp{1 / sqrtf(K)},
-            AElementOp{},
+                                                        B1ElementOp{},
-            B0ElementOp{},
+                                                        CElementOp{});
-            Acc0ElementOp{1 / sqrtf(K), 0.1},
-            B1ElementOp{},
-            CElementOp{});
        auto invoker_ptr = op_ptr->MakeInvokerPointer();

--- a/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2022, Advanced Micro Devices, Inc. All rights reserved.
+#include <cstdlib>
+#include "ck/ck.hpp"
+#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
+#include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_batched_gemm_softmax_gemm_permute_xdl_cshuffle.hpp"
+#include "ck/library/tensor_operation_instance/add_device_operation_instance.hpp"
+namespace ck {
+namespace tensor_operation {
+namespace device {
+namespace instance {
+using F16 = ck::half_t;
+using F32 = float;
+template <ck::index_t... Is>
+using S = ck::Sequence<Is...>;
+using PassThrough   = ck::tensor_operation::element_wise::PassThrough;
+using ScaleMask     = ck::tensor_operation::element_wise::ScaleMask;
+using ScaleBiasMask = ck::tensor_operation::element_wise::ScaleBiasMask;
+static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecialization::Default;
+static constexpr auto GemmPadded  = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
+static constexpr auto TensorDefault = ck::tensor_operation::device::TensorSpecialization::Default;
+// c[g, m, n] = a[g, m, k] * b[g, n, k]
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          typename DataType,
+          typename AccDataType,
+          typename D0DataTypes,
+          typename AD0ElementwiseOp,
+          MaskingSpecialization MaskingSpec>
+using device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances =
+    std::tuple<
+        // clang-format off
+        // #############################################|  NumDimG| NumDimM| NumDimN| NumDimK| NumDimO|    AData|    B0Data|    B1Data|     CData| Acc0BiasData| Acc1BiasData|     AccData| CShuffle|           A|          B0|             Acc0|          B1|           C|           GEMM|   ATensorSpec|  B0TensorSpec|  B1TensorSpec|   CTensorSpec| NumGemmK| Block| Gemm01| Gemm0| Gemm0| Gemm1| Gemm1| AK1| BK1| B1K1| MPer| NPer| Gemm0| Gemm0| Gemm1|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockTransfer| B0BlockLds|  B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockTransfer| B1BlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer| MaskingSpec|
+        // #############################################|         |        |        |        |        |     Type|      Type|      Type|      Type|          ype|         Type|        Type| DataType| Elementwise| Elementwise|      Elementwise| Elementwise| Elementwise| Specialization|              |              |              |              | Prefetch|  Size|   MPer|  NPer|  KPer|  NPer|  KPer|    |    |     |  XDL|  XDL|  MXdl|  NXdl|  NXdl|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN|    ThreadCluster|   ThreadCluster|  SrcAccessOrder|    SrcVectorDim|       SrcScalar|       DstScalar|  AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|            |
+        // #############################################|         |        |        |        |        |         |          |          |          |             |             |            |         |   Operation|   Operation|        Operation|   Operation|   Operation|               |              |              |              |              |    Stage|      |  Block| Block| Block| Block| Block|    |    |     |     |     |   Per|   Per|   Per| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  Lengths_K0_N_K1|    ArrangeOrder|                |                |       PerVector|    PerVector_K1|           |  PerShuffle|  PerShuffle|         _NBlock_NWaveNPerXdl|   _NWaveNPerXdl|            |
+        // #############################################|         |        |        |        |        |         |          |          |          |             |             |            |         |            |            |                 |            |            |               |              |              |              |              |         |      |       |      |      |      |      |    |    |     |     |     |  Wave|  Wave|  Wave|                |               |               |               |               |               |          |                 |                |                |                |                |                |           |                 |                |                |                |                |                |           |            |            |                             |                |            |
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,    64,    32,   8,   8,    2,   32,   32,     2,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    256,   128,    32,   128,    32,   8,   8,    2,   32,   32,     2,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+#if CK_WORKAROUND_SWDEV_388832
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,    64,    32,   8,   8,    2,   32,   32,     1,     8,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+#endif
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   256,    32,   128,    32,   8,   8,    2,   32,   32,     1,     8,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,    64,    32,   8,   8,    2,   32,   32,     1,     4,     2,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    32,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    32,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,   128,    32,   8,   8,    2,   16,   16,     1,    16,     8,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           8,               S<1, 16, 1,16>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,    GemmDefault, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,     64,   256,    64,    64,    32,   8,   8,    2,   16,   16,     1,    16,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S<16, 16, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           4,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        // Padded fallback kernel
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,   128,    64,   128,    32,   8,   8,    2,   32,   32,     1,     4,     4,     S<8, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,     false,      S<8, 32, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,      false,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>,
+        DeviceBatchedGemmSoftmaxGemmPermute_Xdl_CShuffle<  NumDimG, NumDimM, NumDimN, NumDimK, NumDimO, DataType,  DataType,  DataType,  DataType,  D0DataTypes,  ck::Tuple<>, AccDataType, DataType, PassThrough, PassThrough, AD0ElementwiseOp, PassThrough, PassThrough,     GemmPadded, TensorDefault, TensorDefault, TensorDefault, TensorDefault,        1,   256,    128,    64,    32,   128,    32,   8,   8,    2,   32,   32,     1,     2,     4,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<4, 64, 1>,      S<1, 0, 2>,      S<1, 0, 2>,               2,               8,               8,       true,     S< 8, 32, 1>,      S<0, 2, 1>,      S<0, 2, 1>,               1,               4,               2,      false,           1,           2,               S<1, 32, 1, 8>,               8, MaskingSpec>
+        // clang-format on
+        >;
+template <index_t NumDimG,
+          index_t NumDimM,
+          index_t NumDimN,
+          index_t NumDimK,
+          index_t NumDimO,
+          typename ADataType,
+          typename B0DataType,
+          typename B1DataType,
+          typename CDataType,
+          typename Acc0BiasDataType,
+          typename Acc1BiasDataType,
+          typename AElementwiseOperation,
+          typename B0ElementwiseOperation,
+          typename C0DEElementwiseOperation,
+          typename B1ElementwiseOperation,
+          typename C1DEElementwiseOperation,
+          MaskingSpecialization MaskingSpec>
+void add_device_batched_gemm_mutiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances(
+    std::vector<std::unique_ptr<DeviceBatchedGemmSoftmaxGemmPermute<NumDimG,
+                                                                      NumDimM,
+                                                                      NumDimN,
+                                                                      NumDimK,
+                                                                      NumDimO,
+                                                                      ADataType,
+                                                                      B0DataType,
+                                                                      B1DataType,
+                                                                      CDataType,
+                                                                      Acc0BiasDataType,
+                                                                      Acc1BiasDataType,
+                                                                      AElementwiseOperation,
+                                                                      B0ElementwiseOperation,
+                                                                      C0DEElementwiseOperation,
+                                                                      B1ElementwiseOperation,
+                                                                      C1DEElementwiseOperation,
+                                                                      MaskingSpec>>>& instances)
+{
+    add_device_operation_instances(
+        instances,
+        device_batched_gemm_bias_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instances<
+            NumDimG,
+            NumDimM,
+            NumDimN,
+            NumDimK,
+            NumDimO,
+            ADataType,
+            F32,
+            Acc0BiasDataType,
+            C0DEElementwiseOperation,
+            MaskingSpec>{});
+}
+} // namespace instance
+} // namespace device
+} // namespace tensor_operation
+} // namespace ck
--- a/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/batched_gemm_softmax_gemm_permute/device_batched_gemm_multiple_d_softmax_gemm_permute_xdl_cshuffle_gmk_gnk_gno_gmo_instance.cpp