clean up

681ede91 · Chao Liu · edb1d2c3 · 681ede91 · 681ede91 · 681ede91
Commit 681ede91 authored Dec 19, 2021 by Chao Liu
6 changed files
--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r1.hpp
@@ -83,6 +83,7 @@ template <index_t BlockSize,
          index_t ABlockTransferSrcScalarPerVector,
          index_t ABlockTransferDstScalarPerVector_K1,
          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
          typename BBlockTransferThreadSliceLengths_K0_N_K1,
          typename BBlockTransferThreadClusterLengths_K0_N_K1,
          typename BBlockTransferThreadClusterArrangeOrder,
@@ -91,11 +92,14 @@ template <index_t BlockSize,
          index_t BBlockTransferSrcScalarPerVector,
          index_t BBlockTransferDstScalarPerVector_K1,
          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
+          bool BBlockLdsExtraN,
-          index_t CThreadTransferSrcDstVectorDim,
+          index_t MRepeatPerShuffle_CCopy,
-          index_t CThreadTransferDstScalarPerVector,
+          index_t NRepeatPerShuffle_CCopy,
-          bool ABlockLdsExtraM,
+          index_t MRepeatThread_CCopy,
-          bool BBlockLdsExtraN>
+          index_t MThread_CCopy,
+          index_t NRepeatThread_CCopy,
+          index_t NThread_CCopy,
+          index_t NScalarPerVector_CCopy>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
 {
    static constexpr auto I0 = Number<0>{};
@@ -498,7 +502,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
        // shuffle and write out
        {
-#if 1
+#if 0
            // TODO: make it tunable
            constexpr index_t MRepeatPerShuffle_CCopy = 1;
            constexpr index_t NRepeatPerShuffle_CCopy = 1;
@@ -511,7 +515,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
            // vector length for blockwise copy from LDS to global
            constexpr index_t NScalarPerVector_CCopy = 8;
-#else
+#elif 0
            // TODO: make it tunable
            constexpr index_t MRepeatPerShuffle_CCopy = 1;
            constexpr index_t NRepeatPerShuffle_CCopy = 2;
@@ -654,53 +658,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                                       n_thread_data_on_block_idx[I2]),
                      ck::tensor_operation::element_wise::PassThrough{}};
-#if 0
+            auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v6r1<
-            auto c_block_copy_lds_to_global = BlockwiseTensorSliceTransfer_v4r1<
-                BlockSize,                                       // index_t BlockSize,
-                ck::tensor_operation::element_wise::PassThrough, // SrcElementwiseOperation,
-                CElementwiseOperation,                           // DstElementwiseOperation,
-                CGlobalMemoryDataOperation,                      // DstInMemOp,
-                Sequence<1,
-                         MRepeatPerShuffle_CCopy,
-                         MPerBlock_CCopy,
-                         1,
-                         NRepeatPerShuffle_CCopy,
-                         NPerBlock_CCopy>, // BlockSliceLengths,
-                Sequence<1,
-                         MRepeatPerShuffle_CCopy,
-                         MPerThread_CCopy,
-                         1,
-                         NRepeatPerShuffle_CCopy,
-                         NPerThread_CCopy>, // ThreadSliceLengths,
-                Sequence<1,
-                         MRepeatPerThread_CCopy,
-                         MThread_CCopy,
-                         1,
-                         NRepeatPerThread_CCopy,
-                         NThread_CCopy>,    // ThreadClusterLengths,
-                Sequence<0, 1, 2, 3, 4, 5>, // typename ThreadClusterArrangeOrder,
-                FloatC,                     // typename SrcData,
-                FloatC,                     // typename DstData,
-                decltype(c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl),
-                decltype(c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl),
-                Sequence<0, 1, 2, 3, 4, 5>, // typename SrcDimAccessOrder,
-                Sequence<0, 1, 2, 3, 4, 5>, // typename DstDimAccessOrder,
-                5,                          // index_t SrcVectorDim,
-                5,                          // index_t DstVectorDim,
-                NScalarPerVector_CCopy,     // index_t SrcScalarPerVector,
-                NScalarPerVector_CCopy,     // index_t DstScalarPerVector,
-                1,                          // index_t SrcScalarStrideInVector,
-                1,                          // index_t DstScalarStrideInVector,
-                true,                       // bool ThreadTransferSrcResetCoordinateAfterRun,
-                false>                      // bool ThreadTransferDstResetCoordinateAfterRun>
-                {c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
-                 make_multi_index(0, 0, 0, 0, 0, 0),
-                 ck::tensor_operation::element_wise::PassThrough{},
-                 c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
-                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
-                 c_element_op};
-#else
-            auto c_block_copy_lds_to_global          = BlockwiseTensorSliceTransfer_v6r1<
                BlockSize,                  // index_t BlockSize,
                CElementwiseOperation,      // ElementwiseOperation,
                CGlobalMemoryDataOperation, // DstInMemOp,
@@ -730,14 +688,13 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r1
                Sequence<0, 1, 2, 3, 4, 5>, // typename DimAccessOrder,
                5,                          // index_t VectorDim,
                NScalarPerVector_CCopy,     // index_t ScalarPerVector,
-                true,  // bool ThreadTransferSrcResetCoordinateAfterRun,
+                true,                       // bool ThreadTransferSrcResetCoordinateAfterRun,
-                false> // bool ThreadTransferDstResetCoordinateAfterRun>
+                false>                      // bool ThreadTransferDstResetCoordinateAfterRun>
                {c_block_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
                 make_multi_index(0, 0, 0, 0, 0, 0),
                 c_grid_desc_mblock_mrepeat_mwavemperxdl_nblock_nrepeat_nwavenperxdl,
                 make_multi_index(block_work_idx[I0], 0, 0, block_work_idx[I1], 0, 0),
                 c_element_op};
-#endif
            constexpr auto mrepeat_forward_step =
                make_multi_index(0, MRepeatPerShuffle_CCopy, 0, 0, 0, 0);

--- a/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r3.hpp
+++ b/composable_kernel/include/tensor_operation/gridwise_gemm_xdlops_v3r3.hpp
@@ -97,6 +97,7 @@ template <index_t BlockSize,
          index_t ABlockTransferSrcScalarPerVector,
          index_t ABlockTransferDstScalarPerVector_K1,
          bool AThreadTransferSrcResetCoordinateAfterRun,
+          bool ABlockLdsExtraM,
          typename BBlockTransferThreadSliceLengths_K0_N_K1,
          typename BBlockTransferThreadClusterLengths_K0_N_K1,
          typename BBlockTransferThreadClusterArrangeOrder,
@@ -105,11 +106,14 @@ template <index_t BlockSize,
          index_t BBlockTransferSrcScalarPerVector,
          index_t BBlockTransferDstScalarPerVector_K1,
          bool BThreadTransferSrcResetCoordinateAfterRun,
-          typename CThreadTransferSrcDstAccessOrder,
+          bool BBlockLdsExtraN,
-          index_t CThreadTransferSrcDstVectorDim,
+          index_t MRepeatPerShuffle_CCopy,
-          index_t CThreadTransferDstScalarPerVector,
+          index_t NRepeatPerShuffle_CCopy,
-          bool ABlockLdsExtraM,
+          index_t MRepeatThread_CCopy,
-          bool BBlockLdsExtraN>
+          index_t MThread_CCopy,
+          index_t NRepeatThread_CCopy,
+          index_t NThread_CCopy,
+          index_t NScalarPerVector_CCopy>
 struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
 {
    static constexpr auto I0 = Number<0>{};
@@ -537,7 +541,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
        // shuffle and write out
        {
-#if 1
+#if 0
            // TODO: make it tunable
            constexpr index_t MRepeatPerShuffle_CCopy = 1;
            constexpr index_t NRepeatPerShuffle_CCopy = 1;
@@ -550,7 +554,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v3r3
            // vector length for blockwise copy from LDS to global
            constexpr index_t NScalarPerVector_CCopy = 8;
-#else
+#elif 0
            // TODO: make it tunable
            constexpr index_t MRepeatPerShuffle_CCopy = 1;
            constexpr index_t NRepeatPerShuffle_CCopy = 2;

--- a/device_operation/include/device_conv2d_fwd_xdl_output_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
+++ b/device_operation/include/device_conv2d_fwd_xdl_output_shuffle_bias_activation_add_nhwc_kyxc_nhwk.hpp
@@ -41,6 +41,7 @@ template <typename InDataType,
          ck::index_t ABlockTransferSrcVectorDim,
          ck::index_t ABlockTransferSrcScalarPerVector,
          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
          typename BBlockTransferThreadSliceLengths_K0_N_K1,
          typename BBlockTransferThreadClusterLengths_K0_N_K1,
          typename BBlockTransferThreadClusterArrangeOrder,
@@ -48,10 +49,14 @@ template <typename InDataType,
          ck::index_t BBlockTransferSrcVectorDim,
          ck::index_t BBlockTransferSrcScalarPerVector,
          ck::index_t BBlockTransferDstScalarPerVector_K1,
-          ck::index_t CThreadTransferSrcDstVectorDim,
+          bool BBlockLdsAddExtraN,
-          ck::index_t CThreadTransferDstScalarPerVector,
+          index_t MRepeatPerShuffle_CCopy,
-          bool ABlockLdsAddExtraM,
+          index_t NRepeatPerShuffle_CCopy,
-          bool BBlockLdsAddExtraN>
+          index_t MRepeatThread_CCopy,
+          index_t MThread_CCopy,
+          index_t NRepeatThread_CCopy,
+          index_t NThread_CCopy,
+          index_t NScalarPerVector_CCopy>
 struct
    DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
    : public DeviceConvFwdBiasActivationAdd<InElementwiseOperation,
@@ -260,6 +265,7 @@ struct
        ABlockTransferSrcScalarPerVector,
        ABlockTransferDstScalarPerVector_K1,
        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
        BBlockTransferThreadSliceLengths_K0_N_K1,
        BBlockTransferThreadClusterLengths_K0_N_K1,
        Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
@@ -267,12 +273,15 @@ struct
        2,                 // BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_K1,
-        false,                            // BThreadTransferSrcResetCoordinateAfterRun,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
+        BBlockLdsAddExtraN,
-        7,                                // CThreadTransferSrcDstVectorDim,
+        MRepeatPerShuffle_CCopy,
-        CThreadTransferDstScalarPerVector,
+        NRepeatPerShuffle_CCopy,
-        ABlockLdsAddExtraM,
+        MRepeatThread_CCopy,
-        BBlockLdsAddExtraN>;
+        MThread_CCopy,
+        NRepeatThread_CCopy,
+        NThread_CCopy,
+        NScalarPerVector_CCopy>;
    // Argument
    struct Argument : public BaseArgument

--- a/device_operation/include/device_conv2d_fwd_xdl_output_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/device_operation/include/device_conv2d_fwd_xdl_output_shuffle_nhwc_kyxc_nhwk.hpp
@@ -40,6 +40,7 @@ template <typename InDataType,
          ck::index_t ABlockTransferSrcVectorDim,
          ck::index_t ABlockTransferSrcScalarPerVector,
          ck::index_t ABlockTransferDstScalarPerVector_K1,
+          bool ABlockLdsAddExtraM,
          typename BBlockTransferThreadSliceLengths_K0_N_K1,
          typename BBlockTransferThreadClusterLengths_K0_N_K1,
          typename BBlockTransferThreadClusterArrangeOrder,
@@ -47,10 +48,14 @@ template <typename InDataType,
          ck::index_t BBlockTransferSrcVectorDim,
          ck::index_t BBlockTransferSrcScalarPerVector,
          ck::index_t BBlockTransferDstScalarPerVector_K1,
-          ck::index_t CThreadTransferSrcDstVectorDim,
+          bool BBlockLdsAddExtraN,
-          ck::index_t CThreadTransferDstScalarPerVector,
+          index_t MRepeatPerShuffle_CCopy,
-          bool ABlockLdsAddExtraM,
+          index_t NRepeatPerShuffle_CCopy,
-          bool BBlockLdsAddExtraN>
+          index_t MRepeatThread_CCopy,
+          index_t MThread_CCopy,
+          index_t NRepeatThread_CCopy,
+          index_t NThread_CCopy,
+          index_t NScalarPerVector_CCopy>
 struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
    : public DeviceConvFwd<InElementwiseOperation, WeiElementwiseOperation, OutElementwiseOperation>
 {
@@ -241,6 +246,7 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N
        ABlockTransferSrcScalarPerVector,
        ABlockTransferDstScalarPerVector_K1,
        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        ABlockLdsAddExtraM,
        BBlockTransferThreadSliceLengths_K0_N_K1,
        BBlockTransferThreadClusterLengths_K0_N_K1,
        Sequence<1, 0, 2>, // BBlockTransferThreadClusterArrangeOrder,
@@ -248,12 +254,15 @@ struct DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N
        2,                 // BBlockTransferSrcVectorDim,
        BBlockTransferSrcScalarPerVector,
        BBlockTransferDstScalarPerVector_K1,
-        false,                            // BThreadTransferSrcResetCoordinateAfterRun,
+        false, // BThreadTransferSrcResetCoordinateAfterRun,
-        Sequence<2, 3, 0, 1, 7, 5, 4, 6>, // CThreadTransferSrcDstAccessOrder,
+        BBlockLdsAddExtraN,
-        7,                                // CThreadTransferSrcDstVectorDim,
+        MRepeatPerShuffle_CCopy,
-        CThreadTransferDstScalarPerVector,
+        NRepeatPerShuffle_CCopy,
-        ABlockLdsAddExtraM,
+        MRepeatThread_CCopy,
-        BBlockLdsAddExtraN>;
+        MThread_CCopy,
+        NRepeatThread_CCopy,
+        NThread_CCopy,
+        NScalarPerVector_CCopy>;
    // Argument
    struct Argument : public BaseArgument

--- a/example/4_conv2d_fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
+++ b/example/4_conv2d_fwd_xdl_output_shuffle/conv2d_fwd_xdl_output_shuffle.cpp
@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::PassThrough_v2;
 using DeviceConvFwdInstance = ck::tensor_operation::device::
    DeviceConv2dFwdXdl_Output_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
    // clang-format off
-//##|    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
+//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| MRepeatPer| NRepeatPer| MRepeat| MThread| NRepeat| NThread| NScalarPer|
-//##|      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
+//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    Shuffle|    Shuffle|  Thread|  _CCopy|  Thread|  _CCopy|     Vector|
-//##|          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
+//      |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     _CCopy|     _CCopy|  _CCopy|        |  _CCopy|        |     _CCopy|
-//##|          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
+//      |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |          |                |                |               |               |              |               |               |          |           |           |        |        |        |        |           |
-    <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
+        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,          1,          1,       1,      32,       1,       8,          8>;
 // clang-format on
 template <typename TIn,

--- a/example/6_conv2d_fwd_xdl_output_shuffle_bias_relu_add/conv2d_fwd_xdl_output_shuffle_bias_relu_add.cpp
+++ b/example/6_conv2d_fwd_xdl_output_shuffle_bias_relu_add/conv2d_fwd_xdl_output_shuffle_bias_relu_add.cpp
@@ -33,11 +33,11 @@ using OutElementOp = ck::tensor_operation::element_wise::AddReluAdd_v2;
 // clang-format off
 using DeviceConvFwdInstance = ck::tensor_operation::device::
   DeviceConv2dFwdXdl_Output_Shuffle_Bias_Activation_Add_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K 
-//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| CThreadTransfer| CThreadTransfer| ABlockLds| BBlockLds|
+//      |    InData|     WeiData|     OutData|     AccData|          In|         Wei|           Out| Block|  MPer|  NPer| K0Per| K1| MPer| NPer| MXdl| NXdl|  ABlockTransfer|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds| MRepeatPer| NRepeatPer| MRepeat| MThread| NRepeat| NThread| NScalarPer|
-//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
+//      |      Type|        Type|        Type|        Type| Elementwise| Elementwise|   Elementwise|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN|    Shuffle|    Shuffle|  Thread|  _CCopy|  Thread|  _CCopy|     Vector|
-//      |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
+//      |          |            |            |            |   Operation|   Operation|     Operation|      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |     _CCopy|     _CCopy|  _CCopy|        |  _CCopy|        |     _CCopy|
-//      |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
+//      |          |            |            |            |            |            |              |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |          |                |                |               |               |              |               |               |          |           |           |        |        |        |        |           |
-        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,               7,               1,      true,      true>;
+        <InDataType, WeiDataType, OutDataType, AccDataType, InElementOp, WeiElementOp, OutElementOp,   256,   128,   256,     4,  8,   32,   32,    2,    4,      S<1, 2, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,      S<1, 4, 8>,     S<4, 64, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,          1,          1,       1,      32,       1,       8,          8>;
 // clang-format on
 template <typename TIn,