impletment one block

73156add · ltqin · 848ceeb3 · 73156add · 73156add
Commit 73156add authored Jul 25, 2022 by ltqin
2 changed files
--- a/example/01_gemm/gemm_xdl_fp16_flash_attention.cpp
+++ b/example/01_gemm/gemm_xdl_fp16_flash_attention.cpp
@@ -51,7 +51,7 @@ using DeviceGemmInstance0 = ck::tensor_operation::device::DeviceGemmXdl
 //######|      Type|      Type|      Type|        Type|        |        |        | Elementwise| Elementwise| Elementwise|Spacialization|  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| SrcDstVectorDim|       DstScalar|
 //######|          |          |          |            |        |        |        |   Operation|   Operation|   Operation|              |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1|          | Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|          |                |       PerVector|
 //######|          |          |          |            |        |        |        |            |            |            |              |      |      |      |      |   |     |     |     |     |                |               |               |               |               |               |          |                |               |               |              |               |               |          |                |                |
-        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   64,   16,   16,     4,    8,   16,   16,    1,    1,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 16, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>;
+        < ADataType, BDataType, CDataType, AccDataType, ALayout, BLayout, CLayout,  AElementOp,  BElementOp,  CElementOp,   GemmDefault,   128,   32,   32,     4,    8,   16,   16,    1,    2,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,              2,              8,              8,      true,     S<4, 32, 1>,     S<1, 0, 2>,     S<1, 0, 2>,             2,              8,              8,      true,               7,               1>;
 // clang-format on
 using DeviceGemmInstance1 = ck::tensor_operation::device::DeviceGemm_Xdl_CShuffle
@@ -92,8 +92,8 @@ int main(int argc, char* argv[])
    bool time_kernel     = false;
    // GEMM shape
-    ck::index_t M = 16;
+    ck::index_t M = 32;
-    ck::index_t N = 16;
+    ck::index_t N = 32;
    ck::index_t K = 64;
    ck::index_t StrideA = K;

--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
@@ -26,16 +26,14 @@ struct BlockwiseSoftmax_V1
    struct BlockToMKMap_M0_K_M1Adapt
    {
-        using ThreadClusterLengths_M_K                  = Sequence<MPerXDL, WaveSize / MPerXDL>;
-        using ThreadClusterArrangeOrder                 = Sequence<1, 0>;
        __host__ __device__ BlockToMKMap_M0_K_M1Adapt() = default;
        template <typename TopIdx>
        __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
        {
+            const auto index = idx_top[I0];
-            constexpr auto thread_cluster_desc =
+            const auto m     = (index / WaveSize) * MPerXDL + index % MPerXDL;
-                make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+            const auto k     = (index % WaveSize) / MPerXDL;
-            return thread_cluster_desc.CalculateBottomIndex(idx_top);
+            return make_tuple(m, k);
        }
    };
    static constexpr auto I0                  = Number<0>{};
@@ -59,7 +57,7 @@ struct BlockwiseSoftmax_V1
                            false, // param ignored
                            detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
-    using ThreadClusterLengths_M_K = Sequence<MPerXDL, WaveSize / MPerXDL>;
+    using ThreadClusterLengths_M_K = Sequence<MPerXDL * BlockSize / WaveSize, WaveSize / MPerXDL>;
    using BlockwiseMaxReduce =
        PartitionedBlockwiseReduction2<AccDataType,