add PartitionedBlockwiseReduction2

848ceeb3 · ltqin · 9bfe6591 · 848ceeb3
Commit 848ceeb3 authored Jul 25, 2022 by ltqin
Show whitespace changes
Inline Side-by-side

Showing with 27 additions and 24 deletions

include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp ...de/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp +27 -24

No files found.
--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
@@ -22,20 +22,26 @@ struct BlockwiseSoftmax_V1
 {
    static_assert(MRepeat == 1, "Now MRepeat must equal 1");
-    struct BlockToMKMap_M0_K_M1Adapt{
+    static constexpr index_t WaveSize = 64;
-        __host__ __device__ BlockToMKMap_M0_K_M1Adapt() = default;
-        __host__ __device__ constexpr auto CalculateBottomIndex(const index_t& idx_top) const{
+    struct BlockToMKMap_M0_K_M1Adapt
+    {
        using ThreadClusterLengths_M_K                  = Sequence<MPerXDL, WaveSize / MPerXDL>;
        using ThreadClusterArrangeOrder                 = Sequence<1, 0>;
-    static constexpr auto thread_cluster_desc =
+        __host__ __device__ BlockToMKMap_M0_K_M1Adapt() = default;
+        template <typename TopIdx>
+        __host__ __device__ constexpr auto CalculateBottomIndex(const TopIdx& idx_top) const
+        {
+            constexpr auto thread_cluster_desc =
                make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
+            return thread_cluster_desc.CalculateBottomIndex(idx_top);
        }
-    }
+    };
    static constexpr auto I0                  = Number<0>{};
    static constexpr auto I1                  = Number<1>{};
    static constexpr auto I2                  = Number<2>{};
    static constexpr index_t MThreadSliceSize = 1;
-    static constexpr index_t WaveSize         = 64;
    constexpr static auto in_thread_desc = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, Number<RegSizePerXdlops>{}));
@@ -54,15 +60,12 @@ struct BlockwiseSoftmax_V1
                            detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
    using ThreadClusterLengths_M_K = Sequence<MPerXDL, WaveSize / MPerXDL>;
-    using ThreadClusterArrangeOrder = Sequence<1, 0>;
-    static constexpr auto thread_cluster_desc =
-        make_cluster_descriptor(ThreadClusterLengths_M_K{}, ThreadClusterArrangeOrder{});
    using BlockwiseMaxReduce =
        PartitionedBlockwiseReduction2<AccDataType,
                                       BlockSize,
                                       ThreadClusterLengths_M_K,
-                                      decltype(thread_cluster_desc),
+                                       BlockToMKMap_M0_K_M1Adapt,
                                       reduce::Max,
                                       false, // param ignored
                                       detail::AccumulateWithNanIgnore<reduce::Max, AccDataType>>;
@@ -71,7 +74,7 @@ struct BlockwiseSoftmax_V1
        PartitionedBlockwiseReduction2<AccDataType,
                                       BlockSize,
                                       ThreadClusterLengths_M_K,
-                                      decltype(thread_cluster_desc),
+                                       BlockToMKMap_M0_K_M1Adapt,
                                       reduce::Add,
                                       false, // ignored
                                       detail::AccumulateWithNanIgnore<reduce::Add, AccDataType>>;