max and sum save to register

d4368d77 · ltqin · 2717e60d · d4368d77 · d4368d77
Commit d4368d77 authored Jul 27, 2022 by ltqin
2 changed files
--- a/include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_softmax_v1.hpp
@@ -43,9 +43,6 @@ struct BlockwiseSoftmax_V1
        }
    };
-    static constexpr auto softmax_buf_desc_m_k =
-        make_naive_tensor_descriptor_packed(make_tuple(Number<MPerBlock>{}, Number<2>{}));
    constexpr static auto in_thread_desc = make_naive_tensor_descriptor_packed(
        make_tuple(Number<MRepeat>{}, Number<NRepeat>{}, Number<RegSizePerXdlops>{}));
@@ -91,19 +88,10 @@ struct BlockwiseSoftmax_V1
                            detail::AccumulateWithNanIgnore<reduce::Add, AccDataType>>;
    template <typename CThreadBuffer>
    __host__ __device__ static void
-    Run(CThreadBuffer& in_thread_buf, void* __restrict__ p_reduce, void* __restrict__ p_softmax)
+    Run(CThreadBuffer& in_thread_buf, float& f_sum, float& f_max, void* __restrict__ p_reduce)
    {
        auto reduce_work_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
            static_cast<AccDataType*>(p_reduce), BlockSize);
-        auto softmax_lds_buffer = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-            static_cast<AccDataType*>(p_softmax), MPerBlock * 2);
-        // thread id map to thread layout
-        const index_t thread_local_id = get_thread_local_1d_id();
-        const auto thread_cluster_idx =
-            BlockToMKMap_M0_K_M1Adapt::CalculateBottomIndex(make_multi_index(thread_local_id));
-        const auto thread_m_cluster_id = thread_cluster_idx[Number<0>{}];
-        const auto thread_k_cluster_id = thread_cluster_idx[Number<1>{}];
        //
        // find max value
        //
@@ -123,12 +111,8 @@ struct BlockwiseSoftmax_V1
        // block reduce for max
        BlockwiseMaxReduce::Reduce(reduce_work_buf, max_value_buf(I0));
        block_sync_lds();
-        // save max value to lds
+        // save max
-        if(0 == thread_k_cluster_id)
+        f_max = max_value_buf(I0);
-        {
-            softmax_lds_buffer(softmax_buf_desc_m_k.CalculateOffset(
-                make_tuple(thread_m_cluster_id, 1))) = max_value_buf(I0);
-        }
        //
        // softmax
@@ -158,12 +142,8 @@ struct BlockwiseSoftmax_V1
        BlockwiseSumReduce::Reduce(reduce_work_buf, accu_value_buf(I0));
        block_sync_lds();
-        // save sum to lds
+        // save sum
-        if(0 == thread_k_cluster_id)
+        f_sum = accu_value_buf(I0);
-        {
-            softmax_lds_buffer(softmax_buf_desc_m_k.CalculateOffset(
-                make_tuple(thread_m_cluster_id, 0))) = accu_value_buf(I0);
-        }
    }
 }; // namespace ck

--- a/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_gemm_xdlops_v2r3.hpp
@@ -474,7 +474,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                          num_k_block_main_loop);
        {
            __shared__ AccDataType p_reduce_work_buffer[BlockSize];
-            __shared__ AccDataType p_lex[MPerBlock * 2];
+            float f_sum, f_max;
            using BlockwiseSoftmax = BlockwiseSoftmax_V1<BlockSize,
                                                         FloatAcc,
@@ -484,7 +484,7 @@ struct GridwiseGemm_k0mk1_k0nk1_mn_xdlops_v2r3
                                                         blockwise_gemm.GetRegSizePerXdlops(),
                                                         MXdlPerWave,
                                                         NXdlPerWave>;
-            BlockwiseSoftmax::Run(c_thread_buf, p_reduce_work_buffer, p_lex);
+            BlockwiseSoftmax::Run(c_thread_buf, f_sum, f_max, p_reduce_work_buffer);
        }
        // output: register to global memory