add other version ApplyDropout

cc974f0f · ltqin · 06ad7791 · cc974f0f · cc974f0f · cc974f0f
Commit cc974f0f authored Jan 31, 2023 by ltqin
3 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_fp16_dropout.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_fp16_dropout.cpp
@@ -239,13 +239,12 @@ void run_attention_fwd_host(const TensorQ& q_g_m_k,
    ref_softmax_invoker.Run(ref_softmax_argument);
-    // P_dropout
+    // P_dropped
    auto ref_dropout         = ReferenceDropoutInstance{};
    auto ref_dropout_invoker = ref_dropout.MakeInvoker();
    auto ref_dropout_argment =
        ref_dropout.MakeArgument(z_g_m_n, p_g_m_n, p_drop_g_m_n, p_dropout_in_16bits, rp_dropout);
    ref_dropout_invoker.Run(ref_dropout_argment);
-    // std::cout << "p_drop_g_m_n ref:\n" << p_drop_g_m_n;
    // Y = P_dropout * V
    auto ref_gemm1          = ReferenceGemm1Instance{};

--- a/include/ck/tensor_operation/gpu/block/blockwise_dropout.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_dropout.hpp
@@ -16,6 +16,40 @@ struct BlockwiseDropout
    static constexpr index_t MRepeat = ThreadSliceDesc_M_K{}.GetLength(I0);
    static constexpr index_t KRepeat = ThreadSliceDesc_M_K{}.GetLength(I1);
+    template <typename CThreadBuffer, bool using_sign_bit = false>
+    __host__ __device__ void ApplyDropout(CThreadBuffer& in_thread_buf, ck::philox ph)
+    {
+        auto execute_dropout = [&](bool keep, DataType val) {
+            if constexpr(using_sign_bit)
+                return keep ? val : -val;
+            else
+                return keep ? val * p_dropout_rescale : float(0);
+        };
+        constexpr int tmp_size = MRepeat * KRepeat;
+        int philox_calls = tmp_size / 8;
+        ushort tmp[tmp_size];
+        for(int i = 0; i < philox_calls; i++)
+        {
+            ph.get_random_8x16((tmp + i * 8));
+        }
+        block_sync_lds();
+        int tmp_index = 0;
+        static_for<0, MRepeat, 1>{}([&](auto iM) {
+            static_for<0, KRepeat, 1>{}([&](auto iK) {
+                auto offset = Number<ThreadSliceDesc_M_K{}.CalculateOffset(make_tuple(iM, iK))>{};
+                in_thread_buf(offset) =
+                    execute_dropout(tmp[tmp_index] < p_dropout_16bits, in_thread_buf(offset));
+                tmp_index = tmp_index + 1;
+            });
+        });
+    }
    template <typename CThreadBuffer, typename ZThreadBuffer, bool using_sign_bit = false>
    __host__ __device__ void
    ApplyDropout(CThreadBuffer& in_thread_buf, ck::philox ph, ZThreadBuffer& z_thread_buf)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
@@ -1846,20 +1846,27 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
            // scaling is already performed in the preceding statements with s_element_op
            blockwise_softmax.RunWithPreCalcStats(s_slash_p_thread_buf, lse_thread_buf);
+            // save z to global
+            if(p_z_grid)
+            {
                // P_dropped
                blockwise_dropout.template ApplyDropout<decltype(s_slash_p_thread_buf),
                                                        decltype(z_tenor_buffer),
-                                                    true>(s_slash_p_thread_buf, ph, z_tenor_buffer);
+                                                        true>(
+                    s_slash_p_thread_buf, ph, z_tenor_buffer);
-            // save z to global
-            if(p_z_grid)
-            {
                z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                                                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
                                                 z_tenor_buffer,
                                                 z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                                                 z_grid_buf);
            }
+            else
+            {
+                // P_dropped
+                blockwise_dropout.template ApplyDropout<decltype(s_slash_p_thread_buf), true>(
+                    s_slash_p_thread_buf, ph);
+            }
            block_sync_lds(); // wait for gemm1 LDS read