skip dropout when dropout=0

e1287b9a · fsx950223 · f5c70413 · e1287b9a · e1287b9a · e1287b9a
Commit e1287b9a authored Jul 14, 2023 by fsx950223
4 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -48,6 +48,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -119,7 +120,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -154,7 +155,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
                                                        p_b_grid + b_batch_offset,
                                                        z_matrix_ptr,
                                                        p_b1_grid + b1_batch_offset,
@@ -932,7 +933,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -956,6 +957,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;
                return launch_and_time_kernel(
@@ -997,9 +999,11 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
+            if(arg.p_drop_ > 0.0){
-            ave_time = launch_kernel(integral_constant<bool, false>{});
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+            }else{
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+            }
            return ave_time;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -47,6 +47,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -118,7 +119,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -153,7 +154,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
                                                      p_b_grid + b_batch_offset,
                                                      z_matrix_ptr,
                                                      p_b1_grid + b1_batch_offset,
@@ -949,7 +950,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -973,6 +974,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;
                return launch_and_time_kernel(
@@ -1020,11 +1022,17 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            return ave_time;

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1222,6 +1222,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
    }
    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_O0_M_O1>
@@ -1947,6 +1948,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
            constexpr auto position_offset = M3 * M4;
            // save z to global
+            if constexpr(IsDropout){
                if(p_z_grid)
                {
@@ -1996,7 +1998,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V1
                                                                true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
+            }
            block_sync_lds(); // wait for gemm1 LDS read
            // dS = P * (dP - Y_dot_dY)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1154,6 +1154,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
    }
    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_M0_O_M1>
@@ -1863,6 +1864,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
            constexpr auto position_offset = M3 * M4;
            // save z to global
+            if constexpr(IsDropout){
                if(p_z_grid)
                {
@@ -1911,7 +1913,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
                                                                true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
+            }
            block_sync_lds(); // wait for gemm1 LDS read
            // gemm dV