Merge pull request #798 from ROCmSoftwarePlatform/skip_dropout

Skip dropout

Merge pull request #798 from ROCmSoftwarePlatform/skip_dropout
Skip dropout
6d6520ab · Dan Yao · GitHub · 30e10f80 · 1970d162 · 6d6520ab
Unverified Commit 6d6520ab authored Jul 15, 2023 by Dan Yao Committed by GitHub Jul 15, 2023
7 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -48,6 +48,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -120,7 +121,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -155,7 +156,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
                                                        p_b_grid + b_batch_offset,
                                                        z_matrix_ptr,
                                                        p_b1_grid + b1_batch_offset,
@@ -933,7 +934,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -957,6 +958,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;
                return launch_and_time_kernel(
@@ -998,9 +1000,11 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
+            if(arg.p_drop_ > 0.0){
-            ave_time = launch_kernel(integral_constant<bool, false>{});
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+            }else{
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+            }
            return ave_time;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -47,6 +47,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -119,7 +120,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -154,7 +155,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
                                                      p_b_grid + b_batch_offset,
                                                      z_matrix_ptr,
                                                      p_b1_grid + b1_batch_offset,
@@ -950,7 +951,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -974,6 +975,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;
                return launch_and_time_kernel(
@@ -1021,11 +1023,17 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            return ave_time;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename B1ElementwiseOperation,
          typename CElementwiseOperation,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -105,7 +106,7 @@ __global__ void
    {
        for(index_t i = 0; i < num_blocks_per_batch; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                arg_ptr[group_id].p_a_grid_ + a_batch_offset,
                arg_ptr[group_id].p_b_grid_ + b_batch_offset,
                z_matrix_ptr,
@@ -141,7 +142,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
            z_matrix_ptr,
@@ -961,7 +962,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_grouped_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -972,6 +973,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        B1ElementwiseOperation,
                        CElementwiseOperation,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;
                return launch_and_time_kernel(
@@ -996,11 +998,17 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            // to concern Gemm0's loop
            if(all_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename B1ElementwiseOperation,
          typename CElementwiseOperation,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -105,7 +106,7 @@ __global__ void
    {
        for(index_t i = 0; i < num_blocks_per_batch; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                arg_ptr[group_id].p_a_grid_ + a_batch_offset,
                arg_ptr[group_id].p_b_grid_ + b_batch_offset,
                z_matrix_ptr,
@@ -141,7 +142,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
            z_matrix_ptr,
@@ -968,7 +969,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            float ave_time = 0;
-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_grouped_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -979,6 +980,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        B1ElementwiseOperation,
                        CElementwiseOperation,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;
                return launch_and_time_kernel(
@@ -1003,11 +1005,17 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            // to concern Gemm0's loop
            if(all_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1222,6 +1222,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
    }
    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_O0_M_O1>
@@ -1947,6 +1948,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            constexpr auto position_offset = M3 * M4;
            // save z to global
+            if constexpr(IsDropout){
                if(p_z_grid)
                {
@@ -1996,7 +1998,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                                                                true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
+            }
            block_sync_lds(); // wait for gemm1 LDS read
            // dS = P * (dP - Y_dot_dY)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1154,6 +1154,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
    }
    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_M0_O_M1>
@@ -1863,6 +1864,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            constexpr auto position_offset = M3 * M4;
            // save z to global
+            if constexpr(IsDropout){
                if(p_z_grid)
                {
@@ -1911,7 +1913,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                                                                true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
+            }
            block_sync_lds(); // wait for gemm1 LDS read
            // gemm dV

--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
@@ -965,6 +965,19 @@ inline __host__ __device__ constexpr float type_convert<float, bhalf_t>(bhalf_t
 }
 // convert fp32 to bfp16
+#if FLASH_ATTENTION_INTERNAL_USE_RTZ
+template <>
+inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)
+{
+    union
+    {
+        float fp32;
+        uint32_t int32;
+    } u = {static_cast<float>(x)};
+    return uint16_t(u.int32 >> 16);
+}
+#else
 template <>
 inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float x)
 {
@@ -1007,6 +1020,7 @@ inline __host__ __device__ constexpr bhalf_t type_convert<bhalf_t, float>(float
    return uint16_t(u.int32 >> 16);
 }
+#endif
 // convert fp16 to bf16
 template <>