Merge branch 'attn-train-develop-qloop' into mha-train-develop

0999070d · danyao12 · 5ba30232 · 68e3bb6d · 0999070d · 0999070d
Commit 0999070d authored Jul 15, 2023 by danyao12
9 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -1047,7 +1047,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-#if 0
+#if DEBUG_LOG
        arg.Print();
 #endif


--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
@@ -1048,7 +1048,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
-#if 0
+#if DEBUG_LOG
        arg.Print();
 #endif


--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_phased_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_phased_v1.hpp
@@ -1041,7 +1041,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Phased_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-#if 0
+#if DEBUG_LOG
        arg.Print();
 #endif


--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -48,6 +48,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -120,7 +121,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -155,36 +156,36 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                      p_b_grid + b_batch_offset,
-                                                      z_matrix_ptr,
-                                                      p_b1_grid + b1_batch_offset,
-                                                      p_c_grid + c_batch_offset,
-                                                      p_lse_grid + lse_batch_offset,
-                                                      p_ygrad_grid + c_batch_offset,
-                                                      p_qgrad_grid + a_batch_offset,
-                                                      p_kgrad_grid + b_batch_offset,
-                                                      p_vgrad_grid + b1_batch_offset,
-                                                      p_shared,
-                                                      a_element_op,
-                                                      b_element_op,
-                                                      acc_element_op,
-                                                      b1_element_op,
-                                                      c_element_op,
-                                                      a_grid_desc_ak0_m_ak1,
-                                                      b_grid_desc_bk0_n_bk1,
-                                                      c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                                      b1_grid_desc_bk0_n_bk1,
-                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                      lse_grid_desc_m,
-                                                      ygrad_grid_desc_o0_m_o1,
-                                                      block_2_ctile_map,
-                                                      c0_matrix_mask,
-                                                      p_drop,
-                                                      ph,
-                                                      z_random_matrix_offset,
-                                                      raw_n_padded,
-                                                      0);
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+                                                        p_b_grid + b_batch_offset,
+                                                        z_matrix_ptr,
+                                                        p_b1_grid + b1_batch_offset,
+                                                        p_c_grid + c_batch_offset,
+                                                        p_lse_grid + lse_batch_offset,
+                                                        p_ygrad_grid + c_batch_offset,
+                                                        p_qgrad_grid + a_batch_offset,
+                                                        p_kgrad_grid + b_batch_offset,
+                                                        p_vgrad_grid + b1_batch_offset,
+                                                        p_shared,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        acc_element_op,
+                                                        b1_element_op,
+                                                        c_element_op,
+                                                        a_grid_desc_ak0_m_ak1,
+                                                        b_grid_desc_bk0_n_bk1,
+                                                        c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                                                        b1_grid_desc_bk0_n_bk1,
+                                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                        lse_grid_desc_m,
+                                                        ygrad_grid_desc_o0_m_o1,
+                                                        block_2_ctile_map,
+                                                        c0_matrix_mask,
+                                                        p_drop,
+                                                        ph,
+                                                        z_random_matrix_offset,
+                                                        raw_n_padded,
+                                                        0);
    }
 #else
    ignore = p_a_grid;
@@ -933,7 +934,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -957,6 +958,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -998,9 +1000,11 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
-
-            ave_time = launch_kernel(integral_constant<bool, false>{});
-
+            if(arg.p_drop_ > 0.0){
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+            }else{
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+            }
            return ave_time;
        }

@@ -1020,6 +1024,9 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
+#if DEBUG_LOG
+        arg.Print();
+#endif

        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -47,6 +47,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -119,7 +120,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -154,7 +155,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
                                                      p_b_grid + b_batch_offset,
                                                      z_matrix_ptr,
                                                      p_b1_grid + b1_batch_offset,
@@ -950,7 +951,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -974,6 +975,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -1021,11 +1023,17 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }

            return ave_time;
@@ -1047,6 +1055,9 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
+#if DEBUG_LOG
+        arg.Print();
+#endif

        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename B1ElementwiseOperation,
          typename CElementwiseOperation,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -105,7 +106,7 @@ __global__ void
    {
        for(index_t i = 0; i < num_blocks_per_batch; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                arg_ptr[group_id].p_a_grid_ + a_batch_offset,
                arg_ptr[group_id].p_b_grid_ + b_batch_offset,
                z_matrix_ptr,
@@ -141,7 +142,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
            z_matrix_ptr,
@@ -961,7 +962,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_grouped_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -972,6 +973,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        B1ElementwiseOperation,
                        CElementwiseOperation,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -996,11 +998,17 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            // to concern Gemm0's loop
            if(all_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename B1ElementwiseOperation,
          typename CElementwiseOperation,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -105,7 +106,7 @@ __global__ void
    {
        for(index_t i = 0; i < num_blocks_per_batch; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                arg_ptr[group_id].p_a_grid_ + a_batch_offset,
                arg_ptr[group_id].p_b_grid_ + b_batch_offset,
                z_matrix_ptr,
@@ -141,7 +142,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
            z_matrix_ptr,
@@ -968,7 +969,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_grouped_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -979,6 +980,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        B1ElementwiseOperation,
                        CElementwiseOperation,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -1003,11 +1005,17 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            // to concern Gemm0's loop
            if(all_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1222,6 +1222,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
    }

    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_O0_M_O1>
@@ -1947,56 +1948,57 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if(p_z_grid)
-            {
+            if constexpr(IsDropout){
+                if(p_z_grid)
+                {

-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
-
-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
-
-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-
-                blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                    decltype(z_tenor_buffer),
-                                                                    decltype(position_offset),
-                                                                    true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-
-                z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                 z_tenor_buffer,
-                                                 z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 z_grid_buf);
-            }
-            else
-            {
-                ignore = z_grid_buf;
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id
+
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+
+                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                                                                        decltype(z_tenor_buffer),
+                                                                        decltype(position_offset),
+                                                                        true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+
+                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                                    z_tenor_buffer,
+                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    z_grid_buf);
+                }
+                else
+                {
+                    ignore = z_grid_buf;

-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;

-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id

-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;

-                // P_dropped
-                blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                               decltype(position_offset),
-                                                               true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                    // P_dropped
+                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
+                                                                decltype(position_offset),
+                                                                true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                }
            }
-
            block_sync_lds(); // wait for gemm1 LDS read

            // dS = P * (dP - Y_dot_dY)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1154,6 +1154,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
    }

    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_M0_O_M1>
@@ -1863,55 +1864,56 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if(p_z_grid)
-            {
+            if constexpr(IsDropout){
+                if(p_z_grid)
+                {

-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
-
-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
-
-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-
-                blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                    decltype(z_tenor_buffer),
-                                                                    decltype(position_offset),
-                                                                    true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-
-                z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                 z_tenor_buffer,
-                                                 z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 z_grid_buf);
-            }
-            else
-            {
-                ignore = z_grid_buf;
-
-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
-
-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
-
-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-                // P_dropped
-                blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                               decltype(position_offset),
-                                                               true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id
+
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+
+                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                                                                        decltype(z_tenor_buffer),
+                                                                        decltype(position_offset),
+                                                                        true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+
+                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                                    z_tenor_buffer,
+                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    z_grid_buf);
+                }
+                else
+                {
+                    ignore = z_grid_buf;
+
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id
+
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                    // P_dropped
+                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
+                                                                decltype(position_offset),
+                                                                true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                }
            }
-
            block_sync_lds(); // wait for gemm1 LDS read

            // gemm dV