Merge branch 'attn-train-develop-qloop' of...

Merge branch 'attn-train-develop-qloop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into attn-train-develop-qloop

Merge branch 'attn-train-develop-qloop' of...
Merge branch 'attn-train-develop-qloop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into attn-train-develop-qloop
41c659bb · danyao12 · 5571be9d · 68e3bb6d · 41c659bb · 41c659bb
Commit 41c659bb authored Jul 15, 2023 by danyao12
20 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v1.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v1.cpp
@@ -1382,7 +1382,7 @@ int run(int argc, char* argv[])
        }

        std::cout << "Checking z:\n";
-        pass &= ck::utils::check_err(z_fwd_gs_ms_ns.mData, z_bwd_gs_ms_ns.mData, 1);
+        pass &= ck::utils::check_integer_err(z_fwd_gs_ms_ns.mData, z_bwd_gs_ms_ns.mData, 1);

        std::cout << "Checking y:\n";
        pass &= ck::utils::check_err(

--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v2.cpp
@@ -969,7 +969,7 @@ int run(int argc, char* argv[])
        }

        std::cout << "Checking z:\n";
-        pass &= ck::utils::check_err(z_fwd_gs_ms_ns.mData, z_bwd_gs_ms_ns.mData, 1);
+        pass &= ck::utils::check_integer_err(z_fwd_gs_ms_ns.mData, z_bwd_gs_ms_ns.mData, 1);

        std::cout << "Checking y:\n";
        pass &= ck::utils::check_err(

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v1.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v1.cpp
@@ -1406,7 +1406,7 @@ int run(int argc, char* argv[])
            }

            std::cout << "Checking z:\n";
-            pass &= ck::utils::check_err(z_fwd_tensors[i].mData, z_bwd_tensors[i].mData, 1);
+            pass &= ck::utils::check_integer_err(z_fwd_tensors[i].mData, z_bwd_tensors[i].mData, 1);

            std::cout << "Checking y:\n";
            pass &= ck::utils::check_err(

--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v2.cpp
@@ -994,7 +994,7 @@ int run(int argc, char* argv[])
            }

            std::cout << "Checking z:\n";
-            pass &= ck::utils::check_err(z_fwd_tensors[i].mData, z_bwd_tensors[i].mData, 1);
+            pass &= ck::utils::check_integer_err(z_fwd_tensors[i].mData, z_bwd_tensors[i].mData, 1);

            std::cout << "Checking y:\n";
            pass &= ck::utils::check_err(

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -89,7 +89,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -1031,11 +1032,13 @@ struct DeviceBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-#if 0
+#if DEBUG_LOG
        arg.Print();
 #endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
@@ -88,7 +88,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -1047,11 +1048,13 @@ struct DeviceBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
-#if 0
+#if DEBUG_LOG
        arg.Print();
 #endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_phased_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_phased_v1.hpp
@@ -87,7 +87,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -1040,11 +1041,13 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Phased_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-#if 0
+#if DEBUG_LOG
        arg.Print();
 #endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -48,6 +48,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -89,7 +90,8 @@ __global__ void
            const index_t raw_m_padded,
            const index_t raw_n_padded)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -119,7 +121,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -154,36 +156,36 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
-                                                      p_b_grid + b_batch_offset,
-                                                      z_matrix_ptr,
-                                                      p_b1_grid + b1_batch_offset,
-                                                      p_c_grid + c_batch_offset,
-                                                      p_lse_grid + lse_batch_offset,
-                                                      p_ygrad_grid + c_batch_offset,
-                                                      p_qgrad_grid + a_batch_offset,
-                                                      p_kgrad_grid + b_batch_offset,
-                                                      p_vgrad_grid + b1_batch_offset,
-                                                      p_shared,
-                                                      a_element_op,
-                                                      b_element_op,
-                                                      acc_element_op,
-                                                      b1_element_op,
-                                                      c_element_op,
-                                                      a_grid_desc_ak0_m_ak1,
-                                                      b_grid_desc_bk0_n_bk1,
-                                                      c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                                      b1_grid_desc_bk0_n_bk1,
-                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                      lse_grid_desc_m,
-                                                      ygrad_grid_desc_o0_m_o1,
-                                                      block_2_ctile_map,
-                                                      c0_matrix_mask,
-                                                      p_drop,
-                                                      ph,
-                                                      z_random_matrix_offset,
-                                                      raw_n_padded,
-                                                      0);
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+                                                        p_b_grid + b_batch_offset,
+                                                        z_matrix_ptr,
+                                                        p_b1_grid + b1_batch_offset,
+                                                        p_c_grid + c_batch_offset,
+                                                        p_lse_grid + lse_batch_offset,
+                                                        p_ygrad_grid + c_batch_offset,
+                                                        p_qgrad_grid + a_batch_offset,
+                                                        p_kgrad_grid + b_batch_offset,
+                                                        p_vgrad_grid + b1_batch_offset,
+                                                        p_shared,
+                                                        a_element_op,
+                                                        b_element_op,
+                                                        acc_element_op,
+                                                        b1_element_op,
+                                                        c_element_op,
+                                                        a_grid_desc_ak0_m_ak1,
+                                                        b_grid_desc_bk0_n_bk1,
+                                                        c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+                                                        b1_grid_desc_bk0_n_bk1,
+                                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+                                                        lse_grid_desc_m,
+                                                        ygrad_grid_desc_o0_m_o1,
+                                                        block_2_ctile_map,
+                                                        c0_matrix_mask,
+                                                        p_drop,
+                                                        ph,
+                                                        z_random_matrix_offset,
+                                                        raw_n_padded,
+                                                        0);
    }
 #else
    ignore = p_a_grid;
@@ -932,7 +934,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -956,6 +958,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -997,9 +1000,11 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
-
-            ave_time = launch_kernel(integral_constant<bool, false>{});
-
+            if(arg.p_drop_ > 0.0){
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+            }else{
+                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+            }
            return ave_time;
        }

@@ -1019,8 +1024,13 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
+#if DEBUG_LOG
+        arg.Print();
+#endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -47,6 +47,7 @@ template <typename GridwiseGemm,
          typename ComputeBasePtrOfStridedBatch,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -88,7 +89,8 @@ __global__ void
            const index_t raw_m_padded,
            const index_t raw_n_padded)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -118,7 +120,7 @@ __global__ void
    {
        for(index_t i = 0; i < nblock; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                p_a_grid + a_batch_offset,
                p_b_grid + b_batch_offset,
                z_matrix_ptr,
@@ -153,7 +155,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
                                                      p_b_grid + b_batch_offset,
                                                      z_matrix_ptr,
                                                      p_b1_grid + b1_batch_offset,
@@ -949,7 +951,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_batched_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -973,6 +975,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        ComputeBasePtrOfStridedBatch,
                        C0MatrixMask,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -1020,11 +1023,17 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_drop_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }

            return ave_time;
@@ -1046,8 +1055,13 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
+#if DEBUG_LOG
+        arg.Print();
+#endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v1.hpp
@@ -81,7 +81,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -898,7 +899,9 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
        arg.Print();
 #endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp
@@ -83,7 +83,8 @@ __global__ void
            const index_t raw_m_padded,
            const index_t raw_n_padded)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
@@ -915,7 +916,9 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
        arg.Print();
 #endif

-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -52,7 +52,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();
    const auto arg_ptr     = reinterpret_cast<const GroupKernelArg*>(
@@ -1005,7 +1006,9 @@ struct DeviceGroupedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_kloop_v2.hpp
@@ -52,7 +52,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();
    const auto arg_ptr     = reinterpret_cast<const GroupKernelArg*>(
@@ -1012,7 +1013,9 @@ struct DeviceGroupedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename B1ElementwiseOperation,
          typename CElementwiseOperation,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -52,7 +53,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();
    const auto arg_ptr     = reinterpret_cast<const GroupKernelArg*>(
@@ -104,7 +106,7 @@ __global__ void
    {
        for(index_t i = 0; i < num_blocks_per_batch; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                arg_ptr[group_id].p_a_grid_ + a_batch_offset,
                arg_ptr[group_id].p_b_grid_ + b_batch_offset,
                z_matrix_ptr,
@@ -140,7 +142,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
            z_matrix_ptr,
@@ -960,7 +962,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_grouped_multihead_attention_backward_qloop_xdl_cshuffle_v1<
                        GridwiseGemm,
@@ -971,6 +973,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                        B1ElementwiseOperation,
                        CElementwiseOperation,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -995,11 +998,17 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            // to concern Gemm0's loop
            if(all_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            else
            {
@@ -1025,7 +1034,9 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -35,6 +35,7 @@ template <typename GridwiseGemm,
          typename B1ElementwiseOperation,
          typename CElementwiseOperation,
          bool HasMainKBlockLoop,
+          bool IsDropout,
          bool Deterministic>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
@@ -52,7 +53,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];
    const index_t block_id = get_block_1d_id();
    const auto arg_ptr     = reinterpret_cast<const GroupKernelArg*>(
@@ -104,7 +106,7 @@ __global__ void
    {
        for(index_t i = 0; i < num_blocks_per_batch; i++)
        {
-            GridwiseGemm::template Run<HasMainKBlockLoop>(
+            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
                arg_ptr[group_id].p_a_grid_ + a_batch_offset,
                arg_ptr[group_id].p_b_grid_ + b_batch_offset,
                z_matrix_ptr,
@@ -140,7 +142,7 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop>(
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
            arg_ptr[group_id].p_a_grid_ + a_batch_offset,
            arg_ptr[group_id].p_b_grid_ + b_batch_offset,
            z_matrix_ptr,
@@ -967,7 +969,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            float ave_time = 0;

-            auto launch_kernel = [&](auto has_main_k_block_loop_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_, auto is_dropout_) {
                const auto kernel =
                    kernel_grouped_multihead_attention_backward_qloop_xdl_cshuffle_v2<
                        GridwiseGemm,
@@ -978,6 +980,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                        B1ElementwiseOperation,
                        CElementwiseOperation,
                        has_main_k_block_loop_,
+                        is_dropout_,
                        Deterministic>;

                return launch_and_time_kernel(
@@ -1002,11 +1005,17 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            // to concern Gemm0's loop
            if(all_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, true>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
-                ave_time = launch_kernel(integral_constant<bool, false>{});
+                if(arg.p_dropout_ > 0.0)
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                else
+                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
            }
            else
            {
@@ -1032,7 +1041,9 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp
@@ -52,7 +52,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id         = get_block_1d_id();
@@ -938,7 +939,9 @@ struct DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V1

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp
@@ -52,7 +52,8 @@ __global__ void
            const unsigned long long seed,
            const unsigned long long offset)
 {
-#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__))
+#if(!defined(__HIP_DEVICE_COMPILE__) || defined(__gfx908__) || defined(__gfx90a__) || \
+    defined(__gfx940__) || defined(__gfx941__) || defined(__gfx942__))
    __shared__ char p_shared[GridwiseGemm::GetSharedMemoryNumberOfByte()];

    const index_t block_id         = get_block_1d_id();
@@ -960,7 +961,9 @@ struct DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2

    static bool IsSupportedArgument(const Argument& arg)
    {
-        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a"))
+        if(!(ck::get_device_name() == "gfx908" || ck::get_device_name() == "gfx90a" ||
+             ck::get_device_name() == "gfx940" || ck::get_device_name() == "gfx941" ||
+             ck::get_device_name() == "gfx942"))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -1937,10 +1937,14 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global    = m_local + m_block_data_idx_on_grid;
                    auto n_global    = n_local + n_block_data_idx_on_grid;
-                    bool masked_flag = c0_matrix_mask.IsMaskedElement(m_global, n_global);
-                    s_element_op(s_slash_p_thread_buf(i),
-                                 masked_flag ? -ck::NumericLimits<float>::Infinity()
-                                             : s_slash_p_thread_buf[i]);
+                    if(c0_matrix_mask.IsMaskedElement(m_global, n_global))
+                    {
+                        s_slash_p_thread_buf(i) = -ck::NumericLimits<float>::Infinity();
+                    }
+                    else
+                    {
+                        s_element_op(s_slash_p_thread_buf(i), s_slash_p_thread_buf[i]);
+                    }
                });
            }
            else
@@ -2003,11 +2007,17 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
                constexpr auto m =
                    pgrad_thread_idx_to_m_n_adaptor.CalculateBottomIndex(pgrad_thread_idx)[I0];
                // dS and P has same thread buf layout
-                bool undropped_flag = s_slash_p_thread_buf[i] >= 0;
-                sgrad_thread_buf(i) =
-                    s_slash_p_thread_buf[i] *
-                    (undropped_flag ? (pgrad_thread_buf[i] - y_dot_ygrad_thread_buf[Number<m>{}])
-                                    : y_dot_ygrad_thread_buf[Number<m>{}]);
+                if(s_slash_p_thread_buf[i] >= 0)
+                {
+                    sgrad_thread_buf(i) =
+                        s_slash_p_thread_buf[i] *
+                        (pgrad_thread_buf[i] - y_dot_ygrad_thread_buf[Number<m>{}]);
+                }
+                else
+                {
+                    sgrad_thread_buf(i) =
+                        s_slash_p_thread_buf[i] * y_dot_ygrad_thread_buf[Number<m>{}];
+                }
            });

            // gemm dQ

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1222,6 +1222,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
    }

    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_O0_M_O1>
@@ -1947,56 +1948,57 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if(p_z_grid)
-            {
+            if constexpr(IsDropout){
+                if(p_z_grid)
+                {

-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
-
-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
-
-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-
-                blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                    decltype(z_tenor_buffer),
-                                                                    decltype(position_offset),
-                                                                    true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-
-                z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                 z_tenor_buffer,
-                                                 z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 z_grid_buf);
-            }
-            else
-            {
-                ignore = z_grid_buf;
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id
+
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+
+                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                                                                        decltype(z_tenor_buffer),
+                                                                        decltype(position_offset),
+                                                                        true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+
+                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                                    z_tenor_buffer,
+                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    z_grid_buf);
+                }
+                else
+                {
+                    ignore = z_grid_buf;

-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;

-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id

-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;

-                // P_dropped
-                blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                               decltype(position_offset),
-                                                               true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                    // P_dropped
+                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
+                                                                decltype(position_offset),
+                                                                true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                }
            }
-
            block_sync_lds(); // wait for gemm1 LDS read

            // dS = P * (dP - Y_dot_dY)

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1154,6 +1154,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
    }

    template <bool HasMainKBlockLoop,
+              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_M0_O_M1>
@@ -1863,55 +1864,56 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if(p_z_grid)
-            {
+            if constexpr(IsDropout){
+                if(p_z_grid)
+                {

-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
-
-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
-
-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-
-                blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                    decltype(z_tenor_buffer),
-                                                                    decltype(position_offset),
-                                                                    true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-
-                z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                 z_tenor_buffer,
-                                                 z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                 z_grid_buf);
-            }
-            else
-            {
-                ignore = z_grid_buf;
-
-                auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                auto m_global = m_local + m_block_data_idx_on_grid;
-                auto n_global = n_local + n_block_data_idx_on_grid;
-
-                auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                          n_global; // unique element global 1d id
-
-                auto global_elem_id =
-                    (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-                // P_dropped
-                blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                               decltype(position_offset),
-                                                               true>(
-                    s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id
+
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+
+                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                                                                        decltype(z_tenor_buffer),
+                                                                        decltype(position_offset),
+                                                                        true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+
+                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                                                    z_tenor_buffer,
+                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                                                    z_grid_buf);
+                }
+                else
+                {
+                    ignore = z_grid_buf;
+
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
+                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global = m_local + m_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
+
+                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                                            n_global; // unique element global 1d id
+
+                    auto global_elem_id =
+                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                    // P_dropped
+                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
+                                                                decltype(position_offset),
+                                                                true>(
+                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
+                }
            }
-
            block_sync_lds(); // wait for gemm1 LDS read

            // gemm dV