add from botton right mask

92b9b046 · ltqin · 41c659bb · 92b9b046 · 92b9b046 · 92b9b046
Commit 92b9b046 authored Jul 24, 2023 by ltqin
10 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
@@ -24,7 +24,7 @@ Kernel outputs:
 */
 #define PRINT_HOST 0
-#define USING_MASK 0
+#define USING_MASK 1
 #define DIM 128 // DIM should be a multiple of 8.
 #include <iostream>
@@ -85,7 +85,7 @@ static constexpr ck::index_t CShuffleBlockTransferScalarPerVector_NPerBlock = 8;
 static constexpr auto GemmSpec = ck::tensor_operation::device::GemmSpecialization::MNKOPadding;
 #if USING_MASK
 static constexpr auto MaskingSpec =
-    ck::tensor_operation::device::MaskingSpecialization::MaskOutUpperTriangle;
+    ck::tensor_operation::device::MaskingSpecialization::MaskUpperTringleFromBottonRight;
 #else
 static constexpr auto MaskingSpec =
    ck::tensor_operation::device::MaskingSpecialization::MaskDisabled;
@@ -227,8 +227,9 @@ void run_attention_fwd_host(const TensorQ& q_g_m_k,
    ref_gemm0_invoker.Run(ref_gemm0_argument);
    // masking
+    auto M          = s_g_m_n.GetLengths()[1];
    auto N          = s_g_m_n.GetLengths()[2];
-    const auto mask = DeviceGemmInstance::C0MatrixMask(N);
+    const auto mask = DeviceGemmInstance::C0MatrixMask(M, N);
    s_g_m_n.ForEach([&](auto& self, auto idx) {
        if(mask.IsMaskedElement(idx[1], idx[2]))
            self(idx) = -ck::NumericLimits<float>::Infinity();
@@ -267,7 +268,7 @@ int run(int argc, char* argv[])
    // y_g_m_o = Softmax(alpha * Q_g_m_k * K_g_k_n) * V_g_n_o
    // y_g0_g1_m_o = reshape(y_g_m_o, [G0, G1, M, O])
    // y_g0_m_g1_o = permute(y_g0_g1_m_o, [0, 2, 1, 3])
-    ck::index_t M  = 512;
+    ck::index_t M  = 123;
    ck::index_t N  = 512;
    ck::index_t K  = DIM;
    ck::index_t O  = DIM;

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -156,36 +156,37 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
-                                                        p_b_grid + b_batch_offset,
+            p_a_grid + a_batch_offset,
-                                                        z_matrix_ptr,
+            p_b_grid + b_batch_offset,
-                                                        p_b1_grid + b1_batch_offset,
+            z_matrix_ptr,
-                                                        p_c_grid + c_batch_offset,
+            p_b1_grid + b1_batch_offset,
-                                                        p_lse_grid + lse_batch_offset,
+            p_c_grid + c_batch_offset,
-                                                        p_ygrad_grid + c_batch_offset,
+            p_lse_grid + lse_batch_offset,
-                                                        p_qgrad_grid + a_batch_offset,
+            p_ygrad_grid + c_batch_offset,
-                                                        p_kgrad_grid + b_batch_offset,
+            p_qgrad_grid + a_batch_offset,
-                                                        p_vgrad_grid + b1_batch_offset,
+            p_kgrad_grid + b_batch_offset,
-                                                        p_shared,
+            p_vgrad_grid + b1_batch_offset,
-                                                        a_element_op,
+            p_shared,
-                                                        b_element_op,
+            a_element_op,
-                                                        acc_element_op,
+            b_element_op,
-                                                        b1_element_op,
+            acc_element_op,
-                                                        c_element_op,
+            b1_element_op,
-                                                        a_grid_desc_ak0_m_ak1,
+            c_element_op,
-                                                        b_grid_desc_bk0_n_bk1,
+            a_grid_desc_ak0_m_ak1,
-                                                        c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+            b_grid_desc_bk0_n_bk1,
-                                                        b1_grid_desc_bk0_n_bk1,
+            c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+            b1_grid_desc_bk0_n_bk1,
-                                                        lse_grid_desc_m,
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                        ygrad_grid_desc_o0_m_o1,
+            lse_grid_desc_m,
-                                                        block_2_ctile_map,
+            ygrad_grid_desc_o0_m_o1,
-                                                        c0_matrix_mask,
+            block_2_ctile_map,
-                                                        p_drop,
+            c0_matrix_mask,
-                                                        ph,
+            p_drop,
-                                                        z_random_matrix_offset,
+            ph,
-                                                        raw_n_padded,
+            z_random_matrix_offset,
-                                                        0);
+            raw_n_padded,
+            0);
    }
 #else
    ignore = p_a_grid;
@@ -1000,10 +1001,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
-            if(arg.p_drop_ > 0.0){
+            if(arg.p_drop_ > 0.0)
-                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+            {
-            }else{
+                ave_time = launch_kernel(integral_constant<bool, false>{},
-                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                                         integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{},
+                                         integral_constant<bool, false>{});
            }
            return ave_time;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -155,36 +155,37 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
-                                                      p_b_grid + b_batch_offset,
+            p_a_grid + a_batch_offset,
-                                                      z_matrix_ptr,
+            p_b_grid + b_batch_offset,
-                                                      p_b1_grid + b1_batch_offset,
+            z_matrix_ptr,
-                                                      p_c_grid + c_batch_offset,
+            p_b1_grid + b1_batch_offset,
-                                                      p_lse_grid + lse_batch_offset,
+            p_c_grid + c_batch_offset,
-                                                      p_ygrad_grid + c_batch_offset,
+            p_lse_grid + lse_batch_offset,
-                                                      p_qgrad_grid + a_batch_offset,
+            p_ygrad_grid + c_batch_offset,
-                                                      p_kgrad_grid + b_batch_offset,
+            p_qgrad_grid + a_batch_offset,
-                                                      p_vgrad_grid + b1_batch_offset,
+            p_kgrad_grid + b_batch_offset,
-                                                      p_shared,
+            p_vgrad_grid + b1_batch_offset,
-                                                      a_element_op,
+            p_shared,
-                                                      b_element_op,
+            a_element_op,
-                                                      acc_element_op,
+            b_element_op,
-                                                      b1_element_op,
+            acc_element_op,
-                                                      c_element_op,
+            b1_element_op,
-                                                      a_grid_desc_ak0_m_ak1,
+            c_element_op,
-                                                      b_grid_desc_bk0_n_bk1,
+            a_grid_desc_ak0_m_ak1,
-                                                      c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+            b_grid_desc_bk0_n_bk1,
-                                                      b1_grid_desc_bk0_n_bk1,
+            c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+            b1_grid_desc_bk0_n_bk1,
-                                                      lse_grid_desc_m,
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                      ygrad_grid_desc_m0_o_m1,
+            lse_grid_desc_m,
-                                                      block_2_ctile_map,
+            ygrad_grid_desc_m0_o_m1,
-                                                      c0_matrix_mask,
+            block_2_ctile_map,
-                                                      p_drop,
+            c0_matrix_mask,
-                                                      ph,
+            p_drop,
-                                                      z_random_matrix_offset,
+            ph,
-                                                      raw_n_padded,
+            z_random_matrix_offset,
-                                                      0);
+            raw_n_padded,
+            0);
    }
 #else
    ignore = p_a_grid;
@@ -574,6 +575,10 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        {
            return MaskOutUpperTrianglePredicate{};
        }
+        else if constexpr(MaskingSpec == MaskingSpecialization::MaskUpperTringleFromBottonRight)
+        {
+            return MaskUpperTringleFromBottonRightPredicate{};
+        }
    }
    using C0MatrixMask = C0MatrixMask_impl<decltype(make_MaskOutPredicate())>;
@@ -786,7 +791,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
              acc_element_op_{acc_element_op},
              b1_element_op_{b1_element_op},
              c_element_op_{c_element_op},
-              c0_matrix_mask_{b_grid_desc_g_n_k_.GetLength(I1)},
+              c0_matrix_mask_{a_grid_desc_g_m_k_.GetLength(I1), b_grid_desc_g_n_k_.GetLength(I1)},
              raw_lengths_mz_nz_kz_gemm1nz_{a_gs_ms_ks_lengths[NumDimG + NumDimM - 1],
                                            b_gs_ns_ks_lengths[NumDimG + NumDimN - 1],
                                            b_gs_ns_ks_lengths[NumDimG + NumDimN + NumDimK - 1],
@@ -1024,16 +1029,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
                if(arg.p_drop_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {
                if(arg.p_drop_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            return ave_time;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -999,16 +999,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            if(all_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -1006,16 +1006,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(all_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/masking_specialization.hpp
@@ -10,7 +10,8 @@ namespace device {
 enum struct MaskingSpecialization
 {
    MaskDisabled,
-    MaskOutUpperTriangle
+    MaskOutUpperTriangle,
+    MaskUpperTringleFromBottonRight
 };
 inline std::string getMaskingSpecializationString(const MaskingSpecialization& s)
@@ -19,6 +20,8 @@ inline std::string getMaskingSpecializationString(const MaskingSpecialization& s
    {
    case MaskingSpecialization::MaskDisabled: return "MaskDisabled";
    case MaskingSpecialization::MaskOutUpperTriangle: return "MaskOutUpperTriangle";
+    case MaskingSpecialization::MaskUpperTringleFromBottonRight:
+        return "MaskUpperTringleFromBottonRight";
    default: return "Unrecognized specialization!";
    }
 }
@@ -47,13 +50,37 @@ struct MaskOutUpperTrianglePredicate
        return operator()(m + m_tile - 1, n);
    }
 };
+struct MaskUpperTringleFromBottonRightPredicate
+{
+    __host__ __device__ void SetOffset(const index_t offset) { offset_ = offset; }
+    __host__ __device__ constexpr bool operator()(index_t m, index_t n) const
+    {
+        return n > m + offset_;
+    }
+    __host__ __device__ constexpr bool
+    IsTileSkippable(index_t m, index_t n, index_t m_tile, index_t /*n_tile*/) const
+    {
+        return operator()(m + m_tile - 1, n);
+    }
+    private:
+    index_t offset_;
+};
 // to track the points which need to be set to -inf on C0
 // Note: no need to reset M padding value, because they will not be stored out.
 template <typename MaskOutPredicate>
 struct C0MatrixMask_impl
 {
-    C0MatrixMask_impl(index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{}) {}
+    C0MatrixMask_impl(index_t MRaw, index_t NRaw) : NRaw_(NRaw), predicate_(MaskOutPredicate{})
+    {
+        if constexpr(std::is_same<MaskOutPredicate,
+                                  MaskUpperTringleFromBottonRightPredicate>::value)
+        {
+            predicate_.SetOffset(NRaw - MRaw);
+        }
+    }
    __host__ __device__ constexpr bool IsNOutOfBound(/*index_t m, */ index_t n) const
    {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -1935,8 +1935,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
                    auto n_local =
                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                    auto m_global    = m_local + m_block_data_idx_on_grid;
+                    auto m_global = m_local + m_block_data_idx_on_grid;
-                    auto n_global    = n_local + n_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
                    if(c0_matrix_mask.IsMaskedElement(m_global, n_global))
                    {
                        s_slash_p_thread_buf(i) = -ck::NumericLimits<float>::Infinity();

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1948,54 +1948,61 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if constexpr(IsDropout){
+            if constexpr(IsDropout)
+            {
                if(p_z_grid)
                {
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                    blockwise_dropout
-                                                                        decltype(z_tenor_buffer),
+                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                        decltype(position_offset),
+                                                           decltype(z_tenor_buffer),
-                                                                        true>(
+                                                           decltype(position_offset),
-                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                           true>(
+                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                    z_thread_copy_vgpr_to_global.Run(
-                                                    z_tenor_buffer,
+                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                    z_grid_buf);
+                        z_tenor_buffer,
+                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        z_grid_buf);
                }
                else
                {
                    ignore = z_grid_buf;
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                decltype(position_offset),
+                                                                   decltype(position_offset),
-                                                                true>(
+                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
            }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1864,53 +1864,60 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if constexpr(IsDropout){
+            if constexpr(IsDropout)
+            {
                if(p_z_grid)
                {
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                    blockwise_dropout
-                                                                        decltype(z_tenor_buffer),
+                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                        decltype(position_offset),
+                                                           decltype(z_tenor_buffer),
-                                                                        true>(
+                                                           decltype(position_offset),
-                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                           true>(
+                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                    z_thread_copy_vgpr_to_global.Run(
-                                                    z_tenor_buffer,
+                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                    z_grid_buf);
+                        z_tenor_buffer,
+                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        z_grid_buf);
                }
                else
                {
                    ignore = z_grid_buf;
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                decltype(position_offset),
+                                                                   decltype(position_offset),
-                                                                true>(
+                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
            }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
@@ -917,7 +917,7 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
        {
            block_sync_lds();
        }
        do
        {
            auto n_block_data_idx_on_grid =