Merge branch 'mha-train-develop' into mha-train-bias-bwd-type2

ec2ad713 · letaoqin · e3eb4381 · e296ee56 · ec2ad713 · ec2ad713
Commit ec2ad713 authored Aug 17, 2023 by letaoqin
8 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -1533,8 +1533,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
                     unsigned short,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        // z matrix global desc
        /*const auto M = q_grid_desc_k0_m_k1.GetLength(I1);
        const auto N = k_grid_desc_k0_n_k1.GetLength(I1);
@@ -1966,16 +1966,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
                // P_dropped
                static_for<0, n0, 1>{}([&](auto i) {
                    blockwise_dropout.template ApplyDropout<decltype(s_slash_p_thread_buf),
-                                                            decltype(z_tenor_buffer),
+                                                            decltype(z_tensor_buffer),
                                                            true,
                                                            decltype(n0),
                                                            decltype(i)>(
-                        s_slash_p_thread_buf, ph, z_tenor_buffer);
+                        s_slash_p_thread_buf, ph, z_tensor_buffer);
                    z_thread_copy_vgpr_to_global.Run(
                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                        z_tenor_buffer,
+                        z_tensor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                        z_grid_buf);
                    z_thread_copy_vgpr_to_global.MoveDstSliceWindow(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v2.hpp
@@ -1473,8 +1473,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V2
                     unsigned short,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        // z matrix global desc
        /*const auto M = q_grid_desc_k0_m_k1.GetLength(I1);
        const auto N = k_grid_desc_k0_n_k1.GetLength(I1);
@@ -1865,16 +1865,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V2
                // P_dropped
                static_for<0, n0, 1>{}([&](auto i) {
                    blockwise_dropout.template ApplyDropout<decltype(s_slash_p_thread_buf),
-                                                            decltype(z_tenor_buffer),
+                                                            decltype(z_tensor_buffer),
                                                            true,
                                                            decltype(n0),
                                                            decltype(i)>(
-                        s_slash_p_thread_buf, ph, z_tenor_buffer);
+                        s_slash_p_thread_buf, ph, z_tensor_buffer);
                    z_thread_copy_vgpr_to_global.Run(
                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                        z_tenor_buffer,
+                        z_tensor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                        z_grid_buf);
                    z_thread_copy_vgpr_to_global.MoveDstSliceWindow(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
@@ -110,6 +110,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
    static constexpr auto Gemm0MWaves = MPerBlock / (MPerXdl * MXdlPerWave);
    static constexpr auto Gemm0NWaves = NPerBlock / (NPerXdl * NXdlPerWave);
+    static constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+    static constexpr auto DropoutNThread = mfma.num_input_blks; // 2
+    // get_random_8x16() generates 8 random numbers each time
+    static constexpr auto DropoutTile = Number<DropoutNThread * 8>{}; // 16
    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    // C desc for source in blockwise copy
@@ -119,7 +124,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
        const auto M = z_grid_desc_m_n.GetLength(I0);
        const auto N = z_grid_desc_m_n.GetLength(I1);
-        constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
        constexpr auto M3 = mfma.num_groups_per_blk;
        constexpr auto M4 = mfma.num_input_blks;
        constexpr auto M5 = mfma.group_size;
@@ -136,9 +140,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
    __host__ __device__ static constexpr auto GetPaddedSize(const index_t size)
    {
-        constexpr auto mfma       = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+        return math::integer_divide_ceil(size, DropoutTile) * DropoutTile;
-        constexpr auto group_size = mfma.group_size;
-        return math::integer_divide_ceil(size, group_size) * group_size;
    }
    __device__ static auto GetGemm0WaveIdx()
@@ -542,9 +544,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
                BBlockDesc_BK0_N_BK1{});
        }
-        static constexpr index_t KPack =
+        static constexpr index_t KPack = math::max(math::lcm(AK1, BK1), mfma.k_per_blk);
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        // Blockwise gemm with transposed XDL output
        using BlockwiseGemm = BlockwiseGemmXdlops_v2<
@@ -646,8 +646,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
        // therefore we may just as well assign Gemm1KPack = group_size
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack = mfma.group_size;
-            MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.group_size;
        static constexpr index_t GemmMWave   = Gemm0NWaves;                  // 4        // 4
        static constexpr index_t GemmNWave   = Gemm0MWaves;                  // 1        // 1
@@ -770,8 +769,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
        static constexpr index_t GemmNRepeat = Gemm2NXdlPerWave;                        // 1 // 1
        static constexpr index_t GemmMRepeat = Gemm2_M / GemmMWave / MPerXdl;           // 1 // 1
        static constexpr index_t GemmKLoop   = Gemm2_K / Sum_K;                         // 2 // 2
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack   = math::max(A_K1, mfma.k_per_blk);
-            math::max(A_K1, MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        static constexpr index_t B_K3        = GemmKPack; // 8
        static constexpr index_t B_K2 =
            XdlopsGemm<GemmDataType, MPerXdl, NPerXdl, GemmKPack, false>{}.K0PerXdlops; // 2
@@ -1570,8 +1568,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
                     ushort,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        auto z_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_z_grid, z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize());
@@ -1759,7 +1757,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
            // scaling is already performed in the preceding statements with s_element_op
            blockwise_softmax.RunWithPreCalcStats(s_slash_p_thread_buf, lse_thread_buf);
-            constexpr auto position_offset = M3 * M4;
            // save z to global
            if constexpr(IsDropout)
            {
@@ -1774,23 +1771,27 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
-                    auto global_elem_id =
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                                          (n_global % DropoutTile) * raw_n_padded;
                    blockwise_dropout
                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                           decltype(z_tenor_buffer),
+                                                           decltype(z_tensor_buffer),
-                                                           decltype(position_offset),
+                                                           decltype(DropoutTile),
-                                                           true>(
+                                                           true>(s_slash_p_thread_buf,
-                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                                 ph,
+                                                                 global_elem_id,
+                                                                 z_tensor_buffer,
+                                                                 raw_n_padded);
                    z_thread_copy_vgpr_to_global.Run(
                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                        z_tenor_buffer,
+                        z_tensor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        z_grid_buf);
                }
@@ -1806,15 +1807,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
-                    auto global_elem_id =
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                                          (n_global % DropoutTile) * raw_n_padded;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                   decltype(position_offset),
+                                                                   decltype(DropoutTile),
                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
@@ -121,6 +121,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
    static constexpr auto B1K0 = Number<Gemm1KPerBlock / B1K1Value>{};
    static constexpr auto B1K1 = Number<B1K1Value>{};
+    static constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+    static constexpr auto DropoutNThread = mfma.num_input_blks; // 2
+    // get_random_8x16() generates 8 random numbers each time
+    static constexpr auto DropoutTile = Number<DropoutNThread * 8>{}; // 16
    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    using GridwiseGemmPipe = remove_cvref_t<decltype(
@@ -133,7 +138,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
        const auto M = z_grid_desc_m_n.GetLength(I0);
        const auto N = z_grid_desc_m_n.GetLength(I1);
-        constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
        constexpr auto M3 = mfma.num_groups_per_blk;
        constexpr auto M4 = mfma.num_input_blks;
        constexpr auto M5 = mfma.group_size;
@@ -150,9 +154,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
    __host__ __device__ static constexpr auto GetPaddedSize(const index_t size)
    {
-        constexpr auto mfma       = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+        return math::integer_divide_ceil(size, DropoutTile) * DropoutTile;
-        constexpr auto group_size = mfma.group_size;
-        return math::integer_divide_ceil(size, group_size) * group_size;
    }
    __device__ static auto GetGemm0WaveIdx()
@@ -522,9 +524,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
                                                true, // DstResetCoord
                                                NumGemmKPrefetchStage>;
-        static constexpr index_t KPack =
+        static constexpr index_t KPack = math::max(math::lcm(AK1, BK1), mfma.k_per_blk);
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        // Blockwise gemm with transposed XDL output
        using BlockwiseGemm = BlockwiseGemmXdlops_v2<
@@ -657,8 +657,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
        // therefore we may just as well assign Gemm1KPack = group_size
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack = mfma.group_size;
-            MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.group_size;
        using BlockwiseGemm = BlockwiseGemmXdlops_v2<
            BlockSize,
@@ -709,9 +708,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
        static constexpr index_t GemmMWave   = BlockSize / get_warp_size() / GemmNWave;
        static constexpr index_t GemmNRepeat = Gemm2NXdlPerWave;
        static constexpr index_t GemmMRepeat = Gemm2_M / GemmMWave / MPerXdl;
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack   = math::max(math::lcm(A_K1, B_K1), mfma.k_per_blk);
-            math::max(math::lcm(A_K1, B_K1),
-                      MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        using BBlockSliceLengths = Sequence<B_K0, Gemm2_N, B_K1>;
        using BThreadClusterLengths =
@@ -1554,8 +1551,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
                     ushort,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        auto z_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_z_grid, z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize());
@@ -1722,7 +1719,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
            // scaling is already performed in the preceding statements with s_element_op
            blockwise_softmax.RunWithPreCalcStats(s_slash_p_thread_buf, lse_thread_buf);
-            constexpr auto position_offset = M3 * M4;
            // save z to global
            if constexpr(IsDropout)
            {
@@ -1737,23 +1733,27 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
-                    auto global_elem_id =
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                                          (n_global % DropoutTile) * raw_n_padded;
                    blockwise_dropout
                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                           decltype(z_tenor_buffer),
+                                                           decltype(z_tensor_buffer),
-                                                           decltype(position_offset),
+                                                           decltype(DropoutTile),
-                                                           true>(
+                                                           true>(s_slash_p_thread_buf,
-                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                                 ph,
+                                                                 global_elem_id,
+                                                                 z_tensor_buffer,
+                                                                 raw_n_padded);
                    z_thread_copy_vgpr_to_global.Run(
                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                        z_tenor_buffer,
+                        z_tensor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        z_grid_buf);
                }
@@ -1769,14 +1769,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
+                                          (n_global % DropoutTile) * raw_n_padded;
-                    auto global_elem_id =
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                   decltype(position_offset),
+                                                                   decltype(DropoutTile),
                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -109,6 +109,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
    static constexpr auto Gemm0MWaves = MPerBlock / (MPerXdl * MXdlPerWave);
    static constexpr auto Gemm0NWaves = NPerBlock / (NPerXdl * NXdlPerWave);
+    static constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+    static constexpr auto DropoutNThread = mfma.num_input_blks; // 2
+    // get_random_8x16() generates 8 random numbers each time
+    static constexpr auto DropoutTile = Number<DropoutNThread * 8>{}; // 16
    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    // C desc for source in blockwise copy
@@ -118,7 +123,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        const auto M = z_grid_desc_m_n.GetLength(I0);
        const auto N = z_grid_desc_m_n.GetLength(I1);
-        constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
        constexpr auto M3 = mfma.num_groups_per_blk;
        constexpr auto M4 = mfma.num_input_blks;
        constexpr auto M5 = mfma.group_size;
@@ -135,9 +139,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
    __host__ __device__ static constexpr auto GetPaddedSize(const index_t size)
    {
-        constexpr auto mfma       = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+        return math::integer_divide_ceil(size, DropoutTile) * DropoutTile;
-        constexpr auto group_size = mfma.group_size;
-        return math::integer_divide_ceil(size, group_size) * group_size;
    }
    __device__ static auto GetGemm0WaveIdx()
@@ -563,9 +565,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                BBlockDesc_BK0_N_BK1{});
        }
-        static constexpr index_t KPack =
+        static constexpr index_t KPack = math::max(math::lcm(AK1, BK1), mfma.k_per_blk);
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        // Blockwise gemm with transposed XDL output
        using BlockwiseGemm = BlockwiseGemmXdlops_v2<
@@ -667,8 +667,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
        // therefore we may just as well assign Gemm1KPack = group_size
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack = mfma.group_size;
-            MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.group_size;
        static constexpr index_t GemmMWave   = Gemm0NWaves;                  // 4        // 4
        static constexpr index_t GemmNWave   = Gemm0MWaves;                  // 1        // 1
@@ -791,8 +790,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        static constexpr index_t GemmNRepeat = Gemm2NXdlPerWave;                        // 1 // 1
        static constexpr index_t GemmMRepeat = Gemm2_M / GemmMWave / MPerXdl;           // 1 // 1
        static constexpr index_t GemmKLoop   = Gemm2_K / Sum_K;                         // 2 // 2
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack   = math::max(A_K1, mfma.k_per_blk);
-            math::max(A_K1, MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        static constexpr index_t B_K3        = GemmKPack; // 8
        static constexpr index_t B_K2 =
            XdlopsGemm<GemmDataType, MPerXdl, NPerXdl, GemmKPack, false>{}.K0PerXdlops; // 2
@@ -1621,8 +1619,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                     ushort,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        auto z_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_z_grid, z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize());
@@ -1946,7 +1944,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            // scaling is already performed in the preceding statements with s_element_op
            blockwise_softmax.RunWithPreCalcStats(s_slash_p_thread_buf, lse_thread_buf);
-            constexpr auto position_offset = M3 * M4;
            // save z to global
            if constexpr(IsDropout)
            {
@@ -1961,23 +1958,27 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
-                    auto global_elem_id =
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                                          (n_global % DropoutTile) * raw_n_padded;
                    blockwise_dropout
                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                           decltype(z_tenor_buffer),
+                                                           decltype(z_tensor_buffer),
-                                                           decltype(position_offset),
+                                                           decltype(DropoutTile),
-                                                           true>(
+                                                           true>(s_slash_p_thread_buf,
-                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                                 ph,
+                                                                 global_elem_id,
+                                                                 z_tensor_buffer,
+                                                                 raw_n_padded);
                    z_thread_copy_vgpr_to_global.Run(
                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                        z_tenor_buffer,
+                        z_tensor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        z_grid_buf);
                }
@@ -1993,15 +1994,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
-                    auto global_elem_id =
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                                          (n_global % DropoutTile) * raw_n_padded;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                   decltype(position_offset),
+                                                                   decltype(DropoutTile),
                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -127,6 +127,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
    static constexpr auto D0M2 = Number<MPerXdl / D0M3.value>{};
    static constexpr auto D0M1 = Number<MPerBlock / MPerXdl>{};
+    static constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+    static constexpr auto DropoutNThread = mfma.num_input_blks; // 2
+    // get_random_8x16() generates 8 random numbers each time
+    static constexpr auto DropoutTile = Number<DropoutNThread * 8>{}; // 16
    using ThisThreadBlock = ThisThreadBlock<BlockSize>;
    using GridwiseGemmPipe = remove_cvref_t<decltype(
@@ -139,7 +144,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        const auto M = z_grid_desc_m_n.GetLength(I0);
        const auto N = z_grid_desc_m_n.GetLength(I1);
-        constexpr auto mfma = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
        constexpr auto M3 = mfma.num_groups_per_blk;
        constexpr auto M4 = mfma.num_input_blks;
        constexpr auto M5 = mfma.group_size;
@@ -156,9 +160,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
    __host__ __device__ static constexpr auto GetPaddedSize(const index_t size)
    {
-        constexpr auto mfma       = MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma;
+        return math::integer_divide_ceil(size, DropoutTile) * DropoutTile;
-        constexpr auto group_size = mfma.group_size;
-        return math::integer_divide_ceil(size, group_size) * group_size;
    }
    __device__ static auto GetGemm0WaveIdx()
@@ -550,9 +552,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                                                true, // DstResetCoord
                                                NumGemmKPrefetchStage>;
-        static constexpr index_t KPack =
+        static constexpr index_t KPack = math::max(math::lcm(AK1, BK1), mfma.k_per_blk);
-            math::max(math::lcm(AK1, BK1),
-                      MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        // Blockwise gemm with transposed XDL output
        using BlockwiseGemm = BlockwiseGemmXdlops_v2<
@@ -685,8 +685,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        // with 'group_size' amount of contiguous elements. Having Gemm1KPack greater than A1K1 will
        // cause mismatch in summation index for example c[0:7] = a1[[0:3, 8:11]] * b1[0:7].
        // therefore we may just as well assign Gemm1KPack = group_size
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack = mfma.group_size;
-            MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.group_size;
        using BlockwiseGemm = BlockwiseGemmXdlops_v2<
            BlockSize,
@@ -737,9 +736,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        static constexpr index_t GemmMWave   = BlockSize / get_warp_size() / GemmNWave;
        static constexpr index_t GemmNRepeat = Gemm2NXdlPerWave;
        static constexpr index_t GemmMRepeat = Gemm2_M / GemmMWave / MPerXdl;
-        static constexpr index_t GemmKPack =
+        static constexpr index_t GemmKPack   = math::max(math::lcm(A_K1, B_K1), mfma.k_per_blk);
-            math::max(math::lcm(A_K1, B_K1),
-                      MfmaSelector<GemmDataType, MPerXdl, NPerXdl>::selected_mfma.k_per_blk);
        using BBlockSliceLengths = Sequence<B_K0, Gemm2_N, B_K1>;
        using BThreadClusterLengths =
@@ -1666,8 +1663,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                     ushort,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        auto z_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
            p_z_grid, z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3.GetElementSpaceSize());
@@ -1948,7 +1945,6 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            // scaling is already performed in the preceding statements with s_element_op
            blockwise_softmax.RunWithPreCalcStats(s_slash_p_thread_buf, lse_thread_buf);
-            constexpr auto position_offset = M3 * M4;
            // save z to global
            if constexpr(IsDropout)
            {
@@ -1963,23 +1959,27 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
-                    auto global_elem_id =
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
+                                          (n_global % DropoutTile) * raw_n_padded;
                    blockwise_dropout
                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                           decltype(z_tenor_buffer),
+                                                           decltype(z_tensor_buffer),
-                                                           decltype(position_offset),
+                                                           decltype(DropoutTile),
-                                                           true>(
+                                                           true>(s_slash_p_thread_buf,
-                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                                 ph,
+                                                                 global_elem_id,
+                                                                 z_tensor_buffer,
+                                                                 raw_n_padded);
                    z_thread_copy_vgpr_to_global.Run(
                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                        z_tenor_buffer,
+                        z_tensor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        z_grid_buf);
                }
@@ -1995,14 +1995,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
-                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
+                    auto global_tile_id = z_random_matrix_offset +
-                                              n_global; // unique element global 1d id
+                                          (m_global / DropoutTile) * DropoutTile * raw_n_padded +
+                                          (n_global / DropoutTile) * DropoutTile;
+                    auto global_elem_id = global_tile_id + (wave_m_n_id[I0] * M4) +
+                                          (n_global % DropoutTile) * raw_n_padded;
-                    auto global_elem_id =
-                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                   decltype(position_offset),
+                                                                   decltype(DropoutTile),
                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
@@ -873,8 +873,8 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
                     unsigned short,
                     z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5.GetElementSpaceSize(),
                     true>
-            z_tenor_buffer;
+            z_tensor_buffer;
-        z_tenor_buffer.Clear();
+        z_tensor_buffer.Clear();
        // z matrix global desc
        auto z_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
@@ -1022,16 +1022,16 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
                {
                    static_for<0, n0, 1>{}([&](auto i) {
                        blockwise_dropout.template ApplyDropout<decltype(acc_thread_buf),
-                                                                decltype(z_tenor_buffer),
+                                                                decltype(z_tensor_buffer),
                                                                false,
                                                                decltype(n0),
                                                                decltype(i)>(
-                            acc_thread_buf, ph, z_tenor_buffer);
+                            acc_thread_buf, ph, z_tensor_buffer);
                        z_thread_copy_vgpr_to_global.Run(
                            z_thread_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                            make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                            z_tenor_buffer,
+                            z_tensor_buffer,
                            z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
                            z_grid_buf);
                        z_thread_copy_vgpr_to_global.MoveDstSliceWindow(

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp