change name to d_thread_desc_mblock_m1

52478ac3 · ltqin · 2aa9cbee · 52478ac3 · 52478ac3 · 52478ac3
Commit 52478ac3 authored Jul 21, 2023 by ltqin
8 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -156,7 +156,8 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
+            p_a_grid + a_batch_offset,
            p_b_grid + b_batch_offset,
            z_matrix_ptr,
            p_b1_grid + b1_batch_offset,
@@ -1000,10 +1001,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
-            if(arg.p_drop_ > 0.0){
-                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
-            }else{
-                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+            if(arg.p_drop_ > 0.0)
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{},
+                                         integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{},
+                                         integral_constant<bool, false>{});
            }
            return ave_time;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -155,7 +155,8 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
+            p_a_grid + a_batch_offset,
            p_b_grid + b_batch_offset,
            z_matrix_ptr,
            p_b1_grid + b1_batch_offset,
@@ -1024,16 +1025,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
                if(arg.p_drop_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {
                if(arg.p_drop_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }

            return ave_time;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -999,16 +999,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            if(all_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -1006,16 +1006,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(all_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1948,13 +1948,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if constexpr(IsDropout){
+            if constexpr(IsDropout)
+            {
                if(p_z_grid)
                {

                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;

@@ -1964,13 +1967,15 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;

-                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                    blockwise_dropout
+                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
                                                           decltype(z_tenor_buffer),
                                                           decltype(position_offset),
                                                           true>(
                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);

-                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                    z_thread_copy_vgpr_to_global.Run(
+                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
                        z_tenor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
@@ -1981,8 +1986,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    ignore = z_grid_buf;

                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1864,13 +1864,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2

            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if constexpr(IsDropout){
+            if constexpr(IsDropout)
+            {
                if(p_z_grid)
                {

                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;

@@ -1880,13 +1883,15 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;

-                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                    blockwise_dropout
+                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
                                                           decltype(z_tenor_buffer),
                                                           decltype(position_offset),
                                                           true>(
                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);

-                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                    z_thread_copy_vgpr_to_global.Run(
+                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
                        z_tenor_buffer,
                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
@@ -1897,8 +1902,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                    ignore = z_grid_buf;

                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;


--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_ydotygrad.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_ydotygrad.hpp
@@ -165,7 +165,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad

        const index_t block_work_idx_m = block_work_idx[I0];

-        constexpr auto d_thread_desc_mblock_mrepeat_mwave_mperxdl =
+        constexpr auto d_thread_desc_mblock_m1 =
            make_naive_tensor_descriptor_packed(make_tuple(I1, I1));

        constexpr auto y_thread_desc_m0_m1_n0_n1 = make_naive_tensor_descriptor_packed(make_tuple(
@@ -244,7 +244,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
        auto d_thread_copy_vgpr_to_global =
            ThreadwiseTensorSliceTransfer_v1r3<FloatD,
                                               FloatD,
-                                               decltype(d_thread_desc_mblock_mrepeat_mwave_mperxdl),
+                                               decltype(d_thread_desc_mblock_m1),
                                               decltype(d_grid_desc_mblock_mperblock),
                                               ck::tensor_operation::element_wise::PassThrough,
                                               Sequence<1, 1>,
@@ -260,7 +260,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_YDotYGrad
                ck::tensor_operation::element_wise::PassThrough{}};

        // copy from VGPR to Global
-        d_thread_copy_vgpr_to_global.Run(d_thread_desc_mblock_mrepeat_mwave_mperxdl,
+        d_thread_copy_vgpr_to_global.Run(d_thread_desc_mblock_m1,
                                         make_tuple(I0, I0),
                                         y_dot_ygrad_thread_accum_buf,
                                         d_grid_desc_mblock_mperblock,