format

34b1c320 · ltqin · b5a3ea2d · 34b1c320 · 34b1c320 · 34b1c320
Commit 34b1c320 authored Jul 25, 2023 by ltqin
9 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -156,36 +156,37 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
-                                                        p_b_grid + b_batch_offset,
+            p_a_grid + a_batch_offset,
-                                                        z_matrix_ptr,
+            p_b_grid + b_batch_offset,
-                                                        p_b1_grid + b1_batch_offset,
+            z_matrix_ptr,
-                                                        p_c_grid + c_batch_offset,
+            p_b1_grid + b1_batch_offset,
-                                                        p_lse_grid + lse_batch_offset,
+            p_c_grid + c_batch_offset,
-                                                        p_ygrad_grid + c_batch_offset,
+            p_lse_grid + lse_batch_offset,
-                                                        p_qgrad_grid + a_batch_offset,
+            p_ygrad_grid + c_batch_offset,
-                                                        p_kgrad_grid + b_batch_offset,
+            p_qgrad_grid + a_batch_offset,
-                                                        p_vgrad_grid + b1_batch_offset,
+            p_kgrad_grid + b_batch_offset,
-                                                        p_shared,
+            p_vgrad_grid + b1_batch_offset,
-                                                        a_element_op,
+            p_shared,
-                                                        b_element_op,
+            a_element_op,
-                                                        acc_element_op,
+            b_element_op,
-                                                        b1_element_op,
+            acc_element_op,
-                                                        c_element_op,
+            b1_element_op,
-                                                        a_grid_desc_ak0_m_ak1,
+            c_element_op,
-                                                        b_grid_desc_bk0_n_bk1,
+            a_grid_desc_ak0_m_ak1,
-                                                        c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
+            b_grid_desc_bk0_n_bk1,
-                                                        b1_grid_desc_bk0_n_bk1,
+            c_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                                                        c_grid_desc_mblock_mperblock_nblock_nperblock,
+            b1_grid_desc_bk0_n_bk1,
-                                                        lse_grid_desc_m,
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                        ygrad_grid_desc_o0_m_o1,
+            lse_grid_desc_m,
-                                                        block_2_ctile_map,
+            ygrad_grid_desc_o0_m_o1,
-                                                        c0_matrix_mask,
+            block_2_ctile_map,
-                                                        p_drop,
+            c0_matrix_mask,
-                                                        ph,
+            p_drop,
-                                                        z_random_matrix_offset,
+            ph,
-                                                        raw_n_padded,
+            z_random_matrix_offset,
-                                                        0);
+            raw_n_padded,
+            0);
    }
 #else
    ignore = p_a_grid;
@@ -994,10 +995,15 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    arg.m_raw_padded_,
                    arg.n_raw_padded_);
            };
-            if(arg.p_drop_ > 0.0){
+            if(arg.p_drop_ > 0.0)
-                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+            {
-            }else{
+                ave_time = launch_kernel(integral_constant<bool, false>{},
-                ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                                         integral_constant<bool, true>{});
+            }
+            else
+            {
+                ave_time = launch_kernel(integral_constant<bool, false>{},
+                                         integral_constant<bool, false>{});
            }
            return ave_time;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -155,36 +155,37 @@ __global__ void
    }
    else
    {
-        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(p_a_grid + a_batch_offset,
+        GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout>(
-                                                      p_b_grid + b_batch_offset,
+            p_a_grid + a_batch_offset,
-                                                      z_matrix_ptr,
+            p_b_grid + b_batch_offset,
-                                                      p_b1_grid + b1_batch_offset,
+            z_matrix_ptr,
-                                                      p_c_grid + c_batch_offset,
+            p_b1_grid + b1_batch_offset,
-                                                      p_lse_grid + lse_batch_offset,
+            p_c_grid + c_batch_offset,
-                                                      p_ygrad_grid + c_batch_offset,
+            p_lse_grid + lse_batch_offset,
-                                                      p_qgrad_grid + a_batch_offset,
+            p_ygrad_grid + c_batch_offset,
-                                                      p_kgrad_grid + b_batch_offset,
+            p_qgrad_grid + a_batch_offset,
-                                                      p_vgrad_grid + b1_batch_offset,
+            p_kgrad_grid + b_batch_offset,
-                                                      p_shared,
+            p_vgrad_grid + b1_batch_offset,
-                                                      a_element_op,
+            p_shared,
-                                                      b_element_op,
+            a_element_op,
-                                                      acc_element_op,
+            b_element_op,
-                                                      b1_element_op,
+            acc_element_op,
-                                                      c_element_op,
+            b1_element_op,
-                                                      a_grid_desc_ak0_m_ak1,
+            c_element_op,
-                                                      b_grid_desc_bk0_n_bk1,
+            a_grid_desc_ak0_m_ak1,
-                                                      c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+            b_grid_desc_bk0_n_bk1,
-                                                      b1_grid_desc_bk0_n_bk1,
+            c_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                      c_grid_desc_mblock_mperblock_nblock_nperblock,
+            b1_grid_desc_bk0_n_bk1,
-                                                      lse_grid_desc_m,
+            c_grid_desc_mblock_mperblock_nblock_nperblock,
-                                                      ygrad_grid_desc_m0_o_m1,
+            lse_grid_desc_m,
-                                                      block_2_ctile_map,
+            ygrad_grid_desc_m0_o_m1,
-                                                      c0_matrix_mask,
+            block_2_ctile_map,
-                                                      p_drop,
+            c0_matrix_mask,
-                                                      ph,
+            p_drop,
-                                                      z_random_matrix_offset,
+            ph,
-                                                      raw_n_padded,
+            z_random_matrix_offset,
-                                                      0);
+            raw_n_padded,
+            0);
    }
 #else
    ignore = p_a_grid;
@@ -1018,16 +1019,20 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(GridwiseGemm::CalculateHasMainKBlockLoop(K))
            {
                if(arg.p_drop_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {
                if(arg.p_drop_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            return ave_time;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -993,16 +993,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            if(all_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -1000,16 +1000,20 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            if(all_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, true>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, true>{},
+                                             integral_constant<bool, false>{});
            }
            else if(!some_has_main_k_block_loop)
            {
                if(arg.p_dropout_ > 0.0)
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, true>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, true>{});
                else
-                    ave_time = launch_kernel(integral_constant<bool, false>{}, integral_constant<bool, false>{});
+                    ave_time = launch_kernel(integral_constant<bool, false>{},
+                                             integral_constant<bool, false>{});
            }
            else
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_kloop_v1.hpp
@@ -1935,8 +1935,8 @@ struct GridwiseBatchedMultiheadAttentionBackward_Kloop_Xdl_CShuffle_V1
                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
                    auto n_local =
                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                    auto m_global    = m_local + m_block_data_idx_on_grid;
+                    auto m_global = m_local + m_block_data_idx_on_grid;
-                    auto n_global    = n_local + n_block_data_idx_on_grid;
+                    auto n_global = n_local + n_block_data_idx_on_grid;
                    if(c0_matrix_mask.IsMaskedElement(m_global, n_global))
                    {
                        s_slash_p_thread_buf(i) = -ck::NumericLimits<float>::Infinity();

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -1948,54 +1948,61 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if constexpr(IsDropout){
+            if constexpr(IsDropout)
+            {
                if(p_z_grid)
                {
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                    blockwise_dropout
-                                                                        decltype(z_tenor_buffer),
+                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                        decltype(position_offset),
+                                                           decltype(z_tenor_buffer),
-                                                                        true>(
+                                                           decltype(position_offset),
-                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                           true>(
+                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                    z_thread_copy_vgpr_to_global.Run(
-                                                    z_tenor_buffer,
+                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                    z_grid_buf);
+                        z_tenor_buffer,
+                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        z_grid_buf);
                }
                else
                {
                    ignore = z_grid_buf;
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                decltype(position_offset),
+                                                                   decltype(position_offset),
-                                                                true>(
+                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
            }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1864,53 +1864,60 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            constexpr auto position_offset = M3 * M4;
            // save z to global
-            if constexpr(IsDropout){
+            if constexpr(IsDropout)
+            {
                if(p_z_grid)
                {
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
-                    blockwise_dropout.template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
+                    blockwise_dropout
-                                                                        decltype(z_tenor_buffer),
+                        .template ApplyDropoutAttnBwdSaveZ<decltype(s_slash_p_thread_buf),
-                                                                        decltype(position_offset),
+                                                           decltype(z_tenor_buffer),
-                                                                        true>(
+                                                           decltype(position_offset),
-                        s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
+                                                           true>(
+                            s_slash_p_thread_buf, ph, global_elem_id, z_tenor_buffer, raw_n_padded);
-                    z_thread_copy_vgpr_to_global.Run(z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
+                    z_thread_copy_vgpr_to_global.Run(
-                                                    z_tenor_buffer,
+                        z_thread_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                                                    z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        make_tuple(I0, I0, I0, I0, I0, I0, I0, I0, I0, I0),
-                                                    z_grid_buf);
+                        z_tenor_buffer,
+                        z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+                        z_grid_buf);
                }
                else
                {
                    ignore = z_grid_buf;
                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(I0) + acc0_thread_origin;
-                    auto m_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto m_local =
-                    auto n_local  = block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
                    auto m_global = m_local + m_block_data_idx_on_grid;
                    auto n_global = n_local + n_block_data_idx_on_grid;
                    auto global_elem_id_raw = z_random_matrix_offset + m_global * raw_n_padded +
-                                            n_global; // unique element global 1d id
+                                              n_global; // unique element global 1d id
                    auto global_elem_id =
                        (global_elem_id_raw % M4) * raw_n_padded + (global_elem_id_raw / M4) * M4;
                    // P_dropped
                    blockwise_dropout.template ApplyDropoutAttnBwd<decltype(s_slash_p_thread_buf),
-                                                                decltype(position_offset),
+                                                                   decltype(position_offset),
-                                                                true>(
+                                                                   true>(
                        s_slash_p_thread_buf, ph, global_elem_id, raw_n_padded);
                }
            }

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v1.hpp
@@ -921,7 +921,7 @@ struct GridwiseBatchedMultiheadAttentionForward_Xdl_CShuffle_V1
        {
            block_sync_lds();
        }
        do
        {
            auto n_block_data_idx_on_grid =

--- a/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
+++ b/library/include/ck/library/reference_tensor_operation/cpu/reference_softmax.hpp
@@ -28,15 +28,12 @@ struct ReferenceSoftmax : public device::BaseOperator
                 double beta,
                 const std::vector<index_t> sm_reduce_dims,
                 Tensor<AccDataType>* sm_stats_ptr = nullptr)
-            : in_(in),
+            : in_(in), out_(out), sm_reduce_dims_(sm_reduce_dims), sm_stats_ptr_(sm_stats_ptr)
-              out_(out),
-              sm_reduce_dims_(sm_reduce_dims),
-              sm_stats_ptr_(sm_stats_ptr)
        {
            alpha_ = static_cast<AccDataType>(alpha);
            beta_  = static_cast<AccDataType>(beta);
-            // std::cout << "debug: scalar dims: ";  
+            // std::cout << "debug: scalar dims: ";
            for(size_t i = 0; i < in.mDesc.GetNumOfDimension(); i++)
            {
                if(std::find(sm_reduce_dims.begin(), sm_reduce_dims.end(), i) ==