Merge branch 'mha-train-develop' into mha-train-develop-bias-shfl

e87ddb0e · letaoqin · 13129772 · 5ff2d646 · e87ddb0e · e87ddb0e
Commit e87ddb0e authored Oct 26, 2023 by letaoqin
18 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_fwd_xdl_cshuffle_v2.hpp
@@ -47,8 +47,7 @@ template <typename GridwiseGemm,
          typename C0MatrixMask,
          bool HasMainKBlockLoop,
          bool IsDropout,
-          bool IsLseStoring,
-          bool Deterministic>
+          bool IsLseStoring>
 __global__ void
 #if CK_USE_LAUNCH_BOUNDS
    __launch_bounds__(CK_MAX_THREAD_PER_BLOCK, CK_MIN_BLOCK_PER_CU)
@@ -78,7 +77,7 @@ __global__ void
            const LSEGridDescriptor_M lse_grid_desc_m,
            const Block2CTileMap block_2_ctile_map,
            const index_t batch_count,
-            const index_t mblock,
+            const index_t h_ratio,
            const ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch,
            const C0MatrixMask c0_matrix_mask,
            const uint8_t p_dropout_in_uint8_t,
@@ -94,13 +93,14 @@ __global__ void
    const index_t num_blocks_per_batch =
        __builtin_amdgcn_readfirstlane(get_grid_size() / batch_count);
    const index_t g_idx = __builtin_amdgcn_readfirstlane(get_block_1d_id() / num_blocks_per_batch);
+    const index_t gkv_idx = __builtin_amdgcn_readfirstlane(g_idx / h_ratio);

    const long_index_t a_batch_offset = __builtin_amdgcn_readfirstlane(
        static_cast<long_index_t>(compute_base_ptr_of_batch.GetABasePtr(g_idx)));
    const long_index_t b_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(g_idx)));
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetBBasePtr(gkv_idx)));
    const long_index_t b1_batch_offset = __builtin_amdgcn_readfirstlane(
-        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(g_idx)));
+        static_cast<long_index_t>(compute_base_ptr_of_batch.GetB1BasePtr(gkv_idx)));
    const long_index_t c_batch_offset = __builtin_amdgcn_readfirstlane(
        static_cast<long_index_t>(compute_base_ptr_of_batch.GetCBasePtr(g_idx)));
    const long_index_t z_batch_offset = __builtin_amdgcn_readfirstlane(
@@ -122,43 +122,6 @@ __global__ void

    const index_t z_random_matrix_offset = g_idx * raw_m_padded * raw_n_padded;

-    if constexpr(Deterministic)
-    {
-        for(index_t i = 0; i < mblock; i++)
-        {
-            GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout, IsLseStoring>(
-                p_a_grid + a_batch_offset,
-                p_b_grid + b_batch_offset,
-                tmp_p_d0_grid,
-                p_b1_grid + b1_batch_offset,
-                p_c_grid + c_batch_offset,
-                p_z_grid == nullptr ? nullptr : p_z_grid + z_batch_offset,
-                p_lse_grid == nullptr ? nullptr : p_lse_grid + lse_batch_offset,
-                p_shared,
-                a_element_op,
-                b_element_op,
-                acc_element_op,
-                b1_element_op,
-                c_element_op,
-                a_grid_desc_ak0_m_ak1,
-                b_grid_desc_bk0_n_bk1,
-                d0_griddesc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                b1_grid_desc_bk0_n_bk1,
-                c_grid_desc_mblock_mperblock_nblock_nperblock,
-                z_grid_desc_m0_n0_m1_n1_m2_n2_m3_n3_n4_n5,
-                lse_grid_desc_m,
-                block_2_ctile_map,
-                c0_matrix_mask,
-                p_dropout_in_uint8_t,
-                p_dropout_rescale,
-                ph,
-                z_random_matrix_offset,
-                raw_n_padded,
-                i);
-        }
-    }
-    else
-    {
    GridwiseGemm::template Run<HasMainKBlockLoop, IsDropout, IsLseStoring>(
        p_a_grid + a_batch_offset,
        p_b_grid + b_batch_offset,
@@ -186,9 +149,7 @@ __global__ void
        p_dropout_rescale,
        ph,
        z_random_matrix_offset,
-            raw_n_padded,
-            0);
-    }
+        raw_n_padded);
 #else
    ignore = p_a_grid;
    ignore = p_b_grid;
@@ -211,7 +172,7 @@ __global__ void
    ignore = lse_grid_desc_m;
    ignore = block_2_ctile_map;
    ignore = batch_count;
-    ignore = mblock;
+    ignore = h_ratio;
    ignore = compute_base_ptr_of_batch;
    ignore = c0_matrix_mask;
    ignore = p_dropout_in_uint8_t;
@@ -296,7 +257,6 @@ template <index_t NumDimG,
          index_t CShuffleBlockTransferScalarPerVector_NPerBlock,
          index_t Acc1BiasTransferSrcScalarPerVector,
          MaskingSpecialization MaskingSpec,
-          bool Deterministic,
          LoopScheduler LoopSched = LoopScheduler::Default>
 struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
    : public DeviceBatchedMultiheadAttentionForward<NumDimG,
@@ -576,8 +536,7 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
        Acc1BiasTransferSrcScalarPerVector,
        LoopSched,
        Transform::matrix_padder.PadN,
-        MaskingSpec != MaskingSpecialization::MaskDisabled,
-        Deterministic>;
+        MaskingSpec != MaskingSpecialization::MaskDisabled>;

    // Argument
    // FIXME: constness
@@ -662,7 +621,8 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
                                b1_gs_gemm1ns_gemm1ks_strides[NumDimG + NumDimO + NumDimN - 1]},
              c_mz_gemm1nz_strides_{c_gs_ms_gemm1ns_strides[NumDimG + NumDimM - 1],
                                    c_gs_ms_gemm1ns_strides[NumDimG + NumDimM + NumDimO - 1]},
-              batch_count_{c_grid_desc_g_m_n_.GetLength(I0)}
+              batch_count_{c_grid_desc_g_m_n_.GetLength(I0)},
+              h_ratio_{c_grid_desc_g_m_n_.GetLength(I0) / b_grid_desc_g_n_k_.GetLength(I0)}
        {
            // TODO ANT: implement bias addition
            ignore = p_acc1_biases;
@@ -736,10 +696,8 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
            std::cout << "d0_grid_desc_g_m_n_: " << d0_grid_desc_g_m_n_.GetLength(I0) << ", "
                      << d0_grid_desc_g_m_n_.GetLength(I1) << ", "
                      << d0_grid_desc_g_m_n_.GetLength(I2) << '\n';
-
            std::cout << "d0_grid_desc_m_n_: " << d0_grid_desc_m_n_.GetLength(I0) << ", "
                      << d0_grid_desc_m_n_.GetLength(I1) << '\n';
-
            std::cout << "b1_grid_desc_g_n_k_: " << b1_grid_desc_g_n_k_.GetLength(I0) << ", "
                      << b1_grid_desc_g_n_k_.GetLength(I1) << ", "
                      << b1_grid_desc_g_n_k_.GetLength(I2) << '\n';
@@ -802,6 +760,7 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
        std::vector<index_t> c_mz_gemm1nz_strides_;

        index_t batch_count_;
+        index_t h_ratio_;
        ComputeBasePtrOfStridedBatch compute_base_ptr_of_batch_;

        float p_dropout_;
@@ -833,9 +792,7 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
            }

            const index_t grid_size =
-                (Deterministic ? 1
-                               : arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_)) *
-                arg.batch_count_;
+                arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_) * arg.batch_count_;

            // Gemm0_K
            const auto K =
@@ -843,8 +800,9 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2

            float ave_time = 0;

-            auto launch_kernel =
-                [&](auto has_main_k_block_loop_, auto is_dropout_, auto is_lse_storing_) {
+            auto launch_kernel = [&](auto has_main_k_block_loop_,
+                                     auto is_dropout_,
+                                     auto is_lse_storing_) {
                const auto kernel = kernel_batched_multiheadattention_forward_xdl_cshuffle_v2<
                    GridwiseGemm,
                    ADataType, // TODO: distiguish A/B datatype
@@ -870,11 +828,9 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
                    C0MatrixMask,
                    has_main_k_block_loop_,
                    is_dropout_,
-                        is_lse_storing_,
-                        Deterministic>;
+                    is_lse_storing_>;

-                    return launch_and_time_kernel(
-                        stream_config,
+                return launch_and_time_kernel(stream_config,
                                              kernel,
                                              dim3(grid_size),
                                              dim3(BlockSize),
@@ -900,7 +856,7 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2
                                              arg.lse_grid_desc_m_,
                                              arg.block_2_ctile_map_,
                                              arg.batch_count_,
-                        arg.block_2_ctile_map_.CalculateGridSize(arg.c_grid_desc_m_n_),
+                                              arg.h_ratio_,
                                              arg.compute_base_ptr_of_batch_,
                                              arg.c0_matrix_mask_,
                                              arg.p_dropout_in_uint8_t_,
@@ -1014,12 +970,13 @@ struct DeviceBatchedMultiheadAttentionForward_Xdl_CShuffle_V2

        // Check if C permute dimension matches GEMM + GEMM shape
        const index_t c_g       = arg.c_grid_desc_g_m_n_.GetLength(I0); // unpadded
+        const index_t b_g       = arg.b_grid_desc_g_n_k_.GetLength(I0);
        const index_t c_m       = arg.c_grid_desc_m_n_.GetLength(I0);
        const index_t c_gemm1n  = arg.c_grid_desc_m_n_.GetLength(I1);
        const index_t a_m       = arg.a_grid_desc_ak0_m_ak1_.GetLength(I1);
        const index_t b1_gemm1n = arg.b1_grid_desc_bk0_n_bk1_.GetLength(I1);

-        if(!(c_g == arg.batch_count_ && c_m == a_m && c_gemm1n == b1_gemm1n))
+        if(!(c_g == arg.batch_count_ && c_m == a_m && c_gemm1n == b1_gemm1n && c_g % b_g == 0))
        {
            return false;
        }

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp
@@ -694,7 +694,7 @@ struct DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V1
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
                    z_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_infer_xdl_cshuffle.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_light_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_fwd_xdl_cshuffle_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_infer_xdl_cshuffle.hpp
--- a/include/ck/utility/type_convert.hpp
+++ b/include/ck/utility/type_convert.hpp
--- a/library/include/ck/library/utility/host_common_util.hpp
+++ b/library/include/ck/library/utility/host_common_util.hpp
--- a/library/include/ck/library/utility/host_tensor_generator.hpp
+++ b/library/include/ck/library/utility/host_tensor_generator.hpp