add grouped backward

24f96b50 · fsx950223 · ac07f2cd · 24f96b50 · 24f96b50 · 24f96b50
Commit 24f96b50 authored Jan 29, 2023 by fsx950223
6 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_pt1_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_pt1_fp16.cpp
@@ -248,8 +248,8 @@ int run(int argc, char* argv[])
    ck::index_t N  = 512;
    ck::index_t K  = 64;
    ck::index_t O  = 64;
-    ck::index_t G0 = 54; //54
-    ck::index_t G1 = 16; //16
+    ck::index_t G0 = 2; //54
+    ck::index_t G1 = 2; //16

    float alpha = 1.f / std::sqrt(K);


--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_fp16.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_fp16.cpp
@@ -35,6 +35,7 @@ Kernel outputs:
 #include "ck/tensor_operation/gpu/device/gemm_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/impl/device_grouped_multihead_attention_backward_xdl_cshuffle.hpp"
+#include "ck/tensor_operation/gpu/device/impl/device_grouped_multihead_attention_backward_xdl_cshuffle_v2.hpp"
 #include "ck/tensor_operation/gpu/element/element_wise_operation.hpp"

 #include "ck/library/utility/check_err.hpp"
@@ -44,6 +45,8 @@ Kernel outputs:
 #include "ck/library/reference_tensor_operation/cpu/reference_batched_gemm.hpp"
 #include "ck/library/reference_tensor_operation/cpu/reference_softmax.hpp"

+#define FLASH_ATTN_IMPLENTATION 0
+
 template <ck::index_t... Is>
 using S = ck::Sequence<Is...>;

@@ -83,6 +86,7 @@ static constexpr auto TensorSpecK = ck::tensor_operation::device::TensorSpeciali
 static constexpr auto TensorSpecV = ck::tensor_operation::device::TensorSpecialization::Default;
 static constexpr auto TensorSpecY = ck::tensor_operation::device::TensorSpecialization::Default;

+#if FLASH_ATTN_IMPLENTATION
 using DeviceGemmInstance =
    ck::tensor_operation::device::DeviceGroupedMultiheadAttentionBackward_Xdl_CShuffle<
        NumDimG,
@@ -147,7 +151,72 @@ using DeviceGemmInstance =
        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
        MaskingSpec>;   // MaskingSpecialization
-
+#else
+using DeviceGemmInstance = 
+    ck::tensor_operation::device::DeviceGroupedMultiheadAttentionBackward_Xdl_CShuffle_V2<
+        NumDimG,
+        NumDimM,
+        NumDimN,
+        NumDimK,
+        NumDimO,
+        DataType,
+        LSEDataType,
+        Acc0BiasDataType,
+        Acc1BiasDataType,
+        AccDataType,
+        ShuffleDataType,
+        QKVElementOp,
+        QKVElementOp,
+        Scale,
+        QKVElementOp,
+        YElementOp,
+        GemmSpec,
+        TensorSpecQ,
+        TensorSpecK,
+        TensorSpecV,
+        TensorSpecY,
+        1,
+        256,
+        128,         // MPerBlock
+        128,         // NPerBlock
+        64,          // KPerBlock
+        64,          // Gemm1NPerBlock
+        32,          // Gemm1KPerBlock
+        8,           // AK1
+        8,           // BK1
+        2,           // B1K1
+        32,          // MPerXDL
+        32,          // NPerXDL
+        1,           // MXdlPerWave
+        4,           // NXdlPerWave
+        2,           // Gemm1NXdlPerWave
+        S<4, 64, 1>, // ABlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<4, 64, 1>, // BBlockTransfer
+        S<1, 0, 2>,
+        S<1, 0, 2>,
+        2,
+        8,
+        8,
+        true,
+        S<8, 32, 1>, // B1BlockTransfer
+        S<0, 2, 1>,
+        S<0, 2, 1>,
+        1,
+        4,
+        2,
+        false,
+        1,              // CShuffleMXdlPerWavePerShuffle
+        2,              // CShuffleNXdlPerWavePerShuffle
+        S<1, 32, 1, 8>, // CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock
+        8,              // CShuffleBlockTransferScalarPerVector_NPerBlock
+        MaskingSpec>;   // MaskingSpecialization
+#endif
 // Ref Gemm0: S = alpha * Q * K^T
 // fp16 in, fp32 out
 using ReferenceGemm0Instance = ck::tensor_operation::host::ReferenceBatchedGemm<DataType,
@@ -327,11 +396,10 @@ int run(int argc, char* argv[])
    for(std::size_t i=0; i<group_count; i++){
        int M  = 128 * (rand() % 4 + 1);
        int N  = 128 * (rand() % 4 + 1);
-        int K  = 128;
-        int O  = 128;
+        int K  = 64;
+        int O  = 64;
        int G0 = rand() % 3 + 1;
        int G1 = rand() % 2 + 1;
-
        std::vector<ck::index_t> q_gs_ms_ks_lengths{G0, G1, M, K};
        std::vector<ck::index_t> q_gs_ms_ks_strides =
            input_permute

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
@@ -15,7 +15,7 @@
 #include "ck/tensor_operation/gpu/device/masking_specialization.hpp"
 #include "ck/tensor_operation/gpu/device/matrix_padder.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
-#include "ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v3.hpp"
+#include "ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp"
 #include "ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp"
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
@@ -554,7 +554,7 @@ struct DeviceBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
    };

    // GridwiseGemm
-    using GridwiseGemm = GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle<
+    using GridwiseGemm = GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2<
        DataType, // TODO: distinguish A/B datatype
        LSEDataType,
        GemmAccDataType,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_multihead_attention_backward_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_multihead_attention_backward_xdl_cshuffle.hpp
@@ -952,8 +952,6 @@ struct DeviceGroupedMultiheadAttentionBackward_Xdl_CShuffle
        {
            return false;
        }
-        bool all_has_main_k_block_loop  = true;
-        bool some_has_main_k_block_loop = false;

        for(std::size_t i = 0; i < arg.group_count_; i++)
        {
@@ -972,12 +970,6 @@ struct DeviceGroupedMultiheadAttentionBackward_Xdl_CShuffle
                return false;
            }

-            // Check if having main loop
-            const auto K = kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I0) *
-                           kernel_arg.a_grid_desc_ak0_m_ak1_.GetLength(I2);
-            const bool y = GridwiseGemm::CalculateHasMainKBlockLoop(K);
-            all_has_main_k_block_loop &= y;
-            some_has_main_k_block_loop |= y;
            // Note: we need raw lengths since threadwise copy can not handle vector load when part
            // of vector is out of bounds Note: need lowest dim in Ms/Ns/Ks/Os, not merged M/N/K/O
            const auto MzRaw      = device_arg.raw_lengths_mz_nz_kz_gemm1nz_[0];

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_multihead_attention_backward_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_multihead_attention_backward_xdl_cshuffle_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_multihead_attention_backward_xdl_cshuffle_v2.hpp
@@ -80,7 +80,7 @@ template <typename DataType,
          bool PadN,
          bool MaskOutUpperTriangle,
          PipelineVersion PipelineVer = PipelineVersion::v1>
-struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle
+struct GridwiseBatchedMultiheadAttentionBackward_Xdl_CShuffle_V2
 {
    static_assert(LoopSched == LoopScheduler::Default,
                  "Non-default loop scheduler is currently not supported");