Merge pull request #870 from ROCmSoftwarePlatform/mha-train-bias-bwd-type2

Add bias to flash attention bwd

Merge pull request #870 from ROCmSoftwarePlatform/mha-train-bias-bwd-type2
Add bias to flash attention bwd
4c8b47c0 · Dan Yao · GitHub · 226355e7 · 882b3328 · 4c8b47c0
Unverified Commit 4c8b47c0 authored Aug 29, 2023 by Dan Yao Committed by GitHub Aug 29, 2023
15 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_train_v2.cpp
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_backward_v2.cpp
--- a/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/grouped_multihead_attention_train_v2.cpp
--- a/example/52_flash_atten_bias/CMakeLists.txt
+++ b/example/52_flash_atten_bias/CMakeLists.txt
 add_example_executable(example_batched_multihead_attention_bias_forward_v2 batched_multihead_attention_bias_forward_v2.cpp)
-add_example_executable(example_grouped_multihead_attention_bias_forward_v2 grouped_multihead_attention_bias_forward_v2.cpp)
\ No newline at end of file
+add_example_executable(example_grouped_multihead_attention_bias_forward_v2 grouped_multihead_attention_bias_forward_v2.cpp)
+
+add_example_executable(example_batched_multihead_attention_bias_backward_v2 batched_multihead_attention_bias_backward_v2.cpp)
+add_example_executable(example_grouped_multihead_attention_bias_backward_v2 grouped_multihead_attention_bias_backward_v2.cpp)
\ No newline at end of file
--- a/example/52_flash_atten_bias/batched_multihead_attention_bias_backward_v2.cpp
+++ b/example/52_flash_atten_bias/batched_multihead_attention_bias_backward_v2.cpp
--- a/example/52_flash_atten_bias/grouped_multihead_attention_bias_backward_v2.cpp
+++ b/example/52_flash_atten_bias/grouped_multihead_attention_bias_backward_v2.cpp
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -769,6 +769,7 @@ struct BlockwiseGemmXdlops_v2
        c_thread_buf_;

    __host__ __device__ constexpr auto& GetCThreadBuffer() { return c_thread_buf_; }
+    __host__ __device__ constexpr auto& GetCThreadDesc() { return c_thread_desc_; }

    __device__ static auto GetWaveIdx()
    {

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v2.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -21,6 +21,7 @@
 namespace ck {

 template <typename InputDataType,
+          typename D0DataType,
          typename OutputDataType,
          typename ZDataType,
          typename GemmDataType,
@@ -36,6 +37,7 @@ template <typename InputDataType,
          typename QGridDesc_K0_M_K1,
          typename KGridDesc_K0_N_K1,
          typename KGridDesc_N_K,
+          typename D0GridDesc_M_N,
          typename ZGridDesc_M_N,
          typename VGridDesc_O0_N_O1,
          typename YGridDesc_M_O,
@@ -72,6 +74,7 @@ template <typename InputDataType,
          index_t BBlockTransferDstScalarPerVector_BK1,
          bool BThreadTransferSrcResetCoordinateAfterRun, // ignored
          index_t BBlockLdsExtraN,
+          index_t D0BlockTransferSrcScalarPerVector,
          index_t CShuffleMXdlPerWavePerShuffle,
          index_t CShuffleNXdlPerWavePerShuffle,
          typename CShuffleBlockTransferClusterLengths_MBlock_MPerBlock_NBlock_NPerBlock,
@@ -1219,43 +1222,159 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        return math::max(p_slash_sgrad_bytes_end, softmax_bytes_end, c_block_bytes_end);
    }

+    // D0
+    static constexpr auto D0M2 = Number<4>{};
+    static constexpr auto D0M1 = Number<MPerXdl>{} / D0M2;
+    static constexpr auto D0M0 = Number<MPerBlock>{} / Number<MPerXdl>{};
+
+    __host__ __device__ static constexpr auto
+    MakeD0GridDescriptor_M0_N0_M1_M2_N1_M3(const D0GridDesc_M_N& d0_grid_desc_m_n)
+    {
+        const auto M = d0_grid_desc_m_n.GetLength(I0);
+        const auto N = d0_grid_desc_m_n.GetLength(I1);
+
+        const auto MBlock = M / MPerBlock;
+        const auto NBlock = N / NPerBlock;
+
+        const auto d0_grid_desc_m0_n0_m1_m2_n1_m3 = transform_tensor_descriptor(
+            d0_grid_desc_m_n,
+            make_tuple(make_unmerge_transform(make_tuple(MBlock, D0M0, D0M1, D0M2)),
+                       make_unmerge_transform(make_tuple(NBlock, Number<NPerBlock>{}))),
+            make_tuple(Sequence<0>{}, Sequence<1>{}),
+            make_tuple(Sequence<0, 2, 3, 5>{}, Sequence<1, 4>{}));
+
+        return d0_grid_desc_m0_n0_m1_m2_n1_m3;
+    }
+
+    using D0GridDescriptor_M0_N0_M1_M2_N1_M3 =
+        remove_cvref_t<decltype(MakeD0GridDescriptor_M0_N0_M1_M2_N1_M3(D0GridDesc_M_N{}))>;
+
+    struct D0Loader
+    {
+        template <typename DataType>
+        struct TypeTransform
+        {
+            using Type = DataType;
+        };
+        template <>
+        struct TypeTransform<void>
+        {
+            using Type = ck::half_t;
+        };
+        static constexpr index_t NThreadClusterLengths = MPerXdl;
+        static_assert(MPerXdl <= KPerBlock);
+        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
+                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");
+        __host__ __device__ static constexpr auto GetD0BlockWriteDescriptor_M0_N0_M1_M2_N1_M3()
+        {
+            // B1 matrix in LDS memory, dst of blockwise copy
+            return make_naive_tensor_descriptor_packed(
+                make_tuple(I1, I1, I1, D0M1, Number<NPerBlock>{}, D0M2));
+        }
+        __host__ __device__ static constexpr auto GetD0BlockReadDescriptor_N0_N1_M0_M1_M2()
+        {
+            constexpr auto d0_raw_m0_n_m1 =
+                make_naive_tensor_descriptor_packed(make_tuple(D0M1, Number<NPerBlock>{}, D0M2));
+
+            constexpr auto d0_n0_n1_m0_m1_m2 = transform_tensor_descriptor(
+                d0_raw_m0_n_m1,
+                make_tuple(make_unmerge_transform(make_tuple(D0M1 / I2, I2)),
+                           make_unmerge_transform(
+                               make_tuple(Number<NPerBlock / NPerXdl>{}, Number<NPerXdl>{})),
+                           make_pass_through_transform(D0M2)),
+                make_tuple(Sequence<0>{}, Sequence<1>{}, Sequence<2>{}),
+                make_tuple(Sequence<2, 3>{}, Sequence<0, 1>{}, Sequence<4>{}));
+            return d0_n0_n1_m0_m1_m2;
+        }
+        static constexpr auto d0_block_write_desc_m0_n0_m1_m2_n1_m3 =
+            GetD0BlockWriteDescriptor_M0_N0_M1_M2_N1_M3();
+        static constexpr auto d0_block_read_desc_n0_n1_m0_m1_m2 =
+            GetD0BlockReadDescriptor_N0_N1_M0_M1_M2();
+
+        static constexpr auto d0_thread_desc_ =
+            make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I4, I1, D0M2));
+
+        using D0BlockwiseCopy = ThreadGroupTensorSliceTransfer_v4r1<
+            ThisThreadBlock,
+            tensor_operation::element_wise::PassThrough,
+            tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<I1, I1, I1, D0M1, NPerBlock, D0M2>, // BlockSliceLengths
+            Sequence<1,
+                     1,
+                     1,
+                     BlockSize / NThreadClusterLengths,
+                     NThreadClusterLengths,
+                     1>,                                     // ThreadClusterLengths
+            Sequence<0, 1, 2, 3, 5, 4>,                      // ThreadClusterArrangeOrder
+            typename TypeTransform<D0DataType>::Type,        // SrcData
+            typename TypeTransform<D0DataType>::Type,        // DstData
+            D0GridDescriptor_M0_N0_M1_M2_N1_M3,              // SrcDesc
+            decltype(d0_block_write_desc_m0_n0_m1_m2_n1_m3), // DstDesc
+            Sequence<0, 1, 2, 3, 5, 4>,                      // SrcDimAccessOrder
+            Sequence<0, 1, 2, 4, 3, 5>,                      // DstDimAccessOrder
+            4,                                               // SrcVectorDim
+            5,                                               // DstVectorDim
+            D0BlockTransferSrcScalarPerVector,               // SrcScalarPerVector
+            4,                                               // DstScalarPerVector
+            1,
+            1,
+            true,
+            true, // DstResetCoord
+            1>;
+
+        using D0ThreadCopy =
+            ThreadwiseTensorSliceTransfer_v4<typename TypeTransform<D0DataType>::Type,    // SrcData
+                                             typename TypeTransform<D0DataType>::Type,    // DstData
+                                             decltype(d0_block_read_desc_n0_n1_m0_m1_m2), // SrcDesc
+                                             decltype(d0_thread_desc_),                   // DstDesc
+                                             Sequence<1, 1, 4, 1, 4>, // SliceLengths
+                                             Sequence<0, 1, 2, 3, 4>, // DimAccessOrder
+                                             4,                       // SrcVectorDim
+                                             2,                       // SrcScalarPerVector
+                                             2>;
+    };
+
    template <bool HasMainKBlockLoop,
              bool IsDropout,
              typename Block2CTileMap,
              typename C0MatrixMask,
              typename YGradGridDesc_O0_M_O1>
-    __device__ static void Run(const InputDataType* __restrict__ p_q_grid,
-                               const InputDataType* __restrict__ p_k_grid,
-                               ZDataType* __restrict__ p_z_grid,
-                               const InputDataType* __restrict__ p_v_grid,
-                               const InputDataType* __restrict__ p_y_grid,
-                               const FloatLSE* __restrict__ p_lse_grid,
-                               const InputDataType* __restrict__ p_ygrad_grid,
-                               OutputDataType* __restrict__ p_qgrad_grid,
-                               OutputDataType* __restrict__ p_kgrad_grid,
-                               OutputDataType* __restrict__ p_vgrad_grid,
-                               void* __restrict__ p_shared,
-                               const AElementwiseOperation& a_element_op,
-                               const BElementwiseOperation& b_element_op,
-                               const SElementwiseOperation& s_element_op,
-                               const B1ElementwiseOperation& b1_element_op,
-                               const CElementwiseOperation& c_element_op,
-                               const QGridDesc_K0_M_K1& q_grid_desc_k0_m_k1,
-                               const KGridDesc_K0_N_K1& k_grid_desc_k0_n_k1,
-                               const ZGridDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3&
-                                   z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
-                               const VGridDesc_O0_N_O1& v_grid_desc_o0_n_o1,
-                               const YGridDescriptor_MBlock_MPerBlock_OBlock_OPerBlock&
-                                   y_grid_desc_mblock_mperblock_oblock_operblock,
-                               const LSEGridDesc_M& lse_grid_desc_m,
-                               const YGradGridDesc_O0_M_O1& ygrad_grid_desc_o0_m_o1,
-                               const Block2CTileMap& block_2_ctile_map,
-                               const C0MatrixMask& c0_matrix_mask,
-                               const float p_drop,
-                               ck::philox& ph,
-                               const index_t z_random_matrix_offset,
-                               const index_t raw_n_padded,
-                               const index_t block_idx_n)
+    __device__ static void
+    Run(const InputDataType* __restrict__ p_q_grid,
+        const InputDataType* __restrict__ p_k_grid,
+        const D0DataType* __restrict__ p_d0_grid,
+        ZDataType* __restrict__ p_z_grid,
+        const InputDataType* __restrict__ p_v_grid,
+        const InputDataType* __restrict__ p_y_grid,
+        const FloatLSE* __restrict__ p_lse_grid,
+        const InputDataType* __restrict__ p_ygrad_grid,
+        OutputDataType* __restrict__ p_qgrad_grid,
+        OutputDataType* __restrict__ p_kgrad_grid,
+        OutputDataType* __restrict__ p_vgrad_grid,
+        void* __restrict__ p_shared,
+        const AElementwiseOperation& a_element_op,
+        const BElementwiseOperation& b_element_op,
+        const SElementwiseOperation& s_element_op,
+        const B1ElementwiseOperation& b1_element_op,
+        const CElementwiseOperation& c_element_op,
+        const QGridDesc_K0_M_K1& q_grid_desc_k0_m_k1,
+        const KGridDesc_K0_N_K1& k_grid_desc_k0_n_k1,
+        const D0GridDescriptor_M0_N0_M1_M2_N1_M3& d0_grid_desc_m0_n0_m1_m2_n1_m3,
+        const ZGridDescriptor_M0_N0_M1_N1_M2_N2_M3_M4_M5_N3&
+            z_grid_desc_m0_n0_m1_n1_m2_n2_m3_m4_m5_n3,
+        const VGridDesc_O0_N_O1& v_grid_desc_o0_n_o1,
+        const YGridDescriptor_MBlock_MPerBlock_OBlock_OPerBlock&
+            y_grid_desc_mblock_mperblock_oblock_operblock,
+        const LSEGridDesc_M& lse_grid_desc_m,
+        const YGradGridDesc_O0_M_O1& ygrad_grid_desc_o0_m_o1,
+        const Block2CTileMap& block_2_ctile_map,
+        const C0MatrixMask& c0_matrix_mask,
+        const float p_drop,
+        ck::philox& ph,
+        const index_t z_random_matrix_offset,
+        const index_t raw_n_padded,
+        const index_t block_idx_n)
    {
        const FloatGemmAcc p_dropout  = type_convert<FloatGemmAcc>(1.0f - p_drop);
        const FloatGemmAcc rp_dropout = type_convert<FloatGemmAcc>(1.0f / p_dropout);
@@ -1792,6 +1911,18 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1

        // gemm0 M loop
        index_t gemm0_m_block_outer_index = num_gemm0_m_block_outer_loop - 1;
+        // D0
+        auto d0_block_copy_global_to_lds = typename D0Loader::D0BlockwiseCopy(
+            d0_grid_desc_m0_n0_m1_m2_n1_m3,
+            make_multi_index(gemm0_m_block_outer_index, block_work_idx_n, 0, 0, 0, 0),
+            tensor_operation::element_wise::PassThrough{},
+            D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3,
+            make_multi_index(0, 0, 0, 0, 0, 0),
+            tensor_operation::element_wise::PassThrough{});
+
+        auto d0_thread_copy_lds_to_vgpr = typename D0Loader::D0ThreadCopy(
+            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0));
+
        do
        {
            auto m_block_data_idx_on_grid =
@@ -1939,6 +2070,57 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            }

            block_sync_lds(); // wait for lds read in gemm0 blockwise gemm
+            // add bias
+            if constexpr(!is_same<D0DataType, void>::value)
+            {
+                static constexpr auto& c_thread_desc = s_blockwise_gemm.GetCThreadDesc();
+
+                const auto d0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    p_d0_grid, d0_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+
+                auto d0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                    static_cast<D0DataType*>(p_shared) + SharedMemTrait::k_block_space_offset,
+                    D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+
+                auto d0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, D0DataType>(
+                    D0Loader::d0_thread_desc_.GetElementSpaceSize());
+
+                static_for<0, D0M0, 1>{}([&](auto mr) {
+                    // load data to lds
+                    d0_block_copy_global_to_lds.RunRead(d0_grid_desc_m0_n0_m1_m2_n1_m3,
+                                                        d0_grid_buf);
+
+                    d0_block_copy_global_to_lds.MoveSrcSliceWindow(
+                        d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
+
+                    d0_block_copy_global_to_lds.RunWrite(
+                        D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3, d0_block_buf);
+                    block_sync_lds();
+                    // read data form lds
+                    d0_thread_copy_lds_to_vgpr.Run(D0Loader::d0_block_read_desc_n0_n1_m0_m1_m2,
+                                                   make_tuple(I0, I0, I0, I0, I0),
+                                                   d0_block_buf,
+                                                   D0Loader::d0_thread_desc_,
+                                                   make_tuple(I0, I0, I0, I0, I0),
+                                                   d0_thread_buf);
+
+                    // bias add
+                    static_for<0, d0_thread_buf.Size(), 1>{}([&](auto i) {
+                        constexpr index_t c_offset =
+                            c_thread_desc.CalculateOffset(make_tuple(mr, I0, i));
+
+                        s_slash_p_thread_buf(Number<c_offset>{}) +=
+                            ck::type_convert<FloatGemmAcc>(d0_thread_buf[i]);
+                    });
+                });
+
+                // load k
+                gemm_tile_k_blockwise_copy.RunWrite(GemmBlockwiseCopy::k_block_desc_k0_n_k1,
+                                                    k_block_buf);
+
+                d0_block_copy_global_to_lds.MoveSrcSliceWindow(
+                    d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+            }

            // P_i: = softmax(scalar * S_i:)
            // scaling is already performed in the preceding statements with s_element_op

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -111,6 +111,11 @@ struct StaticBufferTupleOfVector

        return base::operator()(i_v).template AsType<S>()(i_s);
    }
+    template <index_t I>
+    __host__ __device__ constexpr S& operator()(Number<I> i_v, Number<I> i_s)
+    {
+        return base::operator()(i_v).template AsType<S>()(i_s);
+    }

    // Get X
    // i is offset of S, not X. i should be aligned to X