Merge branch 'mha-train-develop' of...

Merge branch 'mha-train-develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into mha-train-develop

Merge branch 'mha-train-develop' of...
Merge branch 'mha-train-develop' of https://github.com/ROCmSoftwarePlatform/composable_kernel into mha-train-develop
b15eecba · letaoqin · 87d1e073 · 04c206da · b15eecba · b15eecba
Commit b15eecba authored Sep 25, 2023 by letaoqin
3 changed files
--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v1.hpp
@@ -87,6 +87,10 @@ template <typename InputDataType,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
 {
+    static_assert(KPerBlock == Gemm1NPerBlock);
+    static_assert(MPerBlock % Gemm1KPerBlock == 0);
+    static_assert(NPerBlock % Gemm2KPerBlock == 0);
    static_assert(LoopSched == LoopScheduler::Default,
                  "Non-default loop scheduler is currently not supported");
@@ -1281,7 +1285,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
    using D0GridDescriptor_M0_N0_M1_M2_N1_M3 =
        remove_cvref_t<decltype(MakeD0GridDescriptor_M0_N0_M1_M2_N1_M3(D0GridDesc_M_N{}))>;
-    struct D0Loader
+    struct D0Operator
    {
        template <typename DataType>
        struct TypeTransform
@@ -1297,17 +1301,16 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            static constexpr index_t Size0 = 0;
            static constexpr index_t Size  = sizeof(ck::half_t);
        };
-        static constexpr index_t NThreadClusterLengths = MPerXdl;
+        static constexpr index_t NThreadClusterLengths = 32;
-        static_assert(MPerXdl <= KPerBlock);
+        static_assert(NPerXdl == 32);
        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");
-        __host__ __device__ static constexpr auto GetD0BlockWriteDescriptor_M0_N0_M1_M2_N1_M3()
+        __host__ __device__ static constexpr auto GetD0BlockGlobalDescriptor_M0_N0_M1_M2_N1_M3()
        {
-            // B1 matrix in LDS memory, dst of blockwise copy
            return make_naive_tensor_descriptor_packed(
                make_tuple(I1, I1, I1, D0M1, Number<NPerBlock>{}, D0M2));
        }
-        __host__ __device__ static constexpr auto GetD0BlockReadDescriptor_N0_N1_M0_M1_M2()
+        __host__ __device__ static constexpr auto GetD0BlockVgprDescriptor_N0_N1_M0_M1_M2()
        {
            constexpr auto d0_raw_m0_n_m1 =
                make_naive_tensor_descriptor_packed(make_tuple(D0M1, Number<NPerBlock>{}, D0M2));
@@ -1322,15 +1325,20 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                make_tuple(Sequence<2, 3>{}, Sequence<0, 1>{}, Sequence<4>{}));
            return d0_n0_n1_m0_m1_m2;
        }
-        static constexpr auto d0_block_write_desc_m0_n0_m1_m2_n1_m3 =
+        static constexpr auto d0_block_dst_desc_m0_n0_m1_m2_n1_m3 =
-            GetD0BlockWriteDescriptor_M0_N0_M1_M2_N1_M3();
+            GetD0BlockGlobalDescriptor_M0_N0_M1_M2_N1_M3();
-        static constexpr auto d0_block_read_desc_n0_n1_m0_m1_m2 =
+        static constexpr auto d0_block_src_desc_n0_n1_m0_m1_m2 =
-            GetD0BlockReadDescriptor_N0_N1_M0_M1_M2();
+            GetD0BlockVgprDescriptor_N0_N1_M0_M1_M2();
        static constexpr auto d0_thread_desc_ =
            make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I4, I1, D0M2));
-        using D0BlockwiseCopy = ThreadGroupTensorSliceTransfer_v4r1<
+        static constexpr auto& d0grad_block_dst_desc_n0_n1_m0_m1_m2 =
+            d0_block_src_desc_n0_n1_m0_m1_m2;
+        static constexpr auto& d0grad_block_src_desc_m0_n0_m1_m2_n1_m3 =
+            d0_block_dst_desc_m0_n0_m1_m2_n1_m3;
+        using D0BlockwiseCopyGlobalToLds = ThreadGroupTensorSliceTransfer_v4r1<
            ThisThreadBlock,
            tensor_operation::element_wise::PassThrough,
            tensor_operation::element_wise::PassThrough,
@@ -1341,34 +1349,77 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                     1,
                     BlockSize / NThreadClusterLengths,
                     NThreadClusterLengths,
-                     1>,                                     // ThreadClusterLengths
+                     1>,                                   // ThreadClusterLengths
-            Sequence<0, 1, 2, 3, 5, 4>,                      // ThreadClusterArrangeOrder
+            Sequence<0, 1, 2, 3, 5, 4>,                    // ThreadClusterArrangeOrder
-            typename TypeTransform<D0DataType>::Type,        // SrcData
+            typename TypeTransform<D0DataType>::Type,      // SrcData
-            typename TypeTransform<D0DataType>::Type,        // DstData
+            typename TypeTransform<D0DataType>::Type,      // DstData
-            D0GridDescriptor_M0_N0_M1_M2_N1_M3,              // SrcDesc
+            D0GridDescriptor_M0_N0_M1_M2_N1_M3,            // SrcDesc
-            decltype(d0_block_write_desc_m0_n0_m1_m2_n1_m3), // DstDesc
+            decltype(d0_block_dst_desc_m0_n0_m1_m2_n1_m3), // DstDesc
-            Sequence<0, 1, 2, 3, 5, 4>,                      // SrcDimAccessOrder
+            Sequence<0, 1, 2, 3, 5, 4>,                    // SrcDimAccessOrder
-            Sequence<0, 1, 2, 4, 3, 5>,                      // DstDimAccessOrder
+            Sequence<0, 1, 2, 4, 3, 5>,                    // DstDimAccessOrder
-            4,                                               // SrcVectorDim
+            4,                                             // SrcVectorDim
-            5,                                               // DstVectorDim
+            5,                                             // DstVectorDim
-            D0BlockTransferSrcScalarPerVector,               // SrcScalarPerVector
+            4,                                             // SrcScalarPerVector
-            4,                                               // DstScalarPerVector
+            4,                                             // DstScalarPerVector
            1,
            1,
            true,
            true, // DstResetCoord
            1>;
-        using D0ThreadWiseCopy =
+        using D0ThreadwiseCopyLdsToVgpr =
-            ThreadwiseTensorSliceTransfer_v4<typename TypeTransform<D0DataType>::Type,    // SrcData
+            ThreadwiseTensorSliceTransfer_v4<typename TypeTransform<D0DataType>::Type,   // SrcData
-                                             typename TypeTransform<D0DataType>::Type,    // DstData
+                                             typename TypeTransform<D0DataType>::Type,   // DstData
-                                             decltype(d0_block_read_desc_n0_n1_m0_m1_m2), // SrcDesc
+                                             decltype(d0_block_src_desc_n0_n1_m0_m1_m2), // SrcDesc
-                                             decltype(d0_thread_desc_),                   // DstDesc
+                                             decltype(d0_thread_desc_),                  // DstDesc
                                             Sequence<1, 1, 4, 1, 4>, // SliceLengths
                                             Sequence<0, 1, 2, 3, 4>, // DimAccessOrder
                                             4,                       // SrcVectorDim
-                                             2,                       // SrcScalarPerVector
+                                             4,                       // SrcScalarPerVector
                                             2>;
+        using D0GradThreadwiseCopyVgprToLds = ThreadwiseTensorSliceTransfer_v1r3<
+            FloatGemmAcc,
+            typename TypeTransform<D0DataType>::Type,
+            decltype(d0_thread_desc_),
+            decltype(d0grad_block_dst_desc_n0_n1_m0_m1_m2),
+            tensor_operation::element_wise::Scale, // CElementwiseOperation
+            Sequence<1, 1, 4, 1, 4>,               // SliceLengths
+            Sequence<0, 1, 2, 3, 4>,               // AccessOrder
+            4,                                     // VectorDim
+            4,                                     // ScalarPerVector
+            InMemoryDataOperationEnum::Set,        // GlobalMemoryDataOperation
+            1,                                     // DstScalarStrideInVector
+            true>;
+        using D0GradBlockwiseCopyLdsToGlobal = ThreadGroupTensorSliceTransfer_v4r1<
+            ThisThreadBlock,
+            tensor_operation::element_wise::PassThrough,
+            tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<I1, I1, I1, D0M1, NPerBlock, D0M2>, // BlockSliceLengths
+            Sequence<1,
+                     1,
+                     1,
+                     BlockSize / NThreadClusterLengths,
+                     NThreadClusterLengths,
+                     1>,                                       // ThreadClusterLengths
+            Sequence<0, 1, 2, 3, 5, 4>,                        // ThreadClusterArrangeOrder
+            typename TypeTransform<D0DataType>::Type,          // SrcData
+            typename TypeTransform<D0DataType>::Type,          // DstData
+            decltype(d0grad_block_src_desc_m0_n0_m1_m2_n1_m3), // SrcDesc
+            D0GridDescriptor_M0_N0_M1_M2_N1_M3,                // DstDesc
+            Sequence<0, 1, 2, 4, 3, 5>,                        // SrcDimAccessOrder
+            Sequence<0, 1, 2, 3, 5, 4>,                        // DstDimAccessOrder
+            5,                                                 // SrcVectorDim
+            4,                                                 // DstVectorDim
+            4,                                                 // SrcScalarPerVector
+            D0BlockTransferSrcScalarPerVector,                 // DstScalarPerVector
+            1,
+            1,
+            true,
+            true, // DstResetCoord
+            1>;
    };
    struct SharedMemTrait
@@ -1412,11 +1463,11 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            sizeof(GemmDataType) / sizeof(FloatGemmAcc);
        static constexpr auto d0_block_space_size_aligned = math::integer_least_multiple(
-            D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize(), max_lds_align);
+            D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize(), max_lds_align);
        static constexpr auto d0_block_space_offset =
            (k_block_space_size_aligned.value + ygrad_block_space_size_aligned.value +
             q_block_space_size_aligned.value) *
-            sizeof(GemmDataType) / D0Loader::template TypeTransform<D0DataType>::Size;
+            sizeof(GemmDataType) / D0Operator::template TypeTransform<D0DataType>::Size;
        // LDS allocation for C shuffle in LDS
        static constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
@@ -1436,7 +1487,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                                          sizeof(FloatGemmAcc);
        const index_t d0_bytes_end =
            (SharedMemTrait::d0_block_space_offset + SharedMemTrait::d0_block_space_size_aligned) *
-            D0Loader::template TypeTransform<D0DataType>::Size0;
+            D0Operator::template TypeTransform<D0DataType>::Size0;
        const index_t c_block_bytes_end =
            SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle);
@@ -1460,6 +1511,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        const InputDataType* __restrict__ p_ygrad_grid,
        OutputDataType* __restrict__ p_qgrad_grid,
        OutputDataType* __restrict__ p_kgrad_grid,
+        D0DataType* __restrict__ p_d0grad_grid,
        OutputDataType* __restrict__ p_vgrad_grid,
        void* __restrict__ p_shared,
        const AElementwiseOperation& a_element_op,
@@ -2006,18 +2058,33 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        // gemm0 M loop
        index_t gemm0_m_block_outer_index = num_gemm0_m_block_outer_loop - 1;
        // D0
-        auto d0_block_copy_global_to_lds = typename D0Loader::D0BlockwiseCopy(
+        auto d0_block_copy_global_to_lds = typename D0Operator::D0BlockwiseCopyGlobalToLds(
            d0_grid_desc_m0_n0_m1_m2_n1_m3,
            make_multi_index(gemm0_m_block_outer_index, block_work_idx_n, 0, 0, 0, 0),
            tensor_operation::element_wise::PassThrough{},
-            D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3,
+            D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3,
            make_multi_index(0, 0, 0, 0, 0, 0),
            tensor_operation::element_wise::PassThrough{});
-        auto d0_thread_copy_lds_to_vgpr = typename D0Loader::D0ThreadWiseCopy(
+        auto d0_thread_copy_lds_to_vgpr = typename D0Operator::D0ThreadwiseCopyLdsToVgpr(
            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0));
+        auto& d0grad_grid_desc_m0_n0_m1_m2_n1_m3 = d0_grid_desc_m0_n0_m1_m2_n1_m3;
+        auto d0grad_thread_copy_vgpr_to_lds = typename D0Operator::D0GradThreadwiseCopyVgprToLds(
+            D0Operator::d0grad_block_dst_desc_n0_n1_m0_m1_m2,
+            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0),
+            tensor_operation::element_wise::Scale{rp_dropout});
+        auto d0grad_block_copy_lds_to_global = typename D0Operator::D0GradBlockwiseCopyLdsToGlobal(
+            D0Operator::d0grad_block_src_desc_m0_n0_m1_m2_n1_m3,
+            make_multi_index(0, 0, 0, 0, 0, 0),
+            tensor_operation::element_wise::PassThrough{},
+            d0grad_grid_desc_m0_n0_m1_m2_n1_m3,
+            make_multi_index(gemm0_m_block_outer_index, block_work_idx_n, 0, 0, 0, 0),
+            tensor_operation::element_wise::PassThrough{});
        if constexpr(Deterministic)
        {
            block_sync_lds();
@@ -2192,49 +2259,53 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            // add bias
            if constexpr(!is_same<D0DataType, void>::value)
            {
-                static constexpr auto& c_thread_desc = s_blockwise_gemm.GetCThreadDesc();
+                if(p_d0_grid != nullptr)
+                {
-                const auto d0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    static constexpr auto& c_thread_desc = s_blockwise_gemm.GetCThreadDesc();
-                    p_d0_grid, d0_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                    const auto d0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                auto d0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                        p_d0_grid, d0_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
-                    static_cast<D0DataType*>(p_shared) + SharedMemTrait::d0_block_space_offset,
-                    D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                    auto d0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                        static_cast<D0DataType*>(p_shared) + SharedMemTrait::d0_block_space_offset,
-                auto d0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, D0DataType>(
+                        D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
-                    D0Loader::d0_thread_desc_.GetElementSpaceSize());
+                    auto d0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, D0DataType>(
-                static_for<0, D0M0, 1>{}([&](auto mr) {
+                        D0Operator::d0_thread_desc_.GetElementSpaceSize());
-                    // load data to lds
-                    d0_block_copy_global_to_lds.RunRead(d0_grid_desc_m0_n0_m1_m2_n1_m3,
+                    static_for<0, D0M0, 1>{}([&](auto mr) {
-                                                        d0_grid_buf);
+                        // load data to lds
+                        d0_block_copy_global_to_lds.RunRead(d0_grid_desc_m0_n0_m1_m2_n1_m3,
-                    d0_block_copy_global_to_lds.MoveSrcSliceWindow(
+                                                            d0_grid_buf);
-                        d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
+                        d0_block_copy_global_to_lds.MoveSrcSliceWindow(
-                    d0_block_copy_global_to_lds.RunWrite(
+                            d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
-                        D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3, d0_block_buf);
-                    block_sync_lds();
+                        d0_block_copy_global_to_lds.RunWrite(
-                    // read data form lds
+                            D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3, d0_block_buf);
-                    d0_thread_copy_lds_to_vgpr.Run(D0Loader::d0_block_read_desc_n0_n1_m0_m1_m2,
+                        block_sync_lds();
-                                                   make_tuple(I0, I0, I0, I0, I0),
+                        // read data form lds
-                                                   d0_block_buf,
+                        d0_thread_copy_lds_to_vgpr.Run(D0Operator::d0_block_src_desc_n0_n1_m0_m1_m2,
-                                                   D0Loader::d0_thread_desc_,
+                                                       make_tuple(I0, I0, I0, I0, I0),
-                                                   make_tuple(I0, I0, I0, I0, I0),
+                                                       d0_block_buf,
-                                                   d0_thread_buf);
+                                                       D0Operator::d0_thread_desc_,
+                                                       make_tuple(I0, I0, I0, I0, I0),
-                    // bias add
+                                                       d0_thread_buf);
-                    static_for<0, d0_thread_buf.Size(), 1>{}([&](auto i) {
-                        constexpr index_t c_offset =
+                        // bias add
-                            c_thread_desc.CalculateOffset(make_tuple(mr, I0, i));
+                        static_for<0, d0_thread_buf.Size(), 1>{}([&](auto i) {
+                            constexpr index_t c_offset =
-                        s_slash_p_thread_buf(Number<c_offset>{}) +=
+                                c_thread_desc.CalculateOffset(make_tuple(mr, I0, i));
-                            ck::type_convert<FloatGemmAcc>(d0_thread_buf[i]);
+                            s_slash_p_thread_buf(Number<c_offset>{}) +=
+                                ck::type_convert<FloatGemmAcc>(d0_thread_buf[i]);
+                        });
                    });
-                });
-                d0_block_copy_global_to_lds.MoveSrcSliceWindow(
+                    d0_block_copy_global_to_lds.MoveSrcSliceWindow(
-                    d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+                        d0_grid_desc_m0_n0_m1_m2_n1_m3,
+                        make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+                }
            }
            // P_i: = softmax(scalar * S_i:)
@@ -2325,6 +2396,45 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                                    : y_dot_ygrad_thread_buf[Number<m>{}]);
            });
+            // output bias grad
+            if constexpr(!is_same<D0DataType, void>::value)
+            {
+                if(p_d0grad_grid != nullptr)
+                {
+                    auto d0grad_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_d0grad_grid, d0grad_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                    auto d0grad_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                        static_cast<D0DataType*>(p_shared) + SharedMemTrait::d0_block_space_offset,
+                        D0Operator::d0grad_block_src_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                    static_for<0, D0M0, 1>{}([&](auto mr) {
+                        d0grad_thread_copy_vgpr_to_lds.Run(
+                            D0Operator::d0_thread_desc_,
+                            make_tuple(mr, I0, I0, I0, I0),
+                            sgrad_thread_buf,
+                            D0Operator::d0grad_block_dst_desc_n0_n1_m0_m1_m2,
+                            d0grad_block_buf);
+                        block_sync_lds();
+                        // write data from lds to global
+                        d0grad_block_copy_lds_to_global.Run(
+                            D0Operator::d0grad_block_src_desc_m0_n0_m1_m2_n1_m3,
+                            d0grad_block_buf,
+                            d0grad_grid_desc_m0_n0_m1_m2_n1_m3,
+                            d0grad_grid_buf,
+                            I0);
+                        d0grad_block_copy_lds_to_global.MoveDstSliceWindow(
+                            d0grad_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
+                    });
+                    d0grad_block_copy_lds_to_global.MoveDstSliceWindow(
+                        d0grad_grid_desc_m0_n0_m1_m2_n1_m3,
+                        make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+                }
+            }
            // gemm dV
            // dV = P_drop^T * dY
            {

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -95,6 +95,10 @@ template <typename InputDataType,
          PipelineVersion PipelineVer = PipelineVersion::v1>
 struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
 {
+    static_assert(Gemm1NPerBlock % KPerBlock == 0);
+    static_assert(MPerBlock % Gemm1KPerBlock == 0);
+    static_assert(NPerBlock % Gemm2KPerBlock == 0);
    static_assert(LoopSched == LoopScheduler::Default,
                  "Non-default loop scheduler is currently not supported");
@@ -1337,7 +1341,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
    using D0GridDescriptor_M0_N0_M1_M2_N1_M3 =
        remove_cvref_t<decltype(MakeD0GridDescriptor_M0_N0_M1_M2_N1_M3(D0GridDesc_M_N{}))>;
-    struct D0Loader
+    struct D0Operator
    {
        template <typename DataType>
        struct TypeTransform
@@ -1357,13 +1361,12 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        static_assert(NPerXdl == 32);
        static_assert(D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock,
                      "D0BlockTransferSrcScalarPerVector * NThreadClusterLengths <= NPerBlock");
-        __host__ __device__ static constexpr auto GetD0BlockWriteDescriptor_M0_N0_M1_M2_N1_M3()
+        __host__ __device__ static constexpr auto GetD0BlockGlobalDescriptor_M0_N0_M1_M2_N1_M3()
        {
-            // B1 matrix in LDS memory, dst of blockwise copy
            return make_naive_tensor_descriptor_packed(
                make_tuple(I1, I1, I1, D0M1, Number<NPerBlock>{}, D0M2));
        }
-        __host__ __device__ static constexpr auto GetD0BlockReadDescriptor_N0_N1_M0_M1_M2()
+        __host__ __device__ static constexpr auto GetD0BlockVgprDescriptor_N0_N1_M0_M1_M2()
        {
            constexpr auto d0_raw_m0_n_m1 =
                make_naive_tensor_descriptor_packed(make_tuple(D0M1, Number<NPerBlock>{}, D0M2));
@@ -1378,15 +1381,20 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                make_tuple(Sequence<2, 3>{}, Sequence<0, 1>{}, Sequence<4>{}));
            return d0_n0_n1_m0_m1_m2;
        }
-        static constexpr auto d0_block_write_desc_m0_n0_m1_m2_n1_m3 =
+        static constexpr auto d0_block_dst_desc_m0_n0_m1_m2_n1_m3 =
-            GetD0BlockWriteDescriptor_M0_N0_M1_M2_N1_M3();
+            GetD0BlockGlobalDescriptor_M0_N0_M1_M2_N1_M3();
-        static constexpr auto d0_block_read_desc_n0_n1_m0_m1_m2 =
+        static constexpr auto d0_block_src_desc_n0_n1_m0_m1_m2 =
-            GetD0BlockReadDescriptor_N0_N1_M0_M1_M2();
+            GetD0BlockVgprDescriptor_N0_N1_M0_M1_M2();
        static constexpr auto d0_thread_desc_ =
            make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I4, I1, D0M2));
-        using D0BlockwiseCopy = ThreadGroupTensorSliceTransfer_v4r1<
+        static constexpr auto& d0grad_block_dst_desc_n0_n1_m0_m1_m2 =
+            d0_block_src_desc_n0_n1_m0_m1_m2;
+        static constexpr auto& d0grad_block_src_desc_m0_n0_m1_m2_n1_m3 =
+            d0_block_dst_desc_m0_n0_m1_m2_n1_m3;
+        using D0BlockwiseCopyGlobalToLds = ThreadGroupTensorSliceTransfer_v4r1<
            ThisThreadBlock,
            tensor_operation::element_wise::PassThrough,
            tensor_operation::element_wise::PassThrough,
@@ -1397,34 +1405,77 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                     1,
                     BlockSize / NThreadClusterLengths,
                     NThreadClusterLengths,
-                     1>,                                     // ThreadClusterLengths
+                     1>,                                   // ThreadClusterLengths
-            Sequence<0, 1, 2, 3, 5, 4>,                      // ThreadClusterArrangeOrder
+            Sequence<0, 1, 2, 3, 5, 4>,                    // ThreadClusterArrangeOrder
-            typename TypeTransform<D0DataType>::Type,        // SrcData
+            typename TypeTransform<D0DataType>::Type,      // SrcData
-            typename TypeTransform<D0DataType>::Type,        // DstData
+            typename TypeTransform<D0DataType>::Type,      // DstData
-            D0GridDescriptor_M0_N0_M1_M2_N1_M3,              // SrcDesc
+            D0GridDescriptor_M0_N0_M1_M2_N1_M3,            // SrcDesc
-            decltype(d0_block_write_desc_m0_n0_m1_m2_n1_m3), // DstDesc
+            decltype(d0_block_dst_desc_m0_n0_m1_m2_n1_m3), // DstDesc
-            Sequence<0, 1, 2, 3, 5, 4>,                      // SrcDimAccessOrder
+            Sequence<0, 1, 2, 3, 5, 4>,                    // SrcDimAccessOrder
-            Sequence<0, 1, 2, 4, 3, 5>,                      // DstDimAccessOrder
+            Sequence<0, 1, 2, 4, 3, 5>,                    // DstDimAccessOrder
-            4,                                               // SrcVectorDim
+            4,                                             // SrcVectorDim
-            5,                                               // DstVectorDim
+            5,                                             // DstVectorDim
-            D0BlockTransferSrcScalarPerVector,               // SrcScalarPerVector
+            4,                                             // SrcScalarPerVector
-            4,                                               // DstScalarPerVector
+            4,                                             // DstScalarPerVector
            1,
            1,
            true,
            true, // DstResetCoord
            1>;
-        using D0ThreadWiseCopy =
+        using D0ThreadwiseCopyLdsToVgpr =
-            ThreadwiseTensorSliceTransfer_v4<typename TypeTransform<D0DataType>::Type,    // SrcData
+            ThreadwiseTensorSliceTransfer_v4<typename TypeTransform<D0DataType>::Type,   // SrcData
-                                             typename TypeTransform<D0DataType>::Type,    // DstData
+                                             typename TypeTransform<D0DataType>::Type,   // DstData
-                                             decltype(d0_block_read_desc_n0_n1_m0_m1_m2), // SrcDesc
+                                             decltype(d0_block_src_desc_n0_n1_m0_m1_m2), // SrcDesc
-                                             decltype(d0_thread_desc_),                   // DstDesc
+                                             decltype(d0_thread_desc_),                  // DstDesc
                                             Sequence<1, 1, 4, 1, 4>, // SliceLengths
                                             Sequence<0, 1, 2, 3, 4>, // DimAccessOrder
                                             4,                       // SrcVectorDim
-                                             2,                       // SrcScalarPerVector
+                                             4,                       // SrcScalarPerVector
                                             2>;
+        using D0GradThreadwiseCopyVgprToLds = ThreadwiseTensorSliceTransfer_v1r3<
+            FloatGemmAcc,
+            typename TypeTransform<D0DataType>::Type,
+            decltype(d0_thread_desc_),
+            decltype(d0grad_block_dst_desc_n0_n1_m0_m1_m2),
+            tensor_operation::element_wise::Scale, // CElementwiseOperation
+            Sequence<1, 1, 4, 1, 4>,               // SliceLengths
+            Sequence<0, 1, 2, 3, 4>,               // AccessOrder
+            4,                                     // VectorDim
+            4,                                     // ScalarPerVector
+            InMemoryDataOperationEnum::Set,        // GlobalMemoryDataOperation
+            1,                                     // DstScalarStrideInVector
+            true>;
+        using D0GradBlockwiseCopyLdsToGlobal = ThreadGroupTensorSliceTransfer_v4r1<
+            ThisThreadBlock,
+            tensor_operation::element_wise::PassThrough,
+            tensor_operation::element_wise::PassThrough,
+            InMemoryDataOperationEnum::Set,
+            Sequence<I1, I1, I1, D0M1, NPerBlock, D0M2>, // BlockSliceLengths
+            Sequence<1,
+                     1,
+                     1,
+                     BlockSize / NThreadClusterLengths,
+                     NThreadClusterLengths,
+                     1>,                                       // ThreadClusterLengths
+            Sequence<0, 1, 2, 3, 5, 4>,                        // ThreadClusterArrangeOrder
+            typename TypeTransform<D0DataType>::Type,          // SrcData
+            typename TypeTransform<D0DataType>::Type,          // DstData
+            decltype(d0grad_block_src_desc_m0_n0_m1_m2_n1_m3), // SrcDesc
+            D0GridDescriptor_M0_N0_M1_M2_N1_M3,                // DstDesc
+            Sequence<0, 1, 2, 4, 3, 5>,                        // SrcDimAccessOrder
+            Sequence<0, 1, 2, 3, 5, 4>,                        // DstDimAccessOrder
+            5,                                                 // SrcVectorDim
+            4,                                                 // DstVectorDim
+            4,                                                 // SrcScalarPerVector
+            D0BlockTransferSrcScalarPerVector,                 // DstScalarPerVector
+            1,
+            1,
+            true,
+            true, // DstResetCoord
+            1>;
    };
    struct SharedMemTrait
@@ -1466,10 +1517,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            sizeof(GemmDataType) / sizeof(FloatGemmAcc);
        static constexpr auto d0_block_space_size_aligned = math::integer_least_multiple(
-            D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize(), max_lds_align);
+            D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize(), max_lds_align);
        static constexpr auto d0_block_space_offset =
            k_block_space_size_aligned.value * sizeof(GemmDataType) /
-            D0Loader::template TypeTransform<D0DataType>::Size;
+            D0Operator::template TypeTransform<D0DataType>::Size;
        // LDS allocation for C shuffle in LDS
        static constexpr auto c_shuffle_block_desc_mblock_mperblock_nblock_nperblock =
@@ -1497,7 +1548,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                                          sizeof(FloatGemmAcc);
        const index_t d0_bytes_end =
            (SharedMemTrait::d0_block_space_offset + SharedMemTrait::d0_block_space_size_aligned) *
-            D0Loader::template TypeTransform<D0DataType>::Size0;
+            D0Operator::template TypeTransform<D0DataType>::Size0;
        const index_t c_block_bytes_end =
            SharedMemTrait::c_block_space_size * sizeof(FloatCShuffle);
@@ -1526,6 +1577,7 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        const InputDataType* __restrict__ p_ygrad_grid,
        OutputDataType* __restrict__ p_qgrad_grid,
        OutputDataType* __restrict__ p_kgrad_grid,
+        D0DataType* __restrict__ p_d0grad_grid,
        OutputDataType* __restrict__ p_vgrad_grid,
        void* __restrict__ p_shared,
        const AElementwiseOperation& a_element_op,
@@ -2080,17 +2132,31 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        index_t gemm0_m_block_outer_index = num_gemm0_m_block_outer_loop - 1;
        // D0
-        auto d0_block_copy_global_to_lds = typename D0Loader::D0BlockwiseCopy(
+        auto d0_block_copy_global_to_lds = typename D0Operator::D0BlockwiseCopyGlobalToLds(
            d0_grid_desc_m0_n0_m1_m2_n1_m3,
            make_multi_index(gemm0_m_block_outer_index, block_work_idx_n, 0, 0, 0, 0),
            tensor_operation::element_wise::PassThrough{},
-            D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3,
+            D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3,
            make_multi_index(0, 0, 0, 0, 0, 0),
            tensor_operation::element_wise::PassThrough{});
-        auto d0_thread_copy_lds_to_vgpr = typename D0Loader::D0ThreadWiseCopy(
+        auto d0_thread_copy_lds_to_vgpr = typename D0Operator::D0ThreadwiseCopyLdsToVgpr(
            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0));
+        auto& d0grad_grid_desc_m0_n0_m1_m2_n1_m3 = d0_grid_desc_m0_n0_m1_m2_n1_m3;
+        auto d0grad_thread_copy_vgpr_to_lds = typename D0Operator::D0GradThreadwiseCopyVgprToLds(
+            D0Operator::d0grad_block_dst_desc_n0_n1_m0_m1_m2,
+            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0),
+            tensor_operation::element_wise::Scale{rp_dropout});
+        auto d0grad_block_copy_lds_to_global = typename D0Operator::D0GradBlockwiseCopyLdsToGlobal(
+            D0Operator::d0grad_block_src_desc_m0_n0_m1_m2_n1_m3,
+            make_multi_index(0, 0, 0, 0, 0, 0),
+            tensor_operation::element_wise::PassThrough{},
+            d0grad_grid_desc_m0_n0_m1_m2_n1_m3,
+            make_multi_index(gemm0_m_block_outer_index, block_work_idx_n, 0, 0, 0, 0),
+            tensor_operation::element_wise::PassThrough{});
        if constexpr(Deterministic)
        {
            block_sync_lds();
@@ -2295,50 +2361,53 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            // add bias
            if constexpr(!is_same<D0DataType, void>::value)
            {
-                static constexpr auto& c_thread_desc = s_blockwise_gemm.GetCThreadDesc();
+                if(p_d0_grid != nullptr)
+                {
+                    static constexpr auto& c_thread_desc = s_blockwise_gemm.GetCThreadDesc();
-                const auto d0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                    const auto d0_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
-                    p_d0_grid, d0_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                        p_d0_grid, d0_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
-                auto d0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                    auto d0_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
-                    static_cast<D0DataType*>(p_shared) + SharedMemTrait::d0_block_space_offset,
+                        static_cast<D0DataType*>(p_shared) + SharedMemTrait::d0_block_space_offset,
-                    D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                        D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
-                auto d0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, D0DataType>(
+                    auto d0_thread_buf = make_static_buffer<AddressSpaceEnum::Vgpr, D0DataType>(
-                    D0Loader::d0_thread_desc_.GetElementSpaceSize());
+                        D0Operator::d0_thread_desc_.GetElementSpaceSize());
-                ignore = d0_thread_buf;
-                static_for<0, D0M0, 1>{}([&](auto mr) {
+                    static_for<0, D0M0, 1>{}([&](auto mr) {
-                    // load data to lds
+                        // load data to lds
-                    d0_block_copy_global_to_lds.RunRead(d0_grid_desc_m0_n0_m1_m2_n1_m3,
+                        d0_block_copy_global_to_lds.RunRead(d0_grid_desc_m0_n0_m1_m2_n1_m3,
-                                                        d0_grid_buf);
+                                                            d0_grid_buf);
-                    d0_block_copy_global_to_lds.MoveSrcSliceWindow(
+                        d0_block_copy_global_to_lds.MoveSrcSliceWindow(
-                        d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
+                            d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
-                    d0_block_copy_global_to_lds.RunWrite(
+                        d0_block_copy_global_to_lds.RunWrite(
-                        D0Loader::d0_block_write_desc_m0_n0_m1_m2_n1_m3, d0_block_buf);
+                            D0Operator::d0_block_dst_desc_m0_n0_m1_m2_n1_m3, d0_block_buf);
-                    block_sync_lds();
+                        block_sync_lds();
-                    // read data form lds
+                        // read data form lds
-                    d0_thread_copy_lds_to_vgpr.Run(D0Loader::d0_block_read_desc_n0_n1_m0_m1_m2,
+                        d0_thread_copy_lds_to_vgpr.Run(D0Operator::d0_block_src_desc_n0_n1_m0_m1_m2,
-                                                   make_tuple(I0, I0, I0, I0, I0),
+                                                       make_tuple(I0, I0, I0, I0, I0),
-                                                   d0_block_buf,
+                                                       d0_block_buf,
-                                                   D0Loader::d0_thread_desc_,
+                                                       D0Operator::d0_thread_desc_,
-                                                   make_tuple(I0, I0, I0, I0, I0),
+                                                       make_tuple(I0, I0, I0, I0, I0),
-                                                   d0_thread_buf);
+                                                       d0_thread_buf);
-                    // bias add
+                        // bias add
-                    static_for<0, d0_thread_buf.Size(), 1>{}([&](auto i) {
+                        static_for<0, d0_thread_buf.Size(), 1>{}([&](auto i) {
-                        constexpr index_t c_offset =
+                            constexpr index_t c_offset =
-                            c_thread_desc.CalculateOffset(make_tuple(mr, I0, i));
+                                c_thread_desc.CalculateOffset(make_tuple(mr, I0, i));
-                        s_slash_p_thread_buf(Number<c_offset>{}) +=
+                            s_slash_p_thread_buf(Number<c_offset>{}) +=
-                            ck::type_convert<FloatGemmAcc>(d0_thread_buf[i]);
+                                ck::type_convert<FloatGemmAcc>(d0_thread_buf[i]);
+                        });
                    });
-                });
-                d0_block_copy_global_to_lds.MoveSrcSliceWindow(
+                    d0_block_copy_global_to_lds.MoveSrcSliceWindow(
-                    d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+                        d0_grid_desc_m0_n0_m1_m2_n1_m3,
+                        make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+                }
            }
            // P_i: = softmax(scalar * S_i:)
@@ -2545,6 +2614,46 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                                    : y_dot_ygrad_thread_buf[Number<m>{}]);
            });
+            // output bias grad
+            if constexpr(!is_same<D0DataType, void>::value)
+            {
+                if(p_d0grad_grid != nullptr)
+                {
+                    auto d0grad_grid_buf = make_dynamic_buffer<AddressSpaceEnum::Global>(
+                        p_d0grad_grid, d0grad_grid_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                    auto d0grad_block_buf = make_dynamic_buffer<AddressSpaceEnum::Lds>(
+                        static_cast<D0DataType*>(p_shared) + SharedMemTrait::d0_block_space_offset,
+                        D0Operator::d0grad_block_src_desc_m0_n0_m1_m2_n1_m3.GetElementSpaceSize());
+                    static_for<0, D0M0, 1>{}([&](auto mr) {
+                        d0grad_thread_copy_vgpr_to_lds.Run(
+                            D0Operator::d0_thread_desc_,
+                            make_tuple(mr, I0, I0, I0, I0),
+                            sgrad_thread_buf,
+                            D0Operator::d0grad_block_dst_desc_n0_n1_m0_m1_m2,
+                            d0grad_block_buf);
+                        block_sync_lds();
+                        // write data from lds to global
+                        d0grad_block_copy_lds_to_global.Run(
+                            D0Operator::d0grad_block_src_desc_m0_n0_m1_m2_n1_m3,
+                            d0grad_block_buf,
+                            d0grad_grid_desc_m0_n0_m1_m2_n1_m3,
+                            d0grad_grid_buf,
+                            I0);
+                        d0grad_block_copy_lds_to_global.MoveDstSliceWindow(
+                            d0grad_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(0, 0, 1, 0, 0, 0));
+                    });
+                    d0grad_block_copy_lds_to_global.MoveDstSliceWindow(
+                        d0grad_grid_desc_m0_n0_m1_m2_n1_m3,
+                        make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
+                }
+            }
            SubThreadBlock<BlockSize> gemm2_a_copy_subgroup(s_blockwise_gemm.GetWaveIdx()[I0],
                                                            s_blockwise_gemm.GetWaveIdx()[I1]);

--- a/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
+++ b/include/ck/tensor_operation/operator_transform/transform_contraction_to_gemm.hpp
@@ -282,6 +282,29 @@ struct TransformBatchedContractionContractionToBatchedGemmGemm
        return matrix_padder.PadCDescriptor_M_N(
            MakeCGridDescriptorPair(c_gs_ms_os_lengths_vec, c_gs_ms_os_strides_vec).second);
    }
+    //
+    // C0
+    //
+    static auto MakeC0GridDescriptorPair(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                         const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        return MakeGridDescriptorPair<NumDimG, NumDimM, NumDimN, CSpec>(c_gs_ms_ns_lengths_vec,
+                                                                        c_gs_ms_ns_strides_vec);
+    }
+    // TODO: rename to G_MRaw_NRaw
+    static auto MakeC0GridDescriptor_G_M_N(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                           const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        return MakeC0GridDescriptorPair(c_gs_ms_ns_lengths_vec, c_gs_ms_ns_strides_vec).first;
+    }
+    static auto MakeC0GridDescriptor_M_N(const std::vector<index_t>& c_gs_ms_ns_lengths_vec,
+                                         const std::vector<index_t>& c_gs_ms_ns_strides_vec)
+    {
+        return matrix_padder.PadC0Descriptor_M_N(
+            MakeC0GridDescriptorPair(c_gs_ms_ns_lengths_vec, c_gs_ms_ns_strides_vec).second);
+    }
 };
 } // namespace tensor_operation