move bias before mask

042761af · letaoqin · 958f028f · 042761af · 042761af
Commit 042761af authored Sep 08, 2023 by letaoqin
2 changed files
--- a/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
+++ b/example/32_batched_gemm_scale_softmax_gemm/batched_multihead_attention_backward_v2.cpp
@@ -513,8 +513,10 @@ int run(int argc, char* argv[])
            static_cast<OutputDataType*>(qgrad_device_buf.GetDeviceBuffer()),
            static_cast<OutputDataType*>(kgrad_device_buf.GetDeviceBuffer()),
            static_cast<OutputDataType*>(vgrad_device_buf.GetDeviceBuffer()),
-            {}, // std::array<void*, 1> p_acc0_biases;
-            {}, // std::array<void*, 1> p_acc1_biases;
+            nullptr, //  p_acc0_bias;
+            nullptr, //  p_acc1_bias;
+            nullptr,
+            nullptr,
            q_gs_ms_ks_lengths,
            q_gs_ms_ks_strides,
            k_gs_ns_ks_lengths,
@@ -558,8 +560,10 @@ int run(int argc, char* argv[])
        static_cast<OutputDataType*>(qgrad_device_buf.GetDeviceBuffer()),
        static_cast<OutputDataType*>(kgrad_device_buf.GetDeviceBuffer()),
        static_cast<OutputDataType*>(vgrad_device_buf.GetDeviceBuffer()),
-        {}, // std::array<void*, 1> p_acc0_biases;
-        {}, // std::array<void*, 1> p_acc1_biases;
+        nullptr, //  p_acc0_bias;
+        nullptr, //  p_acc1_bias;
+        nullptr,
+        nullptr,
        q_gs_ms_ks_lengths,
        q_gs_ms_ks_strides,
        k_gs_ns_ks_lengths,

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -1434,7 +1434,17 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                                             Sequence<1, 1, 4, 1, 4>, // SliceLengths
                                             Sequence<0, 1, 2, 3, 4>, // DimAccessOrder
                                             4,                       // SrcVectorDim
-                                             2,                       // SrcScalarPerVector
+                                             4,                       // SrcScalarPerVector
+                                             2>;
+        using D0ThreadCopyVgprToBlock =
+            ThreadwiseTensorSliceTransfer_v4<typename TypeTransform<D0DataType>::Type,    // SrcData
+                                             typename TypeTransform<D0DataType>::Type,    // DstData
+                                             decltype(d0_thread_desc_),                   // SrcDesc
+                                             decltype(d0_block_vgpr_desc_n0_n1_m0_m1_m2), // DstDesc
+                                             Sequence<1, 1, 4, 1, 4>, // SliceLengths
+                                             Sequence<0, 1, 2, 3, 4>, // DimAccessOrder
+                                             4,                       // SrcVectorDim
+                                             4,                       // SrcScalarPerVector
                                             2>;
    };

@@ -2099,6 +2109,10 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        auto d0_thread_copy_lds_to_vgpr = typename D0Operator::D0ThreadCopy(
            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0));

+        auto d0_thread_copy_vgpr_to_lds = typename D0Operator::D0ThreadCopyVgprToBlock(
+            make_tuple(wave_id[I1], wave_m_n_id[I1], 0, wave_m_n_id[I0], 0));
+        ignore = d0_thread_copy_vgpr_to_lds;
+
        if constexpr(Deterministic)
        {
            block_sync_lds();
@@ -2273,32 +2287,9 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                make_tuple(Sequence<0>{}, Sequence<1>{}),
                make_tuple(Sequence<0, 2, 4, 5, 6>{}, Sequence<1, 3, 7>{}));

-            // do MNK padding or upper triangular masking
-            if constexpr(MaskOutUpperTriangle || PadN)
-            {
-
-                static_for<0, Acc0TileIterator::GetNumOfAccess(), 1>{}([&](auto i) {
-                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(i) + acc0_thread_origin;
-                    auto m_local =
-                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
-                    auto n_local =
-                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
-                    auto m_global    = m_local + m_block_data_idx_on_grid;
-                    auto n_global    = n_local + n_block_data_idx_on_grid;
-                    bool masked_flag = c0_matrix_mask.IsMaskedElement(m_global, n_global);
-                    s_element_op(s_slash_p_thread_buf(i),
-                                 masked_flag ? -ck::NumericLimits<float>::Infinity()
-                                             : s_slash_p_thread_buf[i]);
-                });
-            }
-            else
-            {
-                static_for<0, s_slash_p_thread_buf.Size(), 1>{}([&](auto i) {
-                    s_element_op(s_slash_p_thread_buf(i), s_slash_p_thread_buf[i]);
-                });
-            }
-
-            block_sync_lds(); // wait for lds read in gemm0 blockwise gemm
+            // scale
+            static_for<0, s_slash_p_thread_buf.Size(), 1>{}(
+                [&](auto i) { s_element_op(s_slash_p_thread_buf(i), s_slash_p_thread_buf[i]); });

            // add bias
            if constexpr(!is_same<D0DataType, void>::value)
@@ -2349,6 +2340,26 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                    d0_grid_desc_m0_n0_m1_m2_n1_m3, make_multi_index(-1, 0, -D0M0.value, 0, 0, 0));
            }

+            // do MNK padding or upper triangular masking
+            if constexpr(MaskOutUpperTriangle || PadN)
+            {
+
+                static_for<0, Acc0TileIterator::GetNumOfAccess(), 1>{}([&](auto i) {
+                    auto acc0_thread_idx = Acc0TileIterator::GetIndex(i) + acc0_thread_origin;
+                    auto m_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I0];
+                    auto n_local =
+                        block_idx_to_m_n_adaptor.CalculateBottomIndex(acc0_thread_idx)[I1];
+                    auto m_global           = m_local + m_block_data_idx_on_grid;
+                    auto n_global           = n_local + n_block_data_idx_on_grid;
+                    bool masked_flag        = c0_matrix_mask.IsMaskedElement(m_global, n_global);
+                    s_slash_p_thread_buf(i) = masked_flag ? -ck::NumericLimits<float>::Infinity()
+                                                          : s_slash_p_thread_buf[i];
+                });
+            }
+
+            block_sync_lds(); // wait for lds read in gemm0 blockwise gemm
+
            // P_i: = softmax(scalar * S_i:)
            // scaling is already performed in the preceding statements with s_element_op
            blockwise_softmax.RunWithPreCalcStats(s_slash_p_thread_buf, lse_thread_buf);