Merge pull request #977 from ROCmSoftwarePlatform/mha-train-tiny-update

Some tiny updates

Merge pull request #977 from ROCmSoftwarePlatform/mha-train-tiny-update
Some tiny updates
3f4eae1d · Dan Yao · GitHub · bf6b491e · 9ac015bd · 3f4eae1d
Unverified Commit 3f4eae1d authored Oct 10, 2023 by Dan Yao Committed by GitHub Oct 10, 2023
9 changed files
--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
@@ -1414,8 +1414,8 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
        void* p_qgrad_grid,
        void* p_kgrad_grid,
        void* p_vgrad_grid,
-        const D0DataType* p_acc0_bias,
-        const D1DataType* p_acc1_bias,
+        const void* p_acc0_bias,
+        const void* p_acc1_bias,
        void* p_d0grad_grid,
        void* p_d1grad_grid,
        const std::vector<index_t>& a_gs_ms_ks_lengths,

--- a/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_batched_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -1270,10 +1270,10 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
        void* p_qgrad_grid,
        void* p_kgrad_grid,
        void* p_vgrad_grid,
-        const D0DataType* p_acc0_bias,
-        const D1DataType* p_acc1_bias,
-        D0DataType* p_d0grad_grid,
-        D1DataType* p_d1grad_grid,
+        const void* p_acc0_bias,
+        const void* p_acc1_bias,
+        void* p_d0grad_grid,
+        void* p_d1grad_grid,
        const std::vector<index_t>& a_gs_ms_ks_lengths,
        const std::vector<index_t>& a_gs_ms_ks_strides,
        const std::vector<index_t>& b_gs_ns_ks_lengths,
@@ -1312,8 +1312,8 @@ struct DeviceBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
            static_cast<OutputDataType*>(p_vgrad_grid),
            static_cast<const D0DataType*>(p_acc0_bias), // cast in struct Argument
            static_cast<const D1DataType*>(p_acc1_bias), // cast in struct Argument
-            static_cast<const D0DataType*>(p_d0grad_grid),
-            static_cast<const D1DataType*>(p_d1grad_grid),
+            static_cast<D0DataType*>(p_d0grad_grid),
+            static_cast<D1DataType*>(p_d1grad_grid),
            a_gs_ms_ks_lengths,
            a_gs_ms_ks_strides,
            b_gs_ns_ks_lengths,

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v1.hpp
@@ -1027,7 +1027,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V1
                    z_grid_desc_g_m_n,
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(problem_desc.lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_light_v2.hpp
@@ -1098,7 +1098,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_Light_V2
                    z_grid_desc_g_m_n,
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(problem_desc.lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v1.hpp
@@ -918,7 +918,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V1
                    z_grid_desc_g_m_n,
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(problem_desc.lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_bwd_xdl_cshuffle_qloop_v2.hpp
@@ -448,19 +448,6 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
            make_tuple(Sequence<0, 2>{}, Sequence<1>{}));
    }

-    //
-    // dP = dY * V^T
-    //
-
-    // YGrad in Gemm A position
-    static auto MakeYGradGridDescriptor_O0_M_O1(const std::vector<index_t>& y_gs_ms_os_lengths,
-                                                const std::vector<index_t>& y_gs_ms_os_strides)
-    {
-        return Transform::MakeAGridDescriptor_AK0_M_AK1(
-            Transform::MakeAGridDescriptor_M_K(y_gs_ms_os_lengths, y_gs_ms_os_strides),
-            Number<Y_O1>{});
-    }
-
    // V in Gemm B position
    static auto MakeVGridDescriptor_O0_N_O1(const std::vector<index_t>& v_gs_os_ns_lengths,
                                            const std::vector<index_t>& v_gs_os_ns_strides)
@@ -988,7 +975,7 @@ struct DeviceGroupedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
                    z_grid_desc_g_m_n,
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(problem_desc.lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v1.hpp
@@ -694,7 +694,7 @@ struct DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V1
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
                    z_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_mha_fwd_xdl_cshuffle_v2.hpp
@@ -745,7 +745,7 @@ struct DeviceGroupedMultiheadAttentionForward_Xdl_CShuffle_V2
                    b1_grid_desc_g_n_k,
                    c_grid_desc_g_m_n,
                    z_grid_desc_g_m_n,
-                    type_convert<index_t>(lse_grid_desc_m.GetElementSpaceSize()));
+                    type_convert<index_t>(problem_desc.lse_gs_ms_strides[NumDimG - 1]));

                // C0 mask
                const auto c0_matrix_mask =

--- a/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
+++ b/include/ck/tensor_operation/gpu/grid/gridwise_batched_mha_bwd_xdl_cshuffle_qloop_b2t_v2.hpp
@@ -320,18 +320,27 @@ struct GridwiseBatchedMultiheadAttentionBackward_Qloop_Xdl_CShuffle_V2
        // dV_NOM / dK_NKM Gemm (Gemm2 crr)
        if(O != K)
        {
+            std::cerr << "O = " << O << " K = " << K << std::endl;
            std::cerr << "SizeK must be equal to SizeO (equal attention head size)" << '\n';
            return false;
        }

        if(!(M == y_grid_desc_m_o.GetLength(I0) && O == y_grid_desc_m_o.GetLength(I1)))
        {
+            std::cerr << "M = " << M << " O = " << O
+                      << " y_grid_desc_m_o = " << y_grid_desc_m_o.GetLength(I0) << " , "
+                      << y_grid_desc_m_o.GetLength(I1) << std::endl;
+            std::cerr << "Un-matched sizes!" << std::endl;
            return false;
        }

        if(!(M % MPerBlock == 0 && N % NPerBlock == 0 && K % KPerBlock == 0 &&
             O % Gemm1NPerBlock == 0))
        {
+            std::cerr << "M = " << M << " N = " << N << " O = " << O << std::endl;
+            std::cerr << "MPerBlock = " << MPerBlock << " NPerBlock = " << NPerBlock
+                      << " KPerBlock = " << KPerBlock << std::endl;
+            std::cerr << "Un-aligned sizes!" << std::endl;
            return false;
        }