set c matrix zero

b7ec2078 · ltqin · d1998945 · b7ec2078 · b7ec2078 · b7ec2078
Commit b7ec2078 authored Nov 23, 2021 by ltqin
3 changed files
--- a/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn.cpp
+++ b/device_operation/device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn.cpp
@@ -30,7 +30,9 @@ using device_gemm_xdl_instance_f32_f32_f32_mk_kn_mn = std::tuple<
        //#################|  Type|  Type|  Type|    Type|        |        |        |  Size| Block| Block| Block|   |  XDL|  XDL|  Per|  Per|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar|     ThreadSlice|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| SrcDstVectorDim|       DstScalar| AddExtraM| AddExtraN|
        //#################|      |      |      |        |        |        |        |      |      |      |      |   |     |     | Wave| Wave| Lengths_K0_N_K1| Lengths_K0_M_K1|   ArrangeOrder|               |               |      PerVector|   PerVector_K1| Lengths_K0_N_K1| Lengths_K0_N_K1|   ArrangeOrder|               |              |      PerVector|   PerVector_K1|                |       PerVector|          |          |
        //#################|      |      |      |        |        |        |        |      |      |      |      |   |     |     |     |     |                |                |               |               |               |               |               |                |                |               |               |              |               |               |                |                |          |          |
-        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row,   256,   128,   128,     4,  4,   32,   32,    2,    2,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, 1>
+        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row,   256,   128,   128,     4,  4,   32,   32,    2,    2,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, false, 1>,
+        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row,   256,   128,   128,     4,  4,   32,   32,    2,    2,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, true, 360>,
+        DeviceGemmSplitKXdl<  F32,   F32,   F32,     F32,     Row,     Row,     Row,   256,   128,   128,     4,  4,   32,   32,    2,    2,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 2, 1, 3>,     S<0, 2, 1, 3>,              3,              4,              4,      S<1, 1, 2, 4>,     S<1, 4, 64, 1>,     S<0, 1, 3, 2>,     S<0, 1, 3, 2>,             2,              2,              4,               7,               1,      true,      true, true, 480>
    // clang-format on
    >;
 #else

--- a/device_operation/include/device_gemm_splitk_xdl.hpp
+++ b/device_operation/include/device_gemm_splitk_xdl.hpp
@@ -49,6 +49,7 @@ template <typename ADataType,
          ck::index_t CThreadTransferDstScalarPerVector,
          bool ABlockLdsAddExtraM,
          bool BBlockLdsAddExtraN,
+          bool IsSplitK,
          ck::index_t DesiredGridSize>
 struct DeviceGemmSplitKXdl : public DeviceGemm
 {
@@ -140,6 +141,8 @@ struct DeviceGemmSplitKXdl : public DeviceGemm
    static auto GetKBatchAndKPad(index_t M, index_t N, index_t K)
    {
+        if(!IsSplitK)
+            return std::make_tuple(1, K);
        const auto GridMN    = M * N / (MPerBlock * NPerBlock);
        const index_t KBatch = std::max(DesiredGridSize / GridMN, 1);
        const index_t K0     = math::integer_divide_ceil(K, K1 * K0PerBlock * KBatch) * K0PerBlock;
@@ -244,6 +247,54 @@ struct DeviceGemmSplitKXdl : public DeviceGemm
        ABlockLdsAddExtraM,
        BBlockLdsAddExtraN>;
+    // GridwiseGemm
+    using GridwiseGemmAtomicAdd = GridwiseGemm_bk0mk1_bk0nk1_mn_xdlops_v2r4<
+        BlockSize,
+        ADataType, // TODO: distinguish A/B datatype
+        AccDataType,
+        CDataType,
+        InMemoryDataOperationEnum_t::AtomicAdd,
+        AGridDesc_K0_M_K1,
+        BGridDesc_K0_N_K1,
+        CGridDesc_M_N,
+        MPerBlock,
+        NPerBlock,
+        K0PerBlock,
+        MPerXDL,
+        NPerXDL,
+        K1,
+        MXdlPerWave,
+        NXdlPerWave,
+        ABlockTransferThreadSliceLengths_K0_M_K1,
+        ABlockTransferThreadClusterLengths_K0_M_K1,
+        ABlockTransferThreadClusterArrangeOrder,
+        ABlockTransferSrcAccessOrder,
+        ABlockTransferSrcVectorDim,
+        ABlockTransferSrcScalarPerVector,
+        ABlockTransferDstScalarPerVector_K1,
+        false, // AThreadTransferSrcResetCoordinateAfterRun,
+        BBlockTransferThreadSliceLengths_K0_N_K1,
+        BBlockTransferThreadClusterLengths_K0_N_K1,
+        BBlockTransferThreadClusterArrangeOrder,
+        BBlockTransferSrcAccessOrder,
+        BBlockTransferSrcVectorDim,
+        BBlockTransferSrcScalarPerVector,
+        BBlockTransferDstScalarPerVector_K1,
+        false,                            // BThreadTransferSrcResetCoordinateAfterRun,
+        Sequence<0, 2, 4, 5, 6, 1, 3, 7>, // CThreadTransferSrcDstAccessOrder,
+        CThreadTransferSrcDstVectorDim,
+        CThreadTransferDstScalarPerVector,
+        decltype(a_kbatch_k0_m_k1_grid_step_hacks),          //  AGridStepHacks,
+        decltype(b_kbatch_k0_n_k1_grid_step_hacks),          //  BGridStepHacks,
+        decltype(c_m0_n0_m1_n1_m2_m3_m4_n2_grid_step_hacks), //  CGridStepHacks,
+        decltype(
+            a_kbatch_k0_m_k1_grid_move_slice_window_step_hacks), //  AGridMoveSliceWindowStepHacks,
+        decltype(
+            b_kbatch_k0_n_k1_grid_move_slice_window_step_hacks), //  BGridMoveSliceWindowStepHacks,
+        false,                                                   // CAccessOrderMRepeatNRepeat,
+        ABlockLdsAddExtraM,
+        BBlockLdsAddExtraN>;
    using CGridDesc_M0_N0_M1_N1_M2_M3_M4_N2 =
        decltype(GridwiseGemm::MakeCM0N0M1N1M2M3M4N2GridDescriptor(CGridDesc_M_N{}));

--- a/profiler/include/profile_gemm.hpp
+++ b/profiler/include/profile_gemm.hpp
@@ -118,17 +118,20 @@ void profile_gemm(int do_verification,
    std::cout << "b_k_n: " << b_k_n.mDesc << std::endl;
    std::cout << "c_m_n: " << c_m_n_host_result.mDesc << std::endl;
+    std::size_t num_thread = std::thread::hardware_concurrency();
    switch(init_method)
    {
    case 0: break;
    case 1:
-        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5});
+        a_m_k.GenerateTensorValue(GeneratorTensor_2<ADataType>{-5, 5}, num_thread);
-        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_2<BDataType>{-5, 5}, num_thread);
        break;
    default:
-        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0});
+        a_m_k.GenerateTensorValue(GeneratorTensor_3<ADataType>{0.0, 1.0}, num_thread);
-        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5});
+        b_k_n.GenerateTensorValue(GeneratorTensor_3<BDataType>{-0.5, 0.5}, num_thread);
    }
+    // set zero to c_device_buf
+    c_m_n_device_result.GenerateTensorValue(GeneratorTensor_0<CDataType>{}, num_thread);
    if(do_verification)
    {