add ckProfiler

b58b98ff · Chao Liu · 3d005816 · b58b98ff · b58b98ff · b58b98ff
Commit b58b98ff authored Jun 15, 2022 by Chao Liu
12 changed files
--- a/include/ck/tensor_description/tensor_adaptor.hpp
+++ b/include/ck/tensor_description/tensor_adaptor.hpp
@@ -136,7 +136,11 @@ struct TensorAdaptor
    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;

    public:
+#if 0 // workaround compiler complaint about constexpr
    __host__ __device__ constexpr TensorAdaptor() = default;
+#else
+    __host__ __device__ constexpr TensorAdaptor() : transforms_{}, element_size_{} {}
+#endif

    __host__ __device__ constexpr TensorAdaptor(const Transforms& transforms)
        : transforms_{transforms}, element_size_{InitializeElementSize(transforms)}

--- a/include/ck/tensor_description/tensor_descriptor.hpp
+++ b/include/ck/tensor_description/tensor_descriptor.hpp
@@ -111,7 +111,14 @@ struct TensorDescriptor
    using ElementSize = remove_cv_t<decltype(InitializeElementSize(Transforms{}))>;

    public:
+#if 0 // workaround compiler complaint about constexpr
    __host__ __device__ constexpr TensorDescriptor() = default;
+#else
+    __host__ __device__ constexpr TensorDescriptor()
+        : transforms_{}, element_size_{}, element_space_size_{}
+    {
+    }
+#endif

    __host__ __device__ constexpr TensorDescriptor(const Transforms& transforms,
                                                   ElementSpaceSize element_space_size)

--- a/include/ck/utility/tuple.hpp
+++ b/include/ck/utility/tuple.hpp
@@ -18,7 +18,7 @@ struct TupleElementKey
 template <typename Key, typename Data>
 struct TupleElementKeyData
 {
-#if 0
+#if 0 // workaround compiler complaint about implicitly-deleted default constructor
    __host__ __device__ constexpr TupleElementKeyData() = default;
 #else
    __host__ __device__ constexpr TupleElementKeyData() : mData{} {}

--- a/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
+++ b/library/include/ck/library/tensor_operation_instance/device_operation_instance.hpp
 #pragma once

-#include <stdlib.h>
+#include <vector>

 namespace ck {
 namespace tensor_operation {

--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instance.cpp
@@ -12,9 +12,9 @@ namespace device_gemm_instance {

 using F16     = ck::half_t;
 using F32     = float;
-using F16_F16 = ck::Tuple<F16, F16>
+using F16_F16 = ck::Tuple<F16, F16>;

-    using Row = ck::tensor_layout::gemm::RowMajor;
+using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;

 template <ck::index_t... Is>
@@ -28,7 +28,7 @@ static constexpr auto GemmDefault = ck::tensor_operation::device::GemmSpecializa
 // e = elementwise((a * b), d)
 // outout: e[m, n]
 // input: a[k, m], b[k, n], d[m, n]
-using device_gemm_add_add_gelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
+using device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances = std::tuple<
    // clang-format off
        //##############################| ALayout| BLayout| ELayout| AData| BData| AccData| CShuffle|  DsData| EData|           A|           B|            CDE|           GEMM| NumGemmK| Block|  MPer|  NPer|  KPer| AK1| BK1| MPer| NPer| MXdl| NXdl|  ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockTransfer| ABlockLds|  BBlockTransfer| BBlockTransfer| BBlockTransfer| BlockTransfer| BBlockTransfer| BBlockTransfer| BBlockLds|    CShuffle|    CShuffle| CBlockTransferClusterLengths|  CBlockTransfer|
        //##############################|        |        |        |  Type|  Type|    Type| DataType|    Type|  Type| Elementwise| Elementwise|    Elementwise| Spacialization| Prefetch|  Size| Block| Block| Block|    |    |  XDL|  XDL|  Per|  Per|   ThreadCluster|  ThreadCluster| SrcAccessOrder|   SrcVectorDim|      SrcScalar|      DstScalar| AddExtraM|   ThreadCluster|  ThreadCluster| SrcAccessOrder|  SrcVectorDim|      SrcScalar|      DstScalar| AddExtraN| MXdlPerWave| NXdlPerWave|         _MBlock_MWaveMPerXdl| ScalarPerVector|

--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instance.cpp
--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instance.cpp
 #include <stdlib.h>

 #include "config.hpp"
-#include "device_gemm_xdl_cshuffle.hpp"
 #include "element_wise_operation.hpp"
 #include "device_operation_instance.hpp"
+#include "device_gemm_multiple_d_xdl_cshuffle.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -12,9 +12,9 @@ namespace device_gemm_instance {

 using F16     = ck::half_t;
 using F32     = float;
-using F16_F16 = ck::Tuple<F16, F16>
+using F16_F16 = ck::Tuple<F16, F16>;

-    using Row = ck::tensor_layout::gemm::RowMajor;
+using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;

 template <ck::index_t... Is>

--- a/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
+++ b/library/src/tensor_operation_instance/gpu/gemm_add_add_fastgelu/device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instance.cpp
@@ -12,9 +12,9 @@ namespace device_gemm_instance {

 using F16     = ck::half_t;
 using F32     = float;
-using F16_F16 = ck::Tuple<F16, F16>
+using F16_F16 = ck::Tuple<F16, F16>;

-    using Row = ck::tensor_layout::gemm::RowMajor;
+using Row = ck::tensor_layout::gemm::RowMajor;
 using Col = ck::tensor_layout::gemm::ColumnMajor;

 template <ck::index_t... Is>

--- a/profiler/CMakeLists.txt
+++ b/profiler/CMakeLists.txt
@@ -24,7 +24,7 @@ include_directories(BEFORE
 # ck_profiler
 set(PROFILER_SOURCE
    src/profiler.cpp
-    src/profile_gemm.cpp
+#   src/profile_gemm.cpp
 #   src/profile_gemm_bias_2d.cpp
 #   src/profile_gemm_bias_relu.cpp
 #   src/profile_gemm_bias_relu_add.cpp
@@ -47,7 +47,7 @@ add_executable(ckProfiler ${PROFILER_SOURCE})
 target_link_libraries(ckProfiler PRIVATE host_tensor)
 target_link_libraries(ckProfiler PRIVATE conv_util)
 #target_link_libraries(ckProfiler PRIVATE device_gemm_reduce_instance)
- target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
+#target_link_libraries(ckProfiler PRIVATE device_gemm_instance)
 #target_link_libraries(ckProfiler PRIVATE device_gemm_bias2d_instance)
 #target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_instance)
 #target_link_libraries(ckProfiler PRIVATE device_gemm_bias_relu_add_instance)

--- a/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
+++ b/profiler/include/profile_gemm_add_add_fastgelu_impl.hpp
@@ -11,8 +11,8 @@
 #include "tensor_layout.hpp"
 #include "device_tensor.hpp"
 #include "element_wise_operation.hpp"
-#include "device_gemm.hpp"
 #include "reference_gemm.hpp"
+#include "device_gemm_multiple_d.hpp"

 namespace ck {
 namespace tensor_operation {
@@ -23,7 +23,7 @@ using DeviceGemmAddAddFastGeluPtr = ck::tensor_operation::device::DeviceGemmMult
    2,
    ck::tensor_operation::element_wise::PassThrough,
    ck::tensor_operation::element_wise::PassThrough,
-    ck::tensor_operation::element_wise::FastGelu>;
+    ck::tensor_operation::element_wise::AddAddFastGelu>;

 void add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
    std::vector<DeviceGemmAddAddFastGeluPtr>&);
@@ -44,6 +44,7 @@ namespace profiler {

 template <typename ADataType,
          typename BDataType,
+          typename AccDataType,
          typename D0DataType,
          typename D1DataType,
          typename EDataType,
@@ -54,7 +55,7 @@ template <typename ADataType,
          typename ELayout>
 int profile_gemm_add_add_fastgelu_impl(int do_verification,
                                       int init_method,
-                                       bool do_log,
+                                       bool /*do_log*/,
                                       bool time_kernel,
                                       int M,
                                       int N,
@@ -131,28 +132,32 @@ int profile_gemm_add_add_fastgelu_impl(int do_verification,
                     is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(device_op_ptrs);
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_kn_mn_instances(
+                    device_op_ptrs);
        }
        else if constexpr(is_same_v<ALayout, tensor_layout::gemm::RowMajor> &&
                          is_same_v<BLayout, tensor_layout::gemm::ColumnMajor> &&
                          is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(device_op_ptrs);
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_mk_nk_mn_instances(
+                    device_op_ptrs);
        }
        else if constexpr(is_same_v<ALayout, tensor_layout::gemm::ColumnMajor> &&
                          is_same_v<BLayout, tensor_layout::gemm::RowMajor> &&
                          is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(device_op_ptrs);
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_kn_mn_instances(
+                    device_op_ptrs);
        }
        else if constexpr(is_same_v<ALayout, tensor_layout::gemm::ColumnMajor> &&
                          is_same_v<BLayout, tensor_layout::gemm::ColumnMajor> &&
                          is_same_v<ELayout, tensor_layout::gemm::RowMajor>)
        {
            ck::tensor_operation::device::device_gemm_instance::
-                add_device_gemm_gelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(device_op_ptrs);
+                add_device_gemm_add_add_fastgelu_xdl_c_shuffle_f16_f16_f16_km_nk_mn_instances(
+                    device_op_ptrs);
        }
    }


--- a/profiler/src/profile_gemm_add_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -23,7 +23,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
    enum struct MatrixDataType
    {
        F32_F32_F32_F32_F32,      // 0
-        F16_F16_F16_F16_F16_F16_F16, // 1
+        F16_F16_F16_F16_F16,      // 1
        BF16_BF16_BF16_BF16_BF16, // 2
        INT8_INT8_INT8_INT8_INT8, // 3
    };
@@ -31,7 +31,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
    if(argc != 16)
    {
        // clang-format off
-        printf("arg1: tensor operation (gemm_gelu: GEMM+Add+Add+GeLU)\n");
+        printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+GeLU)\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");
@@ -40,7 +40,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
        printf("arg4: verification (0: no; 1: yes)\n");
        printf("arg5: initialization (0: no init; 1: integer value; 2: decimal value)\n");
        printf("arg6: print tensor value (0: no; 1: yes)\n");
-        printf("arg7: time kernel (0=n0, 1=yes)\n");
+        printf("arg7: time kernel (0=no, 1=yes)\n");
        printf("arg8 to 13: M, N, K, StrideA, StrideB, StrideD0, StrideD1, StrideE\n");
        // clang-format on
        exit(1);
@@ -64,12 +64,14 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
    const int StrideE  = std::stoi(argv[15]);

    using F16 = ck::half_t;
+    using F32 = float;

    using Row = ck::tensor_layout::gemm::RowMajor;
    using Col = ck::tensor_layout::gemm::ColumnMajor;

    auto profile = [&](auto a_type,
                       auto b_type,
+                       auto acc_type,
                       auto d0_type,
                       auto d1_type,
                       auto e_type,
@@ -80,6 +82,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
                       auto e_layout) {
        using ADataType   = decltype(a_type);
        using BDataType   = decltype(b_type);
+        using AccDataType = decltype(acc_type);
        using D0DataType  = decltype(d0_type);
        using D1DataType  = decltype(d1_type);
        using EDataType   = decltype(e_type);
@@ -96,8 +99,9 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
        const int DefaultStrideD1 = ck::is_same_v<D1Layout, Row> ? N : M;
        const int DefaultStrideE  = ck::is_same_v<ELayout, Row> ? N : M;

-        return ck::profiler::profile_gemm_add_add_gelu_impl<ADataType,
+        return ck::profiler::profile_gemm_add_add_fastgelu_impl<ADataType,
                                                                BDataType,
+                                                                AccDataType,
                                                                D0DataType,
                                                                D1DataType,
                                                                EDataType,
@@ -122,22 +126,22 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])

    if(data_type == MatrixDataType::F16_F16_F16_F16_F16 && layout == MatrixLayout::MK_KN_MN_MN_MN)
    {
-        return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Row{}, Row{}, Row{}, Row{});
    }
    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
            layout == MatrixLayout::MK_NK_MN_MN_MN)
    {
-        return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Row{}, Col{}, Row{}, Row{}, Row{});
    }
    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
            layout == MatrixLayout::KM_KN_MN_MN_MN)
    {
-        return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Row{}, Row{}, Row{}, Row{});
    }
    else if(data_type == MatrixDataType::F16_F16_F16_F16_F16 &&
            layout == MatrixLayout::KM_NK_MN_MN_MN)
    {
-        return profile(F16{}, F16{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{});
+        return profile(F16{}, F16{}, F32{}, F16{}, F16{}, F16{}, Col{}, Col{}, Row{}, Row{}, Row{});
    }
    else
    {

--- a/profiler/src/profiler.cpp
+++ b/profiler/src/profiler.cpp
@@ -54,11 +54,11 @@ int main(int argc, char* argv[])
        return 0;
    }

+#if 0
    if(strcmp(argv[1], "gemm") == 0)
    {
        return profile_gemm(argc, argv);
    }
-#if 0
    else if(strcmp(argv[1], "gemm_bias_2d") == 0)
    {
        return profile_gemm_bias_2d(argc, argv);
@@ -124,7 +124,7 @@ int main(int argc, char* argv[])
        return profile_conv_bwd_weight(argc, argv);
    }
 #endif
-    else if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
+    if(strcmp(argv[1], "gemm_add_add_fastgelu") == 0)
    {
        return profile_gemm_add_add_fastgelu(argc, argv);
    }