clean

6257e347 · Chao Liu · ac876c6f · 6257e347 · 6257e347 · 6257e347
Commit 6257e347 authored Jul 02, 2022 by Chao Liu
10 changed files
--- a/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/convolution_forward_specialization.hpp
@@ -18,7 +18,7 @@ enum struct ConvolutionForwardSpecialization
    OddC,
 };
-inline std::string getConvFwdSpecializationStr(const ConvolutionForwardSpecialization& s)
+inline std::string getConvForwardSpecializationString(const ConvolutionForwardSpecialization& s)
 {
    switch(s)
    {

--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_c_shuffle_nhwc_kyxc_nhwk.hpp
@@ -871,7 +871,7 @@ struct DeviceConv2dFwdXdl_C_Shuffle_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_W
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvFwdSpecializationStr(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_conv2d_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -711,7 +711,7 @@ struct DeviceConv2dFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvFwdSpecializationStr(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_convnd_fwd_xdl_nhwc_kyxc_nhwk.hpp
@@ -1033,7 +1033,7 @@ struct DeviceConvNDFwdXdl_Input_N_Hi_Wi_C_Weight_K_Y_X_C_Output_N_Ho_Wo_K
            << MPerBlock << ", "
            << NPerBlock << ", "
            << K0PerBlock << ", "
-            << getConvFwdSpecializationStr(ConvForwardSpecialization)
+            << getConvForwardSpecializationString(ConvForwardSpecialization)
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/device_gemm_multiple_d_xdl_cshuffle.hpp
@@ -746,7 +746,8 @@ struct DeviceGemmMultipleD_Xdl_CShuffle : public DeviceGemmMultipleD<ALayout,
            << NPerBlock << ", "
            << KPerBlock << ", "
            << AK1 << ", "
-            << BK1
+            << BK1 << ", "
+            << getGemmSpecializationString(GemmSpec)
            << ">";
        // clang-format on

--- a/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
+++ b/include/ck/tensor_operation/gpu/device/gemm_specialization.hpp
@@ -19,6 +19,22 @@ enum struct GemmSpecialization
    MNKPadding,
 };
+inline std::string getGemmSpecializationString(const GemmSpecialization& s)
+{
+    switch(s)
+    {
+    case GemmSpecialization::Default: return "Default";
+    case GemmSpecialization::MPadding: return "MPadding";
+    case GemmSpecialization::NPadding: return "NPadding";
+    case GemmSpecialization::KPadding: return "KPadding";
+    case GemmSpecialization::MNPadding: return "MNPadding";
+    case GemmSpecialization::MKPadding: return "MKPadding";
+    case GemmSpecialization::NKPadding: return "NKPadding";
+    case GemmSpecialization::MNKPadding: return "MNKPadding";
+    default: return "Unrecognized specialization!";
+    }
+}
 } // namespace device
 } // namespace tensor_operation
 } // namespace ck
--- a/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
+++ b/include/ck/tensor_operation/gpu/element/binary_element_wise_operation.hpp
@@ -35,7 +35,6 @@ struct Add
        y = type_convert<half_t>(x0) + x1;
    };
-    // Question: should half_t be supported ?
    template <>
    __host__ __device__ constexpr void
    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
@@ -43,7 +42,6 @@ struct Add
        y = x0 + x1;
    };
-    // Question: should bhalf_t be supported ?
    template <>
    __host__ __device__ constexpr void
    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
@@ -74,7 +72,6 @@ struct Subtract
        y = x0 - x1;
    };
-    // Question: should half_t be supported ?
    template <>
    __host__ __device__ constexpr void
    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
@@ -82,7 +79,6 @@ struct Subtract
        y = x0 - x1;
    };
-    // Question: should bhalf_t be supported ?
    template <>
    __host__ __device__ constexpr void
    operator()<bhalf_t>(bhalf_t& y, const bhalf_t& x0, const bhalf_t& x1) const
@@ -98,32 +94,6 @@ struct Bilinear
 {
    Bilinear(float alpha, float beta) : alpha_(alpha), beta_(beta){};
-#if 0
-    template <typename T>
-    __host__ __device__ constexpr void operator()(T& y, const T& x0, const T& x1) const;
-    template <>
-    __host__ __device__ constexpr void
-    operator()<float>(float& y, const float& x0, const float& x1) const
-    {
-        y = alpha_ * x0 + beta_ * x1;
-    };
-    template <>
-    __host__ __device__ constexpr void
-    operator()<double>(double& y, const double& x0, const double& x1) const
-    {
-        y = type_convert<double>(alpha_) * x0 + type_convert<double>(beta_) * x1;
-    };
-    template <>
-    __host__ __device__ constexpr void
-    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
-    {
-        y = type_convert<half_t>(alpha_ * type_convert<float>(x0) +
-                                 beta_ * type_convert<float>(x1));
-    };
-#else
    template <typename Y, typename X0, typename X1>
    __host__ __device__ constexpr void operator()(Y&, const X0&, const X1&) const;
@@ -140,7 +110,6 @@ struct Bilinear
    {
        y = type_convert<half_t>(alpha_ * x0 + beta_ * ck::type_convert<float>(x1));
    };
-#endif
    float alpha_;
    float beta_;
@@ -167,7 +136,6 @@ struct AddRelu
        y              = a > 0.0 ? a : 0.0;
    };
-    // Question: should half_t be supported ?
    template <>
    __host__ __device__ constexpr void
    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const
@@ -202,7 +170,6 @@ struct AddHardswish
        y        = c;
    };
-    // Question: should half_t be supported ?
    template <>
    __host__ __device__ constexpr void
    operator()<half_t>(half_t& y, const half_t& x0, const half_t& x1) const

--- a/profiler/include/profile_gemm_bilinear_impl.hpp
+++ b/profiler/include/profile_gemm_bilinear_impl.hpp
@@ -30,18 +30,18 @@ template <typename ADataType,
          typename BLayout,
          typename DELayout> // assume Ds and E have same layout
 bool profile_gemm_bilinear_impl(int do_verification,
-                               int init_method,
+                                int init_method,
-                               bool /*do_log*/,
+                                bool /*do_log*/,
-                               bool time_kernel,
+                                bool time_kernel,
-                               int M,
+                                int M,
-                               int N,
+                                int N,
-                               int K,
+                                int K,
-                               int StrideA,
+                                int StrideA,
-                               int StrideB,
+                                int StrideB,
-                               int StrideD,
+                                int StrideD,
-                               int StrideE,
+                                int StrideE,
-                               float alpha,
+                                float alpha,
-                               float beta)
+                                float beta)
 {
    auto f_host_tensor_descriptor =
        [](std::size_t row, std::size_t col, std::size_t stride, auto layout) {

--- a/profiler/src/profile_gemm_add_add_fastgelu.cpp
+++ b/profiler/src/profile_gemm_add_add_fastgelu.cpp
@@ -29,7 +29,7 @@ int profile_gemm_add_add_fastgelu(int argc, char* argv[])
    if(argc != 16)
    {
        // clang-format off
-        printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+GeLU)\n");
+        printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+FastGeLU)\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: E[m, n] = FastGeLU(A[m, k] * B[k, n] + D0[m, n] + D1[m, n]);\n");
        printf("                     1: E[m, n] = FastGeLU(A[m, k] * B[n, k] + D0[m, n] + D1[m, n]);\n");

--- a/profiler/src/profile_gemm_bilinear.cpp
+++ b/profiler/src/profile_gemm_bilinear.cpp
@@ -29,7 +29,7 @@ int profile_gemm_bilinear(int argc, char* argv[])
    if(argc != 17)
    {
        // clang-format off
-        printf("arg1: tensor operation (gemm_add_add_fastgelu: GEMM+Add+Add+GeLU)\n");
+        printf("arg1: tensor operation (gemm_bilinear: GEMM+Bilinear)\n");
        printf("arg2: data type (0: fp32; 1: fp16; 2: bf16; 3: int8)\n");
        printf("arg3: matrix layout (0: E[m, n] = alpha * A[m, k] * B[k, n] + beta * D[m, n];\n");
        printf("                     1: E[m, n] = alpha * A[m, k] * B[n, k] + beta * D[m, n];\n");
@@ -94,13 +94,13 @@ int profile_gemm_bilinear(int argc, char* argv[])
        const int DefaultStrideE = ck::is_same_v<DELayout, Row> ? N : M;
        bool pass = ck::profiler::profile_gemm_bilinear_impl<ADataType,
-                                                                     BDataType,
+                                                             BDataType,
-                                                                     AccDataType,
+                                                             AccDataType,
-                                                                     DDataType,
+                                                             DDataType,
-                                                                     EDataType,
+                                                             EDataType,
-                                                                     ALayout,
+                                                             ALayout,
-                                                                     BLayout,
+                                                             BLayout,
-                                                                     DELayout>(
+                                                             DELayout>(
            do_verification,
            init_method,
            do_log,