Merge remote-tracking branch 'origin/develop' into improve_pipeline

d55080e9 · Chao Liu · 7610e049 · 860e291c · d55080e9 · d55080e9
Commit d55080e9 authored Apr 21, 2022 by Chao Liu
9 changed files
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -3,7 +3,6 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <getopt.h>
-#include <half.hpp>
 #include "check_err.hpp"
 #include "config.hpp"
@@ -27,10 +26,6 @@ using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 using AccDataType = float;
-using HostInDataType  = half_float::half;
-using HostOutDataType = half_float::half;
-using HostAccDataType = float;
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;
@@ -306,9 +301,9 @@ int main(int argc, char* argv[])
    if(args.do_verification)
    {
-        ReductionHost<HostInDataType,
+        ReductionHost<InDataType,
-                      HostAccDataType,
+                      AccDataType,
-                      HostOutDataType,
+                      OutDataType,
                      ReduceOpId,
                      Rank,
                      NumReduceDim,
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
                      NeedIndices>
            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-        hostReduce.Run(alpha,
+        hostReduce.Run(
-                       reinterpret_cast<const HostInDataType*>(in.mData.data()),
+            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
-                       beta,
-                       reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                       out_indices_ref.mData.data());
    };
    const auto i_inLengths  = to_int_vector(args.inLengths);

--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -37,6 +37,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};
+    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);
@@ -69,7 +71,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();
-        return make_tuple(0, waveId_m, xdlops_a_idx[I1], Number<KPack>{} * xdlops_a_idx[I0]);
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
    }
    __device__ static auto CalculateBThreadOriginDataIndex()
@@ -80,7 +82,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();
-        return make_tuple(0, waveId_n, xdlops_b_idx[I1], Number<KPack>{} * xdlops_b_idx[I0]);
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]);
    }
    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
@@ -271,7 +273,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   make_tuple(I0, I0, I0, I0),
                                   b_thread_buf);
-                static_for<0, KPerBlock, KPack * xdlops_gemm.K0PerXdlops>{}([&](auto k) {
+                static_for<0, KPerThread, KPack>{}([&](auto k) {
                    vector_type<FloatAB, KPack> a_thread_vec;
                    vector_type<FloatAB, KPack> b_thread_vec;
@@ -298,13 +300,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    }
    private:
-    // A[M0, M1, M2, KPerBlock]
+    // A[M0, M1, M2, KPerThread]
    static constexpr auto a_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
-    // B[N0, N1, N2, KPerBlock]
+    // B[N0, N1, N2, KPerThread]
    static constexpr auto b_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));
    // C[M, N, NumRegXdlops]
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
@@ -314,7 +316,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         FloatAB,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, KPerBlock>,
+                                                         Sequence<1, 1, 1, KPerThread>,
                                                         Sequence<0, 1, 2, 3>,
                                                         3,
                                                         A_K1,
@@ -324,7 +326,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         FloatAB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
-                                                         Sequence<1, 1, 1, KPerBlock>,
+                                                         Sequence<1, 1, 1, KPerThread>,
                                                         Sequence<0, 1, 2, 3>,
                                                         3,
                                                         B_K1,

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
 #ifndef CK_MATH_V2_HPP
 #define CK_MATH_V2_HPP
+#include <cmath>
 #include "data_type.hpp"
+#include "half.hpp"
 namespace ck {
 namespace math {
-static inline __device__ half_t abs(half_t x) { return __habs(x); };
+static inline __host__ float abs(float x) { return std::abs(x); };
-static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
-static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+static inline __host__ double abs(double x) { return std::abs(x); };
+static inline __host__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+    return (x ^ sgn) - sgn;
+};
+static inline __host__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+    return (x ^ sgn) - sgn;
+};
+static inline __host__ half_t abs(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    half_float::half abs_xx = half_float::abs(xx);
+    half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
+    return abs_x;
+};
+static inline __host__ float isnan(float x) { return std::isnan(x); };
+static inline __host__ double isnan(double x) { return std::isnan(x); };
+static inline __host__ int8_t isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+static inline __host__ int32_t isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+static inline __host__ bool isnan(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+    return half_float::isnan(xx);
+};
 } // namespace math
 } // namespace ck

--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -33,7 +33,7 @@ namespace ck {
 struct float_equal_one
 {
    template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
    {
        return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
    };
@@ -42,7 +42,7 @@ struct float_equal_one
 struct float_equal_zero
 {
    template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
    {
        return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
    };

--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -26,7 +26,6 @@
 #ifndef GUARD_HOST_REDUCE_UTIL_HPP
 #define GUARD_HOST_REDUCE_UTIL_HPP
-#include <half.hpp>
 #include <limits>
 #include <cmath>
 #include <cassert>
@@ -34,6 +33,8 @@
 #include <string>
 #include "reduction_enums.hpp"
+#include "data_type.hpp"
+#include "math_v2.hpp"
 namespace ck {
@@ -42,34 +43,10 @@ namespace host_reduce {
 using ck::NanPropagation;
 using ck::ReduceTensorOp;
-template <typename T>
-static inline bool float_equal_one(T);
-static inline bool float_equal_one(float x) { return x == 1.0f; };
-static inline bool float_equal_one(double x) { return x == 1.0; };
-static inline bool float_equal_one(half_float::half x)
-{
-    return x == static_cast<half_float::half>(1.0f);
-};
-template <typename T>
-static inline bool float_equal_zero(T x);
-static inline bool float_equal_zero(float x) { return x == 0.0f; };
-static inline bool float_equal_zero(double x) { return x == 0.0; };
-static inline bool float_equal_zero(half_float::half x)
-{
-    return x == static_cast<half_float::half>(0.0f);
-};
 template <typename AccDataType, ReduceTensorOp ReduceOpId>
 __host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
 {
-    using std::abs;
+    using ck::math::abs;
    if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
    {
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
    {
-        return (std::numeric_limits<AccDataType>::max());
+        return (ck::NumericLimits<AccDataType>::Max());
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
    {
-        return (std::numeric_limits<AccDataType>::lowest());
+        return (ck::NumericLimits<AccDataType>::Lowest());
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
    {
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
                     AccDataType& accuVal,
                     AccDataType currVal)
 {
-    using std::isnan;
+    using ck::math::isnan;
    if constexpr(!PropagateNan)
    {
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
                      int& accuIndex,
                      int currIndex)
 {
-    using std::isnan;
+    using ck::math::isnan;
    if constexpr(!PropagateNan)
    {

--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -32,6 +32,7 @@
 #include <functional>
 #include "reduction_enums.hpp"
+#include "reduction_common.hpp"
 #include "host_reduce_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
@@ -196,10 +197,10 @@ struct ReductionHost
                            OutDataType* out_data,
                            IndexDataType* out_indices)
    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
        using ck::type_convert;
        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
        using ck::host_reduce::ReduceOpFn2;
        using ck::host_reduce::ReduceOpZeroVal;
@@ -227,10 +228,10 @@ struct ReductionHost
            posUnaryOp(accuVal);
-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);
-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
            out_data[0]    = type_convert<OutDataType>(accuVal);
@@ -263,13 +264,13 @@ struct ReductionHost
                posUnaryOp(accuVal);
-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);
                auto dst_offset =
                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                               type_convert<AccDataType>(beta);
@@ -303,10 +304,10 @@ struct ReductionHost
    void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
        using ck::type_convert;
        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
        using ck::host_reduce::ReduceOpFn;
        using ck::host_reduce::ReduceOpZeroVal;
@@ -330,10 +331,10 @@ struct ReductionHost
            posUnaryOp(accuVal);
-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);
-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
            out_data[0] = type_convert<OutDataType>(accuVal);
@@ -361,13 +362,13 @@ struct ReductionHost
                posUnaryOp(accuVal);
-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);
                auto dst_offset =
                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                               type_convert<AccDataType>(beta);

--- a/profiler/include/profile_reduce_impl.hpp
+++ b/profiler/include/profile_reduce_impl.hpp
@@ -380,13 +380,9 @@ void profile_reduce_impl_impl(bool do_verification,
        if(do_verification)
        {
-            using HostInDataType  = typename type_mapping<InDataType>::OutType;
+            ReductionHost<InDataType,
-            using HostOutDataType = typename type_mapping<OutDataType>::OutType;
+                          AccDataType,
-            using HostAccDataType = typename type_mapping<AccDataType>::OutType;
+                          OutDataType,
-            ReductionHost<HostInDataType,
-                          HostAccDataType,
-                          HostOutDataType,
                          ReduceOpId,
                          Rank,
                          NumReduceDim,
@@ -394,11 +390,8 @@ void profile_reduce_impl_impl(bool do_verification,
                          NeedIndices>
                hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-            hostReduce.Run(alpha,
+            hostReduce.Run(
-                           reinterpret_cast<const HostInDataType*>(in.mData.data()),
+                alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
-                           beta,
-                           reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                           out_indices_ref.mData.data());
        };
        const auto i_inLengths  = to_int_vector(inLengths);

--- a/test/reduce/reduce_no_index.cpp
+++ b/test/reduce/reduce_no_index.cpp
@@ -37,19 +37,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
    return invariantDims;
 };
-// map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename InType>
-struct type_mapping
-{
-    using OutType = InType;
-};
-template <>
-struct type_mapping<ck::half_t>
-{
-    using OutType = half_float::half;
-};
 constexpr int Rank = 4;
 constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AVG;
@@ -226,13 +213,9 @@ bool test_reduce_no_index_impl(int init_method,
    bool result = true;
-    using HostInDataType  = typename type_mapping<InDataType>::OutType;
+    ReductionHost<InDataType,
-    using HostOutDataType = typename type_mapping<OutDataType>::OutType;
+                  AccDataType,
-    using HostAccDataType = typename type_mapping<AccDataType>::OutType;
+                  OutDataType,
-    ReductionHost<HostInDataType,
-                  HostAccDataType,
-                  HostOutDataType,
                  ReduceOpId,
                  Rank,
                  NumReduceDim,
@@ -240,11 +223,7 @@ bool test_reduce_no_index_impl(int init_method,
                  NeedIndices>
        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-    hostReduce.Run(alpha,
+    hostReduce.Run(alpha, in.mData.data(), beta, out_ref.mData.data(), nullptr);
-                   reinterpret_cast<const HostInDataType*>(in.mData.data()),
-                   beta,
-                   reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                   nullptr);
    const auto i_inLengths  = to_int_vector(inLengths);
    const auto i_inStrides  = to_int_vector(inStrides);

--- a/test/reduce/reduce_with_index.cpp
+++ b/test/reduce/reduce_with_index.cpp
@@ -36,19 +36,6 @@ static inline std::vector<int> get_invariant_dims(const std::vector<int>& reduce
    return invariantDims;
 };
-// map the data type used by the GPU kernels to the corresponding type used by the host codes
-template <typename InType>
-struct type_mapping
-{
-    using OutType = InType;
-};
-template <>
-struct type_mapping<ck::half_t>
-{
-    using OutType = half_float::half;
-};
 constexpr int Rank = 4;
 constexpr ReduceTensorOp ReduceOpId      = ReduceTensorOp::AMAX;
@@ -209,13 +196,9 @@ bool test_reduce_with_index_impl(int init_method,
    bool result = true;
-    using HostInDataType  = typename type_mapping<InDataType>::OutType;
+    ReductionHost<InDataType,
-    using HostOutDataType = typename type_mapping<OutDataType>::OutType;
+                  AccDataType,
-    using HostAccDataType = typename type_mapping<AccDataType>::OutType;
+                  OutDataType,
-    ReductionHost<HostInDataType,
-                  HostAccDataType,
-                  HostOutDataType,
                  ReduceOpId,
                  Rank,
                  NumReduceDim,
@@ -223,11 +206,8 @@ bool test_reduce_with_index_impl(int init_method,
                  NeedIndices>
        hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);
-    hostReduce.Run(alpha,
+    hostReduce.Run(
-                   reinterpret_cast<const HostInDataType*>(in.mData.data()),
+        alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
-                   beta,
-                   reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                   out_indices_ref.mData.data());
    const auto i_inLengths  = to_int_vector(inLengths);
    const auto i_inStrides  = to_int_vector(inStrides);