Merge branch 'develop' into jd/dev_pkg

db775824 · Jehandad Khan · 74397984 · 7c0b1498 · db775824 · db775824
Commit db775824 authored Apr 24, 2022 by Jehandad Khan
20 changed files
--- a/Dockerfile
+++ b/Dockerfile
@@ -48,6 +48,7 @@ RUN apt-get update && DEBIAN_FRONTEND=noninteractive apt-get install -y --allow-
    vim \
    zlib1g-dev \
    openssh-server \
+    clang-format-10 \
    kmod && \
    apt-get clean && \
    rm -rf /var/lib/apt/lists/*

--- a/Jenkinsfile
+++ b/Jenkinsfile
@@ -204,7 +204,7 @@ pipeline {
                stage('Clang Format') {
                    agent{ label rocmnode("nogpu") }
                    environment{
-                        execute_cmd = "find . -iname \'*.h\' \
+                        execute_cmd = "find .. -iname \'*.h\' \
                                -o -iname \'*.hpp\' \
                                -o -iname \'*.cpp\' \
                                -o -iname \'*.h.in\' \

--- a/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
+++ b/example/06_conv2d_fwd_bias_relu/CMakeLists.txt
 add_example_executable(example_conv2d_fwd_xdl_bias_relu conv2d_fwd_xdl_bias_relu.cpp)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu PRIVATE conv_fwd_util)
--- a/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
+++ b/example/07_conv2d_fwd_bias_relu_add/CMakeLists.txt
 add_example_executable(example_conv2d_fwd_xdl_bias_relu_add conv2d_fwd_xdl_bias_relu_add.cpp)
+target_link_libraries(example_conv2d_fwd_xdl_bias_relu_add PRIVATE conv_fwd_util)
--- a/example/09_convnd_fwd/CMakeLists.txt
+++ b/example/09_convnd_fwd/CMakeLists.txt
 add_example_executable(example_convnd_fwd_xdl convnd_fwd_xdl.cpp)
+target_link_libraries(example_convnd_fwd_xdl PRIVATE conv_fwd_util)
 add_example_executable(example_convnd_fwd_xdl_int8 convnd_fwd_xdl_int8.cpp)
+target_link_libraries(example_convnd_fwd_xdl_int8 PRIVATE conv_fwd_util)
 add_example_executable(example_convnd_fwd_xdl_fp16 convnd_fwd_xdl_fp16.cpp)
+target_link_libraries(example_convnd_fwd_xdl_fp16 PRIVATE conv_fwd_util)
--- a/example/10_conv2d_bwd_data/CMakeLists.txt
+++ b/example/10_conv2d_bwd_data/CMakeLists.txt
 add_example_executable(example_conv2d_bwd_data_xdl conv2d_bwd_data_xdl.cpp)
+target_link_libraries(example_conv2d_bwd_data_xdl PRIVATE conv_fwd_util)
--- a/example/11_conv2d_bwd_weight/CMakeLists.txt
+++ b/example/11_conv2d_bwd_weight/CMakeLists.txt
 add_example_executable(example_conv2d_bwd_weight_xdl conv2d_bwd_weight_xdl.cpp)
+target_link_libraries(example_conv2d_bwd_weight_xdl PRIVATE conv_fwd_util)
--- a/example/12_reduce/reduce_blockwise.cpp
+++ b/example/12_reduce/reduce_blockwise.cpp
@@ -3,7 +3,6 @@
 #include <initializer_list>
 #include <cstdlib>
 #include <getopt.h>
-#include <half.hpp>

 #include "check_err.hpp"
 #include "config.hpp"
@@ -27,10 +26,6 @@ using InDataType  = ck::half_t;
 using OutDataType = ck::half_t;
 using AccDataType = float;

-using HostInDataType  = half_float::half;
-using HostOutDataType = half_float::half;
-using HostAccDataType = float;
-
 constexpr int Rank         = 4;
 constexpr int NumReduceDim = 3;

@@ -306,9 +301,9 @@ int main(int argc, char* argv[])

    if(args.do_verification)
    {
-        ReductionHost<HostInDataType,
-                      HostAccDataType,
-                      HostOutDataType,
+        ReductionHost<InDataType,
+                      AccDataType,
+                      OutDataType,
                      ReduceOpId,
                      Rank,
                      NumReduceDim,
@@ -316,11 +311,8 @@ int main(int argc, char* argv[])
                      NeedIndices>
            hostReduce(in.mDesc, out_ref.mDesc, invariantDims, reduceDims);

-        hostReduce.Run(alpha,
-                       reinterpret_cast<const HostInDataType*>(in.mData.data()),
-                       beta,
-                       reinterpret_cast<HostOutDataType*>(out_ref.mData.data()),
-                       out_indices_ref.mData.data());
+        hostReduce.Run(
+            alpha, in.mData.data(), beta, out_ref.mData.data(), out_indices_ref.mData.data());
    };

    const auto i_inLengths  = to_int_vector(args.inLengths);

--- a/example/17_convnd_bwd_data_xdl/CMakeLists.txt
+++ b/example/17_convnd_bwd_data_xdl/CMakeLists.txt
 add_example_executable(example_convnd_bwd_data_xdl convnd_bwd_data_xdl.cpp)
+target_link_libraries(example_convnd_bwd_data_xdl PRIVATE conv_fwd_util)
--- a/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
+++ b/include/ck/tensor_operation/gpu/block/blockwise_gemm_xdlops.hpp
@@ -39,6 +39,8 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1

    static constexpr auto xdlops_gemm = XdlopsGemm<FloatAB, MPerXDL, NPerXDL, KPack>{};

+    static constexpr index_t KPerThread = KPerBlock / xdlops_gemm.K0PerXdlops;
+
    static constexpr index_t MWaves = MPerBlock / (MRepeat * MPerXDL);
    static constexpr index_t NWaves = NPerBlock / (NRepeat * NPerXDL);

@@ -71,7 +73,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1

        const auto xdlops_a_idx = xdlops_gemm.CalculateAThreadOriginDataIndex();

-        return make_tuple(0, waveId_m, xdlops_a_idx[I1], Number<KPack>{} * xdlops_a_idx[I0]);
+        return make_tuple(0, waveId_m, xdlops_a_idx[I1], KPerThread * xdlops_a_idx[I0]);
    }

    __device__ static auto CalculateBThreadOriginDataIndex()
@@ -82,7 +84,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1

        const auto xdlops_b_idx = xdlops_gemm.CalculateBThreadOriginDataIndex();

-        return make_tuple(0, waveId_n, xdlops_b_idx[I1], Number<KPack>{} * xdlops_b_idx[I0]);
+        return make_tuple(0, waveId_n, xdlops_b_idx[I1], KPerThread * xdlops_b_idx[I0]);
    }

    template <index_t m0, index_t n0, index_t xdlops_i, index_t blk_i>
@@ -273,7 +275,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                   make_tuple(I0, I0, I0, I0),
                                   b_thread_buf);

-                static_for<0, KPerBlock, KPack * xdlops_gemm.K0PerXdlops>{}([&](auto k) {
+                static_for<0, KPerThread, KPack>{}([&](auto k) {
                    vector_type<FloatAB, KPack> a_thread_vec;
                    vector_type<FloatAB, KPack> b_thread_vec;

@@ -300,13 +302,13 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
    }

    private:
-    // A[M0, M1, M2, KPerBlock]
+    // A[M0, M1, M2, KPerThread]
    static constexpr auto a_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));

-    // B[N0, N1, N2, KPerBlock]
+    // B[N0, N1, N2, KPerThread]
    static constexpr auto b_thread_desc_ =
-        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerBlock>{}));
+        make_naive_tensor_descriptor_packed(make_tuple(I1, I1, I1, Number<KPerThread>{}));

    // C[M, N, NumRegXdlops]
    static constexpr auto c_thread_desc_ = make_naive_tensor_descriptor_packed(
@@ -316,7 +318,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         FloatAB,
                                                         decltype(a_block_desc_m0_m1_m2_k),
                                                         decltype(a_thread_desc_),
-                                                         Sequence<1, 1, 1, KPerBlock>,
+                                                         Sequence<1, 1, 1, KPerThread>,
                                                         Sequence<0, 1, 2, 3>,
                                                         3,
                                                         A_K1,
@@ -326,7 +328,7 @@ struct BlockwiseGemmXdlops_k0mk1_k0nk1_m0n0m1n1m2m3m4n2_v1
                                                         FloatAB,
                                                         decltype(b_block_desc_n0_n1_n2_k),
                                                         decltype(b_thread_desc_),
-                                                         Sequence<1, 1, 1, KPerBlock>,
+                                                         Sequence<1, 1, 1, KPerThread>,
                                                         Sequence<0, 1, 2, 3>,
                                                         3,
                                                         B_K1,

--- a/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
+++ b/include/ck/tensor_operation/gpu/thread/threadwise_tensor_slice_transfer_v3r1.hpp
@@ -277,9 +277,12 @@ struct ThreadwiseTensorSliceTransfer_v3r1
        // sub-dword transpose between src_thread_scratch_ and dst_thread_scratch_
        // TODO make this logic more generic for more sub-dword datatype
        if constexpr(SrcVectorDim != DstVectorDim &&
-                     is_same<half_t, remove_cvref_t<SrcData>>::value &&
+                     ((is_same<half_t, remove_cvref_t<SrcData>>::value &&
                       is_same<half_t, remove_cvref_t<DstData>>::value &&
-                     SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0)
+                       SrcScalarPerVector % 2 == 0 && DstScalarPerVector % 2 == 0) ||
+                      (is_same<int8_t, remove_cvref_t<SrcData>>::value &&
+                       is_same<int8_t, remove_cvref_t<DstData>>::value &&
+                       SrcScalarPerVector % 4 == 0 && DstScalarPerVector % 4 == 0)))
        {
            // each transpose does
            // DstScalarPerVector # of src vectors in src_thread_scratch_

--- a/include/ck/utility/math_v2.hpp
+++ b/include/ck/utility/math_v2.hpp
 #ifndef CK_MATH_V2_HPP
 #define CK_MATH_V2_HPP

+#include <cmath>
 #include "data_type.hpp"
+#include "half.hpp"

 namespace ck {
 namespace math {

-static inline __device__ half_t abs(half_t x) { return __habs(x); };
-static inline __device__ half_t sqrtf(half_t x) { return hsqrt(x); };
-static inline __device__ bool isnan(half_t x) { return __hisnan(x); };
+static inline __host__ float abs(float x) { return std::abs(x); };
+
+static inline __host__ double abs(double x) { return std::abs(x); };
+
+static inline __host__ int8_t abs(int8_t x)
+{
+    int8_t sgn = x >> (8 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __host__ int32_t abs(int32_t x)
+{
+    int32_t sgn = x >> (32 - 1);
+
+    return (x ^ sgn) - sgn;
+};
+
+static inline __host__ half_t abs(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+
+    half_float::half abs_xx = half_float::abs(xx);
+
+    half_t abs_x = *reinterpret_cast<half_t*>(&abs_xx);
+
+    return abs_x;
+};
+
+static inline __host__ float isnan(float x) { return std::isnan(x); };
+
+static inline __host__ double isnan(double x) { return std::isnan(x); };
+
+static inline __host__ int8_t isnan(int8_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __host__ int32_t isnan(int32_t x)
+{
+    (void)x;
+    return false;
+};
+
+static inline __host__ bool isnan(half_t x)
+{
+    half_float::half xx = *reinterpret_cast<half_float::half*>(&x);
+
+    return half_float::isnan(xx);
+};

 } // namespace math
 } // namespace ck

--- a/include/ck/utility/reduction_common.hpp
+++ b/include/ck/utility/reduction_common.hpp
@@ -33,7 +33,7 @@ namespace ck {
 struct float_equal_one
 {
    template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
    {
        return x <= static_cast<T>(1.0f) and x >= static_cast<T>(1.0f);
    };
@@ -42,7 +42,7 @@ struct float_equal_one
 struct float_equal_zero
 {
    template <class T>
-    __device__ inline bool operator()(T x)
+    __host__ __device__ inline bool operator()(T x)
    {
        return x <= static_cast<T>(0.0f) and x >= static_cast<T>(0.0f);
    };

--- a/include/ck/utility/transpose_vectors.hpp
+++ b/include/ck/utility/transpose_vectors.hpp
@@ -49,7 +49,7 @@ __device__ void transpose_fp16_2x2(const half2_t& x0, const half2_t& x1, half2_t
 template <index_t NX, index_t NY>
 struct transpose_vectors<half_t, NX, NY>
 {
-    // we got [NY * NX] ammount of S data to be transposed
+    // we got [NY * NX] amount of S data to be transposed
    static constexpr index_t s_per_x = NY;
    static constexpr index_t s_per_y = NX;

@@ -83,5 +83,86 @@ struct transpose_vectors<half_t, NX, NY>
    }
 };

+// transpose int8 4x4
+__device__ void transpose_int8_4x4(const int8x4_t& x0,
+                                   const int8x4_t& x1,
+                                   const int8x4_t& x2,
+                                   const int8x4_t& x3,
+                                   int8x4_t& y0,
+                                   int8x4_t& y1,
+                                   int8x4_t& y2,
+                                   int8x4_t& y3)
+{
+    int32_t t0, t1;
+    int32_t z0, z1, z2, z3;
+    constexpr int32_t m0 = 0x05010400;
+    constexpr int32_t m1 = 0x05040100;
+    constexpr int32_t m2 = 0x07060302;
+    constexpr int32_t m3 = 0x07030602;
+
+    // ex: v_perm_b32(0x 11 22 33 44, 0x 55 66 77 88, 0x 05 01 04 00) -> 0x33774488
+    //                   -- -- -- --     -- -- -- --      -  -  -  -
+    //             index  7  6  5  4      3  2  1  0     33 77 44 88
+    // index is reversed because of little endianness (least significant bits first)
+    // clang-format off
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m0));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m0));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z0) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z1) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t0) : "v"(bit_cast<int32_t>(x1)), "v"(bit_cast<int32_t>(x0)), "s"(m3));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(t1) : "v"(bit_cast<int32_t>(x3)), "v"(bit_cast<int32_t>(x2)), "s"(m3));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z2) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m1));
+    asm volatile("v_perm_b32 %0, %1, %2, %3" : "=v"(z3) : "v"(bit_cast<int32_t>(t1)), "v"(bit_cast<int32_t>(t0)), "s"(m2));
+    // clang-format on
+
+    y0 = bit_cast<int8x4_t>(z0);
+    y1 = bit_cast<int8x4_t>(z1);
+    y2 = bit_cast<int8x4_t>(z2);
+    y3 = bit_cast<int8x4_t>(z3);
+}
+
+template <index_t NX, index_t NY>
+struct transpose_vectors<int8_t, NX, NY>
+{
+    // we got [NY * NX] amount of S data to be transposed
+    static constexpr index_t s_per_x = NY;
+    static constexpr index_t s_per_y = NX;
+
+    using S  = int8_t;
+    using VX = vector_type<int8_t, s_per_x>;
+    using VY = vector_type<int8_t, s_per_y>;
+
+    __device__ void operator()(const StaticallyIndexedArray<const VX&, NX>& vx_tuple,
+                               StaticallyIndexedArray<VY&, NY>& vy_tuple)
+    {
+        static constexpr auto I1 = Number<1>{};
+        static constexpr auto I2 = Number<2>{};
+        static constexpr auto I3 = Number<3>{};
+        static constexpr auto I4 = Number<4>{};
+
+        static_assert((NX % 4 == 0 && NY % 4 == 0), "wrong!");
+
+        // loop over 4x4 tile and transpose data from vx_tuple into vy_tuple
+        static_for<0, NY, 4>{}([&](auto iy) {
+            static_for<0, NX, 4>{}([&](auto ix) {
+                // reference to 4 int8 data from vx_tuple
+                const auto& x_s4_0 = vx_tuple[ix].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_1 = vx_tuple[ix + I1].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_2 = vx_tuple[ix + I2].template AsType<int8x4_t>()[iy / I4];
+                const auto& x_s4_3 = vx_tuple[ix + I3].template AsType<int8x4_t>()[iy / I4];
+
+                // reference to 4 int8 data from vy_tuple
+                auto& y_s4_0 = vy_tuple(iy).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_1 = vy_tuple(iy + I1).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_2 = vy_tuple(iy + I2).template AsType<int8x4_t>()(ix / I4);
+                auto& y_s4_3 = vy_tuple(iy + I3).template AsType<int8x4_t>()(ix / I4);
+
+                // transpose
+                transpose_int8_4x4(x_s4_0, x_s4_1, x_s4_2, x_s4_3, y_s4_0, y_s4_1, y_s4_2, y_s4_3);
+            });
+        });
+    }
+};
+
 } // namespace ck
 #endif
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
 add_subdirectory(src/host_tensor)
 add_subdirectory(src/tensor_operation_instance/gpu)
+add_subdirectory(src/utility)
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -26,7 +26,6 @@
 #ifndef GUARD_HOST_REDUCE_UTIL_HPP
 #define GUARD_HOST_REDUCE_UTIL_HPP

-#include <half.hpp>
 #include <limits>
 #include <cmath>
 #include <cassert>
@@ -34,6 +33,8 @@
 #include <string>

 #include "reduction_enums.hpp"
+#include "data_type.hpp"
+#include "math_v2.hpp"

 namespace ck {

@@ -42,34 +43,10 @@ namespace host_reduce {
 using ck::NanPropagation;
 using ck::ReduceTensorOp;

-template <typename T>
-static inline bool float_equal_one(T);
-
-static inline bool float_equal_one(float x) { return x == 1.0f; };
-
-static inline bool float_equal_one(double x) { return x == 1.0; };
-
-static inline bool float_equal_one(half_float::half x)
-{
-    return x == static_cast<half_float::half>(1.0f);
-};
-
-template <typename T>
-static inline bool float_equal_zero(T x);
-
-static inline bool float_equal_zero(float x) { return x == 0.0f; };
-
-static inline bool float_equal_zero(double x) { return x == 0.0; };
-
-static inline bool float_equal_zero(half_float::half x)
-{
-    return x == static_cast<half_float::half>(0.0f);
-};
-
 template <typename AccDataType, ReduceTensorOp ReduceOpId>
 __host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
 {
-    using std::abs;
+    using ck::math::abs;

    if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
    {
@@ -196,11 +173,11 @@ __host__ static inline AccDataType ReduceOpZeroVal()
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
    {
-        return (std::numeric_limits<AccDataType>::max());
+        return (ck::NumericLimits<AccDataType>::Max());
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
    {
-        return (std::numeric_limits<AccDataType>::lowest());
+        return (ck::NumericLimits<AccDataType>::Lowest());
    }
    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
    {
@@ -222,7 +199,7 @@ binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
                     AccDataType& accuVal,
                     AccDataType currVal)
 {
-    using std::isnan;
+    using ck::math::isnan;

    if constexpr(!PropagateNan)
    {
@@ -245,7 +222,7 @@ binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opRe
                      int& accuIndex,
                      int currIndex)
 {
-    using std::isnan;
+    using ck::math::isnan;

    if constexpr(!PropagateNan)
    {

--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
@@ -32,6 +32,7 @@
 #include <functional>

 #include "reduction_enums.hpp"
+#include "reduction_common.hpp"
 #include "host_reduce_util.hpp"
 #include "host_tensor.hpp"
 #include "data_type.hpp"
@@ -196,10 +197,10 @@ struct ReductionHost
                            OutDataType* out_data,
                            IndexDataType* out_indices)
    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
        using ck::type_convert;
        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
        using ck::host_reduce::ReduceOpFn2;
        using ck::host_reduce::ReduceOpZeroVal;

@@ -227,10 +228,10 @@ struct ReductionHost

            posUnaryOp(accuVal);

-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);

-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);

            out_data[0]    = type_convert<OutDataType>(accuVal);
@@ -263,13 +264,13 @@ struct ReductionHost

                posUnaryOp(accuVal);

-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);

                auto dst_offset =
                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);

-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                               type_convert<AccDataType>(beta);

@@ -303,10 +304,10 @@ struct ReductionHost

    void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
    {
+        using ck::float_equal_one;
+        using ck::float_equal_zero;
        using ck::type_convert;
        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
        using ck::host_reduce::ReduceOpFn;
        using ck::host_reduce::ReduceOpZeroVal;

@@ -330,10 +331,10 @@ struct ReductionHost

            posUnaryOp(accuVal);

-            if(!float_equal_one(alpha))
+            if(!float_equal_one{}(alpha))
                accuVal *= type_convert<AccDataType>(alpha);

-            if(!float_equal_zero(beta))
+            if(!float_equal_zero{}(beta))
                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);

            out_data[0] = type_convert<OutDataType>(accuVal);
@@ -361,13 +362,13 @@ struct ReductionHost

                posUnaryOp(accuVal);

-                if(!float_equal_one(alpha))
+                if(!float_equal_one{}(alpha))
                    accuVal *= type_convert<AccDataType>(alpha);

                auto dst_offset =
                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);

-                if(!float_equal_zero(beta))
+                if(!float_equal_zero{}(beta))
                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
                               type_convert<AccDataType>(beta);


--- a/library/include/ck/library/utility/conv_fwd_util.hpp
+++ b/library/include/ck/library/utility/conv_fwd_util.hpp
--- a/library/include/ck/library/utility/fill.hpp
+++ b/library/include/ck/library/utility/fill.hpp
+#pragma once
+
+#include <algorithm>
+#include <random>
+
+#include "data_type.hpp"
+
+namespace ck {
+namespace utils {
+
+// template <typename T, class Enable = void>
+// struct FillUniform;
+
+// TODO: what's wrong with this specialization???
+// err: segmentation fault in mt19937 - infinite loop like.
+// template <typename T>
+// struct FillUniform<T, typename std::enable_if<std::is_integral<T>::value &&
+//                                               !std::is_same<T, bhalf_t>::value>::type>
+// {
+//     int a_{0};
+//     int b_{5};
+//     // T a_ = T{0};
+//     // T b_ = T{5};
+
+//     template <typename ForwardIter>
+//     void operator()(ForwardIter first, ForwardIter last) const
+//     {
+//         std::mt19937 gen{11939};
+//         std::uniform_int_distribution<int> dis(a_, b_);
+//         std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+//     }
+// };
+
+// struct FillUniform<T, typename std::enable_if<std::is_floating_point<T>::value ||
+//                                               std::is_same<T, bhalf_t>::value>::type>
+template <typename T>
+struct FillUniform
+{
+    float a_{0};
+    float b_{5};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::mt19937 gen{11939};
+        std::uniform_real_distribution<> dis(a_, b_);
+        std::generate(first, last, [&dis, &gen]() { return ck::type_convert<T>(dis(gen)); });
+    }
+};
+
+template <typename T>
+struct FillMonotonicSeq
+{
+    T init_value_{0};
+    T step_{1};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::generate(first, last, [=, n = init_value_]() mutable {
+            auto tmp = n;
+            n += step_;
+            return tmp;
+        });
+    }
+};
+
+template <typename T>
+struct FillConstant
+{
+    T value_{0};
+
+    template <typename ForwardIter>
+    void operator()(ForwardIter first, ForwardIter last) const
+    {
+        std::fill(first, last, value_);
+    }
+};
+
+} // namespace utils
+} // namespace ck
--- a/library/include/ck/library/utility/op_instance_engine.hpp
+++ b/library/include/ck/library/utility/op_instance_engine.hpp
+#pragma once
+
+#include <cstdlib>
+#include <limits>
+#include <memory>
+#include <stdexcept>
+#include <tuple>
+#include <utility>
+#include <vector>
+
+#include "check_err.hpp"
+#include "device_base.hpp"
+#include "functional2.hpp"
+
+namespace ck {
+namespace utils {
+
+struct ProfileBestConfig
+{
+    std::string best_op_name;
+    float best_avg_time   = std::numeric_limits<float>::max();
+    float best_tflops     = std::numeric_limits<float>::max();
+    float best_gb_per_sec = std::numeric_limits<float>::max();
+};
+
+/**
+ * @brief      This class describes an operation instance(s).
+ *
+ *             Op instance defines a particular specializations of operator
+ *             template. Thanks to this specific input/output data types, data
+ *             layouts and modifying elementwise operations it is able to create
+ *             it's input/output tensors, provide pointers to instances which
+ *             can execute it and all operation specific parameters.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstance
+{
+    public:
+    template <typename T>
+    using TensorPtr      = std::unique_ptr<Tensor<T>>;
+    using InTensorsTuple = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceMemPtr   = std::unique_ptr<DeviceMem>;
+    using DeviceBuffers  = std::vector<DeviceMemPtr>;
+
+    OpInstance()                  = default;
+    OpInstance(const OpInstance&) = default;
+    OpInstance& operator=(const OpInstance&) = default;
+    virtual ~OpInstance(){};
+
+    virtual InTensorsTuple GetInputTensors() const         = 0;
+    virtual TensorPtr<OutDataType> GetOutputTensor() const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseInvoker>
+    MakeInvokerPointer(tensor_operation::device::BaseOperator*) const = 0;
+    virtual std::unique_ptr<tensor_operation::device::BaseArgument>
+    MakeArgumentPointer(tensor_operation::device::BaseOperator*,
+                        const DeviceBuffers&,
+                        const DeviceMemPtr&) const = 0;
+    virtual std::size_t GetFlops() const           = 0;
+    virtual std::size_t GetBtype() const           = 0;
+};
+
+/**
+ * @brief      A generic operation instance run engine.
+ */
+template <typename OutDataType, typename... InArgTypes>
+class OpInstanceRunEngine
+{
+    public:
+    using OpInstanceT = OpInstance<InArgTypes..., OutDataType>;
+    template <typename T>
+    using TensorPtr        = std::unique_ptr<Tensor<T>>;
+    using DeviceMemPtr     = std::unique_ptr<DeviceMem>;
+    using InTensorsTuple   = std::tuple<TensorPtr<InArgTypes>...>;
+    using DeviceBuffers    = std::vector<DeviceMemPtr>;
+    using InArgsTypesTuple = std::tuple<InArgTypes...>;
+
+    OpInstanceRunEngine() = delete;
+
+    template <typename ReferenceOp = std::function<void()>>
+    OpInstanceRunEngine(const OpInstanceT& op_instance,
+                        const ReferenceOp& reference_op = ReferenceOp{})
+        : op_instance_{op_instance}
+    {
+        in_tensors_ = op_instance_.GetInputTensors();
+        out_tensor_ = op_instance_.GetOutputTensor();
+
+        if constexpr(std::is_invocable_v<ReferenceOp,
+                                         const Tensor<InArgTypes>&...,
+                                         Tensor<OutDataType>&>)
+        {
+            ref_output_ = op_instance_.GetOutputTensor();
+            CallRefOpUnpackArgs(reference_op, std::make_index_sequence<kNInArgs_>{});
+        }
+        AllocateDeviceInputTensors(std::make_index_sequence<kNInArgs_>{});
+        out_device_buffer_ =
+            std::make_unique<DeviceMem>(sizeof(OutDataType) * out_tensor_->mDesc.GetElementSpace());
+        out_device_buffer_->SetZero();
+    }
+
+    virtual ~OpInstanceRunEngine(){};
+
+    template <typename OpInstancePtr>
+    bool Test(const std::vector<OpInstancePtr>& op_ptrs)
+    {
+        bool res{true};
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                invoker->Run(argument.get());
+                out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                if(!ref_output_)
+                {
+                    throw std::runtime_error(
+                        "OpInstanceRunEngine::Test: Reference value not availabe."
+                        " You have to provide reference function.");
+                }
+                // TODO: enable flexible use of custom check_error functions
+                res = res && check_err(out_tensor_->mData, ref_output_->mData);
+                out_device_buffer_->SetZero();
+            }
+        }
+        return res;
+    }
+
+    template <typename OpInstancePtr>
+    ProfileBestConfig Profile(const std::vector<OpInstancePtr>& op_ptrs,
+                              int nrepeat          = 100,
+                              bool do_verification = false,
+                              bool do_log          = false)
+    {
+        bool res{true};
+        ProfileBestConfig best_config;
+
+        for(auto& op_ptr : op_ptrs)
+        {
+            auto invoker  = op_instance_.MakeInvokerPointer(op_ptr.get());
+            auto argument = op_instance_.MakeArgumentPointer(
+                op_ptr.get(), in_device_buffers_, out_device_buffer_);
+            if(op_ptr->IsSupportedArgument(argument.get()))
+            {
+                std::string op_name = op_ptr->GetTypeString();
+                float avg_time      = invoker->Run(argument.get(), nrepeat);
+
+                std::size_t flops     = op_instance_.GetFlops();
+                std::size_t num_btype = op_instance_.GetBtype();
+                float tflops          = static_cast<float>(flops) / 1.E9 / avg_time;
+                float gb_per_sec      = num_btype / 1.E6 / avg_time;
+
+                std::cout << "Perf: " << avg_time << " ms, " << tflops << " TFlops, " << gb_per_sec
+                          << " GB/s, " << op_name << std::endl;
+
+                if(tflops < best_config.best_tflops)
+                {
+                    best_config.best_op_name    = op_name;
+                    best_config.best_tflops     = tflops;
+                    best_config.best_gb_per_sec = gb_per_sec;
+                    best_config.best_avg_time   = avg_time;
+                }
+
+                if(do_verification)
+                {
+                    out_device_buffer_->FromDevice(out_tensor_->mData.data());
+                    if(!ref_output_)
+                    {
+                        throw std::runtime_error(
+                            "OpInstanceRunEngine::Profile: Reference value not availabe."
+                            " You have to provide reference function.");
+                    }
+                    // TODO: enable flexible use of custom check_error functions
+                    res = res && CheckErr(out_tensor_->mData, ref_output_->mData);
+
+                    if(do_log) {}
+                }
+                out_device_buffer_->SetZero();
+            }
+        }
+        return best_config;
+    }
+
+    void SetAtol(double a) { atol_ = a; }
+    void SetRtol(double r) { rtol_ = r; }
+
+    private:
+    template <typename F, std::size_t... Is>
+    void CallRefOpUnpackArgs(const F& f, std::index_sequence<Is...>) const
+    {
+        f(*std::get<Is>(in_tensors_)..., *ref_output_);
+    }
+
+    template <std::size_t... Is>
+    void AllocateDeviceInputTensors(std::index_sequence<Is...>)
+    {
+        (AllocateDeviceInputTensorsImpl<Is>(), ...);
+    }
+
+    template <std::size_t Index>
+    void AllocateDeviceInputTensorsImpl()
+    {
+        const auto& ts = std::get<Index>(in_tensors_);
+        in_device_buffers_
+            .emplace_back(
+                std::make_unique<DeviceMem>(sizeof(std::tuple_element_t<Index, InArgsTypesTuple>) *
+                                            ts->mDesc.GetElementSpace()))
+            ->ToDevice(ts->mData.data());
+    }
+
+    static constexpr std::size_t kNInArgs_ = std::tuple_size_v<InTensorsTuple>;
+    const OpInstanceT& op_instance_;
+    double rtol_{1e-5};
+    double atol_{1e-8};
+
+    InTensorsTuple in_tensors_;
+    TensorPtr<OutDataType> out_tensor_;
+    TensorPtr<OutDataType> ref_output_;
+
+    DeviceBuffers in_device_buffers_;
+    DeviceMemPtr out_device_buffer_;
+
+    template <typename T>
+    bool CheckErr(const std::vector<T>& dev_out, const std::vector<T>& ref_out) const
+    {
+        return ck::utils::check_err(dev_out, ref_out, "Error: incorrect results!", atol_, rtol_);
+    }
+};
+
+} // namespace utils
+} // namespace ck