Merge branch 'develop' into jd/dev_pkg

dd6a8de4 · Jehandad Khan · 0aa899aa · abf4bdb9 · dd6a8de4 · dd6a8de4
Commit dd6a8de4 authored Apr 06, 2022 by Jehandad Khan
20 changed files
--- a/include/ck/utility/common_header.hpp
+++ b/include/ck/utility/common_header.hpp
-#ifndef CK_COMMON_HEADER_HPP
-#define CK_COMMON_HEADER_HPP
-
+#pragma once
 #include "config.hpp"
 #include "array.hpp"
 #include "container_helper.hpp"
@@ -20,30 +18,29 @@
 #include "number.hpp"
 #include "sequence.hpp"
 #include "sequence_helper.hpp"
-#include "synchronization.hpp"
 #include "tuple.hpp"
 #include "tuple_helper.hpp"
 #include "type.hpp"
 #include "magic_division.hpp"
-#include "utility.hpp"
 #include "c_style_pointer_cast.hpp"
-#include "amd_address_space.hpp"
-#include "amd_buffer_addressing.hpp"
-#include "static_buffer.hpp"
-#include "dynamic_buffer.hpp"
 #include "is_known_at_compile_time.hpp"
 #include "transpose_vectors.hpp"
 #include "inner_product.hpp"
 #include "element_wise_operation.hpp"
 #include "debug.hpp"

+#include "amd_buffer_addressing.hpp"
+#include "get_id.hpp"
+#include "synchronization.hpp"
+#include "amd_address_space.hpp"
+#include "static_buffer.hpp"
+#include "dynamic_buffer.hpp"
+
 // TODO: remove this
 #if CK_USE_AMD_INLINE_ASM
 #include "amd_inline_asm.hpp"
 #endif

-#if CK_USE_AMD_XDLOPS
+#ifdef CK_USE_AMD_MFMA
 #include "amd_xdlops.hpp"
 #endif
-
-#endif
--- a/include/ck/utility/data_type.hpp
+++ b/include/ck/utility/data_type.hpp
-#ifndef CK_FLOAT_TYPE_AMD_HPP
-#define CK_FLOAT_TYPE_AMD_HPP
-
+#pragma once
 #include "statically_indexed_array.hpp"

 namespace ck {
@@ -937,7 +935,7 @@ __host__ __device__ Y type_convert(X x)

 // convert bfp16 to fp32
 template <>
-inline __host__ __device__ float type_convert(bhalf_t x)
+inline __host__ __device__ float type_convert<float, bhalf_t>(bhalf_t x)
 {
    union
    {
@@ -950,7 +948,7 @@ inline __host__ __device__ float type_convert(bhalf_t x)

 // convert fp32 to bfp16
 template <>
-inline __host__ __device__ bhalf_t type_convert(float x)
+inline __host__ __device__ bhalf_t type_convert<bhalf_t, float>(float x)
 {
    union
    {
@@ -1090,4 +1088,3 @@ struct NumericLimits<half_t>
 };

 } // namespace ck
-#endif
--- a/include/ck/utility/data_type_enum.hpp
+++ b/include/ck/utility/data_type_enum.hpp
@@ -3,7 +3,7 @@

 namespace ck {

-enum DataTypeEnum_t
+enum struct DataTypeEnum
 {
    Half     = 0,
    Float    = 1,

--- a/include/ck/utility/data_type_enum_helper.hpp
+++ b/include/ck/utility/data_type_enum_helper.hpp
@@ -6,35 +6,35 @@

 namespace ck {

-template <DataTypeEnum_t DataTypeEnum>
+template <DataTypeEnum DataTypeEnum>
 struct get_datatype_from_enum;

 template <>
-struct get_datatype_from_enum<DataTypeEnum_t::Int8>
+struct get_datatype_from_enum<DataTypeEnum::Int8>
 {
    using type = int8_t;
 };

 template <>
-struct get_datatype_from_enum<DataTypeEnum_t::Int32>
+struct get_datatype_from_enum<DataTypeEnum::Int32>
 {
    using type = int32_t;
 };

 template <>
-struct get_datatype_from_enum<DataTypeEnum_t::Half>
+struct get_datatype_from_enum<DataTypeEnum::Half>
 {
    using type = half_t;
 };

 template <>
-struct get_datatype_from_enum<DataTypeEnum_t::Float>
+struct get_datatype_from_enum<DataTypeEnum::Float>
 {
    using type = float;
 };

 template <>
-struct get_datatype_from_enum<DataTypeEnum_t::Double>
+struct get_datatype_from_enum<DataTypeEnum::Double>
 {
    using type = double;
 };
@@ -45,31 +45,31 @@ struct get_datatype_enum_from_type;
 template <>
 struct get_datatype_enum_from_type<int8_t>
 {
-    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int8;
+    static constexpr DataTypeEnum value = DataTypeEnum::Int8;
 };

 template <>
 struct get_datatype_enum_from_type<int32_t>
 {
-    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Int32;
+    static constexpr DataTypeEnum value = DataTypeEnum::Int32;
 };

 template <>
 struct get_datatype_enum_from_type<half_t>
 {
-    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Half;
+    static constexpr DataTypeEnum value = DataTypeEnum::Half;
 };

 template <>
 struct get_datatype_enum_from_type<float>
 {
-    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Float;
+    static constexpr DataTypeEnum value = DataTypeEnum::Float;
 };

 template <>
 struct get_datatype_enum_from_type<double>
 {
-    static constexpr DataTypeEnum_t value = DataTypeEnum_t::Double;
+    static constexpr DataTypeEnum value = DataTypeEnum::Double;
 };

 } // namespace ck

--- a/include/ck/utility/dynamic_buffer.hpp
+++ b/include/ck/utility/dynamic_buffer.hpp
-#ifndef CK_BUFFER_HPP
-#define CK_BUFFER_HPP
-
+#pragma once
 #include "amd_buffer_addressing.hpp"
 #include "c_style_pointer_cast.hpp"
 #include "config.hpp"
@@ -8,7 +6,7 @@

 namespace ck {

-template <AddressSpaceEnum_t BufferAddressSpace,
+template <AddressSpaceEnum BufferAddressSpace,
          typename T,
          typename ElementSpaceSize,
          bool InvalidElementUseNumericalZeroValue>
@@ -34,7 +32,7 @@ struct DynamicBuffer
    {
    }

-    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
+    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace()
    {
        return BufferAddressSpace;
    }
@@ -55,7 +53,7 @@ struct DynamicBuffer
        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;

        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X need to be multiple T");
+                      "wrong! X should contain multiple T");

 #if CK_USE_AMD_BUFFER_LOAD
        bool constexpr use_amd_buffer_addressing = true;
@@ -63,7 +61,7 @@ struct DynamicBuffer
        bool constexpr use_amd_buffer_addressing = false;
 #endif

-        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global && use_amd_buffer_addressing)
+        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
        {
            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

@@ -81,50 +79,48 @@ struct DynamicBuffer
        }
        else
        {
-            if constexpr(InvalidElementUseNumericalZeroValue)
+            if(is_valid_element)
            {
 #if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
                X tmp;

                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));

-                return is_valid_element ? tmp : X{0};
+                return tmp;
 #else
-                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i]) : X{0};
+                return *c_style_pointer_cast<const X*>(&p_data_[i]);
 #endif
            }
            else
            {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
-                X tmp;
-
-                __builtin_memcpy(&tmp, &(p_data_[i]), sizeof(X));
-
-                return is_valid_element ? tmp : X{invalid_element_value_};
-#else
-                return is_valid_element ? *c_style_pointer_cast<const X*>(&p_data_[i])
-                                        : X{invalid_element_value_};
-#endif
+                if constexpr(InvalidElementUseNumericalZeroValue)
+                {
+                    return X{0};
+                }
+                else
+                {
+                    return X{invalid_element_value_};
+                }
            }
        }
    }

-    template <InMemoryDataOperationEnum_t Op,
+    template <InMemoryDataOperationEnum Op,
              typename X,
              typename enable_if<is_same<typename scalar_type<remove_cvref_t<X>>::type,
                                         typename scalar_type<remove_cvref_t<T>>::type>::value,
                                 bool>::type = false>
    __host__ __device__ void Update(index_t i, bool is_valid_element, const X& x)
    {
-        if constexpr(Op == InMemoryDataOperationEnum_t::Set)
+        if constexpr(Op == InMemoryDataOperationEnum::Set)
        {
            this->template Set<X>(i, is_valid_element, x);
        }
-        else if constexpr(Op == InMemoryDataOperationEnum_t::AtomicAdd)
+        else if constexpr(Op == InMemoryDataOperationEnum::AtomicAdd)
        {
            this->template AtomicAdd<X>(i, is_valid_element, x);
        }
-        else if constexpr(Op == InMemoryDataOperationEnum_t::Add)
+        else if constexpr(Op == InMemoryDataOperationEnum::Add)
        {
            auto tmp = this->template Get<X>(i, is_valid_element);
            this->template Set<X>(i, is_valid_element, x + tmp);
@@ -145,143 +141,120 @@ struct DynamicBuffer
        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;

        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X need to be multiple T");
+                      "wrong! X should contain multiple T");

-        if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Global)
-        {
 #if CK_USE_AMD_BUFFER_STORE
-            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
-
-            amd_buffer_store<remove_cvref_t<T>, t_per_x>(
-                x, p_data_, i, is_valid_element, element_space_size_);
+        bool constexpr use_amd_buffer_addressing = true;
 #else
-            if(is_valid_element)
-            {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
-                X tmp = x;
+        bool constexpr use_amd_buffer_addressing      = false;
+#endif

-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
+#if CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
+        bool constexpr workaround_int8_ds_write_issue = true;
 #else
-                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
-#endif
-            }
+        bool constexpr workaround_int8_ds_write_issue = false;
 #endif
+
+        if constexpr(GetAddressSpace() == AddressSpaceEnum::Global && use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+
+            amd_buffer_store<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
        }
-        else if constexpr(GetAddressSpace() == AddressSpaceEnum_t::Lds)
+        else if constexpr(GetAddressSpace() == AddressSpaceEnum::Lds &&
+                          is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value &&
+                          workaround_int8_ds_write_issue)
        {
            if(is_valid_element)
            {
-#if !CK_WORKAROUND_SWDEV_XXXXXX_INT8_DS_WRITE_ISSUE
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
-                X tmp = x;
-
-                __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
-#else
-                *c_style_pointer_cast<X*>(&p_data_[i]) = x;
-#endif
-#else
-                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into
-                // inefficient
+                // HACK: compiler would lower IR "store<i8, 16> address_space(3)" into inefficient
                // ISA, so I try to let compiler emit IR "store<i32, 4>" which would be lower to
                // ds_write_b128
                // TODO: remove this after compiler fix
-                if constexpr(is_same<typename scalar_type<remove_cvref_t<T>>::type, int8_t>::value)
+                static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
+                               is_same<remove_cvref_t<X>, int8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x2_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x16_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x4_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x8_t>::value) ||
+                                  (is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                   is_same<remove_cvref_t<X>, int8x16_t>::value),
+                              "wrong! not implemented for this combination, please add "
+                              "implementation");
+
+                if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                             is_same<remove_cvref_t<X>, int8_t>::value)
                {
-                    static_assert((is_same<remove_cvref_t<T>, int8_t>::value &&
-                                   is_same<remove_cvref_t<X>, int8_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x2_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x16_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x4_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x8_t>::value) ||
-                                      (is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                       is_same<remove_cvref_t<X>, int8x16_t>::value),
-                                  "wrong! not implemented for this combination, please add "
-                                  "implementation");
-
-                    if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                 is_same<remove_cvref_t<X>, int8_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int8_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x2_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int16_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x4_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int32_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x8_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int32x2_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x16_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int32x4_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x4_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int32_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x8_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int32x2_t*>(&x);
-                    }
-                    else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
-                                      is_same<remove_cvref_t<X>, int8x16_t>::value)
-                    {
-                        // HACK: cast pointer of x is bad
-                        // TODO: remove this after compiler fix
-                        *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
-                            *c_style_pointer_cast<const int32x4_t*>(&x);
-                    }
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int8_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int8_t*>(&x);
                }
-                else
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x2_t>::value)
                {
-#if CK_EXPERIMENTAL_USE_MEMCPY_FOR_VECTOR_ACCESS
-                    X tmp = x;
-
-                    __builtin_memcpy(&(p_data_[i]), &tmp, sizeof(X));
-#else
-                    *c_style_pointer_cast<X*>(&p_data_[i]) = x;
-#endif
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int16_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int16_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x2_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x4_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x4_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x4_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x8_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x8_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x2_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x2_t*>(&x);
+                }
+                else if constexpr(is_same<remove_cvref_t<T>, int8x16_t>::value &&
+                                  is_same<remove_cvref_t<X>, int8x16_t>::value)
+                {
+                    // HACK: cast pointer of x is bad
+                    // TODO: remove this after compiler fix
+                    *c_style_pointer_cast<int32x4_t*>(&p_data_[i]) =
+                        *c_style_pointer_cast<const int32x4_t*>(&x);
                }
-#endif
            }
        }
        else
@@ -305,27 +278,49 @@ struct DynamicBuffer
                                 bool>::type = false>
    __host__ __device__ void AtomicAdd(index_t i, bool is_valid_element, const X& x)
    {
+        using scalar_t = typename scalar_type<remove_cvref_t<T>>::type;
+
        // X contains multiple T
        constexpr index_t scalar_per_t_vector = scalar_type<remove_cvref_t<T>>::vector_size;

        constexpr index_t scalar_per_x_vector = scalar_type<remove_cvref_t<X>>::vector_size;

        static_assert(scalar_per_x_vector % scalar_per_t_vector == 0,
-                      "wrong! X need to be multiple T");
-
-        static_assert(GetAddressSpace() == AddressSpaceEnum_t::Global, "only support global mem");
+                      "wrong! X should contain multiple T");
+
+        static_assert(GetAddressSpace() == AddressSpaceEnum::Global, "only support global mem");
+
+#if CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
+        bool constexpr use_amd_buffer_addressing =
+            is_same_v<remove_cvref_t<scalar_t>, int32_t> ||
+            is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+#elif CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER && (!CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT)
+        bool constexpr use_amd_buffer_addressing = is_same_v<remove_cvref_t<scalar_t>, int32_t>;
+#elif(!CK_USE_AMD_BUFFER_ATOMIC_ADD_INTEGER) && CK_USE_AMD_BUFFER_ATOMIC_ADD_FLOAT
+        bool constexpr use_amd_buffer_addressing =
+            is_same_v<remove_cvref_t<scalar_t>, float> ||
+            (is_same_v<remove_cvref_t<scalar_t>, half_t> && scalar_per_x_vector % 2 == 0);
+#else
+        bool constexpr use_amd_buffer_addressing = false;
+#endif

-#if CK_USE_AMD_BUFFER_ATOMIC_ADD
-        constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;
+        if constexpr(use_amd_buffer_addressing)
+        {
+            constexpr index_t t_per_x = scalar_per_x_vector / scalar_per_t_vector;

-        amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
-            x, p_data_, i, is_valid_element, element_space_size_);
-#else
-        if(is_valid_element)
+            amd_buffer_atomic_add<remove_cvref_t<T>, t_per_x>(
+                x, p_data_, i, is_valid_element, element_space_size_);
+        }
+        else
        {
-            atomicAdd(&p_data_[i], x);
+            if(is_valid_element)
+            {
+                // FIXME: atomicAdd is defined by HIP, need to avoid implicit type casting when
+                // calling it
+                atomicAdd(c_style_pointer_cast<X*>(&p_data_[i]), x);
+            }
        }
-#endif
    }

    __host__ __device__ static constexpr bool IsStaticBuffer() { return false; }
@@ -333,14 +328,14 @@ struct DynamicBuffer
    __host__ __device__ static constexpr bool IsDynamicBuffer() { return true; }
 };

-template <AddressSpaceEnum_t BufferAddressSpace, typename T, typename ElementSpaceSize>
+template <AddressSpaceEnum BufferAddressSpace, typename T, typename ElementSpaceSize>
 __host__ __device__ constexpr auto make_dynamic_buffer(T* p, ElementSpaceSize element_space_size)
 {
    return DynamicBuffer<BufferAddressSpace, T, ElementSpaceSize, true>{p, element_space_size};
 }

 template <
-    AddressSpaceEnum_t BufferAddressSpace,
+    AddressSpaceEnum BufferAddressSpace,
    typename T,
    typename ElementSpaceSize,
    typename X,
@@ -353,4 +348,3 @@ make_dynamic_buffer(T* p, ElementSpaceSize element_space_size, X invalid_element
 }

 } // namespace ck
-#endif
--- a/include/ck/utility/utility.hpp
+++ b/include/ck/utility/utility.hpp
-#ifndef CK_UTILITY_HPP
-#define CK_UTILITY_HPP
-
+#pragma once
 #include "config.hpp"

 namespace ck {
@@ -16,5 +14,3 @@ __device__ index_t get_block_1d_id() { return blockIdx.x; }
 __device__ index_t get_grid_size() { return gridDim.x; }

 } // namespace ck
-
-#endif
--- a/include/ck/utility/multi_index.hpp
+++ b/include/ck/utility/multi_index.hpp
@@ -3,7 +3,7 @@

 #include "common_header.hpp"

-#if CK_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
+#if CK_EXPERIMENTAL_USE_DYNAMICALLY_INDEXED_MULTI_INDEX
 #include "array_multi_index.hpp"
 #else
 #include "statically_indexed_array_multi_index.hpp"

--- a/include/ck/utility/reduction_enums.hpp
+++ b/include/ck/utility/reduction_enums.hpp
@@ -28,7 +28,7 @@

 namespace ck {

-enum class ReduceTensorOp_t
+enum struct ReduceTensorOp
 {
    ADD   = 0,
    MUL   = 1,
@@ -41,19 +41,19 @@ enum class ReduceTensorOp_t
    // MUL_NO_ZEROS = 8,
 };

-enum class NanPropagation_t
+enum struct NanPropagation
 {
    NOT_PROPAGATE_NAN = 0,
    PROPAGATE_NAN     = 1,
 };

-enum class ReduceTensorIndices_t
+enum struct ReduceTensorIndices
 {
    NO_INDICES        = 0,
    FLATTENED_INDICES = 1,
 };

-enum class IndicesType_t
+enum struct IndicesType
 {
    INDICES_32BIT = 0,
    INDICES_64BIT = 1,

--- a/include/ck/utility/sequence.hpp
+++ b/include/ck/utility/sequence.hpp
@@ -606,6 +606,12 @@ struct sequence_map_inverse
                                           SeqMap::Size()>::type;
 };

+template <index_t... Xs, index_t... Ys>
+__host__ __device__ constexpr bool operator==(Sequence<Xs...>, Sequence<Ys...>)
+{
+    return ((Xs == Ys) && ...);
+}
+
 template <index_t... Xs, index_t... Ys>
 __host__ __device__ constexpr auto operator+(Sequence<Xs...>, Sequence<Ys...>)
 {

--- a/include/ck/utility/static_buffer.hpp
+++ b/include/ck/utility/static_buffer.hpp
@@ -6,7 +6,7 @@
 namespace ck {

 // static buffer for scalar
-template <AddressSpaceEnum_t AddressSpace,
+template <AddressSpaceEnum AddressSpace,
          typename T,
          index_t N,
          bool InvalidElementUseNumericalZeroValue> // TODO remove this bool, no longer needed
@@ -17,10 +17,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>

    __host__ __device__ constexpr StaticBuffer() : base{} {}

-    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
-    {
-        return AddressSpace;
-    }
+    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }

    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }

@@ -42,7 +39,7 @@ struct StaticBuffer : public StaticallyIndexedArray<T, N>
 };

 // static buffer for vector
-template <AddressSpaceEnum_t AddressSpace,
+template <AddressSpaceEnum AddressSpace,
          typename S,
          index_t NumOfVector,
          index_t ScalarPerVector,
@@ -59,10 +56,7 @@ struct StaticBufferTupleOfVector

    __host__ __device__ constexpr StaticBufferTupleOfVector() : base{} {}

-    __host__ __device__ static constexpr AddressSpaceEnum_t GetAddressSpace()
-    {
-        return AddressSpace;
-    }
+    __host__ __device__ static constexpr AddressSpaceEnum GetAddressSpace() { return AddressSpace; }

    __host__ __device__ static constexpr bool IsStaticBuffer() { return true; }

@@ -158,7 +152,7 @@ struct StaticBufferTupleOfVector
    }
 };

-template <AddressSpaceEnum_t AddressSpace, typename T, index_t N>
+template <AddressSpaceEnum AddressSpace, typename T, index_t N>
 __host__ __device__ constexpr auto make_static_buffer(Number<N>)
 {
    return StaticBuffer<AddressSpace, T, N, true>{};

--- a/include/ck/utility/synchronization.hpp
+++ b/include/ck/utility/synchronization.hpp
@@ -7,7 +7,7 @@ namespace ck {

 __device__ void block_sync_lds()
 {
-#if CK_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
+#if CK_EXPERIMENTAL_BLOCK_SYNC_LDS_WITHOUT_SYNC_VMEM
    asm volatile("\
    s_waitcnt lgkmcnt(0) \n \
    s_barrier \

--- a/include/ck/utility/tensor_space_filling_curve.hpp
+++ b/include/ck/utility/tensor_space_filling_curve.hpp
@@ -37,6 +37,10 @@ struct SpaceFillingCurve

    __host__ __device__ static constexpr index_t GetNumOfAccess()
    {
+        static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
+        static_assert(TensorLengths{} % ScalarsPerAccess{} ==
+                      typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
+
        return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
               ScalarPerVector;
    }
@@ -140,6 +144,15 @@ struct SpaceFillingCurve
        }();
        return idx_md;
    }
+
+    // FIXME: rename this function
+    template <index_t AccessIdx1d>
+    static __device__ __host__ constexpr auto GetIndexTupleOfNumber(Number<AccessIdx1d>)
+    {
+        constexpr auto idx = GetIndex(Number<AccessIdx1d>{});
+
+        return generate_tuple([&](auto i) { return Number<idx[i]>{}; }, Number<nDim>{});
+    }
 };

 } // namespace ck

--- a/library/include/ck/library/host_tensor/conv_common.hpp
+++ b/library/include/ck/library/host_tensor/conv_common.hpp
@@ -75,14 +75,14 @@ calculate_convolution_flops(const InDesc&, const WeiDesc& wei_desc, const OutDes
 }

 template <typename T>
-inline auto activ(T v, const ck::ActivTypeEnum_t activ_type)
+inline auto activ(T v, const ck::ActivTypeEnum activ_type)
 {
    const T alpha = 0.3;
    switch(activ_type)
    {
-    case ck::ActivTypeEnum_t::None: return v;
-    case ck::ActivTypeEnum_t::LeakyRelu: return (v >= 0 ? v : alpha * v);
-    case ck::ActivTypeEnum_t::Sigmoid: return (1 / (1 + exp(-v)));
+    case ck::ActivTypeEnum::None: return v;
+    case ck::ActivTypeEnum::LeakyRelu: return (v >= 0 ? v : alpha * v);
+    case ck::ActivTypeEnum::Sigmoid: return (1 / (1 + exp(-v)));
    default: throw std::runtime_error("unsupported activ type"); break;
    }
 }

--- a/library/include/ck/library/host_tensor/device.hpp
+++ b/library/include/ck/library/host_tensor/device.hpp
@@ -48,8 +48,10 @@ struct DeviceMem
    DeviceMem() = delete;
    DeviceMem(std::size_t mem_size);
    void* GetDeviceBuffer();
+    std::size_t GetBufferSize();
    void ToDevice(const void* p);
    void FromDevice(void* p);
+    void SetZero();
    ~DeviceMem();

    void* mpDeviceBuf;
@@ -109,8 +111,6 @@ float launch_and_time_kernel(

    timer.End();

-    // std::this_thread::sleep_for (std::chrono::microseconds(10));
-
    return timer.GetElapsedTime() / nrepeat;
 #else
    std::ignore = nrepeat;

--- a/library/include/ck/library/host_tensor/device_tensor.hpp
+++ b/library/include/ck/library/host_tensor/device_tensor.hpp
 #pragma once
 #include "host_tensor.hpp"
-#include "common_header.hpp"

 template <typename TensorDesc>
 void ostream_tensor_descriptor(TensorDesc, std::ostream& os = std::cout)

--- a/library/include/ck/library/host_tensor/host_generic_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_generic_reduction.hpp
-
-/*******************************************************************************
- *
- * MIT License
- *
- * Copyright (c) 2020 Advanced Micro Devices, Inc.
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to deal
- * in the Software without restriction, including without limitation the rights
- * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
- * copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- *
- *******************************************************************************/
-#ifndef HOST_GENERIC_REDUCTION_HPP_
-#define HOST_GENERIC_REDUCTION_HPP_
-
-#include <vector>
-#include <functional>
-#include <limits>
-#include <type_traits>
-#include <cassert>
-#include <cmath>
-
-#include "reduction_enums.hpp"
-#include "host_reduce_util.hpp"
-
-using float16 = half_float::half;
-
-namespace ck {
-
-namespace host_reduce {
-
-template <typename T>
-static void
-get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
-{
-    if(dim < dimLengths.size())
-    {
-        std::vector<std::vector<T>> updated_indexes;
-
-        if(dim == 0)
-        {
-            assert(indexes.size() == 0);
-            assert(dimLengths[dim] > 0);
-            for(T i = 0; i < dimLengths[dim]; i++)
-            {
-                std::vector<T> index = {i};
-
-                updated_indexes.push_back(index);
-            };
-        }
-        else
-        {
-            // go through all the current indexes
-            for(const auto& index : indexes)
-                for(T i = 0; i < dimLengths[dim]; i++)
-                {
-                    auto index_new = index;
-                    index_new.push_back(i);
-
-                    updated_indexes.push_back(index_new);
-                };
-        };
-
-        // update to the indexes (output)
-        indexes = updated_indexes;
-
-        // further to construct the indexes from the updated status
-        get_all_indexes(dimLengths, dim + 1, indexes);
-    };
-};
-
-template <typename T>
-static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(strides.size() == index.size());
-
-    for(int i = 0; i < index.size(); i++)
-        offset += strides[i] * static_cast<T>(index[i]);
-
-    return (offset);
-};
-
-template <typename T>
-static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
-{
-    T offset = 0;
-
-    assert(lengths.size() == index.size() && lengths.size() > 0);
-
-    int len  = lengths.size();
-    T stride = 1;
-
-    // for len==1, the loop is not executed
-    for(int i = len - 1; i > 0; i--)
-    {
-        offset += stride * static_cast<T>(index[i]);
-
-        stride *= lengths[i];
-    };
-
-    offset += stride * static_cast<T>(index[0]);
-
-    return (offset);
-};
-
-template <typename InDataType,
-          typename AccDataType,
-          typename OutDataType,
-          ck::ReduceTensorOp_t ReduceOpId,
-          bool PropagateNan,
-          bool NeedIndices>
-class ReductionHost
-{
-    public:
-    ReductionHost() = default;
-    ReductionHost(HostTensorDescriptor& inDesc,
-                  HostTensorDescriptor& outDesc,
-                  const std::vector<int>& invariantDims_,
-                  const std::vector<int>& toReduceDims_)
-    {
-        this->inLengths  = to_int_vector(inDesc.GetLengths());
-        this->outLengths = to_int_vector(outDesc.GetLengths());
-        this->inStrides  = to_int_vector(inDesc.GetStrides());
-        this->outStrides = to_int_vector(outDesc.GetStrides());
-
-        this->invariantDims = invariantDims_;
-        this->toReduceDims  = toReduceDims_;
-
-        assert(this->inLengths.size() == this->outLengths.size());
-        assert(!this->toReduceDims.empty());
-
-        for(const auto dim : this->invariantDims)
-            this->invariantLengths.push_back(this->inLengths[dim]);
-
-        for(const auto dim : this->toReduceDims)
-            toReduceLengths.push_back(this->inLengths[dim]);
-
-        this->reduceAllDims = this->invariantDims.empty();
-    };
-
-    ~ReductionHost(){};
-
-    void
-    Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
-    {
-        if constexpr(NeedIndices)
-            RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
-        else
-            RunImpl_no_indices(alpha, in_data, beta, out_data);
-    };
-
-    private:
-    std::vector<int> inLengths;
-    std::vector<int> outLengths;
-    std::vector<int> inStrides;
-    std::vector<int> outStrides;
-
-    std::vector<int> invariantLengths;
-    std::vector<int> toReduceLengths;
-
-    std::vector<int> invariantDims;
-    std::vector<int> toReduceDims;
-
-    bool reduceAllDims;
-
-    void RunImpl_with_indices(
-        float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
-    {
-        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
-        using ck::host_reduce::PosUnaryOpFn;
-        using ck::host_reduce::PreUnaryOpFn;
-        using ck::host_reduce::ReduceOpFn2;
-        using ck::host_reduce::ReduceOpZeroVal;
-
-        auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
-
-        int divider = 1;
-        for(int i = 0; i < toReduceLengths.size(); i++)
-            divider *= toReduceLengths[i];
-
-        auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-        auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
-
-        if(reduceAllDims)
-        {
-            std::vector<std::vector<int>> indexes_1;
-
-            get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
-
-            auto accuVal  = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-            int accuIndex = 0;
-
-            // go through indexes of the invariant dimensions
-            for(const auto& src_index : indexes_1)
-            {
-                auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-
-                // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
-                // done
-                PreUnaryOp(currVal);
-
-                auto currIndex = get_flatten_offset(inLengths, src_index);
-                binop_with_nan_check2<AccDataType, PropagateNan>(
-                    opReduce, accuVal, currVal, accuIndex, currIndex);
-            };
-
-            // scale the accumulated value
-            if(!float_equal_one(alpha))
-                accuVal *= static_cast<AccDataType>(alpha);
-
-            // scale the prior dst value and add it to the accumulated value
-            if(!float_equal_zero(beta))
-                accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
-
-            // store the reduced value to dst location
-            out_data[0] = static_cast<OutDataType>(accuVal);
-            indices[0]  = accuIndex;
-        }
-        else
-        {
-            std::vector<std::vector<int>> indexes_1, indexes_2;
-
-            get_all_indexes(
-                this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
-            get_all_indexes(
-                this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
-
-            // go through indexes of the invariant dimensions
-            for(const auto& index_1 : indexes_1)
-            {
-                std::vector<int> src_index;
-                std::vector<int> dst_index;
-
-                src_index.resize(this->inLengths.size());
-
-                // generate the part of src index belonging to invariant dims
-                for(int k = 0; k < invariantDims.size(); k++)
-                    src_index[invariantDims[k]] = index_1[k];
-
-                for(int k = 0; k < invariantDims.size(); k++)
-                    dst_index.push_back(index_1[k]);
-
-                int dst_offset = get_offset_from_index(this->outStrides, dst_index);
-
-                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-                int accuIndex       = 0;
-
-                // go through indexes of the toReduce dimensions
-                for(const auto& index_2 : indexes_2)
-                {
-                    // generate the part of src index belonging to toReduce dims
-                    for(int k = 0; k < toReduceDims.size(); k++)
-                        src_index[toReduceDims[k]] = index_2[k];
-
-                    auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                    auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-                    // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
-                    // actually done
-                    PreUnaryOp(currVal);
-
-                    auto currIndex = get_flatten_offset(toReduceLengths, index_2);
-                    binop_with_nan_check2<AccDataType, PropagateNan>(
-                        opReduce, accuVal, currVal, accuIndex, currIndex);
-                };
-
-                // scale the accumulated value
-                if(!float_equal_one(alpha))
-                    accuVal *= static_cast<AccDataType>(alpha);
-
-                // scale the prior dst value and add it to the accumulated value
-                if(!float_equal_zero(beta))
-                    accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
-                               static_cast<AccDataType>(beta);
-
-                // store the reduced value to dst location
-                out_data[dst_offset] = static_cast<OutDataType>(accuVal);
-                indices[dst_offset]  = accuIndex;
-            };
-        };
-    }; // end of RunImpl_with_indices()
-
-    void
-    RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
-    {
-        using ck::host_reduce::binop_with_nan_check;
-        using ck::host_reduce::binop_with_nan_check2;
-        using ck::host_reduce::float_equal_one;
-        using ck::host_reduce::float_equal_zero;
-        using ck::host_reduce::PosUnaryOpFn;
-        using ck::host_reduce::PreUnaryOpFn;
-        using ck::host_reduce::ReduceOpFn;
-        using ck::host_reduce::ReduceOpZeroVal;
-
-        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
-
-        int divider = 1;
-        for(int i = 0; i < toReduceLengths.size(); i++)
-            divider *= toReduceLengths[i];
-
-        auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
-        auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
-
-        if(reduceAllDims)
-        {
-            std::vector<std::vector<int>> indexes_1;
-
-            get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
-
-            auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-
-            // go through indexes of the invariant dimensions
-            for(const auto& src_index : indexes_1)
-            {
-                auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-
-                PreUnaryOp(currVal);
-
-                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
-            };
-
-            PosUnaryOp(accuVal);
-
-            // scale the accumulated value
-            if(!float_equal_one(alpha))
-                accuVal *= static_cast<AccDataType>(alpha);
-
-            // scale the prior dst value and add it to the accumulated value
-            if(!float_equal_zero(beta))
-                accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
-
-            // store the reduced value to dst location
-            out_data[0] = static_cast<OutDataType>(accuVal);
-        }
-        else
-        {
-            std::vector<std::vector<int>> indexes_1, indexes_2;
-
-            get_all_indexes(
-                this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
-            get_all_indexes(
-                this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
-
-            // go through indexes of the invariant dimensions
-            for(const auto& index_1 : indexes_1)
-            {
-                std::vector<int> src_index;
-                std::vector<int> dst_index;
-
-                src_index.resize(this->inLengths.size());
-
-                for(int k = 0; k < invariantDims.size(); k++)
-                    dst_index.push_back(index_1[k]);
-
-                int dst_offset = get_offset_from_index(this->outStrides, dst_index);
-
-                // generate the part of src index belonging to invariant dims
-                for(int k = 0; k < invariantDims.size(); k++)
-                    src_index[invariantDims[k]] = index_1[k];
-
-                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
-
-                // go through indexes of the toReduce dimensions
-                for(const auto& index_2 : indexes_2)
-                {
-                    // generate the part of src index belonging to toReduce dims
-                    for(int k = 0; k < toReduceDims.size(); k++)
-                        src_index[toReduceDims[k]] = index_2[k];
-
-                    auto src_offset = get_offset_from_index(this->inStrides, src_index);
-
-                    auto currVal = static_cast<AccDataType>(in_data[src_offset]);
-
-                    PreUnaryOp(currVal);
-
-                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
-                };
-
-                PosUnaryOp(accuVal);
-
-                // scale the accumulated value
-                if(!float_equal_one(alpha))
-                    accuVal *= static_cast<AccDataType>(alpha);
-
-                // scale the prior dst value and add it to the accumulated value
-                if(!float_equal_zero(beta))
-                    accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
-                               static_cast<AccDataType>(beta);
-
-                // store the reduced value to dst location
-                out_data[dst_offset] = static_cast<OutDataType>(accuVal);
-            };
-        };
-    }; // end of RunImpl_no_indices()
-};
-
-}; // end of namespace host_reduce
-
-}; // end of namespace ck
-
-#endif
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
@@ -39,8 +39,8 @@ namespace ck {

 namespace host_reduce {

-using ck::NanPropagation_t;
-using ck::ReduceTensorOp_t;
+using ck::NanPropagation;
+using ck::ReduceTensorOp;

 template <typename T>
 static inline bool float_equal_one(T);
@@ -66,95 +66,95 @@ static inline bool float_equal_zero(half_float::half x)
    return x == static_cast<half_float::half>(0.0f);
 };

-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
+__host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
 {
    using std::abs;

-    if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
+    if constexpr(ReduceOpId == ReduceTensorOp::NORM1)
    {
-        return ([&](compType& a_) { a_ = abs(a_); });
+        return ([&](AccDataType& a_) { a_ = abs(a_); });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
+    else if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
    {
-        return ([&](compType& a_) { a_ = a_ * a_; });
+        return ([&](AccDataType& a_) { a_ = a_ * a_; });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
+    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
    {
-        return ([&](compType& a_) { a_ = abs(a_); });
+        return ([&](AccDataType& a_) { a_ = abs(a_); });
    }
    else
    {
-        // ReduceTensorOp_t::AVG:
-        // ReduceTensorOp_t::ADD:
-        // ReduceTensorOp_t::MUL:
-        // ReduceTensorOp_t::MIN:
-        // ReduceTensorOp_t::MAX:
-        return ([&](compType&) {});
+        // ReduceTensorOp::AVG:
+        // ReduceTensorOp::ADD:
+        // ReduceTensorOp::MUL:
+        // ReduceTensorOp::MIN:
+        // ReduceTensorOp::MAX:
+        return ([&](AccDataType&) {});
    };
 };

-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
+__host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
 {
    using std::sqrt;

-    if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
+    if constexpr(ReduceOpId == ReduceTensorOp::NORM2)
    {
-        return ([&](compType& a_) { a_ = sqrt(a_); });
+        return ([&](AccDataType& a_) { a_ = sqrt(a_); });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
+    else if constexpr(ReduceOpId == ReduceTensorOp::AVG)
    {
-        return ([&, divider](compType& a_) {
-            a_ = a_ / static_cast<compType>(static_cast<float>(divider));
+        return ([&, divider](AccDataType& a_) {
+            a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
        });
    }
    else
    {
-        // ReduceTensorOp_t::ADD:
-        // ReduceTensorOp_t::NORM1:
-        // ReduceTensorOp_t::MUL:
-        // ReduceTensorOp_t::MIN:
-        // ReduceTensorOp_t::MAX:
-        // ReduceTensorOp_t::AMAX:
-        return ([&](compType&) {});
+        // ReduceTensorOp::ADD:
+        // ReduceTensorOp::NORM1:
+        // ReduceTensorOp::MUL:
+        // ReduceTensorOp::MIN:
+        // ReduceTensorOp::MAX:
+        // ReduceTensorOp::AMAX:
+        return ([&](AccDataType&) {});
    }
 };

-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&, compType)> ReduceOpFn()
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
+__host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
 {
-    if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
-                 ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
+    if constexpr(ReduceOpId == ReduceTensorOp::ADD || ReduceOpId == ReduceTensorOp::AVG ||
+                 ReduceOpId == ReduceTensorOp::NORM1 || ReduceOpId == ReduceTensorOp::NORM2)
    {
-        return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
+        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
+    else if constexpr(ReduceOpId == ReduceTensorOp::MUL)
    {
-        return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
+        return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
+    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
    {
-        return ([&](compType& a_, compType b_) {
+        return ([&](AccDataType& a_, AccDataType b_) {
            if(a_ > b_)
                a_ = b_;
        });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
+    else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
    {
-        return ([&](compType& a_, compType b_) {
+        return ([&](AccDataType& a_, AccDataType b_) {
            if(a_ < b_)
                a_ = b_;
        });
    }
 };

-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2()
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
+__host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
 {
-    if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
+    if constexpr(ReduceOpId == ReduceTensorOp::MIN)
    {
-        return ([&](compType& a_, compType b_, bool& changed) {
+        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
            if(a_ > b_)
            {
                a_      = b_;
@@ -164,9 +164,9 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
                changed = false;
        });
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
+    else if constexpr(ReduceOpId == ReduceTensorOp::MAX || ReduceOpId == ReduceTensorOp::AMAX)
    {
-        return ([&](compType& a_, compType b_, bool& changed) {
+        return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
            if(a_ < b_)
            {
                a_      = b_;
@@ -178,48 +178,49 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
    }
    else
    {
-        // ReduceTensorOp_t::ADD:
-        // ReduceTensorOp_t::MUL:
-        // ReduceTensorOp_t::AVG:
-        // ReduceTensorOp_t::NORM1:
-        // ReduceTensorOp_t::NORM2:
-        return (std::function<void(compType&, compType, bool&)>{});
+        // ReduceTensorOp::ADD:
+        // ReduceTensorOp::MUL:
+        // ReduceTensorOp::AVG:
+        // ReduceTensorOp::NORM1:
+        // ReduceTensorOp::NORM2:
+        return (std::function<void(AccDataType&, AccDataType, bool&)>{});
    };
 };

-template <typename compType, ReduceTensorOp_t ReduceOpId>
-__host__ static inline compType ReduceOpZeroVal()
+template <typename AccDataType, ReduceTensorOp ReduceOpId>
+__host__ static inline AccDataType ReduceOpZeroVal()
 {
-    if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
+    if constexpr(ReduceOpId == ReduceTensorOp::MUL)
    {
-        return (static_cast<compType>(1.0f));
+        return (static_cast<AccDataType>(1.0f));
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
+    else if constexpr(ReduceOpId == ReduceTensorOp::MIN)
    {
-        return (std::numeric_limits<compType>::max());
+        return (std::numeric_limits<AccDataType>::max());
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
+    else if constexpr(ReduceOpId == ReduceTensorOp::MAX)
    {
-        return (std::numeric_limits<compType>::lowest());
+        return (std::numeric_limits<AccDataType>::lowest());
    }
-    else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
+    else if constexpr(ReduceOpId == ReduceTensorOp::AMAX)
    {
-        return (static_cast<compType>(0.0f));
+        return (static_cast<AccDataType>(0.0f));
    }
    else
    {
-        // ReduceTensorOp_t::ADD
-        // ReduceTensorOp_t::AVG
-        // ReduceTensorOp_t::NORM1
-        // ReduceTensorOp_t::NORM2
-        return (static_cast<compType>(0.0f));
+        // ReduceTensorOp::ADD
+        // ReduceTensorOp::AVG
+        // ReduceTensorOp::NORM1
+        // ReduceTensorOp::NORM2
+        return (static_cast<AccDataType>(0.0f));
    };
 };

-template <typename compType, bool PropagateNan>
-__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce,
-                                                 compType& accuVal,
-                                                 compType currVal)
+template <typename AccDataType, bool PropagateNan>
+__host__ static inline void
+binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
+                     AccDataType& accuVal,
+                     AccDataType currVal)
 {
    using std::isnan;

@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
    };
 };

-template <typename compType, bool PropagateNan>
+template <typename AccDataType, bool PropagateNan>
 __host__ static inline void
-binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce,
-                      compType& accuVal,
-                      compType currVal,
+binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
+                      AccDataType& accuVal,
+                      AccDataType currVal,
                      int& accuIndex,
                      int currIndex)
 {

--- a/library/include/ck/library/host_tensor/host_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_reduction.hpp
+
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HOST_REDUCTION_HPP_
+#define HOST_REDUCTION_HPP_
+
+#include <vector>
+#include <array>
+#include <functional>
+
+#include "reduction_enums.hpp"
+#include "host_reduce_util.hpp"
+#include "host_tensor.hpp"
+#include "data_type.hpp"
+
+template <int NDim>
+static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
+                            std::vector<std::array<size_t, NDim>>& indexes)
+{
+    static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
+
+    if constexpr(NDim == 1)
+    {
+        for(size_t i = 0; i < dimLengths[0]; i++)
+        {
+            std::array<size_t, 1> index{i};
+
+            indexes.push_back(index);
+        };
+    }
+    else
+    {
+        std::array<size_t, NDim - 1> partial_dim_lengths;
+
+        for(int i = 0; i < NDim - 1; i++)
+            partial_dim_lengths[i] = dimLengths[i + 1];
+
+        std::vector<std::array<size_t, NDim - 1>> partial_indexes;
+
+        get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
+
+        for(size_t i = 0; i < dimLengths[0]; i++)
+            for(const auto& index : partial_indexes)
+            {
+                std::array<size_t, NDim> extIndex;
+
+                extIndex[0] = i;
+
+                for(int k = 0; k < NDim - 1; k++)
+                    extIndex[k + 1] = index[k];
+
+                indexes.push_back(extIndex);
+            };
+    };
+};
+
+template <int NDim>
+static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
+                                    const std::array<size_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <int NDim>
+static size_t get_offset_from_index(const std::vector<size_t>& strides,
+                                    const std::array<size_t, NDim>& index)
+{
+    size_t offset = 0;
+
+    for(int i = 0; i < NDim; i++)
+        offset += strides[i] * index[i];
+
+    return (offset);
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          ck::ReduceTensorOp ReduceOpId,
+          int Rank,
+          int NumReduceDim,
+          bool PropagateNan,
+          bool NeedIndices>
+struct ReductionHost
+{
+    using IndexDataType = int32_t;
+
+    static constexpr int NumInvariantDim = Rank - NumReduceDim;
+
+    std::vector<size_t> outStrides;
+    std::vector<int> invariantDims;
+    std::vector<int> reduceDims;
+
+    IndexDataType divider;
+    std::function<void(AccDataType&)> preUnaryOp;
+    std::function<void(AccDataType&)> posUnaryOp;
+    std::array<size_t, NumReduceDim> reduceLengths;
+    std::array<size_t, NumReduceDim> reduceStrides;
+    std::array<size_t, NumInvariantDim> invariantLengths;
+    std::array<size_t, NumInvariantDim> invariantStrides;
+
+    std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
+    std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
+
+    ReductionHost(HostTensorDescriptor& inDesc,
+                  HostTensorDescriptor& outDesc,
+                  const std::vector<int>& invariantDims_,
+                  const std::vector<int>& reduceDims_)
+    {
+        using ck::host_reduce::PosUnaryOpFn;
+        using ck::host_reduce::PreUnaryOpFn;
+
+        // this->outLengths = to_int_vector(outDesc.GetLengths());
+        this->outStrides = outDesc.GetStrides();
+
+        this->invariantDims = invariantDims_;
+        this->reduceDims    = reduceDims_;
+
+        int product = 1;
+
+        for(int i = 0; i < NumReduceDim; i++)
+        {
+            reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
+            reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
+            product *= inDesc.GetLengths()[reduceDims[i]];
+        };
+
+        divider = product;
+
+        for(int i = 0; i < NumInvariantDim; i++)
+        {
+            invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
+            invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
+        };
+
+        reduce_dim_indexes.clear();
+        get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
+
+        if constexpr(NumInvariantDim > 0)
+        {
+            invariant_dim_indexes.clear();
+            get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
+        };
+
+        preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
+        posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
+    };
+
+    void Run(float alpha,
+             const InDataType* in_data,
+             float beta,
+             OutDataType* out_data,
+             IndexDataType* out_indices)
+    {
+        if constexpr(NeedIndices)
+        {
+            RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
+        }
+        else
+        {
+            RunImpl_no_index(alpha, in_data, beta, out_data);
+        };
+    };
+
+    void RunImpl_with_index(float alpha,
+                            const InDataType* in_data,
+                            float beta,
+                            OutDataType* out_data,
+                            IndexDataType* out_indices)
+    {
+        using ck::type_convert;
+        using ck::host_reduce::binop_with_nan_check2;
+        using ck::host_reduce::float_equal_one;
+        using ck::host_reduce::float_equal_zero;
+        using ck::host_reduce::ReduceOpFn2;
+        using ck::host_reduce::ReduceOpZeroVal;
+
+        auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
+
+        if constexpr(NumInvariantDim == 0)
+        {
+            AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            IndexDataType accuIndex = 0;
+
+            for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
+            {
+                auto offset_reduce =
+                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
+
+                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
+
+                preUnaryOp(currVal);
+
+                auto currIndex = i;
+
+                binop_with_nan_check2<AccDataType, PropagateNan>(
+                    opReduce2, accuVal, currVal, accuIndex, currIndex);
+            };
+
+            posUnaryOp(accuVal);
+
+            if(!float_equal_one(alpha))
+                accuVal *= type_convert<AccDataType>(alpha);
+
+            if(!float_equal_zero(beta))
+                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
+
+            out_data[0]    = type_convert<OutDataType>(accuVal);
+            out_indices[0] = accuIndex;
+        }
+        else
+        {
+            auto thread_reduce_func = [&](auto invariant_index) {
+                AccDataType accuVal     = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                IndexDataType accuIndex = 0;
+
+                auto offset_invariant =
+                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
+
+                for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
+                {
+                    auto offset_reduce =
+                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
+
+                    auto currVal =
+                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
+
+                    preUnaryOp(currVal);
+
+                    auto currIndex = i;
+
+                    binop_with_nan_check2<AccDataType, PropagateNan>(
+                        opReduce2, accuVal, currVal, accuIndex, currIndex);
+                };
+
+                posUnaryOp(accuVal);
+
+                if(!float_equal_one(alpha))
+                    accuVal *= type_convert<AccDataType>(alpha);
+
+                auto dst_offset =
+                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
+
+                if(!float_equal_zero(beta))
+                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
+                               type_convert<AccDataType>(beta);
+
+                out_data[dst_offset]    = type_convert<OutDataType>(accuVal);
+                out_indices[dst_offset] = accuIndex;
+            };
+
+            std::size_t num_thread = 1;
+            std::size_t work_per_thread =
+                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end =
+                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
+
+                auto f = [=] {
+                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                    {
+                        thread_reduce_func(invariant_dim_indexes[iw]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+        };
+    };
+
+    void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
+    {
+        using ck::type_convert;
+        using ck::host_reduce::binop_with_nan_check;
+        using ck::host_reduce::float_equal_one;
+        using ck::host_reduce::float_equal_zero;
+        using ck::host_reduce::ReduceOpFn;
+        using ck::host_reduce::ReduceOpZeroVal;
+
+        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+
+        if constexpr(NumInvariantDim == 0)
+        {
+            AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+            for(const auto& reduce_index : reduce_dim_indexes)
+            {
+                auto offset_reduce =
+                    get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
+
+                auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
+
+                preUnaryOp(currVal);
+
+                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+            };
+
+            posUnaryOp(accuVal);
+
+            if(!float_equal_one(alpha))
+                accuVal *= type_convert<AccDataType>(alpha);
+
+            if(!float_equal_zero(beta))
+                accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
+
+            out_data[0] = type_convert<OutDataType>(accuVal);
+        }
+        else
+        {
+            auto thread_reduce_func = [&](auto invariant_index) {
+                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+                auto offset_invariant =
+                    get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
+
+                for(const auto& reduce_index : reduce_dim_indexes)
+                {
+                    auto offset_reduce =
+                        get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
+
+                    auto currVal =
+                        type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
+
+                    preUnaryOp(currVal);
+
+                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                };
+
+                posUnaryOp(accuVal);
+
+                if(!float_equal_one(alpha))
+                    accuVal *= type_convert<AccDataType>(alpha);
+
+                auto dst_offset =
+                    get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
+
+                if(!float_equal_zero(beta))
+                    accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
+                               type_convert<AccDataType>(beta);
+
+                out_data[dst_offset] = type_convert<OutDataType>(accuVal);
+            };
+
+            std::size_t num_thread = 1;
+            std::size_t work_per_thread =
+                (invariant_dim_indexes.size() + num_thread - 1) / num_thread;
+
+            std::vector<joinable_thread> threads(num_thread);
+
+            for(std::size_t it = 0; it < num_thread; ++it)
+            {
+                std::size_t iw_begin = it * work_per_thread;
+                std::size_t iw_end =
+                    std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
+
+                auto f = [=] {
+                    for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
+                    {
+                        thread_reduce_func(invariant_dim_indexes[iw]);
+                    }
+                };
+
+                threads[it] = joinable_thread(f);
+            }
+        };
+    };
+};
+
+#endif
--- a/library/include/ck/library/host_tensor/host_tensor.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor.hpp
@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
    return os;
 }

-typedef enum
-{
-    Half  = 0,
-    Float = 1,
-} DataType_t;
-
-template <typename T>
-struct DataType;
-
-template <>
-struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
-{
-};
-
 template <typename F, typename T, std::size_t... Is>
 auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
 {
@@ -87,10 +73,10 @@ struct HostTensorDescriptor
    HostTensorDescriptor() = delete;

    template <typename X>
-    HostTensorDescriptor(std::vector<X> lens);
+    HostTensorDescriptor(const std::vector<X>& lens);

    template <typename X, typename Y>
-    HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides);
+    HostTensorDescriptor(const std::vector<X>& lens, const std::vector<Y>& strides);

    void CalculateStrides();

@@ -177,7 +163,7 @@ struct ParallelTensorFunctor
        return indices;
    }

-    void operator()(std::size_t num_thread = std::thread::hardware_concurrency()) const
+    void operator()(std::size_t num_thread = 1) const
    {
        std::size_t work_per_thread = (mN1d + num_thread - 1) / num_thread;

@@ -227,7 +213,7 @@ struct Tensor
    Tensor(const HostTensorDescriptor& desc) : mDesc(desc), mData(mDesc.GetElementSpace()) {}

    template <typename G>
-    void GenerateTensorValue(G g, std::size_t num_thread = std::thread::hardware_concurrency())
+    void GenerateTensorValue(G g, std::size_t num_thread = 1)
    {
        switch(mDesc.GetNumOfDimension())
        {
@@ -299,85 +285,69 @@ struct Tensor
 };

 template <typename X>
-HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens) : mLens(lens)
+HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens) : mLens(lens)
 {
    this->CalculateStrides();
 }

 template <typename X, typename Y>
-HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> strides)
+HostTensorDescriptor::HostTensorDescriptor(const std::vector<X>& lens,
+                                           const std::vector<Y>& strides)
    : mLens(lens), mStrides(strides)
 {
 }

 void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);

-float bf16_to_f32_(ck::bhalf_t src_val);
-
+#if 1
+// FIXME: remove
 void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
+#endif

 template <typename T>
-void check_error(const Tensor<T>& ref, const Tensor<T>& result)
+float check_error(const Tensor<T>& ref, const Tensor<T>& result)
 {
-    float error     = 0;
-    float max_diff  = -1;
-    float ref_value = 0, result_value = 0;
+    float l1_error       = 0;
+    float linf_error     = -1;
+    float linf_rel_error = -1;
+
+    float linf_ref_value = 0, linf_result_value = 0;
+    float linf_rel_ref_value = 0, linf_rel_result_value = 0;
+
+    constexpr float eps = 1e-10;

-    if constexpr(std::is_same<ck::bhalf_t, T>::value)
+    for(int i = 0; i < ref.mData.size(); ++i)
    {
-        for(int i = 0; i < ref.mData.size(); ++i)
+        float ref_v    = ck::type_convert<float>(ref.mData[i]);
+        float result_v = ck::type_convert<float>(result.mData[i]);
+
+        float diff     = std::abs(ref_v - result_v);
+        float rel_diff = diff / std::max(std::abs(ref_v), eps);
+
+        l1_error += diff;
+
+        if(linf_error < diff)
        {
-            error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
-            float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i]));
-            if(max_diff < diff)
-            {
-                max_diff     = diff;
-                ref_value    = bf16_to_f32_(ref.mData[i]);
-                result_value = bf16_to_f32_(result.mData[i]);
-            }
+            linf_error        = diff;
+            linf_ref_value    = ref_v;
+            linf_result_value = result_v;
        }
-    }
-    else
-    {
-        for(int i = 0; i < ref.mData.size(); ++i)
+
+        if(linf_rel_error < rel_diff)
        {
-            error += std::abs(double(ref.mData[i]) - double(result.mData[i]));
-            float diff = std::abs(double(ref.mData[i]) - double(result.mData[i]));
-            if(max_diff < diff)
-            {
-                max_diff     = diff;
-                ref_value    = ref.mData[i];
-                result_value = result.mData[i];
-            }
+            linf_rel_error        = rel_diff;
+            linf_rel_ref_value    = ref_v;
+            linf_rel_result_value = result_v;
        }
    }

-    std::cout << "error: " << error << std::endl;
-    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
-}
-
-template <typename T>
-void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
-{
-    bool has_error  = false;
-    int error_count = 0;
-
-    for(int i = 0; i < ref.mData.size(); ++i)
-    {
-        if(ref.mData[i] != result.mData[i])
-        {
-            std::cerr << std::endl
-                      << "Indices different at position " << i << " (ref: " << ref.mData[i]
-                      << ", result: " << result.mData[i] << ")" << std::endl;
-            has_error = true;
-            error_count++;
-            if(error_count == 20)
-                break;
-        };
-    }
+    std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl;
+    std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref "
+              << linf_ref_value << ", result " << linf_result_value << std::endl;
+    std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref "
+              << linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl;

-    if(!has_error)
-        std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
+    return linf_error;
 }

 #endif
--- a/library/include/ck/library/host_tensor/host_tensor_generator.hpp
+++ b/library/include/ck/library/host_tensor/host_tensor_generator.hpp
-#ifndef HOST_TENSOR_GENERATOR_HPP
-#define HOST_TENSOR_GENERATOR_HPP
+#pragma once

 #include <cmath>
+#include <numeric>
+
 #include "config.hpp"

 template <typename T>
@@ -93,8 +94,8 @@ struct GeneratorTensor_2<int8_t>
 template <typename T>
 struct GeneratorTensor_3
 {
-    T min_value = 0;
-    T max_value = 1;
+    float min_value = 0;
+    float max_value = 1;

    template <typename... Is>
    T operator()(Is...)
@@ -122,22 +123,6 @@ struct GeneratorTensor_3<ck::bhalf_t>
    }
 };

-template <>
-struct GeneratorTensor_3<int8_t>
-{
-    float min_value = 0;
-    float max_value = 1;
-
-    template <typename... Is>
-    int8_t operator()(Is...)
-    {
-        int8_t min_tmp = static_cast<int8_t>(min_value);
-        int8_t max_tmp = static_cast<int8_t>(max_value);
-
-        return (std::rand() % (max_tmp - min_tmp)) + min_tmp;
-    }
-};
-
 struct GeneratorTensor_Checkboard
 {
    template <typename... Ts>
@@ -163,5 +148,3 @@ struct GeneratorTensor_Sequential
        return dims[Dim];
    }
 };
-
-#endif