merge develop

9dce6851 · Jing Zhang · 3cc57101 · 5d37d7bf · 9dce6851 · 9dce6851
Commit 9dce6851 authored Mar 10, 2022 by Jing Zhang
20 changed files
--- a/composable_kernel/include/utility/tensor_space_filling_curve.hpp
+++ b/composable_kernel/include/utility/tensor_space_filling_curve.hpp
+#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
+#define TENSOR_SPACE_FILLING_CURVE_HPP
+
 #include "math.hpp"
 #include "sequence.hpp"
+#include "sequence_helper.hpp"
 #include "tensor_adaptor.hpp"
 #include "statically_indexed_array_multi_index.hpp"
 #include "tuple_helper.hpp"
@@ -37,13 +41,25 @@ struct SpaceFillingCurve
               ScalarPerVector;
    }

+    template <index_t AccessIdx1dBegin, index_t AccessIdx1dEnd>
+    static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>,
+                                                             Number<AccessIdx1dEnd>)
+    {
+        static_assert(AccessIdx1dBegin >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dBegin < GetNumOfAccess(), "1D index should be larger than 0");
+        static_assert(AccessIdx1dEnd >= 0, "1D index should be non-negative");
+        static_assert(AccessIdx1dEnd < GetNumOfAccess(), "1D index should be larger than 0");
+
+        constexpr auto idx_begin = GetIndex(Number<AccessIdx1dBegin>{});
+        constexpr auto idx_end   = GetIndex(Number<AccessIdx1dEnd>{});
+        return idx_end - idx_begin;
+    }
+
    template <index_t AccessIdx1d>
    static __device__ __host__ constexpr auto GetForwardStep(Number<AccessIdx1d>)
    {
-
-        constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
-        constexpr auto idx_next = GetIndex(Number<AccessIdx1d + 1>{});
-        return idx_next - idx_curr;
+        static_assert(AccessIdx1d < GetNumOfAccess(), "1D index should be larger than 0");
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d + 1>{});
    }

    template <index_t AccessIdx1d>
@@ -51,9 +67,7 @@ struct SpaceFillingCurve
    {
        static_assert(AccessIdx1d > 0, "1D index should be larger than 0");

-        constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
-        constexpr auto idx_prev = GetIndex(Number<AccessIdx1d - 1>{});
-        return idx_prev - idx_curr;
+        return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d - 1>{});
    }

    template <index_t AccessIdx1d>
@@ -129,3 +143,4 @@ struct SpaceFillingCurve
 };

 } // namespace ck
+#endif
--- a/composable_kernel/include/utility/transpose_vectors.hpp
+++ b/composable_kernel/include/utility/transpose_vectors.hpp
--- a/composable_kernel/include/utility/tuple.hpp
+++ b/composable_kernel/include/utility/tuple.hpp
--- a/composable_kernel/include/utility/tuple_helper.hpp
+++ b/composable_kernel/include/utility/tuple_helper.hpp
--- a/composable_kernel/include/utility/type.hpp
+++ b/composable_kernel/include/utility/type.hpp
 #ifndef CK_TYPE_HPP
 #define CK_TYPE_HPP

+#include "config.hpp"
 #include "integral_constant.hpp"
 #include "enable_if.hpp"


--- a/composable_kernel/include/utility/utility.hpp
+++ b/composable_kernel/include/utility/utility.hpp
--- a/library/CMakeLists.txt
+++ b/library/CMakeLists.txt
+add_subdirectory(src/host_tensor)
+add_subdirectory(src/tensor_operation_instance/gpu)
--- a/host/host_tensor/include/conv_common.hpp
+++ b/host/host_tensor/include/conv_common.hpp
--- a/host/host_tensor/include/device.hpp
+++ b/host/host_tensor/include/device.hpp
@@ -48,6 +48,7 @@ template <typename... Args, typename F>
 float launch_and_time_kernel(
    F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
 {
+#if 1
    KernelTimer timer;

    printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
@@ -80,5 +81,10 @@ float launch_and_time_kernel(
    // std::this_thread::sleep_for (std::chrono::microseconds(10));

    return timer.GetElapsedTime() / nrepeat;
+#else
+    launch_kernel(kernel, grid_dim, block_dim, lds_byte, args...);
+
+    return 0;
+#endif
 }
 #endif
--- a/host/host_tensor/include/device_tensor.hpp
+++ b/host/host_tensor/include/device_tensor.hpp
--- a/host/host_tensor/include/host_conv.hpp
+++ b/host/host_tensor/include/host_conv.hpp
@@ -77,12 +77,12 @@ void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor<TIn>& in,
    const auto X      = wei.mDesc.GetLengths()[3];
    const auto C      = wei.mDesc.GetLengths()[4];

-    auto f_ndhwc = [&](auto n, auto do__, auto ho_, auto wo_, auto k) {
+    auto f_ndhwc = [&](auto n, auto do_tmp, auto ho_tmp, auto wo_tmp, auto k) {
        // do__ must be converted to signed integer, otherwise zmin might be wrong in cases
        // negative values.
-        const int do_ = static_cast<int>(do__);
-        const int ho  = static_cast<int>(ho_);
-        const int wo  = static_cast<int>(wo_);
+        const int do_ = static_cast<int>(do_tmp);
+        const int ho  = static_cast<int>(ho_tmp);
+        const int wo  = static_cast<int>(wo_tmp);
        const int zmin =
            std::max(0,
                     (in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) /

--- a/host/host_tensor/include/host_gemm.hpp
+++ b/host/host_tensor/include/host_gemm.hpp
--- a/library/include/ck/library/host_tensor/host_generic_reduction.hpp
+++ b/library/include/ck/library/host_tensor/host_generic_reduction.hpp
+
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef HOST_GENERIC_REDUCTION_HPP_
+#define HOST_GENERIC_REDUCTION_HPP_
+
+#include <vector>
+#include <functional>
+#include <limits>
+#include <type_traits>
+#include <cassert>
+#include <cmath>
+
+#include "reduction_enums.hpp"
+#include "host_reduce_util.hpp"
+
+using float16 = half_float::half;
+
+namespace ck {
+
+namespace host_reduce {
+
+template <typename T>
+static void
+get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
+{
+    if(dim < dimLengths.size())
+    {
+        std::vector<std::vector<T>> updated_indexes;
+
+        if(dim == 0)
+        {
+            assert(indexes.size() == 0);
+            assert(dimLengths[dim] > 0);
+            for(T i = 0; i < dimLengths[dim]; i++)
+            {
+                std::vector<T> index = {i};
+
+                updated_indexes.push_back(index);
+            };
+        }
+        else
+        {
+            // go through all the current indexes
+            for(const auto& index : indexes)
+                for(T i = 0; i < dimLengths[dim]; i++)
+                {
+                    auto index_new = index;
+                    index_new.push_back(i);
+
+                    updated_indexes.push_back(index_new);
+                };
+        };
+
+        // update to the indexes (output)
+        indexes = updated_indexes;
+
+        // further to construct the indexes from the updated status
+        get_all_indexes(dimLengths, dim + 1, indexes);
+    };
+};
+
+template <typename T>
+static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
+{
+    T offset = 0;
+
+    assert(strides.size() == index.size());
+
+    for(int i = 0; i < index.size(); i++)
+        offset += strides[i] * static_cast<T>(index[i]);
+
+    return (offset);
+};
+
+template <typename T>
+static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
+{
+    T offset = 0;
+
+    assert(lengths.size() == index.size() && lengths.size() > 0);
+
+    int len  = lengths.size();
+    T stride = 1;
+
+    // for len==1, the loop is not executed
+    for(int i = len - 1; i > 0; i--)
+    {
+        offset += stride * static_cast<T>(index[i]);
+
+        stride *= lengths[i];
+    };
+
+    offset += stride * static_cast<T>(index[0]);
+
+    return (offset);
+};
+
+template <typename InDataType,
+          typename AccDataType,
+          typename OutDataType,
+          ck::ReduceTensorOp_t ReduceOpId,
+          bool PropagateNan,
+          bool NeedIndices>
+class ReductionHost
+{
+    public:
+    ReductionHost() = default;
+    ReductionHost(HostTensorDescriptor& inDesc,
+                  HostTensorDescriptor& outDesc,
+                  const std::vector<int>& invariantDims_,
+                  const std::vector<int>& toReduceDims_)
+    {
+        this->inLengths  = to_int_vector(inDesc.GetLengths());
+        this->outLengths = to_int_vector(outDesc.GetLengths());
+        this->inStrides  = to_int_vector(inDesc.GetStrides());
+        this->outStrides = to_int_vector(outDesc.GetStrides());
+
+        this->invariantDims = invariantDims_;
+        this->toReduceDims  = toReduceDims_;
+
+        assert(this->inLengths.size() == this->outLengths.size());
+        assert(!this->toReduceDims.empty());
+
+        for(const auto dim : this->invariantDims)
+            this->invariantLengths.push_back(this->inLengths[dim]);
+
+        for(const auto dim : this->toReduceDims)
+            toReduceLengths.push_back(this->inLengths[dim]);
+
+        this->reduceAllDims = this->invariantDims.empty();
+    };
+
+    ~ReductionHost(){};
+
+    void
+    Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
+    {
+        if constexpr(NeedIndices)
+            RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
+        else
+            RunImpl_no_indices(alpha, in_data, beta, out_data);
+    };
+
+    private:
+    std::vector<int> inLengths;
+    std::vector<int> outLengths;
+    std::vector<int> inStrides;
+    std::vector<int> outStrides;
+
+    std::vector<int> invariantLengths;
+    std::vector<int> toReduceLengths;
+
+    std::vector<int> invariantDims;
+    std::vector<int> toReduceDims;
+
+    bool reduceAllDims;
+
+    void RunImpl_with_indices(
+        float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
+    {
+        using ck::host_reduce::binop_with_nan_check;
+        using ck::host_reduce::binop_with_nan_check2;
+        using ck::host_reduce::float_equal_one;
+        using ck::host_reduce::float_equal_zero;
+        using ck::host_reduce::PosUnaryOpFn;
+        using ck::host_reduce::PreUnaryOpFn;
+        using ck::host_reduce::ReduceOpFn2;
+        using ck::host_reduce::ReduceOpZeroVal;
+
+        auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
+
+        int divider = 1;
+        for(int i = 0; i < toReduceLengths.size(); i++)
+            divider *= toReduceLengths[i];
+
+        auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
+        auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
+
+        if(reduceAllDims)
+        {
+            std::vector<std::vector<int>> indexes_1;
+
+            get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
+
+            auto accuVal  = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+            int accuIndex = 0;
+
+            // go through indexes of the invariant dimensions
+            for(const auto& src_index : indexes_1)
+            {
+                auto src_offset = get_offset_from_index(this->inStrides, src_index);
+
+                auto currVal = static_cast<AccDataType>(in_data[src_offset]);
+
+                // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
+                // done
+                PreUnaryOp(currVal);
+
+                auto currIndex = get_flatten_offset(inLengths, src_index);
+                binop_with_nan_check2<AccDataType, PropagateNan>(
+                    opReduce, accuVal, currVal, accuIndex, currIndex);
+            };
+
+            // scale the accumulated value
+            if(!float_equal_one(alpha))
+                accuVal *= static_cast<AccDataType>(alpha);
+
+            // scale the prior dst value and add it to the accumulated value
+            if(!float_equal_zero(beta))
+                accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
+
+            // store the reduced value to dst location
+            out_data[0] = static_cast<OutDataType>(accuVal);
+            indices[0]  = accuIndex;
+        }
+        else
+        {
+            std::vector<std::vector<int>> indexes_1, indexes_2;
+
+            get_all_indexes(
+                this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
+            get_all_indexes(
+                this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
+
+            // go through indexes of the invariant dimensions
+            for(const auto& index_1 : indexes_1)
+            {
+                std::vector<int> src_index;
+                std::vector<int> dst_index;
+
+                src_index.resize(this->inLengths.size());
+
+                // generate the part of src index belonging to invariant dims
+                for(int k = 0; k < invariantDims.size(); k++)
+                    src_index[invariantDims[k]] = index_1[k];
+
+                for(int k = 0; k < invariantDims.size(); k++)
+                    dst_index.push_back(index_1[k]);
+
+                int dst_offset = get_offset_from_index(this->outStrides, dst_index);
+
+                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+                int accuIndex       = 0;
+
+                // go through indexes of the toReduce dimensions
+                for(const auto& index_2 : indexes_2)
+                {
+                    // generate the part of src index belonging to toReduce dims
+                    for(int k = 0; k < toReduceDims.size(); k++)
+                        src_index[toReduceDims[k]] = index_2[k];
+
+                    auto src_offset = get_offset_from_index(this->inStrides, src_index);
+
+                    auto currVal = static_cast<AccDataType>(in_data[src_offset]);
+                    // unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
+                    // actually done
+                    PreUnaryOp(currVal);
+
+                    auto currIndex = get_flatten_offset(toReduceLengths, index_2);
+                    binop_with_nan_check2<AccDataType, PropagateNan>(
+                        opReduce, accuVal, currVal, accuIndex, currIndex);
+                };
+
+                // scale the accumulated value
+                if(!float_equal_one(alpha))
+                    accuVal *= static_cast<AccDataType>(alpha);
+
+                // scale the prior dst value and add it to the accumulated value
+                if(!float_equal_zero(beta))
+                    accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
+                               static_cast<AccDataType>(beta);
+
+                // store the reduced value to dst location
+                out_data[dst_offset] = static_cast<OutDataType>(accuVal);
+                indices[dst_offset]  = accuIndex;
+            };
+        };
+    }; // end of RunImpl_with_indices()
+
+    void
+    RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
+    {
+        using ck::host_reduce::binop_with_nan_check;
+        using ck::host_reduce::binop_with_nan_check2;
+        using ck::host_reduce::float_equal_one;
+        using ck::host_reduce::float_equal_zero;
+        using ck::host_reduce::PosUnaryOpFn;
+        using ck::host_reduce::PreUnaryOpFn;
+        using ck::host_reduce::ReduceOpFn;
+        using ck::host_reduce::ReduceOpZeroVal;
+
+        auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
+
+        int divider = 1;
+        for(int i = 0; i < toReduceLengths.size(); i++)
+            divider *= toReduceLengths[i];
+
+        auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
+        auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
+
+        if(reduceAllDims)
+        {
+            std::vector<std::vector<int>> indexes_1;
+
+            get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
+
+            auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+            // go through indexes of the invariant dimensions
+            for(const auto& src_index : indexes_1)
+            {
+                auto src_offset = get_offset_from_index(this->inStrides, src_index);
+
+                auto currVal = static_cast<AccDataType>(in_data[src_offset]);
+
+                PreUnaryOp(currVal);
+
+                binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+            };
+
+            PosUnaryOp(accuVal);
+
+            // scale the accumulated value
+            if(!float_equal_one(alpha))
+                accuVal *= static_cast<AccDataType>(alpha);
+
+            // scale the prior dst value and add it to the accumulated value
+            if(!float_equal_zero(beta))
+                accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
+
+            // store the reduced value to dst location
+            out_data[0] = static_cast<OutDataType>(accuVal);
+        }
+        else
+        {
+            std::vector<std::vector<int>> indexes_1, indexes_2;
+
+            get_all_indexes(
+                this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
+            get_all_indexes(
+                this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
+
+            // go through indexes of the invariant dimensions
+            for(const auto& index_1 : indexes_1)
+            {
+                std::vector<int> src_index;
+                std::vector<int> dst_index;
+
+                src_index.resize(this->inLengths.size());
+
+                for(int k = 0; k < invariantDims.size(); k++)
+                    dst_index.push_back(index_1[k]);
+
+                int dst_offset = get_offset_from_index(this->outStrides, dst_index);
+
+                // generate the part of src index belonging to invariant dims
+                for(int k = 0; k < invariantDims.size(); k++)
+                    src_index[invariantDims[k]] = index_1[k];
+
+                AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
+
+                // go through indexes of the toReduce dimensions
+                for(const auto& index_2 : indexes_2)
+                {
+                    // generate the part of src index belonging to toReduce dims
+                    for(int k = 0; k < toReduceDims.size(); k++)
+                        src_index[toReduceDims[k]] = index_2[k];
+
+                    auto src_offset = get_offset_from_index(this->inStrides, src_index);
+
+                    auto currVal = static_cast<AccDataType>(in_data[src_offset]);
+
+                    PreUnaryOp(currVal);
+
+                    binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
+                };
+
+                PosUnaryOp(accuVal);
+
+                // scale the accumulated value
+                if(!float_equal_one(alpha))
+                    accuVal *= static_cast<AccDataType>(alpha);
+
+                // scale the prior dst value and add it to the accumulated value
+                if(!float_equal_zero(beta))
+                    accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
+                               static_cast<AccDataType>(beta);
+
+                // store the reduced value to dst location
+                out_data[dst_offset] = static_cast<OutDataType>(accuVal);
+            };
+        };
+    }; // end of RunImpl_no_indices()
+};
+
+}; // end of namespace host_reduce
+
+}; // end of namespace ck
+
+#endif
--- a/library/include/ck/library/host_tensor/host_reduce_util.hpp
+++ b/library/include/ck/library/host_tensor/host_reduce_util.hpp
+/*******************************************************************************
+ *
+ * MIT License
+ *
+ * Copyright (c) 2020 Advanced Micro Devices, Inc.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ *
+ *******************************************************************************/
+#ifndef GUARD_HOST_REDUCE_UTIL_HPP
+#define GUARD_HOST_REDUCE_UTIL_HPP
+
+#include <half.hpp>
+#include <limits>
+#include <cmath>
+#include <cassert>
+#include <stdexcept>
+#include <string>
+
+#include "reduction_enums.hpp"
+
+namespace ck {
+
+namespace host_reduce {
+
+using ck::NanPropagation_t;
+using ck::ReduceTensorOp_t;
+
+template <typename T>
+static inline bool float_equal_one(T);
+
+static inline bool float_equal_one(float x) { return x == 1.0f; };
+
+static inline bool float_equal_one(double x) { return x == 1.0; };
+
+static inline bool float_equal_one(half_float::half x)
+{
+    return x == static_cast<half_float::half>(1.0f);
+};
+
+template <typename T>
+static inline bool float_equal_zero(T x);
+
+static inline bool float_equal_zero(float x) { return x == 0.0f; };
+
+static inline bool float_equal_zero(double x) { return x == 0.0; };
+
+static inline bool float_equal_zero(half_float::half x)
+{
+    return x == static_cast<half_float::half>(0.0f);
+};
+
+template <typename compType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
+{
+    using std::abs;
+
+    if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
+    {
+        return ([&](compType& a_) { a_ = abs(a_); });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
+    {
+        return ([&](compType& a_) { a_ = a_ * a_; });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
+    {
+        return ([&](compType& a_) { a_ = abs(a_); });
+    }
+    else
+    {
+        // ReduceTensorOp_t::AVG:
+        // ReduceTensorOp_t::ADD:
+        // ReduceTensorOp_t::MUL:
+        // ReduceTensorOp_t::MIN:
+        // ReduceTensorOp_t::MAX:
+        return ([&](compType&) {});
+    };
+};
+
+template <typename compType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
+{
+    using std::sqrt;
+
+    if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
+    {
+        return ([&](compType& a_) { a_ = sqrt(a_); });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
+    {
+        return ([&, divider](compType& a_) {
+            a_ = a_ / static_cast<compType>(static_cast<float>(divider));
+        });
+    }
+    else
+    {
+        // ReduceTensorOp_t::ADD:
+        // ReduceTensorOp_t::NORM1:
+        // ReduceTensorOp_t::MUL:
+        // ReduceTensorOp_t::MIN:
+        // ReduceTensorOp_t::MAX:
+        // ReduceTensorOp_t::AMAX:
+        return ([&](compType&) {});
+    }
+};
+
+template <typename compType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(compType&, compType)> ReduceOpFn()
+{
+    if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
+                 ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
+    {
+        return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
+    {
+        return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
+    {
+        return ([&](compType& a_, compType b_) {
+            if(a_ > b_)
+                a_ = b_;
+        });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
+    {
+        return ([&](compType& a_, compType b_) {
+            if(a_ < b_)
+                a_ = b_;
+        });
+    }
+};
+
+template <typename compType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2()
+{
+    if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
+    {
+        return ([&](compType& a_, compType b_, bool& changed) {
+            if(a_ > b_)
+            {
+                a_      = b_;
+                changed = true;
+            }
+            else
+                changed = false;
+        });
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
+    {
+        return ([&](compType& a_, compType b_, bool& changed) {
+            if(a_ < b_)
+            {
+                a_      = b_;
+                changed = true;
+            }
+            else
+                changed = false;
+        });
+    }
+    else
+    {
+        // ReduceTensorOp_t::ADD:
+        // ReduceTensorOp_t::MUL:
+        // ReduceTensorOp_t::AVG:
+        // ReduceTensorOp_t::NORM1:
+        // ReduceTensorOp_t::NORM2:
+        return (std::function<void(compType&, compType, bool&)>{});
+    };
+};
+
+template <typename compType, ReduceTensorOp_t ReduceOpId>
+__host__ static inline compType ReduceOpZeroVal()
+{
+    if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
+    {
+        return (static_cast<compType>(1.0f));
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
+    {
+        return (std::numeric_limits<compType>::max());
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
+    {
+        return (std::numeric_limits<compType>::lowest());
+    }
+    else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
+    {
+        return (static_cast<compType>(0.0f));
+    }
+    else
+    {
+        // ReduceTensorOp_t::ADD
+        // ReduceTensorOp_t::AVG
+        // ReduceTensorOp_t::NORM1
+        // ReduceTensorOp_t::NORM2
+        return (static_cast<compType>(0.0f));
+    };
+};
+
+template <typename compType, bool PropagateNan>
+__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce,
+                                                 compType& accuVal,
+                                                 compType currVal)
+{
+    using std::isnan;
+
+    if constexpr(!PropagateNan)
+    {
+        opReduce(accuVal, currVal);
+    }
+    else
+    {
+        if(isnan(currVal))
+            accuVal = currVal;
+        else
+            opReduce(accuVal, currVal);
+    };
+};
+
+template <typename compType, bool PropagateNan>
+__host__ static inline void
+binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce,
+                      compType& accuVal,
+                      compType currVal,
+                      int& accuIndex,
+                      int currIndex)
+{
+    using std::isnan;
+
+    if constexpr(!PropagateNan)
+    {
+        bool changed;
+
+        opReduce(accuVal, currVal, changed);
+
+        if(changed)
+            accuIndex = currIndex;
+    }
+    else
+    {
+        if(isnan(currVal))
+        {
+            accuVal   = currVal;
+            accuIndex = currIndex;
+        }
+        else
+        {
+            bool changed;
+
+            opReduce(accuVal, currVal, changed);
+
+            if(changed)
+                accuIndex = currIndex;
+        };
+    };
+};
+
+}; // namespace host_reduce
+
+static inline std::vector<int> to_int_vector(const std::vector<size_t>& inData)
+{
+    std::vector<int> outData;
+
+    for(auto elem : inData)
+        outData.push_back(static_cast<int>(elem));
+
+    return (outData);
+};
+
+}; // namespace ck
+
+#endif
--- a/host/host_tensor/include/host_tensor.hpp
+++ b/host/host_tensor/include/host_tensor.hpp
@@ -8,6 +8,7 @@
 #include <utility>
 #include <cassert>
 #include <iostream>
+#include "data_type.hpp"

 template <typename Range>
 std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
@@ -311,7 +312,9 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s

 void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);

-float bf16_to_f32_(ushort src_val);
+float bf16_to_f32_(ck::bhalf_t src_val);
+
+void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);

 template <typename T>
 void check_error(const Tensor<T>& ref, const Tensor<T>& result)
@@ -320,7 +323,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
    float max_diff  = -1;
    float ref_value = 0, result_value = 0;

-    if constexpr(std::is_same<ushort, T>::value)
+    if constexpr(std::is_same<ck::bhalf_t, T>::value)
    {
        for(int i = 0; i < ref.mData.size(); ++i)
        {
@@ -353,4 +356,28 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
    std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
 }

+template <typename T>
+void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
+{
+    bool has_error  = false;
+    int error_count = 0;
+
+    for(int i = 0; i < ref.mData.size(); ++i)
+    {
+        if(ref.mData[i] != result.mData[i])
+        {
+            std::cerr << std::endl
+                      << "Indices different at position " << i << " (ref: " << ref.mData[i]
+                      << ", result: " << result.mData[i] << ")" << std::endl;
+            has_error = true;
+            error_count++;
+            if(error_count == 20)
+                break;
+        };
+    }
+
+    if(!has_error)
+        std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
+}
+
 #endif
--- a/host/host_tensor/include/host_tensor_generator.hpp
+++ b/host/host_tensor/include/host_tensor_generator.hpp
@@ -3,7 +3,6 @@

 #include <cmath>
 #include "config.hpp"
-#include "data_type.hpp"

 template <typename T>
 struct GeneratorTensor_0
@@ -28,14 +27,14 @@ struct GeneratorTensor_1
 };

 template <>
-struct GeneratorTensor_1<ushort>
+struct GeneratorTensor_1<ck::bhalf_t>
 {
    float value = 1.0;

    template <typename... Is>
-    ushort operator()(Is...)
+    ck::bhalf_t operator()(Is...)
    {
-        return ck::type_convert<ushort>(value);
+        return ck::type_convert<ck::bhalf_t>(value);
    }
 };

@@ -60,21 +59,21 @@ struct GeneratorTensor_2
    template <typename... Is>
    T operator()(Is...)
    {
-        return (std::rand() % (max_value - min_value)) + min_value;
+        return static_cast<T>((std::rand() % (max_value - min_value)) + min_value);
    }
 };

 template <>
-struct GeneratorTensor_2<ushort>
+struct GeneratorTensor_2<ck::bhalf_t>
 {
    int min_value = 0;
    int max_value = 1;

    template <typename... Is>
-    ushort operator()(Is...)
+    ck::bhalf_t operator()(Is...)
    {
        float tmp = (std::rand() % (max_value - min_value)) + min_value;
-        return ck::type_convert<ushort>(tmp);
+        return ck::type_convert<ck::bhalf_t>(tmp);
    }
 };

@@ -102,24 +101,24 @@ struct GeneratorTensor_3
    {
        float tmp = float(std::rand()) / float(RAND_MAX);

-        return min_value + tmp * (max_value - min_value);
+        return static_cast<T>(min_value + tmp * (max_value - min_value));
    }
 };

 template <>
-struct GeneratorTensor_3<ushort>
+struct GeneratorTensor_3<ck::bhalf_t>
 {
    float min_value = 0;
    float max_value = 1;

    template <typename... Is>
-    ushort operator()(Is...)
+    ck::bhalf_t operator()(Is...)
    {
        float tmp = float(std::rand()) / float(RAND_MAX);

        float fp32_tmp = min_value + tmp * (max_value - min_value);

-        return ck::type_convert<ushort>(fp32_tmp);
+        return ck::type_convert<ck::bhalf_t>(fp32_tmp);
    }
 };


--- a/host/driver_offline/include/debug.hpp
+++ b/host/driver_offline/include/debug.hpp
--- a/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
+++ b/host/driver_offline/include/device_convolution_add_forward_implicit_gemm_v5r1_dlops_nc0hwc1_kc0yxc1_nk0hwk1.hpp
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1_xdlops_nhwc_kyxc_nhwk.hpp
--- a/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp
+++ b/host/driver_offline/include/device_convolution_backward_data_implicit_gemm_v4r1r2_xdlops_nhwc_kyxc_nhwk.hpp