Commit 9dce6851 authored by Jing Zhang's avatar Jing Zhang
Browse files

merge develop

parents 3cc57101 5d37d7bf
#ifndef TENSOR_SPACE_FILLING_CURVE_HPP
#define TENSOR_SPACE_FILLING_CURVE_HPP
#include "math.hpp"
#include "sequence.hpp"
#include "sequence_helper.hpp"
#include "tensor_adaptor.hpp"
#include "statically_indexed_array_multi_index.hpp"
#include "tuple_helper.hpp"
......@@ -37,13 +41,25 @@ struct SpaceFillingCurve
ScalarPerVector;
}
template <index_t AccessIdx1dBegin, index_t AccessIdx1dEnd>
static __device__ __host__ constexpr auto GetStepBetween(Number<AccessIdx1dBegin>,
Number<AccessIdx1dEnd>)
{
static_assert(AccessIdx1dBegin >= 0, "1D index should be non-negative");
static_assert(AccessIdx1dBegin < GetNumOfAccess(), "1D index should be larger than 0");
static_assert(AccessIdx1dEnd >= 0, "1D index should be non-negative");
static_assert(AccessIdx1dEnd < GetNumOfAccess(), "1D index should be larger than 0");
constexpr auto idx_begin = GetIndex(Number<AccessIdx1dBegin>{});
constexpr auto idx_end = GetIndex(Number<AccessIdx1dEnd>{});
return idx_end - idx_begin;
}
template <index_t AccessIdx1d>
static __device__ __host__ constexpr auto GetForwardStep(Number<AccessIdx1d>)
{
constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
constexpr auto idx_next = GetIndex(Number<AccessIdx1d + 1>{});
return idx_next - idx_curr;
static_assert(AccessIdx1d < GetNumOfAccess(), "1D index should be larger than 0");
return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d + 1>{});
}
template <index_t AccessIdx1d>
......@@ -51,9 +67,7 @@ struct SpaceFillingCurve
{
static_assert(AccessIdx1d > 0, "1D index should be larger than 0");
constexpr auto idx_curr = GetIndex(Number<AccessIdx1d>{});
constexpr auto idx_prev = GetIndex(Number<AccessIdx1d - 1>{});
return idx_prev - idx_curr;
return GetStepBetween(Number<AccessIdx1d>{}, Number<AccessIdx1d - 1>{});
}
template <index_t AccessIdx1d>
......@@ -129,3 +143,4 @@ struct SpaceFillingCurve
};
} // namespace ck
#endif
#ifndef CK_TYPE_HPP
#define CK_TYPE_HPP
#include "config.hpp"
#include "integral_constant.hpp"
#include "enable_if.hpp"
......
add_subdirectory(src/host_tensor)
add_subdirectory(src/tensor_operation_instance/gpu)
......@@ -48,6 +48,7 @@ template <typename... Args, typename F>
float launch_and_time_kernel(
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{
#if 1
KernelTimer timer;
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
......@@ -80,5 +81,10 @@ float launch_and_time_kernel(
// std::this_thread::sleep_for (std::chrono::microseconds(10));
return timer.GetElapsedTime() / nrepeat;
#else
launch_kernel(kernel, grid_dim, block_dim, lds_byte, args...);
return 0;
#endif
}
#endif
......@@ -77,12 +77,12 @@ void host_conv3d_ndhwc_kzyxc_ndhwk(const Tensor<TIn>& in,
const auto X = wei.mDesc.GetLengths()[3];
const auto C = wei.mDesc.GetLengths()[4];
auto f_ndhwc = [&](auto n, auto do__, auto ho_, auto wo_, auto k) {
auto f_ndhwc = [&](auto n, auto do_tmp, auto ho_tmp, auto wo_tmp, auto k) {
// do__ must be converted to signed integer, otherwise zmin might be wrong in cases
// negative values.
const int do_ = static_cast<int>(do__);
const int ho = static_cast<int>(ho_);
const int wo = static_cast<int>(wo_);
const int do_ = static_cast<int>(do_tmp);
const int ho = static_cast<int>(ho_tmp);
const int wo = static_cast<int>(wo_tmp);
const int zmin =
std::max(0,
(in_left_pads[I0] - do_ * conv_strides[I0] + conv_dilations[I0] - 1) /
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_GENERIC_REDUCTION_HPP_
#define HOST_GENERIC_REDUCTION_HPP_
#include <vector>
#include <functional>
#include <limits>
#include <type_traits>
#include <cassert>
#include <cmath>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
using float16 = half_float::half;
namespace ck {
namespace host_reduce {
template <typename T>
static void
get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
{
if(dim < dimLengths.size())
{
std::vector<std::vector<T>> updated_indexes;
if(dim == 0)
{
assert(indexes.size() == 0);
assert(dimLengths[dim] > 0);
for(T i = 0; i < dimLengths[dim]; i++)
{
std::vector<T> index = {i};
updated_indexes.push_back(index);
};
}
else
{
// go through all the current indexes
for(const auto& index : indexes)
for(T i = 0; i < dimLengths[dim]; i++)
{
auto index_new = index;
index_new.push_back(i);
updated_indexes.push_back(index_new);
};
};
// update to the indexes (output)
indexes = updated_indexes;
// further to construct the indexes from the updated status
get_all_indexes(dimLengths, dim + 1, indexes);
};
};
template <typename T>
static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
{
T offset = 0;
assert(strides.size() == index.size());
for(int i = 0; i < index.size(); i++)
offset += strides[i] * static_cast<T>(index[i]);
return (offset);
};
template <typename T>
static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
{
T offset = 0;
assert(lengths.size() == index.size() && lengths.size() > 0);
int len = lengths.size();
T stride = 1;
// for len==1, the loop is not executed
for(int i = len - 1; i > 0; i--)
{
offset += stride * static_cast<T>(index[i]);
stride *= lengths[i];
};
offset += stride * static_cast<T>(index[0]);
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp_t ReduceOpId,
bool PropagateNan,
bool NeedIndices>
class ReductionHost
{
public:
ReductionHost() = default;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& toReduceDims_)
{
this->inLengths = to_int_vector(inDesc.GetLengths());
this->outLengths = to_int_vector(outDesc.GetLengths());
this->inStrides = to_int_vector(inDesc.GetStrides());
this->outStrides = to_int_vector(outDesc.GetStrides());
this->invariantDims = invariantDims_;
this->toReduceDims = toReduceDims_;
assert(this->inLengths.size() == this->outLengths.size());
assert(!this->toReduceDims.empty());
for(const auto dim : this->invariantDims)
this->invariantLengths.push_back(this->inLengths[dim]);
for(const auto dim : this->toReduceDims)
toReduceLengths.push_back(this->inLengths[dim]);
this->reduceAllDims = this->invariantDims.empty();
};
~ReductionHost(){};
void
Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
if constexpr(NeedIndices)
RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
else
RunImpl_no_indices(alpha, in_data, beta, out_data);
};
private:
std::vector<int> inLengths;
std::vector<int> outLengths;
std::vector<int> inStrides;
std::vector<int> outStrides;
std::vector<int> invariantLengths;
std::vector<int> toReduceLengths;
std::vector<int> invariantDims;
std::vector<int> toReduceDims;
bool reduceAllDims;
void RunImpl_with_indices(
float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
// done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(inLengths, src_index);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
indices[0] = accuIndex;
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
// actually done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(toReduceLengths, index_2);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
indices[dst_offset] = accuIndex;
};
};
}; // end of RunImpl_with_indices()
void
RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
};
};
}; // end of RunImpl_no_indices()
};
}; // end of namespace host_reduce
}; // end of namespace ck
#endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef GUARD_HOST_REDUCE_UTIL_HPP
#define GUARD_HOST_REDUCE_UTIL_HPP
#include <half.hpp>
#include <limits>
#include <cmath>
#include <cassert>
#include <stdexcept>
#include <string>
#include "reduction_enums.hpp"
namespace ck {
namespace host_reduce {
using ck::NanPropagation_t;
using ck::ReduceTensorOp_t;
template <typename T>
static inline bool float_equal_one(T);
static inline bool float_equal_one(float x) { return x == 1.0f; };
static inline bool float_equal_one(double x) { return x == 1.0; };
static inline bool float_equal_one(half_float::half x)
{
return x == static_cast<half_float::half>(1.0f);
};
template <typename T>
static inline bool float_equal_zero(T x);
static inline bool float_equal_zero(float x) { return x == 0.0f; };
static inline bool float_equal_zero(double x) { return x == 0.0; };
static inline bool float_equal_zero(half_float::half x)
{
return x == static_cast<half_float::half>(0.0f);
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
{
using std::abs;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
{
return ([&](compType& a_) { a_ = abs(a_); });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
{
return ([&](compType& a_) { a_ = a_ * a_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
{
return ([&](compType& a_) { a_ = abs(a_); });
}
else
{
// ReduceTensorOp_t::AVG:
// ReduceTensorOp_t::ADD:
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX:
return ([&](compType&) {});
};
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
{
using std::sqrt;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
{
return ([&](compType& a_) { a_ = sqrt(a_); });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
{
return ([&, divider](compType& a_) {
a_ = a_ / static_cast<compType>(static_cast<float>(divider));
});
}
else
{
// ReduceTensorOp_t::ADD:
// ReduceTensorOp_t::NORM1:
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX:
// ReduceTensorOp_t::AMAX:
return ([&](compType&) {});
}
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&, compType)> ReduceOpFn()
{
if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
{
return ([&](compType& a_, compType b_) { a_ = a_ + b_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
{
return ([&](compType& a_, compType b_) { a_ = a_ * b_; });
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
{
return ([&](compType& a_, compType b_) {
if(a_ > b_)
a_ = b_;
});
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
{
return ([&](compType& a_, compType b_) {
if(a_ < b_)
a_ = b_;
});
}
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2()
{
if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
{
return ([&](compType& a_, compType b_, bool& changed) {
if(a_ > b_)
{
a_ = b_;
changed = true;
}
else
changed = false;
});
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
{
return ([&](compType& a_, compType b_, bool& changed) {
if(a_ < b_)
{
a_ = b_;
changed = true;
}
else
changed = false;
});
}
else
{
// ReduceTensorOp_t::ADD:
// ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::AVG:
// ReduceTensorOp_t::NORM1:
// ReduceTensorOp_t::NORM2:
return (std::function<void(compType&, compType, bool&)>{});
};
};
template <typename compType, ReduceTensorOp_t ReduceOpId>
__host__ static inline compType ReduceOpZeroVal()
{
if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
{
return (static_cast<compType>(1.0f));
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
{
return (std::numeric_limits<compType>::max());
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
{
return (std::numeric_limits<compType>::lowest());
}
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
{
return (static_cast<compType>(0.0f));
}
else
{
// ReduceTensorOp_t::ADD
// ReduceTensorOp_t::AVG
// ReduceTensorOp_t::NORM1
// ReduceTensorOp_t::NORM2
return (static_cast<compType>(0.0f));
};
};
template <typename compType, bool PropagateNan>
__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce,
compType& accuVal,
compType currVal)
{
using std::isnan;
if constexpr(!PropagateNan)
{
opReduce(accuVal, currVal);
}
else
{
if(isnan(currVal))
accuVal = currVal;
else
opReduce(accuVal, currVal);
};
};
template <typename compType, bool PropagateNan>
__host__ static inline void
binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce,
compType& accuVal,
compType currVal,
int& accuIndex,
int currIndex)
{
using std::isnan;
if constexpr(!PropagateNan)
{
bool changed;
opReduce(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
}
else
{
if(isnan(currVal))
{
accuVal = currVal;
accuIndex = currIndex;
}
else
{
bool changed;
opReduce(accuVal, currVal, changed);
if(changed)
accuIndex = currIndex;
};
};
};
}; // namespace host_reduce
static inline std::vector<int> to_int_vector(const std::vector<size_t>& inData)
{
std::vector<int> outData;
for(auto elem : inData)
outData.push_back(static_cast<int>(elem));
return (outData);
};
}; // namespace ck
#endif
......@@ -8,6 +8,7 @@
#include <utility>
#include <cassert>
#include <iostream>
#include "data_type.hpp"
template <typename Range>
std::ostream& LogRange(std::ostream& os, Range&& range, std::string delim)
......@@ -311,7 +312,9 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
float bf16_to_f32_(ushort src_val);
float bf16_to_f32_(ck::bhalf_t src_val);
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
template <typename T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result)
......@@ -320,7 +323,7 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
float max_diff = -1;
float ref_value = 0, result_value = 0;
if constexpr(std::is_same<ushort, T>::value)
if constexpr(std::is_same<ck::bhalf_t, T>::value)
{
for(int i = 0; i < ref.mData.size(); ++i)
{
......@@ -353,4 +356,28 @@ void check_error(const Tensor<T>& ref, const Tensor<T>& result)
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl;
}
template <typename T>
void check_indices(const Tensor<T>& ref, const Tensor<T>& result)
{
bool has_error = false;
int error_count = 0;
for(int i = 0; i < ref.mData.size(); ++i)
{
if(ref.mData[i] != result.mData[i])
{
std::cerr << std::endl
<< "Indices different at position " << i << " (ref: " << ref.mData[i]
<< ", result: " << result.mData[i] << ")" << std::endl;
has_error = true;
error_count++;
if(error_count == 20)
break;
};
}
if(!has_error)
std::cout << std::endl << "Indices result is completely acccurate!" << std::endl;
}
#endif
......@@ -3,7 +3,6 @@
#include <cmath>
#include "config.hpp"
#include "data_type.hpp"
template <typename T>
struct GeneratorTensor_0
......@@ -28,14 +27,14 @@ struct GeneratorTensor_1
};
template <>
struct GeneratorTensor_1<ushort>
struct GeneratorTensor_1<ck::bhalf_t>
{
float value = 1.0;
template <typename... Is>
ushort operator()(Is...)
ck::bhalf_t operator()(Is...)
{
return ck::type_convert<ushort>(value);
return ck::type_convert<ck::bhalf_t>(value);
}
};
......@@ -60,21 +59,21 @@ struct GeneratorTensor_2
template <typename... Is>
T operator()(Is...)
{
return (std::rand() % (max_value - min_value)) + min_value;
return static_cast<T>((std::rand() % (max_value - min_value)) + min_value);
}
};
template <>
struct GeneratorTensor_2<ushort>
struct GeneratorTensor_2<ck::bhalf_t>
{
int min_value = 0;
int max_value = 1;
template <typename... Is>
ushort operator()(Is...)
ck::bhalf_t operator()(Is...)
{
float tmp = (std::rand() % (max_value - min_value)) + min_value;
return ck::type_convert<ushort>(tmp);
return ck::type_convert<ck::bhalf_t>(tmp);
}
};
......@@ -102,24 +101,24 @@ struct GeneratorTensor_3
{
float tmp = float(std::rand()) / float(RAND_MAX);
return min_value + tmp * (max_value - min_value);
return static_cast<T>(min_value + tmp * (max_value - min_value));
}
};
template <>
struct GeneratorTensor_3<ushort>
struct GeneratorTensor_3<ck::bhalf_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
ushort operator()(Is...)
ck::bhalf_t operator()(Is...)
{
float tmp = float(std::rand()) / float(RAND_MAX);
float fp32_tmp = min_value + tmp * (max_value - min_value);
return ck::type_convert<ushort>(fp32_tmp);
return ck::type_convert<ck::bhalf_t>(fp32_tmp);
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment