Commit e72c0c43 authored by carlushuang's avatar carlushuang
Browse files

Merge remote-tracking branch 'origin/develop' into cpu_avx2

parents d714fa15 313bbea5
...@@ -37,6 +37,10 @@ struct SpaceFillingCurve ...@@ -37,6 +37,10 @@ struct SpaceFillingCurve
__host__ __device__ static constexpr index_t GetNumOfAccess() __host__ __device__ static constexpr index_t GetNumOfAccess()
{ {
static_assert(TensorLengths::Size() == ScalarsPerAccess::Size());
static_assert(TensorLengths{} % ScalarsPerAccess{} ==
typename uniform_sequence_gen<TensorLengths::Size(), 0>::type{});
return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) / return reduce_on_sequence(TensorLengths{}, math::multiplies{}, Number<1>{}) /
ScalarPerVector; ScalarPerVector;
} }
...@@ -140,6 +144,15 @@ struct SpaceFillingCurve ...@@ -140,6 +144,15 @@ struct SpaceFillingCurve
}(); }();
return idx_md; return idx_md;
} }
// FIXME: rename this function
template <index_t AccessIdx1d>
static __device__ __host__ constexpr auto GetIndexTupleOfNumber(Number<AccessIdx1d>)
{
constexpr auto idx = GetIndex(Number<AccessIdx1d>{});
return generate_tuple([&](auto i) { return Number<idx[i]>{}; }, Number<nDim>{});
}
}; };
} // namespace ck } // namespace ck
......
...@@ -13,8 +13,10 @@ struct DeviceMem ...@@ -13,8 +13,10 @@ struct DeviceMem
DeviceMem() = delete; DeviceMem() = delete;
DeviceMem(std::size_t mem_size); DeviceMem(std::size_t mem_size);
void* GetDeviceBuffer(); void* GetDeviceBuffer();
std::size_t GetBufferSize();
void ToDevice(const void* p); void ToDevice(const void* p);
void FromDevice(void* p); void FromDevice(void* p);
void SetZero();
~DeviceMem(); ~DeviceMem();
void* mpDeviceBuf; void* mpDeviceBuf;
...@@ -48,7 +50,6 @@ template <typename... Args, typename F> ...@@ -48,7 +50,6 @@ template <typename... Args, typename F>
float launch_and_time_kernel( float launch_and_time_kernel(
F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args) F kernel, int nrepeat, dim3 grid_dim, dim3 block_dim, std::size_t lds_byte, Args... args)
{ {
#if 1
KernelTimer timer; KernelTimer timer;
printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n", printf("%s: grid_dim {%d, %d, %d}, block_dim {%d, %d, %d} \n",
...@@ -78,13 +79,6 @@ float launch_and_time_kernel( ...@@ -78,13 +79,6 @@ float launch_and_time_kernel(
timer.End(); timer.End();
// std::this_thread::sleep_for (std::chrono::microseconds(10));
return timer.GetElapsedTime() / nrepeat; return timer.GetElapsedTime() / nrepeat;
#else
launch_kernel(kernel, grid_dim, block_dim, lds_byte, args...);
return 0;
#endif
} }
#endif #endif
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_GENERIC_REDUCTION_HPP_
#define HOST_GENERIC_REDUCTION_HPP_
#include <vector>
#include <functional>
#include <limits>
#include <type_traits>
#include <cassert>
#include <cmath>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
using float16 = half_float::half;
namespace ck {
namespace host_reduce {
template <typename T>
static void
get_all_indexes(const std::vector<T>& dimLengths, int dim, std::vector<std::vector<T>>& indexes)
{
if(dim < dimLengths.size())
{
std::vector<std::vector<T>> updated_indexes;
if(dim == 0)
{
assert(indexes.size() == 0);
assert(dimLengths[dim] > 0);
for(T i = 0; i < dimLengths[dim]; i++)
{
std::vector<T> index = {i};
updated_indexes.push_back(index);
};
}
else
{
// go through all the current indexes
for(const auto& index : indexes)
for(T i = 0; i < dimLengths[dim]; i++)
{
auto index_new = index;
index_new.push_back(i);
updated_indexes.push_back(index_new);
};
};
// update to the indexes (output)
indexes = updated_indexes;
// further to construct the indexes from the updated status
get_all_indexes(dimLengths, dim + 1, indexes);
};
};
template <typename T>
static T get_offset_from_index(const std::vector<T>& strides, const std::vector<T>& index)
{
T offset = 0;
assert(strides.size() == index.size());
for(int i = 0; i < index.size(); i++)
offset += strides[i] * static_cast<T>(index[i]);
return (offset);
};
template <typename T>
static inline T get_flatten_offset(const std::vector<T>& lengths, const std::vector<T>& index)
{
T offset = 0;
assert(lengths.size() == index.size() && lengths.size() > 0);
int len = lengths.size();
T stride = 1;
// for len==1, the loop is not executed
for(int i = len - 1; i > 0; i--)
{
offset += stride * static_cast<T>(index[i]);
stride *= lengths[i];
};
offset += stride * static_cast<T>(index[0]);
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp_t ReduceOpId,
bool PropagateNan,
bool NeedIndices>
class ReductionHost
{
public:
ReductionHost() = default;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& toReduceDims_)
{
this->inLengths = to_int_vector(inDesc.GetLengths());
this->outLengths = to_int_vector(outDesc.GetLengths());
this->inStrides = to_int_vector(inDesc.GetStrides());
this->outStrides = to_int_vector(outDesc.GetStrides());
this->invariantDims = invariantDims_;
this->toReduceDims = toReduceDims_;
assert(this->inLengths.size() == this->outLengths.size());
assert(!this->toReduceDims.empty());
for(const auto dim : this->invariantDims)
this->invariantLengths.push_back(this->inLengths[dim]);
for(const auto dim : this->toReduceDims)
toReduceLengths.push_back(this->inLengths[dim]);
this->reduceAllDims = this->invariantDims.empty();
};
~ReductionHost(){};
void
Run(float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
if constexpr(NeedIndices)
RunImpl_with_indices(alpha, in_data, beta, out_data, indices);
else
RunImpl_no_indices(alpha, in_data, beta, out_data);
};
private:
std::vector<int> inLengths;
std::vector<int> outLengths;
std::vector<int> inStrides;
std::vector<int> outStrides;
std::vector<int> invariantLengths;
std::vector<int> toReduceLengths;
std::vector<int> invariantDims;
std::vector<int> toReduceDims;
bool reduceAllDims;
void RunImpl_with_indices(
float alpha, const InDataType* in_data, float beta, OutDataType* out_data, int* indices)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn2<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is actually
// done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(inLengths, src_index);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
indices[0] = accuIndex;
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
int accuIndex = 0;
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
// unary operation before reducing, needed by AMAX. For MIN/MAX, nothing is
// actually done
PreUnaryOp(currVal);
auto currIndex = get_flatten_offset(toReduceLengths, index_2);
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce, accuVal, currVal, accuIndex, currIndex);
};
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
indices[dst_offset] = accuIndex;
};
};
}; // end of RunImpl_with_indices()
void
RunImpl_no_indices(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
int divider = 1;
for(int i = 0; i < toReduceLengths.size(); i++)
divider *= toReduceLengths[i];
auto PreUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
auto PosUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
if(reduceAllDims)
{
std::vector<std::vector<int>> indexes_1;
get_all_indexes(inLengths, 0, indexes_1); // generate the input indexes space
auto accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the invariant dimensions
for(const auto& src_index : indexes_1)
{
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[0]) * static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[0] = static_cast<OutDataType>(accuVal);
}
else
{
std::vector<std::vector<int>> indexes_1, indexes_2;
get_all_indexes(
this->invariantLengths, 0, indexes_1); // generate the invariant indexes space
get_all_indexes(
this->toReduceLengths, 0, indexes_2); // generate the toReduce indexes space
// go through indexes of the invariant dimensions
for(const auto& index_1 : indexes_1)
{
std::vector<int> src_index;
std::vector<int> dst_index;
src_index.resize(this->inLengths.size());
for(int k = 0; k < invariantDims.size(); k++)
dst_index.push_back(index_1[k]);
int dst_offset = get_offset_from_index(this->outStrides, dst_index);
// generate the part of src index belonging to invariant dims
for(int k = 0; k < invariantDims.size(); k++)
src_index[invariantDims[k]] = index_1[k];
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
// go through indexes of the toReduce dimensions
for(const auto& index_2 : indexes_2)
{
// generate the part of src index belonging to toReduce dims
for(int k = 0; k < toReduceDims.size(); k++)
src_index[toReduceDims[k]] = index_2[k];
auto src_offset = get_offset_from_index(this->inStrides, src_index);
auto currVal = static_cast<AccDataType>(in_data[src_offset]);
PreUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
PosUnaryOp(accuVal);
// scale the accumulated value
if(!float_equal_one(alpha))
accuVal *= static_cast<AccDataType>(alpha);
// scale the prior dst value and add it to the accumulated value
if(!float_equal_zero(beta))
accuVal += static_cast<AccDataType>(out_data[dst_offset]) *
static_cast<AccDataType>(beta);
// store the reduced value to dst location
out_data[dst_offset] = static_cast<OutDataType>(accuVal);
};
};
}; // end of RunImpl_no_indices()
};
}; // end of namespace host_reduce
}; // end of namespace ck
#endif
...@@ -66,22 +66,22 @@ static inline bool float_equal_zero(half_float::half x) ...@@ -66,22 +66,22 @@ static inline bool float_equal_zero(half_float::half x)
return x == static_cast<half_float::half>(0.0f); return x == static_cast<half_float::half>(0.0f);
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&)> PreUnaryOpFn(int) __host__ static inline std::function<void(AccDataType&)> PreUnaryOpFn(int)
{ {
using std::abs; using std::abs;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1) if constexpr(ReduceOpId == ReduceTensorOp_t::NORM1)
{ {
return ([&](compType& a_) { a_ = abs(a_); }); return ([&](AccDataType& a_) { a_ = abs(a_); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) else if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
{ {
return ([&](compType& a_) { a_ = a_ * a_; }); return ([&](AccDataType& a_) { a_ = a_ * a_; });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
{ {
return ([&](compType& a_) { a_ = abs(a_); }); return ([&](AccDataType& a_) { a_ = abs(a_); });
} }
else else
{ {
...@@ -90,23 +90,23 @@ __host__ static inline std::function<void(compType&)> PreUnaryOpFn(int) ...@@ -90,23 +90,23 @@ __host__ static inline std::function<void(compType&)> PreUnaryOpFn(int)
// ReduceTensorOp_t::MUL: // ReduceTensorOp_t::MUL:
// ReduceTensorOp_t::MIN: // ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX: // ReduceTensorOp_t::MAX:
return ([&](compType&) {}); return ([&](AccDataType&) {});
}; };
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider) __host__ static inline std::function<void(AccDataType&)> PosUnaryOpFn(int32_t divider)
{ {
using std::sqrt; using std::sqrt;
if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2) if constexpr(ReduceOpId == ReduceTensorOp_t::NORM2)
{ {
return ([&](compType& a_) { a_ = sqrt(a_); }); return ([&](AccDataType& a_) { a_ = sqrt(a_); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG) else if constexpr(ReduceOpId == ReduceTensorOp_t::AVG)
{ {
return ([&, divider](compType& a_) { return ([&, divider](AccDataType& a_) {
a_ = a_ / static_cast<compType>(static_cast<float>(divider)); a_ = a_ / static_cast<AccDataType>(static_cast<float>(divider));
}); });
} }
else else
...@@ -117,44 +117,44 @@ __host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider) ...@@ -117,44 +117,44 @@ __host__ static inline std::function<void(compType&)> PosUnaryOpFn(int divider)
// ReduceTensorOp_t::MIN: // ReduceTensorOp_t::MIN:
// ReduceTensorOp_t::MAX: // ReduceTensorOp_t::MAX:
// ReduceTensorOp_t::AMAX: // ReduceTensorOp_t::AMAX:
return ([&](compType&) {}); return ([&](AccDataType&) {});
} }
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&, compType)> ReduceOpFn() __host__ static inline std::function<void(AccDataType&, AccDataType)> ReduceOpFn()
{ {
if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG || if constexpr(ReduceOpId == ReduceTensorOp_t::ADD || ReduceOpId == ReduceTensorOp_t::AVG ||
ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2) ReduceOpId == ReduceTensorOp_t::NORM1 || ReduceOpId == ReduceTensorOp_t::NORM2)
{ {
return ([&](compType& a_, compType b_) { a_ = a_ + b_; }); return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ + b_; });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) else if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
{ {
return ([&](compType& a_, compType b_) { a_ = a_ * b_; }); return ([&](AccDataType& a_, AccDataType b_) { a_ = a_ * b_; });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
{ {
return ([&](compType& a_, compType b_) { return ([&](AccDataType& a_, AccDataType b_) {
if(a_ > b_) if(a_ > b_)
a_ = b_; a_ = b_;
}); });
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
{ {
return ([&](compType& a_, compType b_) { return ([&](AccDataType& a_, AccDataType b_) {
if(a_ < b_) if(a_ < b_)
a_ = b_; a_ = b_;
}); });
} }
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
__host__ static inline std::function<void(compType&, compType, bool& changed)> ReduceOpFn2() __host__ static inline std::function<void(AccDataType&, AccDataType, bool& changed)> ReduceOpFn2()
{ {
if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
{ {
return ([&](compType& a_, compType b_, bool& changed) { return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ > b_) if(a_ > b_)
{ {
a_ = b_; a_ = b_;
...@@ -166,7 +166,7 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R ...@@ -166,7 +166,7 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX || ReduceOpId == ReduceTensorOp_t::AMAX)
{ {
return ([&](compType& a_, compType b_, bool& changed) { return ([&](AccDataType& a_, AccDataType b_, bool& changed) {
if(a_ < b_) if(a_ < b_)
{ {
a_ = b_; a_ = b_;
...@@ -183,28 +183,28 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R ...@@ -183,28 +183,28 @@ __host__ static inline std::function<void(compType&, compType, bool& changed)> R
// ReduceTensorOp_t::AVG: // ReduceTensorOp_t::AVG:
// ReduceTensorOp_t::NORM1: // ReduceTensorOp_t::NORM1:
// ReduceTensorOp_t::NORM2: // ReduceTensorOp_t::NORM2:
return (std::function<void(compType&, compType, bool&)>{}); return (std::function<void(AccDataType&, AccDataType, bool&)>{});
}; };
}; };
template <typename compType, ReduceTensorOp_t ReduceOpId> template <typename AccDataType, ReduceTensorOp_t ReduceOpId>
__host__ static inline compType ReduceOpZeroVal() __host__ static inline AccDataType ReduceOpZeroVal()
{ {
if constexpr(ReduceOpId == ReduceTensorOp_t::MUL) if constexpr(ReduceOpId == ReduceTensorOp_t::MUL)
{ {
return (static_cast<compType>(1.0f)); return (static_cast<AccDataType>(1.0f));
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN) else if constexpr(ReduceOpId == ReduceTensorOp_t::MIN)
{ {
return (std::numeric_limits<compType>::max()); return (std::numeric_limits<AccDataType>::max());
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX) else if constexpr(ReduceOpId == ReduceTensorOp_t::MAX)
{ {
return (std::numeric_limits<compType>::lowest()); return (std::numeric_limits<AccDataType>::lowest());
} }
else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX) else if constexpr(ReduceOpId == ReduceTensorOp_t::AMAX)
{ {
return (static_cast<compType>(0.0f)); return (static_cast<AccDataType>(0.0f));
} }
else else
{ {
...@@ -212,14 +212,15 @@ __host__ static inline compType ReduceOpZeroVal() ...@@ -212,14 +212,15 @@ __host__ static inline compType ReduceOpZeroVal()
// ReduceTensorOp_t::AVG // ReduceTensorOp_t::AVG
// ReduceTensorOp_t::NORM1 // ReduceTensorOp_t::NORM1
// ReduceTensorOp_t::NORM2 // ReduceTensorOp_t::NORM2
return (static_cast<compType>(0.0f)); return (static_cast<AccDataType>(0.0f));
}; };
}; };
template <typename compType, bool PropagateNan> template <typename AccDataType, bool PropagateNan>
__host__ static inline void binop_with_nan_check(std::function<void(compType&, compType)> opReduce, __host__ static inline void
compType& accuVal, binop_with_nan_check(std::function<void(AccDataType&, AccDataType)> opReduce,
compType currVal) AccDataType& accuVal,
AccDataType currVal)
{ {
using std::isnan; using std::isnan;
...@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c ...@@ -236,11 +237,11 @@ __host__ static inline void binop_with_nan_check(std::function<void(compType&, c
}; };
}; };
template <typename compType, bool PropagateNan> template <typename AccDataType, bool PropagateNan>
__host__ static inline void __host__ static inline void
binop_with_nan_check2(std::function<void(compType&, compType, bool&)> opReduce, binop_with_nan_check2(std::function<void(AccDataType&, AccDataType, bool&)> opReduce,
compType& accuVal, AccDataType& accuVal,
compType currVal, AccDataType currVal,
int& accuIndex, int& accuIndex,
int currIndex) int currIndex)
{ {
......
/*******************************************************************************
*
* MIT License
*
* Copyright (c) 2020 Advanced Micro Devices, Inc.
*
* Permission is hereby granted, free of charge, to any person obtaining a copy
* of this software and associated documentation files (the "Software"), to deal
* in the Software without restriction, including without limitation the rights
* to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
* copies of the Software, and to permit persons to whom the Software is
* furnished to do so, subject to the following conditions:
*
* The above copyright notice and this permission notice shall be included in all
* copies or substantial portions of the Software.
*
* THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
* IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
* FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
* AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
* LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*
*******************************************************************************/
#ifndef HOST_REDUCTION_HPP_
#define HOST_REDUCTION_HPP_
#include <vector>
#include <array>
#include <functional>
#include "reduction_enums.hpp"
#include "host_reduce_util.hpp"
#include "host_tensor.hpp"
#include "data_type.hpp"
template <int NDim>
static void get_all_indexes(const std::array<size_t, NDim>& dimLengths,
std::vector<std::array<size_t, NDim>>& indexes)
{
static_assert(NDim >= 1, "NDim >= 1 is required to use this function!");
if constexpr(NDim == 1)
{
for(size_t i = 0; i < dimLengths[0]; i++)
{
std::array<size_t, 1> index{i};
indexes.push_back(index);
};
}
else
{
std::array<size_t, NDim - 1> partial_dim_lengths;
for(int i = 0; i < NDim - 1; i++)
partial_dim_lengths[i] = dimLengths[i + 1];
std::vector<std::array<size_t, NDim - 1>> partial_indexes;
get_all_indexes<NDim - 1>(partial_dim_lengths, partial_indexes);
for(size_t i = 0; i < dimLengths[0]; i++)
for(const auto& index : partial_indexes)
{
std::array<size_t, NDim> extIndex;
extIndex[0] = i;
for(int k = 0; k < NDim - 1; k++)
extIndex[k + 1] = index[k];
indexes.push_back(extIndex);
};
};
};
template <int NDim>
static size_t get_offset_from_index(const std::array<size_t, NDim>& strides,
const std::array<size_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += strides[i] * index[i];
return (offset);
};
template <int NDim>
static size_t get_offset_from_index(const std::vector<size_t>& strides,
const std::array<size_t, NDim>& index)
{
size_t offset = 0;
for(int i = 0; i < NDim; i++)
offset += strides[i] * index[i];
return (offset);
};
template <typename InDataType,
typename AccDataType,
typename OutDataType,
ck::ReduceTensorOp_t ReduceOpId,
int Rank,
int NumReduceDim,
bool PropagateNan,
bool NeedIndices>
struct ReductionHost
{
using IndexDataType = int32_t;
static constexpr int NumInvariantDim = Rank - NumReduceDim;
std::vector<size_t> outStrides;
std::vector<int> invariantDims;
std::vector<int> reduceDims;
IndexDataType divider;
std::function<void(AccDataType&)> preUnaryOp;
std::function<void(AccDataType&)> posUnaryOp;
std::array<size_t, NumReduceDim> reduceLengths;
std::array<size_t, NumReduceDim> reduceStrides;
std::array<size_t, NumInvariantDim> invariantLengths;
std::array<size_t, NumInvariantDim> invariantStrides;
std::vector<std::array<size_t, NumReduceDim>> reduce_dim_indexes;
std::vector<std::array<size_t, NumInvariantDim>> invariant_dim_indexes;
ReductionHost(HostTensorDescriptor& inDesc,
HostTensorDescriptor& outDesc,
const std::vector<int>& invariantDims_,
const std::vector<int>& reduceDims_)
{
using ck::host_reduce::PosUnaryOpFn;
using ck::host_reduce::PreUnaryOpFn;
// this->outLengths = to_int_vector(outDesc.GetLengths());
this->outStrides = outDesc.GetStrides();
this->invariantDims = invariantDims_;
this->reduceDims = reduceDims_;
int product = 1;
for(int i = 0; i < NumReduceDim; i++)
{
reduceLengths[i] = inDesc.GetLengths()[reduceDims[i]];
reduceStrides[i] = inDesc.GetStrides()[reduceDims[i]];
product *= inDesc.GetLengths()[reduceDims[i]];
};
divider = product;
for(int i = 0; i < NumInvariantDim; i++)
{
invariantLengths[i] = inDesc.GetLengths()[invariantDims[i]];
invariantStrides[i] = inDesc.GetStrides()[invariantDims[i]];
};
reduce_dim_indexes.clear();
get_all_indexes<NumReduceDim>(reduceLengths, reduce_dim_indexes);
if constexpr(NumInvariantDim > 0)
{
invariant_dim_indexes.clear();
get_all_indexes<NumInvariantDim>(invariantLengths, invariant_dim_indexes);
};
preUnaryOp = PreUnaryOpFn<AccDataType, ReduceOpId>(divider);
posUnaryOp = PosUnaryOpFn<AccDataType, ReduceOpId>(divider);
};
void Run(float alpha,
const InDataType* in_data,
float beta,
OutDataType* out_data,
IndexDataType* out_indices)
{
if constexpr(NeedIndices)
{
RunImpl_with_index(alpha, in_data, beta, out_data, out_indices);
}
else
{
RunImpl_no_index(alpha, in_data, beta, out_data);
};
};
void RunImpl_with_index(float alpha,
const InDataType* in_data,
float beta,
OutDataType* out_data,
IndexDataType* out_indices)
{
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check2;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn2;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce2 = ReduceOpFn2<AccDataType, ReduceOpId>();
if constexpr(NumInvariantDim == 0)
{
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal);
auto currIndex = i;
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce2, accuVal, currVal, accuIndex, currIndex);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
out_indices[0] = accuIndex;
}
else
{
auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
IndexDataType accuIndex = 0;
auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(IndexDataType i = 0; i < reduce_dim_indexes.size(); i++)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_dim_indexes[i]);
auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal);
auto currIndex = i;
binop_with_nan_check2<AccDataType, PropagateNan>(
opReduce2, accuVal, currVal, accuIndex, currIndex);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
out_indices[dst_offset] = accuIndex;
};
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end =
std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
thread_reduce_func(invariant_dim_indexes[iw]);
}
};
threads[it] = joinable_thread(f);
}
};
};
void RunImpl_no_index(float alpha, const InDataType* in_data, float beta, OutDataType* out_data)
{
using ck::type_convert;
using ck::host_reduce::binop_with_nan_check;
using ck::host_reduce::float_equal_one;
using ck::host_reduce::float_equal_zero;
using ck::host_reduce::ReduceOpFn;
using ck::host_reduce::ReduceOpZeroVal;
auto opReduce = ReduceOpFn<AccDataType, ReduceOpId>();
if constexpr(NumInvariantDim == 0)
{
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
for(const auto& reduce_index : reduce_dim_indexes)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
auto currVal = type_convert<AccDataType>(in_data[offset_reduce]);
preUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[0]) * type_convert<AccDataType>(beta);
out_data[0] = type_convert<OutDataType>(accuVal);
}
else
{
auto thread_reduce_func = [&](auto invariant_index) {
AccDataType accuVal = ReduceOpZeroVal<AccDataType, ReduceOpId>();
auto offset_invariant =
get_offset_from_index<NumInvariantDim>(invariantStrides, invariant_index);
for(const auto& reduce_index : reduce_dim_indexes)
{
auto offset_reduce =
get_offset_from_index<NumReduceDim>(reduceStrides, reduce_index);
auto currVal =
type_convert<AccDataType>(in_data[offset_invariant + offset_reduce]);
preUnaryOp(currVal);
binop_with_nan_check<AccDataType, PropagateNan>(opReduce, accuVal, currVal);
};
posUnaryOp(accuVal);
if(!float_equal_one(alpha))
accuVal *= type_convert<AccDataType>(alpha);
auto dst_offset =
get_offset_from_index<NumInvariantDim>(outStrides, invariant_index);
if(!float_equal_zero(beta))
accuVal += type_convert<AccDataType>(out_data[dst_offset]) *
type_convert<AccDataType>(beta);
out_data[dst_offset] = type_convert<OutDataType>(accuVal);
};
std::size_t num_thread = std::thread::hardware_concurrency();
std::size_t work_per_thread =
(invariant_dim_indexes.size() + num_thread - 1) / num_thread;
std::vector<joinable_thread> threads(num_thread);
for(std::size_t it = 0; it < num_thread; ++it)
{
std::size_t iw_begin = it * work_per_thread;
std::size_t iw_end =
std::min((it + 1) * work_per_thread, invariant_dim_indexes.size());
auto f = [=] {
for(std::size_t iw = iw_begin; iw < iw_end; ++iw)
{
thread_reduce_func(invariant_dim_indexes[iw]);
}
};
threads[it] = joinable_thread(f);
}
};
};
};
#endif
...@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim) ...@@ -40,20 +40,6 @@ std::ostream& LogRangeAsType(std::ostream& os, Range&& range, std::string delim)
return os; return os;
} }
typedef enum
{
Half = 0,
Float = 1,
} DataType_t;
template <typename T>
struct DataType;
template <>
struct DataType<float> : std::integral_constant<DataType_t, DataType_t::Float>
{
};
template <typename F, typename T, std::size_t... Is> template <typename F, typename T, std::size_t... Is>
auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>) auto call_f_unpack_args_impl(F f, T args, std::index_sequence<Is...>)
{ {
...@@ -312,48 +298,58 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s ...@@ -312,48 +298,58 @@ HostTensorDescriptor::HostTensorDescriptor(std::vector<X> lens, std::vector<Y> s
void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout); void ostream_HostTensorDescriptor(const HostTensorDescriptor& desc, std::ostream& os = std::cout);
#if 1
// FIXME: remove
float bf16_to_f32_(ck::bhalf_t src_val); float bf16_to_f32_(ck::bhalf_t src_val);
// FIXME: remove
void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst); void bf16_to_f32_(const Tensor<ck::bhalf_t>& src, Tensor<float>& dst);
#endif
template <typename T> template <typename T>
void check_error(const Tensor<T>& ref, const Tensor<T>& result) float check_error(const Tensor<T>& ref, const Tensor<T>& result)
{ {
float error = 0; float l1_error = 0;
float max_diff = -1; float linf_error = -1;
float ref_value = 0, result_value = 0; float linf_rel_error = -1;
float linf_ref_value = 0, linf_result_value = 0;
float linf_rel_ref_value = 0, linf_rel_result_value = 0;
if constexpr(std::is_same<ck::bhalf_t, T>::value) constexpr float eps = 1e-10;
for(int i = 0; i < ref.mData.size(); ++i)
{ {
for(int i = 0; i < ref.mData.size(); ++i) float ref_v = ck::type_convert<float>(ref.mData[i]);
float result_v = ck::type_convert<float>(result.mData[i]);
float diff = std::abs(ref_v - result_v);
float rel_diff = diff / std::max(std::abs(ref_v), eps);
l1_error += diff;
if(linf_error < diff)
{ {
error += std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i])); linf_error = diff;
float diff = std::abs(bf16_to_f32_(ref.mData[i]) - bf16_to_f32_(result.mData[i])); linf_ref_value = ref_v;
if(max_diff < diff) linf_result_value = result_v;
{
max_diff = diff;
ref_value = bf16_to_f32_(ref.mData[i]);
result_value = bf16_to_f32_(result.mData[i]);
}
} }
}
else if(linf_rel_error < rel_diff)
{
for(int i = 0; i < ref.mData.size(); ++i)
{ {
error += std::abs(double(ref.mData[i]) - double(result.mData[i])); linf_rel_error = rel_diff;
float diff = std::abs(double(ref.mData[i]) - double(result.mData[i])); linf_rel_ref_value = ref_v;
if(max_diff < diff) linf_rel_result_value = result_v;
{
max_diff = diff;
ref_value = ref.mData[i];
result_value = result.mData[i];
}
} }
} }
std::cout << "error: " << error << std::endl; std::cout << "Absolute Error L1 Norm (sum of abs diff): " << l1_error << std::endl;
std::cout << "max_diff: " << max_diff << ", " << ref_value << ", " << result_value << std::endl; std::cout << "Absolute Error L-inf Norm (max abs diff): " << linf_error << ", ref "
<< linf_ref_value << ", result " << linf_result_value << std::endl;
std::cout << "Relative Error L-inf Norm (max relative abs diff): " << linf_rel_error << ", ref "
<< linf_rel_ref_value << ", result " << linf_rel_result_value << std::endl;
return linf_error;
} }
template <typename T> template <typename T>
......
...@@ -93,8 +93,8 @@ struct GeneratorTensor_2<int8_t> ...@@ -93,8 +93,8 @@ struct GeneratorTensor_2<int8_t>
template <typename T> template <typename T>
struct GeneratorTensor_3 struct GeneratorTensor_3
{ {
T min_value = 0; float min_value = 0;
T max_value = 1; float max_value = 1;
template <typename... Is> template <typename... Is>
T operator()(Is...) T operator()(Is...)
...@@ -122,22 +122,6 @@ struct GeneratorTensor_3<ck::bhalf_t> ...@@ -122,22 +122,6 @@ struct GeneratorTensor_3<ck::bhalf_t>
} }
}; };
template <>
struct GeneratorTensor_3<int8_t>
{
float min_value = 0;
float max_value = 1;
template <typename... Is>
int8_t operator()(Is...)
{
int8_t min_tmp = static_cast<int8_t>(min_value);
int8_t max_tmp = static_cast<int8_t>(max_value);
return (std::rand() % (max_tmp - min_tmp)) + min_tmp;
}
};
struct GeneratorTensor_Checkboard struct GeneratorTensor_Checkboard
{ {
template <typename... Ts> template <typename... Ts>
......
...@@ -14,9 +14,9 @@ namespace host { ...@@ -14,9 +14,9 @@ namespace host {
// //
// @brief Reference implementation for forward convolution. // @brief Reference implementation for forward convolution.
// //
// @paragraph Supported tensor layouts. Input tensor supports NCHiWi data layout. // @paragraph Supports both NCHW as well as NHWC formats (and their respective
// Weights tensor supports KCYX data layout. Output tensor supports // counterparts for weight and output) as long as tensor descriptor
// NKHoWo data layout. // lengths is in NCHW.
// //
// @tparam InDataType Input tensor data type. // @tparam InDataType Input tensor data type.
// @tparam WeiDataType Weights tensor data type. // @tparam WeiDataType Weights tensor data type.
...@@ -100,9 +100,9 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -100,9 +100,9 @@ struct ReferenceConvFwd : public device::BaseOperator
float v_wei; float v_wei;
arg.in_element_op_(v_in, arg.in_element_op_(v_in,
static_cast<const float>(arg.input_(n, c, wi))); ck::type_convert<float>(arg.input_(n, c, wi)));
arg.wei_element_op_(v_wei, arg.wei_element_op_(v_wei,
static_cast<const float>(arg.weight_(k, c, x))); ck::type_convert<float>(arg.weight_(k, c, x)));
v_acc += v_in * v_wei; v_acc += v_in * v_wei;
} }
...@@ -112,7 +112,7 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -112,7 +112,7 @@ struct ReferenceConvFwd : public device::BaseOperator
float v_out; float v_out;
arg.out_element_op_(v_out, v_acc); arg.out_element_op_(v_out, v_acc);
arg.output_(n, k, wo) = v_out; arg.output_(n, k, wo) = ck::type_convert<OutDataType>(v_out);
}; };
make_ParallelTensorFunctor(f_ncw, make_ParallelTensorFunctor(f_ncw,
...@@ -169,6 +169,61 @@ struct ReferenceConvFwd : public device::BaseOperator ...@@ -169,6 +169,61 @@ struct ReferenceConvFwd : public device::BaseOperator
return 0; return 0;
} }
else if constexpr(NumDimSpatial == 3)
{
auto f_nchw = [&](auto n, auto k, auto d_o, auto ho, auto wo) {
float v_acc = 0;
for(int c = 0; c < arg.weight_.mDesc.GetLengths()[1]; ++c)
{
for(int z = 0; z < arg.weight_.mDesc.GetLengths()[2]; ++z)
{
int di = d_o * arg.conv_strides_[0] + z * arg.conv_dilations_[0] -
arg.in_left_pads_[0];
for(int y = 0; y < arg.weight_.mDesc.GetLengths()[3]; ++y)
{
int hi = ho * arg.conv_strides_[1] + y * arg.conv_dilations_[1] -
arg.in_left_pads_[1];
for(int x = 0; x < arg.weight_.mDesc.GetLengths()[4]; ++x)
{
int wi = wo * arg.conv_strides_[2] +
x * arg.conv_dilations_[2] - arg.in_left_pads_[2];
if(di >= 0 && di < arg.input_.mDesc.GetLengths()[2] &&
hi >= 0 && hi < arg.input_.mDesc.GetLengths()[3] &&
wi >= 0 && wi < arg.input_.mDesc.GetLengths()[4])
{
float v_in;
float v_wei;
arg.in_element_op_(
v_in,
ck::type_convert<float>(arg.input_(n, c, di, hi, wi)));
arg.wei_element_op_(
v_wei,
ck::type_convert<float>(arg.weight_(k, c, z, y, x)));
v_acc += v_in * v_wei;
}
}
}
}
}
float v_out;
arg.out_element_op_(v_out, v_acc);
arg.output_(n, k, d_o, ho, wo) = ck::type_convert<OutDataType>(v_out);
};
make_ParallelTensorFunctor(f_nchw,
arg.output_.mDesc.GetLengths()[0],
arg.output_.mDesc.GetLengths()[1],
arg.output_.mDesc.GetLengths()[2],
arg.output_.mDesc.GetLengths()[3],
arg.output_.mDesc.GetLengths()[4])(
std::thread::hardware_concurrency());
return 0;
}
} }
float Run(const device::BaseArgument* p_arg, int) override float Run(const device::BaseArgument* p_arg, int) override
......
...@@ -6,23 +6,36 @@ ...@@ -6,23 +6,36 @@
#include "device_reduce_instance_blockwise_f32_f32_f32.hpp" #include "device_reduce_instance_blockwise_f32_f32_f32.hpp"
#include "device_reduce_instance_blockwise_f32_f64_f32.hpp" #include "device_reduce_instance_blockwise_f32_f64_f32.hpp"
#include "device_reduce_instance_blockwise_f64_f64_f64.hpp" #include "device_reduce_instance_blockwise_f64_f64_f64.hpp"
#include "device_reduce_instance_blockwise_i8_i8_i8.hpp"
#include "device_reduce_instance_blockwise_i8_i32_i8.hpp"
#include "device_reduce_instance_blockwise_b16_f32_b16.hpp"
#include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp" #include "device_reduce_instance_blockwise_second_call_f16_f16_f16.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp" #include "device_reduce_instance_blockwise_second_call_f32_f32_f16.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp" #include "device_reduce_instance_blockwise_second_call_f32_f32_f32.hpp"
#include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp" #include "device_reduce_instance_blockwise_second_call_f64_f64_f32.hpp"
#include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp" #include "device_reduce_instance_blockwise_second_call_f64_f64_f64.hpp"
#include "device_reduce_instance_blockwise_second_call_i8_i8_i8.hpp"
#include "device_reduce_instance_blockwise_second_call_i32_i32_i8.hpp"
#include "device_reduce_instance_blockwise_second_call_f32_f32_b16.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f16_f32_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f32_f32_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp" #include "device_reduce_instance_multiblock_atomic_add_f32_f64_f32.hpp"
#include "device_reduce_instance_multiblock_atomic_add_b16_f32_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f16_f16_f16.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f16_f32_f16.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f32_f32_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f32_f64_f32.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp" #include "device_reduce_instance_multiblock_partial_reduce_f64_f64_f64.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_i8_i8_i8.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_i8_i32_i8.hpp"
#include "device_reduce_instance_multiblock_partial_reduce_b16_f32_b16.hpp"
#include "device_reduce_instance_threadwise_f16_f16_f16.hpp" #include "device_reduce_instance_threadwise_f16_f16_f16.hpp"
#include "device_reduce_instance_threadwise_f16_f32_f16.hpp" #include "device_reduce_instance_threadwise_f16_f32_f16.hpp"
#include "device_reduce_instance_threadwise_f32_f32_f32.hpp" #include "device_reduce_instance_threadwise_f32_f32_f32.hpp"
#include "device_reduce_instance_threadwise_f32_f64_f32.hpp" #include "device_reduce_instance_threadwise_f32_f64_f32.hpp"
#include "device_reduce_instance_threadwise_f64_f64_f64.hpp" #include "device_reduce_instance_threadwise_f64_f64_f64.hpp"
#include "device_reduce_instance_threadwise_i8_i8_i8.hpp"
#include "device_reduce_instance_threadwise_i8_i32_i8.hpp"
#include "device_reduce_instance_threadwise_b16_f32_b16.hpp"
#endif #endif
...@@ -17,7 +17,6 @@ using reduce_configuration_2_instances_blockwise = std::tuple< ...@@ -17,7 +17,6 @@ using reduce_configuration_2_instances_blockwise = std::tuple<
ReductionConfiguration_2<0, 2, 2, 2, 1>, ReductionConfiguration_2<0, 2, 2, 2, 1>,
ReductionConfiguration_2<0, 1, 1, 2, 1>, ReductionConfiguration_2<0, 1, 1, 2, 1>,
ReductionConfiguration_2<1, 2, 1, 1, 2>, ReductionConfiguration_2<1, 2, 1, 1, 2>,
ReductionConfiguration_2<1, 2, 2, 1, 2>,
ReductionConfiguration_2<0, 1, 1, 3, 1>, ReductionConfiguration_2<0, 1, 1, 3, 1>,
ReductionConfiguration_2<1, 1, 1, 1, 3> ReductionConfiguration_2<1, 1, 1, 1, 3>
// clang-format on // clang-format on
......
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_B16_F32_B16_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 7, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(bhalf_t, float, bhalf_t, 4, 0, 1, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
...@@ -13,21 +13,27 @@ namespace device_reduce_instance { ...@@ -13,21 +13,27 @@ namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
......
...@@ -13,12 +13,15 @@ namespace device_reduce_instance { ...@@ -13,12 +13,15 @@ namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(half_t, float, half_t, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
......
...@@ -13,30 +13,39 @@ namespace device_reduce_instance { ...@@ -13,30 +13,39 @@ namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 7, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, float, float, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
......
...@@ -13,12 +13,15 @@ namespace device_reduce_instance { ...@@ -13,12 +13,15 @@ namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(float, double, float, 7, 0, 0, 2, 1);
// clang-format on // clang-format on
......
...@@ -13,30 +13,39 @@ namespace device_reduce_instance { ...@@ -13,30 +13,39 @@ namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 5, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2 ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 3); // for NORM2
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 7, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1); ADD_BLOCKWISE_INST_REF_BY_ID(double, double, double, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
......
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I32_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 3); // for ADD
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 0, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 3); // for AVG
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int32_t, int8_t, 5, 0, 0, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
#ifndef DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#define DEVICE_REDUCE_INSTANCE_BLOCKWISE_I8_I8_I8_HPP
#include "reduction_enums.hpp"
#include "reduction_operator_mapping.hpp"
#include "device_reduce_instance_blockwise.hpp"
namespace ck {
namespace tensor_operation {
namespace device {
namespace device_reduce_instance {
// clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 4);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_INST_REF_BY_ID(int8_t, int8_t, int8_t, 4, 0, 1, 2, 1);
// clang-format on
} // namespace device_reduce_instance
} // namespace device
} // namespace tensor_operation
} // namespace ck
#endif
...@@ -15,9 +15,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple< ...@@ -15,9 +15,7 @@ using reduce_configuration_2_instances_blockwise_second_call = std::tuple<
// clang-format off // clang-format off
// InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize // InSrcVectorDim | InSrcVectorSize | OutDstVectorSize | MThreadSliceSize | KThreadSliceSize
ReductionConfiguration_2<1, 2, 1, 1, 2>, ReductionConfiguration_2<1, 2, 1, 1, 2>,
ReductionConfiguration_2<1, 2, 2, 1, 2>, ReductionConfiguration_2<1, 1, 1, 1, 3>
ReductionConfiguration_2<1, 1, 1, 1, 3>,
ReductionConfiguration_2<1, 1, 2, 1, 3>
// clang-format on // clang-format on
>; >;
#else #else
......
...@@ -13,21 +13,27 @@ namespace device_reduce_instance { ...@@ -13,21 +13,27 @@ namespace device_reduce_instance {
// clang-format off // clang-format off
// InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim // InDataType | AccDataType | OutDataType | ReduceOpId | NanPropaOpt | IndicesOpt | Rank | NumReduceDim
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 3); // for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 4);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 4, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 0, 2, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 3); // for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 4);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 4, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 0, 2, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 3); // for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 4);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 4, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 0, 2, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 3); // for MIN
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 4);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 4, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 2, 0, 1, 2, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 3); // for MAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 4);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 4, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 3, 0, 1, 2, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 3); // for AMAX
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 4);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 4, 1);
ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1); ADD_BLOCKWISE_SECOND_CALL_INST_REF_BY_ID(half_t, half_t, half_t, 4, 0, 1, 2, 1);
// clang-format on // clang-format on
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment