Unverified Commit a0b570b2 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Add more supported operators and optimizations for the cpu backend (#746)



* Add eliminate_data_type pass

* Formatting

* Auto convert quant ops

* Formatting

* Flip the order of decompose

* Compute max size differently

* Formatting

* Clamp values in convert

* Formatting

* Fix loss of precision in reduce

* Formatting

* Fix bugs in reduction

* Fix accumulator type in reference softmax implementation

* Formatting

* Update convert test

* Remove unused variables

* Remove unnecessary quant_dot check

* Formatting

* Add tests

* Formatting

* Remove unused code

* Remove duplicate ops

* Remove blaze dependency

* Use set since shape::type_t is no hashable on gcc 5

* Formatting

* Add dnnl binary op

* Formatting

* Add binary and eltwise

* Formatting

* Add softmax

* Formatting

* Remove unused operators

* Add missing files

* Formatting

* Add lrn

* Formatting

* Add deconvolution

* Formatting

* Change allocate default

* Add reorder

* Formatting

* Add reductions

* Formatting

* Sort lines

* Change literals in another loop

* Add pow operator

* Formatting

* Add pow operator

* Formatting

* Make sure shapes are packed

* Allow broadcasted inputs

* Remove unused operators

* Simplify functions

* Remove softmax

* Add sub and erf functions

* Formatting

* Fix bug

* Formatting

* Improve parallism

* Formatting

* Allow multiple batch dimensions

* Formatting

* Move literal transforms out of lowering

* Formatting

* Add gather operator

* Sort lines

* Add early exit for carry

* Formatting

* Add missing concat

* Rename macro

* Fix deep nesting

* Formatting

* Fix cppcheck issues

* Remov else

* Move attribute to typedef

* Formatting

* Disable maybe-uninitialized warning since its broken on gcc

* Add constexpr default constructor

* Formatting

* Fix compiler warnings

* Fix adjust_allocation test
Co-authored-by: default avatarShucai Xiao <shucai@gmail.com>
Co-authored-by: default avatarmvermeulen <5479696+mvermeulen@users.noreply.github.com>
parent 165d1a17
......@@ -177,8 +177,7 @@ rocm_enable_cppcheck(
passedByValue
unusedStructMember
functionStatic
functionConst:*program.*
functionConst:*module.*
functionConst
shadowFunction
shadowVar
shadowVariable
......
......@@ -107,9 +107,12 @@ else()
else()
list(APPEND CMAKE_COMPILER_WARNINGS
-Wno-missing-field-initializers
-Wno-maybe-uninitialized
# -Wno-deprecated-declarations
)
endif()
add_definitions(${CMAKE_COMPILER_WARNINGS})
foreach(COMPILER_WARNING ${CMAKE_COMPILER_WARNINGS})
add_compile_options($<$<COMPILE_LANGUAGE:${COMPILER}>:${COMPILER_WARNING}>)
endforeach()
endforeach()
endif ()
......@@ -77,9 +77,7 @@ void eliminate_contiguous::apply(module& p) const
auto args = ins->inputs();
for(auto arg : ins->inputs())
{
// TODO: Pass in names for the operator in the constructor instead
// of using ends_with
if(ends_with(arg->name(), "contiguous"))
if(arg->name() == op_name)
{
auto new_args = args;
auto prev = arg->inputs().front();
......
......@@ -151,6 +151,13 @@ struct check_shapes
return *this;
}
const check_shapes& packed_or_broadcasted() const
{
if(!this->all_of([](const shape& s) { return s.packed() or s.broadcasted(); }))
MIGRAPHX_THROW(prefix() + "Shapes are not packed nor broadcasted");
return *this;
}
const check_shapes& not_transposed() const
{
if(!this->all_of([](const shape& s) { return not s.transposed(); }))
......
......@@ -15,6 +15,7 @@ struct module;
*/
struct eliminate_contiguous
{
std::string op_name;
std::string name() const { return "eliminate_contiguous"; }
void apply(module& p) const;
};
......
include(CheckCXXCompilerFlag)
add_library(migraphx_cpu
allocation_model.cpp
allocate.cpp
add.cpp
contiguous.cpp
allocation_model.cpp
binary.cpp
concat.cpp
convolution.cpp
copy.cpp
mul.cpp
pooling.cpp
relu.cpp
deconvolution.cpp
dnnl.cpp
eltwise.cpp
erf.cpp
gather.cpp
gemm.cpp
target.cpp
logsoftmax.cpp
lowering.cpp
lrn.cpp
pooling.cpp
reduction.cpp
reorder.cpp
softmax.cpp
sub.cpp
target.cpp
write_literals.cpp
)
set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
......@@ -23,6 +34,7 @@ find_package(dnnl REQUIRED)
rocm_clang_tidy_check(migraphx_cpu)
target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
find_package(OpenMP)
target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
# Add library path to rpath to workaround issues with our broken packages
......
#include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
{
std::string algo;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.algo, "algo"));
}
std::string name() const { return "dnnl::binary"; }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
check_shapes{inputs, *this}.has(2);
auto s0 = inputs.at(0);
auto s1 = inputs.at(1);
auto r = s0;
if(s0 != s1 or !s0.packed())
{
r = shape{s0.type(), s0.lens()};
}
// Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs));
return r;
}
dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {to_dnnl_algo(algo), m.at(DNNL_ARG_SRC_0), m.at(DNNL_ARG_SRC_1), m.at(DNNL_ARG_DST)};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/deconvolution.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_deconvolution
: dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::deconvolution>
{
std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
shape adjust_shape(const shape& x, int i) const
{
auto s = base_adjust_shape(x);
if(i == 1)
{
// The input and output channels are flipped for dnnl
auto lens = s.lens();
std::swap(lens[0], lens[1]);
auto strides = s.strides();
std::swap(strides[0], strides[1]);
return {s.type(), lens, strides};
}
return s;
}
dnnl::deconvolution_forward::desc
get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
// In DNNL dilation is zero-based
auto dilation = op.dilation;
std::transform(
dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
return {dnnl::prop_kind::forward_inference,
dnnl::algorithm::deconvolution_direct,
m.at(DNNL_ARG_SRC),
m.at(DNNL_ARG_WEIGHTS),
m.at(DNNL_ARG_DST),
to_dnnl_dims(op.stride),
to_dnnl_dims(dilation),
to_dnnl_dims(op.padding),
to_dnnl_dims(op.padding)};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/cpu/dnnl.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
dnnl_context& get_dnnl_context()
{
static dnnl_context ctx{}; // NOLINT
return ctx;
}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wswitch-enum"
#endif
dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
{
using dt = dnnl::memory::data_type;
using st = shape::type_t;
switch(t)
{
case st::half_type: return dt::f16;
case st::float_type: return dt::f32;
case st::int32_type: return dt::s32;
case st::int8_type: return dt::s8;
case st::uint8_type: return dt::u8;
default: MIGRAPHX_THROW("Unsupported data type");
}
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
{
switch(n)
{
case 1: return dnnl::memory::format_tag::a;
case 2: return dnnl::memory::format_tag::ab;
case 3: return dnnl::memory::format_tag::abc;
case 4: return dnnl::memory::format_tag::abcd;
case 5: return dnnl::memory::format_tag::abcde;
case 6: return dnnl::memory::format_tag::abcdef;
default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
}
}
dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
{
return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
}
dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
{
return dnnl::memory(desc, get_dnnl_context().engine, a.data());
}
dnnl::memory to_dnnl_memory(const argument& a)
{
return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
}
// clang-format off
#define MIGRAPHX_VISIT_DNNL_ALGO(m) \
m(undef) \
m(convolution_auto) \
m(convolution_direct) \
m(convolution_winograd) \
m(deconvolution_direct) \
m(deconvolution_winograd) \
m(eltwise_relu) \
m(eltwise_tanh) \
m(eltwise_elu) \
m(eltwise_square) \
m(eltwise_abs) \
m(eltwise_sqrt) \
m(eltwise_swish) \
m(eltwise_linear) \
m(eltwise_bounded_relu) \
m(eltwise_soft_relu) \
m(eltwise_logistic) \
m(eltwise_exp) \
m(eltwise_gelu) \
m(eltwise_gelu_tanh) \
m(eltwise_gelu_erf) \
m(eltwise_log) \
m(eltwise_clip) \
m(eltwise_pow) \
m(eltwise_round) \
m(eltwise_relu_use_dst_for_bwd) \
m(eltwise_tanh_use_dst_for_bwd) \
m(eltwise_elu_use_dst_for_bwd) \
m(eltwise_sqrt_use_dst_for_bwd) \
m(eltwise_logistic_use_dst_for_bwd) \
m(eltwise_exp_use_dst_for_bwd) \
m(lrn_across_channels) \
m(lrn_within_channel) \
m(pooling_max) \
m(pooling_avg) \
m(pooling_avg_include_padding) \
m(pooling_avg_exclude_padding) \
m(vanilla_rnn) \
m(vanilla_lstm) \
m(vanilla_gru) \
m(lbr_gru) \
m(binary_add) \
m(binary_mul) \
m(binary_max) \
m(binary_min) \
m(binary_div) \
m(resampling_nearest) \
m(resampling_linear) \
m(reduction_max) \
m(reduction_min) \
m(reduction_sum) \
m(reduction_mul) \
m(reduction_mean) \
m(reduction_norm_lp_max) \
m(reduction_norm_lp_sum) \
m(reduction_norm_lp_power_p_max) \
m(reduction_norm_lp_power_p_sum)
// clang-format on
const std::unordered_map<std::string, dnnl::algorithm>& dnnl_algo_map()
{
static const std::unordered_map<std::string, dnnl::algorithm> m = {
#define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {#x, dnnl::algorithm::x},
MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
#undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
};
return m;
}
dnnl::algorithm to_dnnl_algo(const std::string& name)
{
if(dnnl_algo_map().count(name) == 0)
MIGRAPHX_THROW("Missing dnnl algo: " + name);
return dnnl_algo_map().at(name);
}
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_eltwise : dnnl_op<dnnl_eltwise, dnnl::eltwise_forward>
{
std::string algo;
float alpha = 0;
float beta = 0;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta"));
}
std::string name() const { return "dnnl::eltwise"; }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
check_shapes{inputs, *this}.has(1).packed();
auto s = inputs.at(0);
auto r = s;
if(not s.packed())
r = shape{s.type(), s.lens()};
// Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs));
return r;
}
dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {dnnl::prop_kind::forward_inference,
to_dnnl_algo(algo),
m.at(DNNL_ARG_SRC_0),
alpha,
beta};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/contiguous.hpp>
#include <migraphx/op/erf.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template struct cpu_unary<op::contiguous>;
template struct cpu_unary<op::erf>;
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
......
#include <migraphx/config.hpp>
#include <migraphx/context.hpp>
#include <migraphx/cpu/context.hpp>
#include <migraphx/op/gather.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct cpu_gather : auto_register_op<cpu_gather>
{
op::gather op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
check_shapes(inputs, *this).standard();
return migraphx::compute_shape(op, inputs);
}
argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{
std::size_t nelements = output_shape.elements();
auto lens = args[0].get_shape().lens();
auto axis_dim_size = lens[op.axis];
lens[op.axis] = args[1].get_shape().elements();
shape out_comp{output_shape.type(), lens};
visit_all(args.back(), args[0])([&](auto output, auto input) {
args[1].visit([&](auto indices) {
const auto* indices_ptr = indices.data();
auto* output_ptr = output.data();
ctx.bulk_execute(nelements, 1024, [=](auto start, auto end) {
for(auto i = start; i < end; i++)
{
auto idx = out_comp.multi(i);
auto in_index = indices_ptr[idx[op.axis]];
in_index = (in_index < 0) ? in_index + axis_dim_size : in_index;
idx[op.axis] = in_index;
output_ptr[i] = input(idx.begin(), idx.end());
}
});
});
});
return args.back();
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -15,30 +15,7 @@ struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
{
std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
// Batch must be a single dimension
shape adjust_shape(shape x, int) const
{
auto s = base_adjust_shape(x);
auto ndims = s.lens().size();
if(ndims > 3)
{
if(not std::is_sorted(
s.strides().begin(), s.strides().begin() + (ndims - 2), std::greater<>{}))
MIGRAPHX_THROW("Batch transposed");
std::size_t batch = std::accumulate(
s.lens().begin(), s.lens().begin() + (ndims - 2), 1, std::multiplies<>{});
shape s3d{s.type(),
{batch, s.lens()[ndims - 2], s.lens()[ndims - 1]},
{s.lens()[ndims - 2] * s.lens()[ndims - 1],
s.strides()[ndims - 2],
s.strides()[ndims - 1]}};
return s3d;
}
else
{
return s;
}
}
void required(const check_shapes& cs) const { cs.not_broadcasted(); }
dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
......
......@@ -21,46 +21,11 @@ struct dnnl_context
dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
};
inline dnnl_context& get_dnnl_context()
{
static dnnl_context ctx{}; // NOLINT
return ctx;
}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wswitch-enum"
#endif
inline dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
{
using dt = dnnl::memory::data_type;
using st = shape::type_t;
switch(t)
{
case st::half_type: return dt::f16;
case st::float_type: return dt::f32;
case st::int32_type: return dt::s32;
case st::int8_type: return dt::s8;
case st::uint8_type: return dt::u8;
default: MIGRAPHX_THROW("Unsupported data type");
}
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
dnnl_context& get_dnnl_context();
inline dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
{
switch(n)
{
case 1: return dnnl::memory::format_tag::a;
case 2: return dnnl::memory::format_tag::ab;
case 3: return dnnl::memory::format_tag::abc;
case 4: return dnnl::memory::format_tag::abcd;
case 5: return dnnl::memory::format_tag::abcde;
case 6: return dnnl::memory::format_tag::abcdef;
default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
}
}
dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);
dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
template <class R>
inline dnnl::memory::dims to_dnnl_dims(R&& r)
......@@ -68,20 +33,13 @@ inline dnnl::memory::dims to_dnnl_dims(R&& r)
return {r.begin(), r.end()};
}
inline dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
{
return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
}
dnnl::memory::desc to_dnnl_memory_desc(const shape& s);
inline dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
{
return dnnl::memory(desc, get_dnnl_context().engine, a.data());
}
dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a);
inline dnnl::memory to_dnnl_memory(const argument& a)
{
return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
}
dnnl::memory to_dnnl_memory(const argument& a);
dnnl::algorithm to_dnnl_algo(const std::string& name);
template <class Derived, class Primitive>
struct dnnl_op : auto_register_op<Derived>
......@@ -208,12 +166,16 @@ struct dnnl_extend_op : dnnl_op<Derived, Primitive>
return migraphx::reflect(self.op, f);
}
// dnnl has some issues with non-packed inputs
void required(const check_shapes& cs) const { cs.packed_or_broadcasted(); }
std::string name() const { return "dnnl::" + op.name(); }
shape compute_shape(std::vector<shape> inputs) const
{
const auto& self = static_cast<const Derived&>(*this);
// Compensate for allocation
inputs.pop_back();
// check_shapes(inputs, *this).standard();
self.required(check_shapes(inputs, self));
auto r = migraphx::compute_shape(op, inputs);
// Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs));
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
// #define MIGRAPHX_DISABLE_OMP
#include <migraphx/config.hpp>
#ifdef MIGRAPHX_DISABLE_OMP
#include <migraphx/par_for.hpp>
#else
#include <omp.h>
#endif
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
#ifdef MIGRAPHX_DISABLE_OMP
inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
{
if(threadsize <= 1)
{
f(std::size_t{0}, n);
}
else
{
std::vector<joinable_thread> threads(threadsize);
// Using const here causes gcc 5 to ICE
#if(!defined(__GNUC__) || __GNUC__ != 5)
const
#endif
std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
std::size_t work = 0;
std::generate(threads.begin(), threads.end(), [=, &work] {
auto result =
joinable_thread([=]() mutable { f(work, std::min(n, work + grainsize)); });
work += grainsize;
return result;
});
// cppcheck-suppress unsignedLessThanZero
assert(work >= n);
}
}
#else
inline std::size_t max_threads() { return omp_get_max_threads(); }
template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
{
......@@ -18,19 +59,19 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
else
{
std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
#pragma omp parallel num_threads(threadsize)
#pragma omp parallel for num_threads(threadsize) schedule(static, 1) private(grainsize, n)
for(std::size_t tid = 0; tid < threadsize; tid++)
{
std::size_t tid = omp_get_thread_num();
std::size_t work = tid * grainsize;
f(work, std::min(n, work + grainsize));
}
}
}
#endif
template <class F>
void parallel_for(std::size_t n, std::size_t min_grain, F f)
{
const auto threadsize = std::min<std::size_t>(omp_get_num_threads(), n / min_grain);
const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
parallel_for_impl(n, threadsize, f);
}
......
......@@ -14,6 +14,8 @@ namespace cpu {
struct multi_index
{
constexpr multi_index() = default;
multi_index(const shape& s, std::size_t i) : n(s.lens().size())
{
assert(n < max_size);
......@@ -21,17 +23,17 @@ struct multi_index
s.multi_copy(i, index, index + max_size);
}
std::size_t size() const { return n; }
constexpr std::size_t size() const { return n; }
std::size_t* begin() { return index; }
const std::size_t* begin() const { return index; }
constexpr std::size_t* begin() { return index; }
constexpr const std::size_t* begin() const { return index; }
std::size_t* end() { return index + size(); }
const std::size_t* end() const { return index + size(); }
constexpr std::size_t* end() { return index + size(); }
constexpr const std::size_t* end() const { return index + size(); }
std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
void carry()
constexpr void carry()
{
std::size_t overflow = 0;
for(std::ptrdiff_t i = size() - 1; i > 0; i--)
......@@ -40,29 +42,34 @@ struct multi_index
// Reset overflow
overflow = 0;
// Compute overflow using while loop instead of mod
// overflow = z / dims[i];
// z = z % dims[i];
while(z >= dims[i])
{
z -= dims[i];
overflow += 1;
}
index[i] = z;
// Exit if there is no overflow
if(overflow == 0)
return;
}
index[0] += overflow;
}
void increment(std::size_t i)
constexpr void increment(std::size_t i)
{
index[size() - 1] += i;
carry();
}
multi_index& operator+=(std::size_t i)
constexpr multi_index& operator+=(std::size_t i)
{
increment(i);
return *this;
}
multi_index& operator++()
constexpr multi_index& operator++()
{
increment(1);
return *this;
......@@ -76,9 +83,9 @@ struct multi_index
private:
static const std::size_t max_size = 5;
std::size_t index[max_size];
std::size_t dims[max_size];
std::size_t n;
std::size_t index[max_size] = {};
std::size_t dims[max_size] = {};
std::size_t n = 0;
};
struct reduce_dims_base
......@@ -104,6 +111,153 @@ struct reduce_dims_base
}
};
template <class T, std::size_t N>
struct vec
{
using array_type = std::array<T, N>;
using vector_type __attribute__((vector_size(N * sizeof(T)))) = T;
union
{
array_type array;
vector_type vector;
};
static_assert(sizeof(array_type) == sizeof(vector_type), "Not the same size");
};
template <class T>
constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
{
return {};
}
template <class T, std::size_t N>
constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
{
return {};
}
template <class T>
constexpr std::size_t vec_size()
{
return decltype(vec_size(std::declval<T>())){};
}
template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() > 0))>
void vec_apply(F f, V& v, Vs... vs)
{
assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
assert(vec_size<V>() == v.array.size());
for(std::size_t i = 0; i < vec_size<V>(); i++)
f(v.array[i], vs.vector[i]...);
}
template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() == 0))>
void vec_apply(F f, V& v, Vs&... vs)
{
f(v, vs...);
}
inline std::size_t find_packed_len(const shape& s)
{
for(std::size_t i = 0; i < s.lens().size(); i++)
{
if(s.lens()[i] > 1 and s.strides()[i] == 1)
{
return i;
}
}
return -1;
}
template <std::size_t N>
shape vectorize(const shape& s)
{
assert(s.standard() or s.broadcasted());
auto lens = s.lens();
if(s.broadcasted())
{
auto n = find_packed_len(s);
assert(n != -1);
assert((lens[n] % N) == 0);
lens[n] /= N;
return {s.type(), lens, s.strides()};
}
assert((lens.back() % N) == 0);
lens.back() /= N;
return {s.type(), lens};
}
template <std::size_t N, class T>
tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
{
return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
}
template <class T>
struct is_vector_type : std::false_type
{
};
template <>
struct is_vector_type<float> : std::true_type
{
};
template <class... Ts>
struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...>
{
};
template <std::size_t N, class... Xs>
bool is_vectorizable(const Xs&... xs)
{
return all_of({xs...}, [](const auto& s) {
if(s.standard() and (s.lens().back() % N) == 0)
return true;
if(s.broadcasted())
{
auto n = std::inner_product(s.lens().begin(),
s.lens().end(),
s.strides().begin(),
0,
std::plus<>{},
[&](auto len, auto stride) -> std::size_t {
if(stride > 0 and len == 1)
return 0;
return stride;
});
if(n == 1)
{
auto i = find_packed_len(s);
assert(i != -1);
return (s.lens()[i] % N) == 0;
}
}
return false;
});
}
template <class... Ts, MIGRAPHX_REQUIRES(is_vector_tensor_view<Ts...>{})>
auto auto_vectorize(const shape& base_shape, Ts... xs)
{
return [=](auto f) {
if(is_vectorizable<32>(base_shape, xs.get_shape()...))
f(vectorize<32>(base_shape), vectorize<32>(xs)...);
else if(is_vectorizable<8>(base_shape, xs.get_shape()...))
f(vectorize<8>(base_shape), vectorize<8>(xs)...);
else
f(base_shape, xs...);
};
}
template <class... Ts, MIGRAPHX_REQUIRES(not is_vector_tensor_view<Ts...>{})>
auto auto_vectorize(const shape& base_shape, Ts... xs)
{
return [=](auto f) { f(base_shape, xs...); };
}
template <class X, class... Xs>
bool is_standard_offset(const X& x, const Xs&... xs)
{
......@@ -116,15 +270,15 @@ bool is_standard_offset(const X& x, const Xs&... xs)
}
template <class... Ts>
auto pointwise(Ts... xs)
auto pointwise_apply(Ts... ts)
{
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
if(is_standard_offset(xs.get_shape()...))
if(is_standard_offset(ts.get_shape()...))
{
ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
for(auto i = start; i < end; i++)
{
f(xs.data()[i]...);
vec_apply(f, ts.data()[i]...);
}
});
}
......@@ -135,7 +289,7 @@ auto pointwise(Ts... xs)
multi_index mi(base_shape, start);
for(auto i = start; i < end; i++)
{
f(xs.data()[mi.offset(xs.get_shape())]...);
vec_apply(f, ts.data()[mi.offset(ts.get_shape())]...);
++mi;
}
});
......@@ -143,6 +297,15 @@ auto pointwise(Ts... xs)
};
}
template <class... Ts>
auto pointwise(Ts... ts)
{
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
auto_vectorize(base_shape, ts...)(
[&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
};
}
template <class Op>
struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
{
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
#include <migraphx/config.hpp>
#include <string>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct module;
namespace cpu {
struct write_literals
{
std::string name() const { return "cpu::write_literals"; }
void apply(module& m) const;
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/logsoftmax.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forward, op::logsoftmax>
{
dnnl::logsoftmax_forward::desc
get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
int axis = this->op.axis;
return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -51,145 +51,6 @@ typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::ena
return x;
}
struct cpu_lrn
{
op::lrn op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::lrn"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0])([&](auto output, auto input) {
int n_batch = output_shape.lens()[0];
int channels = output_shape.lens()[1];
int height = output_shape.lens()[2];
int width = output_shape.lens()[3];
float alphaoverarea = op.alpha / float(op.size);
int radius_lower = (op.size - 1) / 2;
int radius_upper = op.size / 2 + 1;
par_dfor(n_batch, height, width)([&](int b, int h, int w) {
float scale = 0;
dfor(channels)([&](int c) {
auto start = (c - radius_lower) < 0 ? 0 : (c - radius_lower);
auto end = (c + radius_upper) > channels ? channels : (c + radius_upper);
for(auto k = start; k < end; ++k)
{
scale += std::pow(input(b, k, h, w), 2);
}
scale *= alphaoverarea;
scale += op.bias;
scale = std::pow(scale, -op.beta);
output(b, c, h, w) = input(b, c, h, w) * scale;
});
});
});
return result;
}
};
MIGRAPHX_REGISTER_OP(cpu_lrn)
template <class Op>
struct cpu_deconvolution : auto_register_op<cpu_deconvolution<Op>>
{
cpu_deconvolution() = default;
cpu_deconvolution(Op pop) : op(std::move(pop)) {}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
using type = typename decltype(output)::value_type;
std::fill(output.begin(), output.end(), type{0});
auto in_lens = input.get_shape().lens();
auto in_n = in_lens[0];
auto in_c = in_lens[1];
auto wei = weights.get_shape().lens();
auto wei_n = wei[0];
auto wei_c = wei[1];
auto out_lens = output_shape.lens();
auto kdims = op.kdims();
std::vector<std::size_t> win_size{in_c};
std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
shape win_shape{output_shape.type(), win_size};
par_dfor(in_n, wei_c)([&](int o, int k) {
shape_for_each(win_shape, [&](auto idx_win) {
const int w = idx_win[0];
auto input_dims_start = idx_win.begin() + 1;
auto wei_dims_start = idx_win.begin() + kdims + 1;
std::vector<std::ptrdiff_t> win_start;
for(std::size_t n = 0; n < kdims; ++n)
{
win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * op.stride[n]) -
std::ptrdiff_t(op.padding[n]));
}
const int group_id = w / (wei_n / op.group);
const int in_ch = group_id * wei_c + k;
std::vector<std::ptrdiff_t> idx_out{o, in_ch};
for(size_t n = 0; n < kdims; n++)
{
idx_out.push_back(win_start[n] + *(wei_dims_start + n) * op.dilation[n]);
}
std::vector<std::ptrdiff_t> idx_wei{w, k};
std::copy(wei_dims_start, idx_win.end(), std::back_inserter(idx_wei));
std::vector<std::ptrdiff_t> idx_in{o, w};
std::copy(input_dims_start, wei_dims_start, std::back_inserter(idx_in));
if(std::all_of(
idx_out.begin() + 2, idx_out.end(), [&](auto ii) { return ii >= 0; }) and
std::equal(idx_out.begin() + 2,
idx_out.end(),
out_lens.begin() + 2,
out_lens.end(),
std::less<std::ptrdiff_t>{}))
{
output(idx_out.begin(), idx_out.end()) +=
input(idx_in.begin(), idx_in.end()) *
weights(idx_wei.begin(), idx_wei.end());
}
});
});
});
return result;
}
};
template struct cpu_deconvolution<op::deconvolution>;
struct cpu_im2col
{
op::im2col op;
......@@ -334,17 +195,6 @@ struct leaky_relu_op
}
};
struct elu_op
{
op::elu op;
std::string name() const { return "cpu::elu"; }
auto fcn() const
{
auto a = op.alpha;
return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
}
};
template <typename Op>
struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
{
......@@ -382,78 +232,6 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
}
};
template struct cpu_unary2<leaky_relu_op>;
template struct cpu_unary2<elu_op>;
template <class Op>
struct cpu_softmax : auto_register_op<cpu_softmax<Op>>
{
cpu_softmax() = default;
cpu_softmax(Op pop) : op(std::move(pop)) {}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(1).standard();
return op.normalize_compute_shape(inputs);
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
auto batch_lens = output_shape.lens();
int64_t tuned_axis = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
std::size_t n_dims = batch_lens[tuned_axis];
batch_lens[tuned_axis] = 1;
shape batch_shape{shape::int32_type, batch_lens};
visit_all(result, args[0])([&](auto output, auto input) {
using value_type = typename decltype(input)::value_type;
std::vector<value_type> batch_max(batch_shape.elements(),
std::numeric_limits<value_type>::lowest());
std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
par_for(batch_shape.elements(), [&](auto i) {
auto idx = batch_shape.multi(i);
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
std::size_t index = output_shape.index(idx);
output[index] = std::exp(input[index] - batch_max[i]);
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
batch_sum[i] += output(idx.begin(), idx.end());
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
output(idx.begin(), idx.end()) =
op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
}
});
});
return result;
}
};
template struct cpu_softmax<op::softmax>;
template struct cpu_softmax<op::logsoftmax>;
struct cpu_rnn_var_sl_last_output
{
......@@ -502,29 +280,6 @@ struct cpu_rnn_var_sl_last_output
};
MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output)
struct cpu_literal
{
argument data;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.data, "data"));
}
std::string name() const { return "cpu::literal"; }
shape compute_shape(const std::vector<shape>&) const { return data.get_shape(); }
argument compute(const shape&, const std::vector<argument>&) const { return data; }
friend std::ostream& operator<<(std::ostream& os, const cpu_literal& x)
{
os << x.name();
return os;
}
};
struct cpu_apply
{
module* modl;
......@@ -553,49 +308,97 @@ struct cpu_apply
}
}
void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = false)
void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = true)
{
apply_map.emplace(op_name, [=](instruction_ref ins) {
auto&& op = ins->get_operator();
if(allocate)
replace(ins, make_op(cpu_name, op.to_value()));
return replace(ins, make_op(cpu_name, op.to_value()));
return modl->replace_instruction(ins, make_op(cpu_name, op.to_value()), ins->inputs());
});
}
void extend_dnnl_algos(const std::string& dnnl_name,
const std::vector<std::pair<std::string, std::string>>& algos)
{
for(auto&& pp : algos)
{
std::string op_name = pp.first;
std::string algo = pp.second;
apply_map.emplace(op_name, [=](instruction_ref ins) {
auto v = ins->get_operator().to_value();
if(not v.is_object())
return ins;
v["algo"] = algo;
auto op = make_op(dnnl_name, v);
return replace(ins, op);
});
}
}
void init()
{
create_output_names();
extend_op("add", "dnnl::add", true);
extend_op("mul", "dnnl::mul", true);
extend_op("convolution", "dnnl::convolution", true);
extend_op("dot", "dnnl::dot", true);
extend_op("relu", "dnnl::relu", true);
extend_op("contiguous", "cpu::contiguous", true);
extend_op("deconvolution", "cpu::deconvolution");
extend_op("elu", "cpu::elu");
extend_op("im2col", "cpu::im2col");
extend_op("leaky_relu", "cpu::leaky_relu");
extend_op("logsoftmax", "cpu::logsoftmax");
extend_op("lrn", "cpu::lrn");
extend_op("pad", "cpu::pad");
extend_op("quant_convolution", "cpu::quant_convolution", true);
extend_op("quant_dot", "cpu::quant_dot", true);
extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output");
extend_op("softmax", "cpu::softmax");
extend_dnnl_algos("dnnl::binary",
{
{"add", "binary_add"},
{"div", "binary_div"},
{"max", "binary_max"},
{"min", "binary_min"},
{"mul", "binary_mul"},
});
extend_dnnl_algos("dnnl::eltwise",
{
{"abs", "eltwise_abs"},
{"elu", "eltwise_elu"},
{"exp", "eltwise_exp"},
{"log", "eltwise_log"},
{"relu", "eltwise_relu"},
{"sqrt", "eltwise_sqrt"},
{"tanh", "eltwise_tanh"},
});
extend_dnnl_algos("dnnl::reduction",
{
{"reduce_max", "reduction_max"},
{"reduce_mean", "reduction_mean"},
{"reduce_min", "reduction_min"},
{"reduce_sum", "reduction_sum"},
});
extend_op("concat", "dnnl::concat");
extend_op("contiguous", "dnnl::reorder");
extend_op("convolution", "dnnl::convolution");
extend_op("deconvolution", "dnnl::deconvolution");
extend_op("dot", "dnnl::dot");
extend_op("erf", "cpu::erf");
extend_op("gather", "cpu::gather");
extend_op("logsoftmax", "dnnl::logsoftmax");
extend_op("lrn", "dnnl::lrn");
extend_op("softmax", "dnnl::softmax");
extend_op("sub", "cpu::sub");
extend_op("im2col", "cpu::im2col", false);
extend_op("leaky_relu", "cpu::leaky_relu", false);
extend_op("pad", "cpu::pad", false);
extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output", false);
}
void apply()
{
init();
// Apply these operators first so the inputs can be const folded
for(auto it : iterator_for(*modl))
{
if(it->name() == "@literal")
if(it->name() == "pow")
{
apply_literal(it);
apply_pow(it);
}
else if(it->name() == "pooling")
}
for(auto it : iterator_for(*modl))
{
if(it->name() == "pooling")
{
apply_pooling(it);
}
......@@ -606,9 +409,15 @@ struct cpu_apply
}
}
instruction_ref apply_literal(instruction_ref ins) const
instruction_ref apply_pow(instruction_ref ins)
{
return modl->replace_instruction(ins, cpu_literal{ins->get_literal().get_argument()});
auto beta = read_scalar<float>(ins->inputs()[1]);
if(beta.empty())
return ins;
return replace(ins,
make_op("dnnl::eltwise",
{{"algo", "eltwise_pow"}, {"alpha", 1.0}, {"beta", beta.front()}}),
{ins->inputs().front()});
}
instruction_ref apply_pooling(instruction_ref ins)
......@@ -626,9 +435,27 @@ struct cpu_apply
return ins;
}
template <class T>
static std::vector<T> read_scalar(instruction_ref ins)
{
if(ins->name() == "contiguous")
return read_scalar<T>(ins->inputs().front());
if(ins->get_shape().elements() != 1 and not ins->get_shape().scalar())
return {};
auto r = ins->eval();
if(r.empty())
return {};
return {r.at<T>()};
}
instruction_ref replace(instruction_ref ins, const operation& op)
{
auto inputs = ins->inputs();
return replace(ins, op, ins->inputs());
}
instruction_ref
replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs)
{
inputs.push_back(insert_allocation(ins, ins->get_shape()));
return modl->replace_instruction(ins, op, inputs);
}
......
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/mul.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/lrn.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_mul : dnnl_extend_op<dnnl_mul, dnnl::binary, op::mul>
struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
{
dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
dnnl::lrn_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {dnnl::algorithm::binary_mul,
return {dnnl::prop_kind::forward_inference,
dnnl::algorithm::lrn_across_channels,
m.at(DNNL_ARG_SRC_0),
m.at(DNNL_ARG_SRC_1),
m.at(DNNL_ARG_DST)};
this->op.size,
this->op.alpha,
this->op.beta,
this->op.bias};
}
};
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment