Unverified Commit a0b570b2 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Add more supported operators and optimizations for the cpu backend (#746)



* Add eliminate_data_type pass

* Formatting

* Auto convert quant ops

* Formatting

* Flip the order of decompose

* Compute max size differently

* Formatting

* Clamp values in convert

* Formatting

* Fix loss of precision in reduce

* Formatting

* Fix bugs in reduction

* Fix accumulator type in reference softmax implementation

* Formatting

* Update convert test

* Remove unused variables

* Remove unnecessary quant_dot check

* Formatting

* Add tests

* Formatting

* Remove unused code

* Remove duplicate ops

* Remove blaze dependency

* Use set since shape::type_t is no hashable on gcc 5

* Formatting

* Add dnnl binary op

* Formatting

* Add binary and eltwise

* Formatting

* Add softmax

* Formatting

* Remove unused operators

* Add missing files

* Formatting

* Add lrn

* Formatting

* Add deconvolution

* Formatting

* Change allocate default

* Add reorder

* Formatting

* Add reductions

* Formatting

* Sort lines

* Change literals in another loop

* Add pow operator

* Formatting

* Add pow operator

* Formatting

* Make sure shapes are packed

* Allow broadcasted inputs

* Remove unused operators

* Simplify functions

* Remove softmax

* Add sub and erf functions

* Formatting

* Fix bug

* Formatting

* Improve parallism

* Formatting

* Allow multiple batch dimensions

* Formatting

* Move literal transforms out of lowering

* Formatting

* Add gather operator

* Sort lines

* Add early exit for carry

* Formatting

* Add missing concat

* Rename macro

* Fix deep nesting

* Formatting

* Fix cppcheck issues

* Remov else

* Move attribute to typedef

* Formatting

* Disable maybe-uninitialized warning since its broken on gcc

* Add constexpr default constructor

* Formatting

* Fix compiler warnings

* Fix adjust_allocation test
Co-authored-by: default avatarShucai Xiao <shucai@gmail.com>
Co-authored-by: default avatarmvermeulen <5479696+mvermeulen@users.noreply.github.com>
parent 165d1a17
...@@ -177,8 +177,7 @@ rocm_enable_cppcheck( ...@@ -177,8 +177,7 @@ rocm_enable_cppcheck(
passedByValue passedByValue
unusedStructMember unusedStructMember
functionStatic functionStatic
functionConst:*program.* functionConst
functionConst:*module.*
shadowFunction shadowFunction
shadowVar shadowVar
shadowVariable shadowVariable
......
...@@ -107,9 +107,12 @@ else() ...@@ -107,9 +107,12 @@ else()
else() else()
list(APPEND CMAKE_COMPILER_WARNINGS list(APPEND CMAKE_COMPILER_WARNINGS
-Wno-missing-field-initializers -Wno-missing-field-initializers
-Wno-maybe-uninitialized
# -Wno-deprecated-declarations # -Wno-deprecated-declarations
) )
endif() endif()
add_definitions(${CMAKE_COMPILER_WARNINGS}) foreach(COMPILER_WARNING ${CMAKE_COMPILER_WARNINGS})
add_compile_options($<$<COMPILE_LANGUAGE:${COMPILER}>:${COMPILER_WARNING}>)
endforeach()
endforeach() endforeach()
endif () endif ()
...@@ -77,9 +77,7 @@ void eliminate_contiguous::apply(module& p) const ...@@ -77,9 +77,7 @@ void eliminate_contiguous::apply(module& p) const
auto args = ins->inputs(); auto args = ins->inputs();
for(auto arg : ins->inputs()) for(auto arg : ins->inputs())
{ {
// TODO: Pass in names for the operator in the constructor instead if(arg->name() == op_name)
// of using ends_with
if(ends_with(arg->name(), "contiguous"))
{ {
auto new_args = args; auto new_args = args;
auto prev = arg->inputs().front(); auto prev = arg->inputs().front();
......
...@@ -151,6 +151,13 @@ struct check_shapes ...@@ -151,6 +151,13 @@ struct check_shapes
return *this; return *this;
} }
const check_shapes& packed_or_broadcasted() const
{
if(!this->all_of([](const shape& s) { return s.packed() or s.broadcasted(); }))
MIGRAPHX_THROW(prefix() + "Shapes are not packed nor broadcasted");
return *this;
}
const check_shapes& not_transposed() const const check_shapes& not_transposed() const
{ {
if(!this->all_of([](const shape& s) { return not s.transposed(); })) if(!this->all_of([](const shape& s) { return not s.transposed(); }))
......
...@@ -15,6 +15,7 @@ struct module; ...@@ -15,6 +15,7 @@ struct module;
*/ */
struct eliminate_contiguous struct eliminate_contiguous
{ {
std::string op_name;
std::string name() const { return "eliminate_contiguous"; } std::string name() const { return "eliminate_contiguous"; }
void apply(module& p) const; void apply(module& p) const;
}; };
......
include(CheckCXXCompilerFlag)
add_library(migraphx_cpu add_library(migraphx_cpu
allocation_model.cpp
allocate.cpp allocate.cpp
add.cpp allocation_model.cpp
contiguous.cpp binary.cpp
concat.cpp concat.cpp
convolution.cpp convolution.cpp
copy.cpp copy.cpp
mul.cpp deconvolution.cpp
pooling.cpp dnnl.cpp
relu.cpp eltwise.cpp
erf.cpp
gather.cpp
gemm.cpp gemm.cpp
target.cpp logsoftmax.cpp
lowering.cpp lowering.cpp
lrn.cpp
pooling.cpp
reduction.cpp
reorder.cpp
softmax.cpp
sub.cpp
target.cpp
write_literals.cpp
) )
set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu) set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION}) rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
...@@ -23,6 +34,7 @@ find_package(dnnl REQUIRED) ...@@ -23,6 +34,7 @@ find_package(dnnl REQUIRED)
rocm_clang_tidy_check(migraphx_cpu) rocm_clang_tidy_check(migraphx_cpu)
target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads) target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl) target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
find_package(OpenMP) find_package(OpenMP)
target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX) target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
# Add library path to rpath to workaround issues with our broken packages # Add library path to rpath to workaround issues with our broken packages
......
#include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_binary : dnnl_op<dnnl_binary, dnnl::binary>
{
std::string algo;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.algo, "algo"));
}
std::string name() const { return "dnnl::binary"; }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
check_shapes{inputs, *this}.has(2);
auto s0 = inputs.at(0);
auto s1 = inputs.at(1);
auto r = s0;
if(s0 != s1 or !s0.packed())
{
r = shape{s0.type(), s0.lens()};
}
// Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs));
return r;
}
dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {to_dnnl_algo(algo), m.at(DNNL_ARG_SRC_0), m.at(DNNL_ARG_SRC_1), m.at(DNNL_ARG_DST)};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/deconvolution.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_deconvolution
: dnnl_extend_op<dnnl_deconvolution, dnnl::deconvolution_forward, op::deconvolution>
{
std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
shape adjust_shape(const shape& x, int i) const
{
auto s = base_adjust_shape(x);
if(i == 1)
{
// The input and output channels are flipped for dnnl
auto lens = s.lens();
std::swap(lens[0], lens[1]);
auto strides = s.strides();
std::swap(strides[0], strides[1]);
return {s.type(), lens, strides};
}
return s;
}
dnnl::deconvolution_forward::desc
get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
// In DNNL dilation is zero-based
auto dilation = op.dilation;
std::transform(
dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
return {dnnl::prop_kind::forward_inference,
dnnl::algorithm::deconvolution_direct,
m.at(DNNL_ARG_SRC),
m.at(DNNL_ARG_WEIGHTS),
m.at(DNNL_ARG_DST),
to_dnnl_dims(op.stride),
to_dnnl_dims(dilation),
to_dnnl_dims(op.padding),
to_dnnl_dims(op.padding)};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/cpu/dnnl.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
dnnl_context& get_dnnl_context()
{
static dnnl_context ctx{}; // NOLINT
return ctx;
}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wswitch-enum"
#endif
dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
{
using dt = dnnl::memory::data_type;
using st = shape::type_t;
switch(t)
{
case st::half_type: return dt::f16;
case st::float_type: return dt::f32;
case st::int32_type: return dt::s32;
case st::int8_type: return dt::s8;
case st::uint8_type: return dt::u8;
default: MIGRAPHX_THROW("Unsupported data type");
}
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
{
switch(n)
{
case 1: return dnnl::memory::format_tag::a;
case 2: return dnnl::memory::format_tag::ab;
case 3: return dnnl::memory::format_tag::abc;
case 4: return dnnl::memory::format_tag::abcd;
case 5: return dnnl::memory::format_tag::abcde;
case 6: return dnnl::memory::format_tag::abcdef;
default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
}
}
dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
{
return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
}
dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
{
return dnnl::memory(desc, get_dnnl_context().engine, a.data());
}
dnnl::memory to_dnnl_memory(const argument& a)
{
return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
}
// clang-format off
#define MIGRAPHX_VISIT_DNNL_ALGO(m) \
m(undef) \
m(convolution_auto) \
m(convolution_direct) \
m(convolution_winograd) \
m(deconvolution_direct) \
m(deconvolution_winograd) \
m(eltwise_relu) \
m(eltwise_tanh) \
m(eltwise_elu) \
m(eltwise_square) \
m(eltwise_abs) \
m(eltwise_sqrt) \
m(eltwise_swish) \
m(eltwise_linear) \
m(eltwise_bounded_relu) \
m(eltwise_soft_relu) \
m(eltwise_logistic) \
m(eltwise_exp) \
m(eltwise_gelu) \
m(eltwise_gelu_tanh) \
m(eltwise_gelu_erf) \
m(eltwise_log) \
m(eltwise_clip) \
m(eltwise_pow) \
m(eltwise_round) \
m(eltwise_relu_use_dst_for_bwd) \
m(eltwise_tanh_use_dst_for_bwd) \
m(eltwise_elu_use_dst_for_bwd) \
m(eltwise_sqrt_use_dst_for_bwd) \
m(eltwise_logistic_use_dst_for_bwd) \
m(eltwise_exp_use_dst_for_bwd) \
m(lrn_across_channels) \
m(lrn_within_channel) \
m(pooling_max) \
m(pooling_avg) \
m(pooling_avg_include_padding) \
m(pooling_avg_exclude_padding) \
m(vanilla_rnn) \
m(vanilla_lstm) \
m(vanilla_gru) \
m(lbr_gru) \
m(binary_add) \
m(binary_mul) \
m(binary_max) \
m(binary_min) \
m(binary_div) \
m(resampling_nearest) \
m(resampling_linear) \
m(reduction_max) \
m(reduction_min) \
m(reduction_sum) \
m(reduction_mul) \
m(reduction_mean) \
m(reduction_norm_lp_max) \
m(reduction_norm_lp_sum) \
m(reduction_norm_lp_power_p_max) \
m(reduction_norm_lp_power_p_sum)
// clang-format on
const std::unordered_map<std::string, dnnl::algorithm>& dnnl_algo_map()
{
static const std::unordered_map<std::string, dnnl::algorithm> m = {
#define MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR(x) {#x, dnnl::algorithm::x},
MIGRAPHX_VISIT_DNNL_ALGO(MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR)
#undef MIGRAPHX_DNNL_ALGO_GENERATE_VISITOR
};
return m;
}
dnnl::algorithm to_dnnl_algo(const std::string& name)
{
if(dnnl_algo_map().count(name) == 0)
MIGRAPHX_THROW("Missing dnnl algo: " + name);
return dnnl_algo_map().at(name);
}
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_eltwise : dnnl_op<dnnl_eltwise, dnnl::eltwise_forward>
{
std::string algo;
float alpha = 0;
float beta = 0;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.algo, "algo"), f(self.alpha, "alpha"), f(self.beta, "beta"));
}
std::string name() const { return "dnnl::eltwise"; }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
check_shapes{inputs, *this}.has(1).packed();
auto s = inputs.at(0);
auto r = s;
if(not s.packed())
r = shape{s.type(), s.lens()};
// Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs));
return r;
}
dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {dnnl::prop_kind::forward_inference,
to_dnnl_algo(algo),
m.at(DNNL_ARG_SRC_0),
alpha,
beta};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp> #include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/contiguous.hpp> #include <migraphx/op/erf.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace cpu { namespace cpu {
template struct cpu_unary<op::contiguous>; template struct cpu_unary<op::erf>;
} // namespace cpu } // namespace cpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
#include <migraphx/config.hpp>
#include <migraphx/context.hpp>
#include <migraphx/cpu/context.hpp>
#include <migraphx/op/gather.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct cpu_gather : auto_register_op<cpu_gather>
{
op::gather op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
check_shapes(inputs, *this).standard();
return migraphx::compute_shape(op, inputs);
}
argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{
std::size_t nelements = output_shape.elements();
auto lens = args[0].get_shape().lens();
auto axis_dim_size = lens[op.axis];
lens[op.axis] = args[1].get_shape().elements();
shape out_comp{output_shape.type(), lens};
visit_all(args.back(), args[0])([&](auto output, auto input) {
args[1].visit([&](auto indices) {
const auto* indices_ptr = indices.data();
auto* output_ptr = output.data();
ctx.bulk_execute(nelements, 1024, [=](auto start, auto end) {
for(auto i = start; i < end; i++)
{
auto idx = out_comp.multi(i);
auto in_index = indices_ptr[idx[op.axis]];
in_index = (in_index < 0) ? in_index + axis_dim_size : in_index;
idx[op.axis] = in_index;
output_ptr[i] = input(idx.begin(), idx.end());
}
});
});
});
return args.back();
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
...@@ -15,30 +15,7 @@ struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot> ...@@ -15,30 +15,7 @@ struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
{ {
std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; } std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
// Batch must be a single dimension void required(const check_shapes& cs) const { cs.not_broadcasted(); }
shape adjust_shape(shape x, int) const
{
auto s = base_adjust_shape(x);
auto ndims = s.lens().size();
if(ndims > 3)
{
if(not std::is_sorted(
s.strides().begin(), s.strides().begin() + (ndims - 2), std::greater<>{}))
MIGRAPHX_THROW("Batch transposed");
std::size_t batch = std::accumulate(
s.lens().begin(), s.lens().begin() + (ndims - 2), 1, std::multiplies<>{});
shape s3d{s.type(),
{batch, s.lens()[ndims - 2], s.lens()[ndims - 1]},
{s.lens()[ndims - 2] * s.lens()[ndims - 1],
s.strides()[ndims - 2],
s.strides()[ndims - 1]}};
return s3d;
}
else
{
return s;
}
}
dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{ {
......
...@@ -21,46 +21,11 @@ struct dnnl_context ...@@ -21,46 +21,11 @@ struct dnnl_context
dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {} dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
}; };
inline dnnl_context& get_dnnl_context() dnnl_context& get_dnnl_context();
{
static dnnl_context ctx{}; // NOLINT
return ctx;
}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wswitch-enum"
#endif
inline dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
{
using dt = dnnl::memory::data_type;
using st = shape::type_t;
switch(t)
{
case st::half_type: return dt::f16;
case st::float_type: return dt::f32;
case st::int32_type: return dt::s32;
case st::int8_type: return dt::s8;
case st::uint8_type: return dt::u8;
default: MIGRAPHX_THROW("Unsupported data type");
}
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
inline dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n) dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t);
{
switch(n) dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n);
{
case 1: return dnnl::memory::format_tag::a;
case 2: return dnnl::memory::format_tag::ab;
case 3: return dnnl::memory::format_tag::abc;
case 4: return dnnl::memory::format_tag::abcd;
case 5: return dnnl::memory::format_tag::abcde;
case 6: return dnnl::memory::format_tag::abcdef;
default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
}
}
template <class R> template <class R>
inline dnnl::memory::dims to_dnnl_dims(R&& r) inline dnnl::memory::dims to_dnnl_dims(R&& r)
...@@ -68,20 +33,13 @@ inline dnnl::memory::dims to_dnnl_dims(R&& r) ...@@ -68,20 +33,13 @@ inline dnnl::memory::dims to_dnnl_dims(R&& r)
return {r.begin(), r.end()}; return {r.begin(), r.end()};
} }
inline dnnl::memory::desc to_dnnl_memory_desc(const shape& s) dnnl::memory::desc to_dnnl_memory_desc(const shape& s);
{
return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
}
inline dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a) dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a);
{
return dnnl::memory(desc, get_dnnl_context().engine, a.data());
}
inline dnnl::memory to_dnnl_memory(const argument& a) dnnl::memory to_dnnl_memory(const argument& a);
{
return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a); dnnl::algorithm to_dnnl_algo(const std::string& name);
}
template <class Derived, class Primitive> template <class Derived, class Primitive>
struct dnnl_op : auto_register_op<Derived> struct dnnl_op : auto_register_op<Derived>
...@@ -208,12 +166,16 @@ struct dnnl_extend_op : dnnl_op<Derived, Primitive> ...@@ -208,12 +166,16 @@ struct dnnl_extend_op : dnnl_op<Derived, Primitive>
return migraphx::reflect(self.op, f); return migraphx::reflect(self.op, f);
} }
// dnnl has some issues with non-packed inputs
void required(const check_shapes& cs) const { cs.packed_or_broadcasted(); }
std::string name() const { return "dnnl::" + op.name(); } std::string name() const { return "dnnl::" + op.name(); }
shape compute_shape(std::vector<shape> inputs) const shape compute_shape(std::vector<shape> inputs) const
{ {
const auto& self = static_cast<const Derived&>(*this);
// Compensate for allocation // Compensate for allocation
inputs.pop_back(); inputs.pop_back();
// check_shapes(inputs, *this).standard(); self.required(check_shapes(inputs, self));
auto r = migraphx::compute_shape(op, inputs); auto r = migraphx::compute_shape(op, inputs);
// Call to get_primitive to make sure an algo is available // Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs)); this->get_primitive(this->to_memory_desc(r, inputs));
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP #ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP #define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
// #define MIGRAPHX_DISABLE_OMP
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#ifdef MIGRAPHX_DISABLE_OMP
#include <migraphx/par_for.hpp>
#else
#include <omp.h> #include <omp.h>
#endif
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace cpu { namespace cpu {
#ifdef MIGRAPHX_DISABLE_OMP
inline std::size_t max_threads() { return std::thread::hardware_concurrency(); }
template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
{
if(threadsize <= 1)
{
f(std::size_t{0}, n);
}
else
{
std::vector<joinable_thread> threads(threadsize);
// Using const here causes gcc 5 to ICE
#if(!defined(__GNUC__) || __GNUC__ != 5)
const
#endif
std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
std::size_t work = 0;
std::generate(threads.begin(), threads.end(), [=, &work] {
auto result =
joinable_thread([=]() mutable { f(work, std::min(n, work + grainsize)); });
work += grainsize;
return result;
});
// cppcheck-suppress unsignedLessThanZero
assert(work >= n);
}
}
#else
inline std::size_t max_threads() { return omp_get_max_threads(); }
template <class F> template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f) void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
{ {
...@@ -18,19 +59,19 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f) ...@@ -18,19 +59,19 @@ void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
else else
{ {
std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize); std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
#pragma omp parallel num_threads(threadsize) #pragma omp parallel for num_threads(threadsize) schedule(static, 1) private(grainsize, n)
for(std::size_t tid = 0; tid < threadsize; tid++)
{ {
std::size_t tid = omp_get_thread_num();
std::size_t work = tid * grainsize; std::size_t work = tid * grainsize;
f(work, std::min(n, work + grainsize)); f(work, std::min(n, work + grainsize));
} }
} }
} }
#endif
template <class F> template <class F>
void parallel_for(std::size_t n, std::size_t min_grain, F f) void parallel_for(std::size_t n, std::size_t min_grain, F f)
{ {
const auto threadsize = std::min<std::size_t>(omp_get_num_threads(), n / min_grain); const auto threadsize = std::min<std::size_t>(max_threads(), n / min_grain);
parallel_for_impl(n, threadsize, f); parallel_for_impl(n, threadsize, f);
} }
......
...@@ -14,6 +14,8 @@ namespace cpu { ...@@ -14,6 +14,8 @@ namespace cpu {
struct multi_index struct multi_index
{ {
constexpr multi_index() = default;
multi_index(const shape& s, std::size_t i) : n(s.lens().size()) multi_index(const shape& s, std::size_t i) : n(s.lens().size())
{ {
assert(n < max_size); assert(n < max_size);
...@@ -21,17 +23,17 @@ struct multi_index ...@@ -21,17 +23,17 @@ struct multi_index
s.multi_copy(i, index, index + max_size); s.multi_copy(i, index, index + max_size);
} }
std::size_t size() const { return n; } constexpr std::size_t size() const { return n; }
std::size_t* begin() { return index; } constexpr std::size_t* begin() { return index; }
const std::size_t* begin() const { return index; } constexpr const std::size_t* begin() const { return index; }
std::size_t* end() { return index + size(); } constexpr std::size_t* end() { return index + size(); }
const std::size_t* end() const { return index + size(); } constexpr const std::size_t* end() const { return index + size(); }
std::size_t offset(const shape& s) const { return s.index(begin(), end()); } std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
void carry() constexpr void carry()
{ {
std::size_t overflow = 0; std::size_t overflow = 0;
for(std::ptrdiff_t i = size() - 1; i > 0; i--) for(std::ptrdiff_t i = size() - 1; i > 0; i--)
...@@ -40,29 +42,34 @@ struct multi_index ...@@ -40,29 +42,34 @@ struct multi_index
// Reset overflow // Reset overflow
overflow = 0; overflow = 0;
// Compute overflow using while loop instead of mod // Compute overflow using while loop instead of mod
// overflow = z / dims[i];
// z = z % dims[i];
while(z >= dims[i]) while(z >= dims[i])
{ {
z -= dims[i]; z -= dims[i];
overflow += 1; overflow += 1;
} }
index[i] = z; index[i] = z;
// Exit if there is no overflow
if(overflow == 0)
return;
} }
index[0] += overflow; index[0] += overflow;
} }
void increment(std::size_t i) constexpr void increment(std::size_t i)
{ {
index[size() - 1] += i; index[size() - 1] += i;
carry(); carry();
} }
multi_index& operator+=(std::size_t i) constexpr multi_index& operator+=(std::size_t i)
{ {
increment(i); increment(i);
return *this; return *this;
} }
multi_index& operator++() constexpr multi_index& operator++()
{ {
increment(1); increment(1);
return *this; return *this;
...@@ -76,9 +83,9 @@ struct multi_index ...@@ -76,9 +83,9 @@ struct multi_index
private: private:
static const std::size_t max_size = 5; static const std::size_t max_size = 5;
std::size_t index[max_size]; std::size_t index[max_size] = {};
std::size_t dims[max_size]; std::size_t dims[max_size] = {};
std::size_t n; std::size_t n = 0;
}; };
struct reduce_dims_base struct reduce_dims_base
...@@ -104,6 +111,153 @@ struct reduce_dims_base ...@@ -104,6 +111,153 @@ struct reduce_dims_base
} }
}; };
template <class T, std::size_t N>
struct vec
{
using array_type = std::array<T, N>;
using vector_type __attribute__((vector_size(N * sizeof(T)))) = T;
union
{
array_type array;
vector_type vector;
};
static_assert(sizeof(array_type) == sizeof(vector_type), "Not the same size");
};
template <class T>
constexpr std::integral_constant<std::size_t, 0> vec_size(const T&)
{
return {};
}
template <class T, std::size_t N>
constexpr std::integral_constant<std::size_t, N> vec_size(const vec<T, N>&)
{
return {};
}
template <class T>
constexpr std::size_t vec_size()
{
return decltype(vec_size(std::declval<T>())){};
}
template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() > 0))>
void vec_apply(F f, V& v, Vs... vs)
{
assert(all_of({vec_size<Vs>()...}, [&](auto n) { return n == vec_size<V>(); }));
assert(vec_size<V>() == v.array.size());
for(std::size_t i = 0; i < vec_size<V>(); i++)
f(v.array[i], vs.vector[i]...);
}
template <class F, class V, class... Vs, MIGRAPHX_REQUIRES((vec_size<V>() == 0))>
void vec_apply(F f, V& v, Vs&... vs)
{
f(v, vs...);
}
inline std::size_t find_packed_len(const shape& s)
{
for(std::size_t i = 0; i < s.lens().size(); i++)
{
if(s.lens()[i] > 1 and s.strides()[i] == 1)
{
return i;
}
}
return -1;
}
template <std::size_t N>
shape vectorize(const shape& s)
{
assert(s.standard() or s.broadcasted());
auto lens = s.lens();
if(s.broadcasted())
{
auto n = find_packed_len(s);
assert(n != -1);
assert((lens[n] % N) == 0);
lens[n] /= N;
return {s.type(), lens, s.strides()};
}
assert((lens.back() % N) == 0);
lens.back() /= N;
return {s.type(), lens};
}
template <std::size_t N, class T>
tensor_view<vec<T, N>> vectorize(tensor_view<T> tv)
{
return {vectorize<N>(tv.get_shape()), reinterpret_cast<vec<T, N>*>(tv.data())};
}
template <class T>
struct is_vector_type : std::false_type
{
};
template <>
struct is_vector_type<float> : std::true_type
{
};
template <class... Ts>
struct is_vector_tensor_view : and_<is_vector_type<typename Ts::value_type>{}...>
{
};
template <std::size_t N, class... Xs>
bool is_vectorizable(const Xs&... xs)
{
return all_of({xs...}, [](const auto& s) {
if(s.standard() and (s.lens().back() % N) == 0)
return true;
if(s.broadcasted())
{
auto n = std::inner_product(s.lens().begin(),
s.lens().end(),
s.strides().begin(),
0,
std::plus<>{},
[&](auto len, auto stride) -> std::size_t {
if(stride > 0 and len == 1)
return 0;
return stride;
});
if(n == 1)
{
auto i = find_packed_len(s);
assert(i != -1);
return (s.lens()[i] % N) == 0;
}
}
return false;
});
}
template <class... Ts, MIGRAPHX_REQUIRES(is_vector_tensor_view<Ts...>{})>
auto auto_vectorize(const shape& base_shape, Ts... xs)
{
return [=](auto f) {
if(is_vectorizable<32>(base_shape, xs.get_shape()...))
f(vectorize<32>(base_shape), vectorize<32>(xs)...);
else if(is_vectorizable<8>(base_shape, xs.get_shape()...))
f(vectorize<8>(base_shape), vectorize<8>(xs)...);
else
f(base_shape, xs...);
};
}
template <class... Ts, MIGRAPHX_REQUIRES(not is_vector_tensor_view<Ts...>{})>
auto auto_vectorize(const shape& base_shape, Ts... xs)
{
return [=](auto f) { f(base_shape, xs...); };
}
template <class X, class... Xs> template <class X, class... Xs>
bool is_standard_offset(const X& x, const Xs&... xs) bool is_standard_offset(const X& x, const Xs&... xs)
{ {
...@@ -116,15 +270,15 @@ bool is_standard_offset(const X& x, const Xs&... xs) ...@@ -116,15 +270,15 @@ bool is_standard_offset(const X& x, const Xs&... xs)
} }
template <class... Ts> template <class... Ts>
auto pointwise(Ts... xs) auto pointwise_apply(Ts... ts)
{ {
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable { return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
if(is_standard_offset(xs.get_shape()...)) if(is_standard_offset(ts.get_shape()...))
{ {
ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable { ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
for(auto i = start; i < end; i++) for(auto i = start; i < end; i++)
{ {
f(xs.data()[i]...); vec_apply(f, ts.data()[i]...);
} }
}); });
} }
...@@ -135,7 +289,7 @@ auto pointwise(Ts... xs) ...@@ -135,7 +289,7 @@ auto pointwise(Ts... xs)
multi_index mi(base_shape, start); multi_index mi(base_shape, start);
for(auto i = start; i < end; i++) for(auto i = start; i < end; i++)
{ {
f(xs.data()[mi.offset(xs.get_shape())]...); vec_apply(f, ts.data()[mi.offset(ts.get_shape())]...);
++mi; ++mi;
} }
}); });
...@@ -143,6 +297,15 @@ auto pointwise(Ts... xs) ...@@ -143,6 +297,15 @@ auto pointwise(Ts... xs)
}; };
} }
template <class... Ts>
auto pointwise(Ts... ts)
{
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
auto_vectorize(base_shape, ts...)(
[&](auto bs, auto... xs) { pointwise_apply(xs...)(ctx, bs, min_grain, f); });
};
}
template <class Op> template <class Op>
struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>> struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
{ {
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_WRITE_LITERALS_HPP
#include <migraphx/config.hpp>
#include <string>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct module;
namespace cpu {
struct write_literals
{
std::string name() const { return "cpu::write_literals"; }
void apply(module& m) const;
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/logsoftmax.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_logsoftmax : dnnl_extend_op<dnnl_logsoftmax, dnnl::logsoftmax_forward, op::logsoftmax>
{
dnnl::logsoftmax_forward::desc
get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
int axis = this->op.axis;
return {dnnl::prop_kind::forward_inference, m.at(DNNL_ARG_SRC_0), axis};
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
...@@ -51,145 +51,6 @@ typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::ena ...@@ -51,145 +51,6 @@ typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::ena
return x; return x;
} }
struct cpu_lrn
{
op::lrn op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::lrn"; }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0])([&](auto output, auto input) {
int n_batch = output_shape.lens()[0];
int channels = output_shape.lens()[1];
int height = output_shape.lens()[2];
int width = output_shape.lens()[3];
float alphaoverarea = op.alpha / float(op.size);
int radius_lower = (op.size - 1) / 2;
int radius_upper = op.size / 2 + 1;
par_dfor(n_batch, height, width)([&](int b, int h, int w) {
float scale = 0;
dfor(channels)([&](int c) {
auto start = (c - radius_lower) < 0 ? 0 : (c - radius_lower);
auto end = (c + radius_upper) > channels ? channels : (c + radius_upper);
for(auto k = start; k < end; ++k)
{
scale += std::pow(input(b, k, h, w), 2);
}
scale *= alphaoverarea;
scale += op.bias;
scale = std::pow(scale, -op.beta);
output(b, c, h, w) = input(b, c, h, w) * scale;
});
});
});
return result;
}
};
MIGRAPHX_REGISTER_OP(cpu_lrn)
template <class Op>
struct cpu_deconvolution : auto_register_op<cpu_deconvolution<Op>>
{
cpu_deconvolution() = default;
cpu_deconvolution(Op pop) : op(std::move(pop)) {}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
argument result{output_shape};
visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
using type = typename decltype(output)::value_type;
std::fill(output.begin(), output.end(), type{0});
auto in_lens = input.get_shape().lens();
auto in_n = in_lens[0];
auto in_c = in_lens[1];
auto wei = weights.get_shape().lens();
auto wei_n = wei[0];
auto wei_c = wei[1];
auto out_lens = output_shape.lens();
auto kdims = op.kdims();
std::vector<std::size_t> win_size{in_c};
std::copy(in_lens.begin() + 2, in_lens.end(), std::back_inserter(win_size));
std::copy(wei.begin() + 2, wei.end(), std::back_inserter(win_size));
shape win_shape{output_shape.type(), win_size};
par_dfor(in_n, wei_c)([&](int o, int k) {
shape_for_each(win_shape, [&](auto idx_win) {
const int w = idx_win[0];
auto input_dims_start = idx_win.begin() + 1;
auto wei_dims_start = idx_win.begin() + kdims + 1;
std::vector<std::ptrdiff_t> win_start;
for(std::size_t n = 0; n < kdims; ++n)
{
win_start.push_back(std::ptrdiff_t(*(input_dims_start + n) * op.stride[n]) -
std::ptrdiff_t(op.padding[n]));
}
const int group_id = w / (wei_n / op.group);
const int in_ch = group_id * wei_c + k;
std::vector<std::ptrdiff_t> idx_out{o, in_ch};
for(size_t n = 0; n < kdims; n++)
{
idx_out.push_back(win_start[n] + *(wei_dims_start + n) * op.dilation[n]);
}
std::vector<std::ptrdiff_t> idx_wei{w, k};
std::copy(wei_dims_start, idx_win.end(), std::back_inserter(idx_wei));
std::vector<std::ptrdiff_t> idx_in{o, w};
std::copy(input_dims_start, wei_dims_start, std::back_inserter(idx_in));
if(std::all_of(
idx_out.begin() + 2, idx_out.end(), [&](auto ii) { return ii >= 0; }) and
std::equal(idx_out.begin() + 2,
idx_out.end(),
out_lens.begin() + 2,
out_lens.end(),
std::less<std::ptrdiff_t>{}))
{
output(idx_out.begin(), idx_out.end()) +=
input(idx_in.begin(), idx_in.end()) *
weights(idx_wei.begin(), idx_wei.end());
}
});
});
});
return result;
}
};
template struct cpu_deconvolution<op::deconvolution>;
struct cpu_im2col struct cpu_im2col
{ {
op::im2col op; op::im2col op;
...@@ -334,17 +195,6 @@ struct leaky_relu_op ...@@ -334,17 +195,6 @@ struct leaky_relu_op
} }
}; };
struct elu_op
{
op::elu op;
std::string name() const { return "cpu::elu"; }
auto fcn() const
{
auto a = op.alpha;
return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
}
};
template <typename Op> template <typename Op>
struct cpu_unary2 : auto_register_op<cpu_unary2<Op>> struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
{ {
...@@ -382,78 +232,6 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>> ...@@ -382,78 +232,6 @@ struct cpu_unary2 : auto_register_op<cpu_unary2<Op>>
} }
}; };
template struct cpu_unary2<leaky_relu_op>; template struct cpu_unary2<leaky_relu_op>;
template struct cpu_unary2<elu_op>;
template <class Op>
struct cpu_softmax : auto_register_op<cpu_softmax<Op>>
{
cpu_softmax() = default;
cpu_softmax(Op pop) : op(std::move(pop)) {}
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(1).standard();
return op.normalize_compute_shape(inputs);
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
argument result{output_shape};
auto batch_lens = output_shape.lens();
int64_t tuned_axis = tune_axis(args[0].get_shape().lens().size(), op.axis, op.name());
std::size_t n_dims = batch_lens[tuned_axis];
batch_lens[tuned_axis] = 1;
shape batch_shape{shape::int32_type, batch_lens};
visit_all(result, args[0])([&](auto output, auto input) {
using value_type = typename decltype(input)::value_type;
std::vector<value_type> batch_max(batch_shape.elements(),
std::numeric_limits<value_type>::lowest());
std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
par_for(batch_shape.elements(), [&](auto i) {
auto idx = batch_shape.multi(i);
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
batch_max[i] = std::max(batch_max[i], input(idx.begin(), idx.end()));
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
std::size_t index = output_shape.index(idx);
output[index] = std::exp(input[index] - batch_max[i]);
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
batch_sum[i] += output(idx.begin(), idx.end());
}
for(std::size_t j = 0; j < n_dims; ++j)
{
idx[tuned_axis] = j;
output(idx.begin(), idx.end()) =
op.output()(output(idx.begin(), idx.end()), batch_sum[i]);
}
});
});
return result;
}
};
template struct cpu_softmax<op::softmax>;
template struct cpu_softmax<op::logsoftmax>;
struct cpu_rnn_var_sl_last_output struct cpu_rnn_var_sl_last_output
{ {
...@@ -502,29 +280,6 @@ struct cpu_rnn_var_sl_last_output ...@@ -502,29 +280,6 @@ struct cpu_rnn_var_sl_last_output
}; };
MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output) MIGRAPHX_REGISTER_OP(cpu_rnn_var_sl_last_output)
struct cpu_literal
{
argument data;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.data, "data"));
}
std::string name() const { return "cpu::literal"; }
shape compute_shape(const std::vector<shape>&) const { return data.get_shape(); }
argument compute(const shape&, const std::vector<argument>&) const { return data; }
friend std::ostream& operator<<(std::ostream& os, const cpu_literal& x)
{
os << x.name();
return os;
}
};
struct cpu_apply struct cpu_apply
{ {
module* modl; module* modl;
...@@ -553,49 +308,97 @@ struct cpu_apply ...@@ -553,49 +308,97 @@ struct cpu_apply
} }
} }
void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = false) void extend_op(const std::string& op_name, const std::string& cpu_name, bool allocate = true)
{ {
apply_map.emplace(op_name, [=](instruction_ref ins) { apply_map.emplace(op_name, [=](instruction_ref ins) {
auto&& op = ins->get_operator(); auto&& op = ins->get_operator();
if(allocate) if(allocate)
replace(ins, make_op(cpu_name, op.to_value())); return replace(ins, make_op(cpu_name, op.to_value()));
return modl->replace_instruction(ins, make_op(cpu_name, op.to_value()), ins->inputs()); return modl->replace_instruction(ins, make_op(cpu_name, op.to_value()), ins->inputs());
}); });
} }
void extend_dnnl_algos(const std::string& dnnl_name,
const std::vector<std::pair<std::string, std::string>>& algos)
{
for(auto&& pp : algos)
{
std::string op_name = pp.first;
std::string algo = pp.second;
apply_map.emplace(op_name, [=](instruction_ref ins) {
auto v = ins->get_operator().to_value();
if(not v.is_object())
return ins;
v["algo"] = algo;
auto op = make_op(dnnl_name, v);
return replace(ins, op);
});
}
}
void init() void init()
{ {
create_output_names(); create_output_names();
extend_op("add", "dnnl::add", true); extend_dnnl_algos("dnnl::binary",
extend_op("mul", "dnnl::mul", true); {
extend_op("convolution", "dnnl::convolution", true); {"add", "binary_add"},
extend_op("dot", "dnnl::dot", true); {"div", "binary_div"},
extend_op("relu", "dnnl::relu", true); {"max", "binary_max"},
{"min", "binary_min"},
extend_op("contiguous", "cpu::contiguous", true); {"mul", "binary_mul"},
extend_op("deconvolution", "cpu::deconvolution"); });
extend_op("elu", "cpu::elu");
extend_op("im2col", "cpu::im2col"); extend_dnnl_algos("dnnl::eltwise",
extend_op("leaky_relu", "cpu::leaky_relu"); {
extend_op("logsoftmax", "cpu::logsoftmax"); {"abs", "eltwise_abs"},
extend_op("lrn", "cpu::lrn"); {"elu", "eltwise_elu"},
extend_op("pad", "cpu::pad"); {"exp", "eltwise_exp"},
extend_op("quant_convolution", "cpu::quant_convolution", true); {"log", "eltwise_log"},
extend_op("quant_dot", "cpu::quant_dot", true); {"relu", "eltwise_relu"},
extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output"); {"sqrt", "eltwise_sqrt"},
extend_op("softmax", "cpu::softmax"); {"tanh", "eltwise_tanh"},
});
extend_dnnl_algos("dnnl::reduction",
{
{"reduce_max", "reduction_max"},
{"reduce_mean", "reduction_mean"},
{"reduce_min", "reduction_min"},
{"reduce_sum", "reduction_sum"},
});
extend_op("concat", "dnnl::concat");
extend_op("contiguous", "dnnl::reorder");
extend_op("convolution", "dnnl::convolution");
extend_op("deconvolution", "dnnl::deconvolution");
extend_op("dot", "dnnl::dot");
extend_op("erf", "cpu::erf");
extend_op("gather", "cpu::gather");
extend_op("logsoftmax", "dnnl::logsoftmax");
extend_op("lrn", "dnnl::lrn");
extend_op("softmax", "dnnl::softmax");
extend_op("sub", "cpu::sub");
extend_op("im2col", "cpu::im2col", false);
extend_op("leaky_relu", "cpu::leaky_relu", false);
extend_op("pad", "cpu::pad", false);
extend_op("rnn_var_sl_last_output", "cpu::rnn_var_sl_last_output", false);
} }
void apply() void apply()
{ {
init(); init();
// Apply these operators first so the inputs can be const folded
for(auto it : iterator_for(*modl)) for(auto it : iterator_for(*modl))
{ {
if(it->name() == "@literal") if(it->name() == "pow")
{ {
apply_literal(it); apply_pow(it);
} }
else if(it->name() == "pooling") }
for(auto it : iterator_for(*modl))
{
if(it->name() == "pooling")
{ {
apply_pooling(it); apply_pooling(it);
} }
...@@ -606,9 +409,15 @@ struct cpu_apply ...@@ -606,9 +409,15 @@ struct cpu_apply
} }
} }
instruction_ref apply_literal(instruction_ref ins) const instruction_ref apply_pow(instruction_ref ins)
{ {
return modl->replace_instruction(ins, cpu_literal{ins->get_literal().get_argument()}); auto beta = read_scalar<float>(ins->inputs()[1]);
if(beta.empty())
return ins;
return replace(ins,
make_op("dnnl::eltwise",
{{"algo", "eltwise_pow"}, {"alpha", 1.0}, {"beta", beta.front()}}),
{ins->inputs().front()});
} }
instruction_ref apply_pooling(instruction_ref ins) instruction_ref apply_pooling(instruction_ref ins)
...@@ -626,9 +435,27 @@ struct cpu_apply ...@@ -626,9 +435,27 @@ struct cpu_apply
return ins; return ins;
} }
template <class T>
static std::vector<T> read_scalar(instruction_ref ins)
{
if(ins->name() == "contiguous")
return read_scalar<T>(ins->inputs().front());
if(ins->get_shape().elements() != 1 and not ins->get_shape().scalar())
return {};
auto r = ins->eval();
if(r.empty())
return {};
return {r.at<T>()};
}
instruction_ref replace(instruction_ref ins, const operation& op) instruction_ref replace(instruction_ref ins, const operation& op)
{ {
auto inputs = ins->inputs(); return replace(ins, op, ins->inputs());
}
instruction_ref
replace(instruction_ref ins, const operation& op, std::vector<instruction_ref> inputs)
{
inputs.push_back(insert_allocation(ins, ins->get_shape())); inputs.push_back(insert_allocation(ins, ins->get_shape()));
return modl->replace_instruction(ins, op, inputs); return modl->replace_instruction(ins, op, inputs);
} }
......
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp> #include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/mul.hpp> #include <migraphx/op/lrn.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace cpu { namespace cpu {
struct dnnl_mul : dnnl_extend_op<dnnl_mul, dnnl::binary, op::mul> struct dnnl_lrn : dnnl_extend_op<dnnl_lrn, dnnl::lrn_forward, op::lrn>
{ {
dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const dnnl::lrn_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{ {
return {dnnl::algorithm::binary_mul, return {dnnl::prop_kind::forward_inference,
dnnl::algorithm::lrn_across_channels,
m.at(DNNL_ARG_SRC_0), m.at(DNNL_ARG_SRC_0),
m.at(DNNL_ARG_SRC_1), this->op.size,
m.at(DNNL_ARG_DST)}; this->op.alpha,
this->op.beta,
this->op.bias};
} }
}; };
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment