".github/workflows/vscode:/vscode.git/clone" did not exist on "1b14cd542b8f865e63b1481a98bc635466de49c2"
Unverified Commit 406afeb8 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Use dnnl for cpu backend (#688)



* Add flag to enable cpu backend

* Make buffers shared

* Enable optimizations

* Add onednn

* Formatting

* Formatting

* Add dnnl header

* Formatting

* Rewrite rnn first

* Formatting

* Call reference implementation

* Formatting

* Make literal data shared

* Formatting

* Add convolution

* Formatting

* Compensate for dilation

* Formatting

* Use name/make_op instead

* Formatting

* Rename gemm header

* Formatting

* Add dnnl convolution/gemm operators

* Formatting

* Add eliminate_contiguous

* Add faster pointwise operators

* Formatting

* Formatting

* Formatting

* Add dnnl op class

* Formatting

* Add add op

* Formatting

* Add concat operator

* Formatting

* Add more ops

* Create descriptor during finalization

* Formatting

* Dont rewrite pooling

* Enable memory coloring

* Formatting

* Add output aliases

* Formatting

* Fix errors

* Formatting

* Convert literals

* Add missing file

* Remove batch_norm

* Formatting

* Use strides

* Formatting

* Add some debug checks

* Formatting

* Fix big in adjusting shape for gemm

* Formatting

* Fix fallback dot operator

* Zero initialize buffers

* Add suport for group convolutions

* Formatting

* Make adjust allocation target independent

* Formatting

* Enable adjust_allocation for gpu/cpu

* Formatting

* Add copy to allocation model

* Formatting

* Add copy operator

* Formatting

* Better handling of output parameters in adjust_allocation

* Formatting

* Build with dnnl

* Make dnnl required

* Fix compile error

* Tidy fixes

* Formatting

* Tidy fixes

* Formatting

* Fix more tidy issues

* Formatting

* Add mul op

* Add mul op

* Set c compiler to clang as well

* Compensate for normalized compute shape

* Formatting

* Fix cppcheck errors

* Formatting

* Add onednn library to hcc

* Guard clang pragmas

* Disable cpu mode for gcc for now

* Leave it enabled it for gcc 7

* Fix cppcheck suppresion

* Fix compile error on gcc 5

* Remove unused code
Co-authored-by: default avatarShucai Xiao <shucai.xiao@amd.com>
Co-authored-by: default avatarmvermeulen <5479696+mvermeulen@users.noreply.github.com>
parent 8698cd2c
add_library(migraphx_cpu add_library(migraphx_cpu
allocation_model.cpp
allocate.cpp
add.cpp
contiguous.cpp
concat.cpp
convolution.cpp
copy.cpp
mul.cpp
pooling.cpp
relu.cpp
gemm.cpp
target.cpp target.cpp
lowering.cpp lowering.cpp
gemm.cpp migemm.cpp
) )
set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu) set_target_properties(migraphx_cpu PROPERTIES EXPORT_NAME cpu)
rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION}) rocm_set_soversion(migraphx_cpu ${MIGRAPHX_SO_VERSION})
find_path(BLAZE_INCLUDE blaze/Blaze.h) find_path(BLAZE_INCLUDE blaze/Blaze.h)
find_package(Threads) find_package(Threads)
find_package(dnnl REQUIRED)
rocm_clang_tidy_check(migraphx_cpu) rocm_clang_tidy_check(migraphx_cpu)
target_link_libraries(migraphx_cpu migraphx Threads::Threads) target_link_libraries(migraphx_cpu PRIVATE migraphx Threads::Threads)
target_include_directories(migraphx_cpu PRIVATE ${BLAZE_INCLUDE}) target_include_directories(migraphx_cpu PRIVATE ${BLAZE_INCLUDE})
target_compile_definitions(migraphx_cpu PRIVATE -DBLAZE_USE_CPP_THREADS) target_compile_definitions(migraphx_cpu PRIVATE -DBLAZE_USE_CPP_THREADS)
if(dnnl_FOUND)
target_link_libraries(migraphx_cpu PRIVATE DNNL::dnnl)
target_compile_definitions(migraphx_cpu PRIVATE -DUSE_DNNL)
find_package(OpenMP)
target_link_libraries(migraphx_cpu PUBLIC OpenMP::OpenMP_CXX)
# Add library path to rpath to workaround issues with our broken packages
foreach(LIBRARY ${OpenMP_CXX_LIBRARIES})
if(LIBRARY MATCHES "libomp")
get_filename_component(LIBRARY_PATH "${LIBRARY}" PATH)
target_link_libraries(migraphx_cpu PUBLIC -Wl,-rpath=${LIBRARY_PATH} -Wl,-rpath-link=${LIBRARY_PATH})
endif()
endforeach()
endif()
target_link_libraries(migraphx_all_targets INTERFACE migraphx_cpu) target_link_libraries(migraphx_all_targets INTERFACE migraphx_cpu)
......
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/add.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template struct cpu_binary<op::add>;
#if USE_DNNL
struct dnnl_add : dnnl_extend_op<dnnl_add, dnnl::binary, op::add>
{
dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {dnnl::algorithm::binary_add,
m.at(DNNL_ARG_SRC_0),
m.at(DNNL_ARG_SRC_1),
m.at(DNNL_ARG_DST)};
}
};
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/check_shapes.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/context.hpp>
#include <migraphx/cpu/context.hpp>
#include <migraphx/register_op.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct cpu_allocate : auto_register_op<cpu_allocate>
{
shape s;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.s, "shape"));
}
std::string name() const { return "cpu::allocate"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(0);
return s;
}
argument compute(context&, const shape& output_shape, const std::vector<argument>&) const
{
argument result{output_shape};
return result;
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/cpu/allocation_model.hpp>
#include <migraphx/make_op.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
std::string cpu_allocation_model::name() const { return "cpu::allocate"; }
operation cpu_allocation_model::allocate(const shape& s) const
{
return make_op(name(), {{"shape", to_value(s)}});
}
std::string cpu_allocation_model::copy() const { return "cpu::copy"; }
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/concat.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
#if USE_DNNL
struct dnnl_concat : dnnl_extend_op<dnnl_concat, dnnl::concat, op::concat>
{
std::vector<int> arg_map(int size) const
{
std::vector<int> result(size);
std::iota(result.begin(), result.end(), DNNL_ARG_MULTIPLE_SRC);
return result;
}
// Custom desc class since its missing in dnnl
struct desc
{
dnnl::memory::desc dst;
std::size_t axis = 1;
std::vector<dnnl::memory::desc> srcs;
};
desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
std::vector<dnnl::memory::desc> srcs;
srcs.reserve(m.size() - 1);
for(auto i = 0; i < m.size() - 1; i++)
{
srcs.push_back(m.at(DNNL_ARG_MULTIPLE_SRC + i));
}
return {m.at(DNNL_ARG_DST), std::size_t(op.axis), srcs};
}
auto get_primitive_desc(const desc& d) const
{
return dnnl::concat::primitive_desc(d.dst, d.axis, d.srcs, get_dnnl_context().engine);
}
};
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/contiguous.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template struct cpu_unary<op::contiguous>;
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/par_for.hpp>
#include <migraphx/context.hpp>
#include <migraphx/cpu/context.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/convolution.hpp>
#include <migraphx/op/quant_convolution.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template <class V, class T, class... Ts>
void visit_quantize_impl(V&& v, T&& x, Ts&&... xs)
{
x.visit([&](auto y) { visit_all(xs...)([&](auto... ys) { v(y, ys...); }); });
}
template <class T, class... Ts>
auto visit_quantize(T&& x, Ts&&... xs)
{
return [&](auto v) {
// Workaround for https://gcc.gnu.org/bugzilla/show_bug.cgi?id=70100
visit_quantize_impl(v, x, xs...);
};
}
template <class Op>
struct cpu_convolution : auto_register_op<cpu_convolution<Op>>
{
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(std::vector<shape> inputs) const
{
inputs.pop_back();
return op.compute_shape(inputs);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
argument compute(context&, shape output_shape, std::vector<argument> args) const
{
visit_quantize(args.back(), args[0], args[1])([&](auto output, auto input, auto weights) {
auto in_lens = input.get_shape().lens();
auto wei_lens = weights.get_shape().lens();
auto wei_n = wei_lens[0];
auto wei_c = wei_lens[1];
std::vector<std::size_t> win_size(wei_lens.begin() + 1, wei_lens.end());
par_for(output_shape.elements(), [&](auto i) {
auto idx_o = output_shape.multi(i);
auto w = idx_o[1];
auto n_dim = idx_o.size();
std::vector<std::ptrdiff_t> win_start;
for(std::size_t dim = 2; dim < n_dim; ++dim)
{
auto d_2 = dim - 2;
win_start.push_back(std::ptrdiff_t(idx_o[dim] * op.stride[d_2]) -
std::ptrdiff_t(op.padding[d_2]));
}
const auto group_id = w / (wei_n / op.group);
shape win_shape{output_shape.type(), win_size};
double acc = 0.0;
shape_for_each(win_shape, [&](auto idx_win) {
auto k = idx_win[0];
const auto in_ch = group_id * wei_c + k;
std::vector<std::ptrdiff_t> idx(idx_o.begin(), idx_o.end());
idx[1] = in_ch;
std::transform(idx_win.begin() + 1,
idx_win.end(),
win_start.begin(),
idx.begin() + 2,
[](std::ptrdiff_t ii, std::ptrdiff_t jj) { return ii + jj; });
std::vector<std::ptrdiff_t> idx_wei(idx_o.size());
idx_wei[0] = w;
std::copy(idx_win.begin(), idx_win.end(), idx_wei.begin() + 1);
if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
std::equal(idx.begin(),
idx.end(),
in_lens.begin(),
in_lens.end(),
std::less<std::ptrdiff_t>{}))
{
acc +=
input(idx.begin(), idx.end()) * weights(idx_wei.begin(), idx_wei.end());
}
});
output[i] = acc;
});
});
return args.back();
}
};
template struct cpu_convolution<op::quant_convolution>;
template struct cpu_convolution<op::convolution>;
#if USE_DNNL
struct dnnl_convolution
: dnnl_extend_op<dnnl_convolution, dnnl::convolution_forward, op::convolution>
{
std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
shape adjust_shape(const shape& x, int i) const
{
auto s = base_adjust_shape(x);
if(i == 1 and op.group > 1)
{
// TODO: Add support for transposed weights
if(not s.standard())
MIGRAPHX_THROW("Weights for grouped convolution must be standard");
auto lens = s.lens();
lens.insert(lens.begin(), op.group);
lens.at(1) /= op.group;
return shape{s.type(), lens};
}
return s;
}
dnnl::convolution_forward::desc
get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
// In DNNL dilation is zero-based
auto dilation = op.dilation;
std::transform(
dilation.begin(), dilation.end(), dilation.begin(), [](auto x) { return x - 1; });
return {dnnl::prop_kind::forward_inference,
dnnl::algorithm::convolution_auto,
m.at(DNNL_ARG_SRC),
m.at(DNNL_ARG_WEIGHTS),
m.at(DNNL_ARG_DST),
to_dnnl_dims(op.stride),
to_dnnl_dims(dilation),
to_dnnl_dims(op.padding),
to_dnnl_dims(op.padding)};
}
};
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct cpu_copy : reduce_dims_base, auto_register_op<cpu_copy>
{
template <class Self, class F>
static auto reflect(Self&, F)
{
return pack();
}
std::string name() const { return "cpu::copy"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2);
return inputs.at(1);
}
argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{
argument result = get_arg(args, args.size() - 1);
visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
pointwise(output, input)(ctx, output.get_shape(), 1024, [](auto& y, auto x) { y = x; });
});
return result.reshape(output_shape);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/cpu/gemm.hpp> #include <migraphx/config.hpp>
#include <migraphx/dfor.hpp> #include <migraphx/register_op.hpp>
#include <migraphx/requires.hpp> #include <migraphx/reflect.hpp>
#include <migraphx/shape_for_each.hpp> #include <migraphx/context.hpp>
#include <blaze/math/CustomMatrix.h> #include <migraphx/cpu/context.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/cpu/migemm.hpp>
#include <migraphx/op/dot.hpp>
#include <migraphx/op/quant_dot.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace cpu { namespace cpu {
template <class T> #if USE_DNNL
using matrix = blaze::CustomMatrix<T, blaze::unaligned, blaze::unpadded>; // NOLINT struct dnnl_gemm : dnnl_extend_op<dnnl_gemm, dnnl::matmul, op::dot>
template <class T>
static auto make_mat(tensor_view<T> x)
{
const auto& s = x.get_shape();
// assert(s.lens().size() == 2);
std::size_t n_dims = s.lens().size();
std::size_t dim_0 = n_dims - 2;
std::size_t dim_1 = n_dims - 1;
if(s.transposed())
return matrix<T>{x.data(), s.lens()[dim_1], s.lens()[dim_0], s.strides()[dim_1]};
return matrix<T>{x.data(), s.lens()[dim_0], s.lens()[dim_1], s.strides()[dim_0]};
}
template <class T, class F>
static void visit_mat(tensor_view<T> x, F f)
{ {
auto mat = make_mat(x); std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC, DNNL_ARG_WEIGHTS}; }
if(x.get_shape().transposed())
f(blaze::trans(mat)); // Batch must be a single dimension
shape adjust_shape(shape x, int) const
{
auto s = base_adjust_shape(x);
auto ndims = s.lens().size();
if(ndims > 3)
{
if(not std::is_sorted(
s.strides().begin(), s.strides().begin() + (ndims - 2), std::greater<>{}))
MIGRAPHX_THROW("Batch transposed");
std::size_t batch = std::accumulate(
s.lens().begin(), s.lens().begin() + (ndims - 2), 1, std::multiplies<>{});
shape s3d{s.type(),
{batch, s.lens()[ndims - 2], s.lens()[ndims - 1]},
{s.lens()[ndims - 2] * s.lens()[ndims - 1],
s.strides()[ndims - 2],
s.strides()[ndims - 1]}};
return s3d;
}
else else
f(mat); {
} return s;
}
}
template <class T> dnnl::matmul::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
struct is_fast_gemm_type : std::false_type {
{ return {m.at(DNNL_ARG_SRC), m.at(DNNL_ARG_WEIGHTS), m.at(DNNL_ARG_DST)};
}
}; };
#endif
// template <> struct cpu_gemm : auto_register_op<cpu_gemm>
// struct is_fast_gemm_type<float> : std::true_type
// {
// };
template <class T, class F>
void migemm_impl(
tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::true_type)
{ {
visit_mat(amat, [&](const auto& a) { op::dot op;
visit_mat(bmat, [&](const auto& b) {
auto c = make_mat(cmat); template <class Self, class F>
c = beta * c; static auto reflect(Self& self, F f)
// This is a simple optimization to avoid
// compute A * B if alpha is 0.0
if(alpha != 0.0)
{ {
c = c + alpha * a * b; return migraphx::reflect(self.op, f);
} }
std::string name() const { return "cpu::dot"; }
shape compute_shape(std::vector<shape> inputs) const
{
check_shapes{inputs, *this}.standard();
inputs.pop_back();
return op.compute_shape(inputs);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
argument compute(context&, const shape&, std::vector<argument> args) const
{
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrices, and C is of the same shape as A * B
if(args.size() == 4)
{
// no need to consider the value of args[2]
if(op.beta == 0.0f)
{
args.back().visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
}
else
{
visit_all(args.back(), args[2])([&](auto output, auto input) {
std::copy(input.begin(), input.end(), output.begin());
}); });
}); }
}
migemm(args.back(), args[0], args[1], op.alpha, op.beta);
template <class T, class F> return args.back();
void migemm_impl( }
tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
// 2 input arguments
migemm(args.back(), args[0], args[1], op.alpha, 0.0f);
return args.back();
}
};
struct cpu_quant_gemm : auto_register_op<cpu_quant_gemm>
{ {
std::size_t n_dims = cmat.get_shape().lens().size(); op::quant_dot op;
std::size_t dim_0 = n_dims - 2;
std::size_t dim_1 = n_dims - 1; template <class Self, class F>
auto k = amat.get_shape().lens()[dim_1]; static auto reflect(Self& self, F f)
{
assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]); return migraphx::reflect(self.op, f);
assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]); }
assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
std::string name() const { return "cpu::quant_dot"; }
shape_for_each(cmat.get_shape(), [&](const auto& c_idx) { shape compute_shape(std::vector<shape> inputs) const
auto a_idx = c_idx; {
auto b_idx = c_idx; check_shapes{inputs, *this}.standard();
double s = 0.0; inputs.pop_back();
dfor(k)([&](auto kk) { return op.compute_shape(inputs);
a_idx[dim_1] = b_idx[dim_0] = kk; }
s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
argument compute(context&, const shape&, std::vector<argument> args) const
{
// 3 inputs, it is alpha * A * B + beta * C, then
// A and B are matrices, and C is of the same shape to A * B
// first, convert the args[0] and args[1] from int8_t to int32_t
argument arg_0{{shape::int32_type, {args.at(0).get_shape().lens()}}};
argument arg_1{{shape::int32_type, {args.at(1).get_shape().lens()}}};
arg_0.visit([&](auto output) {
args.at(0).visit(
[&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
}); });
cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
arg_1.visit([&](auto output) {
args.at(1).visit(
[&](auto input) { std::copy(input.begin(), input.end(), output.begin()); });
}); });
}
template <class T, class F> if(args.size() == 4)
void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
{
auto lens = amat.get_shape().lens();
bool batch_mul =
std::accumulate(
lens.rbegin() + 2, lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()) == 1;
if(batch_mul)
{ {
migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{}); // no need to consider the value of args[2]
if(op.beta == 0)
{
args.back().visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
} }
else else
{ {
migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{}); visit_all(args.back(), args[2])([&](auto output, auto input) {
std::copy(input.begin(), input.end(), output.begin());
});
} }
}
template <class F> migemm(args.back(), arg_0, arg_1, op.alpha, op.beta);
void migemm_tpl(
const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
{
visit_all(c_arg, a_arg, b_arg)(
[&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
}
void migemm( return args.back();
const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta) }
{
migemm_tpl(c_arg, a_arg, b_arg, alpha, beta); // 2 input arguments
} migemm(args.back(), arg_0, arg_1, op.alpha, int32_t{0});
void migemm(const argument& c_arg, return args.back();
const argument& a_arg, }
const argument& b_arg, };
int32_t alpha,
int32_t beta)
{
migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
}
} // namespace cpu } // namespace cpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_ALLOCATION_MODEL_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
#include <string>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct cpu_allocation_model
{
std::string name() const;
std::string copy() const;
operation allocate(const shape& s) const;
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
...@@ -2,15 +2,55 @@ ...@@ -2,15 +2,55 @@
#define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP #define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
#include <migraphx/config.hpp> #include <migraphx/config.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/cpu/parallel.hpp>
#include <migraphx/par_for.hpp>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
namespace cpu { namespace cpu {
#ifdef USE_DNNL
struct context struct context
{ {
void finish() const {} void finish() const {}
template <class F>
void bulk_execute(std::size_t n, std::size_t min_grain, F f)
{
cpu::parallel_for(n, min_grain, f);
}
template <class F>
void bulk_execute(std::size_t n, F f)
{
this->bulk_execute(n, 256, f);
}
}; };
#else
struct context
{
void finish() const {}
template <class F>
void bulk_execute(std::size_t n, std::size_t min_grain, F f)
{
const auto threadsize =
std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
par_for(threadsize, 1, [&](auto tid) {
std::size_t work = tid * grainsize;
f(work, std::min(n, work + grainsize));
});
}
template <class F>
void bulk_execute(std::size_t n, F f)
{
this->bulk_execute(n, 256, f);
}
};
#endif
} // namespace cpu } // namespace cpu
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_DNNL_HPP
#include <migraphx/config.hpp>
#include <migraphx/argument.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/check_shapes.hpp>
#include <unordered_map>
#ifdef USE_DNNL
#include <dnnl.hpp>
#include <migraphx/errors.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct dnnl_context
{
dnnl::engine engine;
dnnl::stream stream;
dnnl_context() : engine(dnnl::engine::kind::cpu, 0), stream(engine) {}
};
inline dnnl_context& get_dnnl_context()
{
static dnnl_context ctx{}; // NOLINT
return ctx;
}
#ifdef __clang__
#pragma clang diagnostic push
#pragma clang diagnostic ignored "-Wswitch-enum"
#endif
inline dnnl::memory::data_type to_dnnl_memory_data_type(shape::type_t t)
{
using dt = dnnl::memory::data_type;
using st = shape::type_t;
switch(t)
{
case st::half_type: return dt::f16;
case st::float_type: return dt::f32;
case st::int32_type: return dt::s32;
case st::int8_type: return dt::s8;
case st::uint8_type: return dt::u8;
default: MIGRAPHX_THROW("Unsupported data type");
}
}
#ifdef __clang__
#pragma clang diagnostic pop
#endif
inline dnnl::memory::format_tag to_dnnl_memory_format_tag(std::size_t n)
{
switch(n)
{
case 1: return dnnl::memory::format_tag::a;
case 2: return dnnl::memory::format_tag::ab;
case 3: return dnnl::memory::format_tag::abc;
case 4: return dnnl::memory::format_tag::abcd;
case 5: return dnnl::memory::format_tag::abcde;
case 6: return dnnl::memory::format_tag::abcdef;
default: MIGRAPHX_THROW("Unsupported tensor size: " + std::to_string(n));
}
}
template <class R>
inline dnnl::memory::dims to_dnnl_dims(R&& r)
{
return {r.begin(), r.end()};
}
inline dnnl::memory::desc to_dnnl_memory_desc(const shape& s)
{
return {to_dnnl_dims(s.lens()), to_dnnl_memory_data_type(s.type()), to_dnnl_dims(s.strides())};
}
inline dnnl::memory to_dnnl_memory(const dnnl::memory::desc& desc, const argument& a)
{
return dnnl::memory(desc, get_dnnl_context().engine, a.data());
}
inline dnnl::memory to_dnnl_memory(const argument& a)
{
return to_dnnl_memory(to_dnnl_memory_desc(a.get_shape()), a);
}
template <class Derived, class Primitive>
struct dnnl_op : auto_register_op<Derived>
{
std::function<argument(context& ctx, const std::vector<argument>& args)> execute;
static std::vector<shape> to_shapes(const std::vector<argument>& args)
{
std::vector<shape> shapes(args.size());
std::transform(args.begin(), args.end(), shapes.begin(), [](const argument& a) {
return a.get_shape();
});
return shapes;
}
// Map arg index to arg in dnnl
std::vector<int> arg_map(int size) const
{
std::vector<int> result(size);
std::iota(result.begin(), result.end(), DNNL_ARG_SRC_0);
return result;
}
shape base_adjust_shape(const shape& s) const
{
if(s.broadcasted())
{
auto lens = s.lens();
auto strides = s.strides();
std::transform(strides.begin(),
strides.end(),
lens.begin(),
lens.begin(),
[](auto stride, auto len) -> std::size_t {
if(stride == 0)
return 1;
else
return len;
});
return shape{s.type(), lens};
}
return s;
}
shape adjust_shape(const shape& s, int) const { return base_adjust_shape(s); }
std::unordered_map<int, dnnl::memory::desc>
to_memory_desc(const shape& output_shape, const std::vector<shape>& inputs) const
{
const auto& self = static_cast<const Derived&>(*this);
std::unordered_map<int, dnnl::memory::desc> result;
result[DNNL_ARG_DST] = to_dnnl_memory_desc(self.adjust_shape(output_shape, inputs.size()));
auto m = self.arg_map(inputs.size());
for(int i = 0; i < inputs.size(); i++)
{
result[m[i]] = to_dnnl_memory_desc(self.adjust_shape(inputs[i], i));
}
return result;
}
template <class T>
auto get_primitive_desc(const T& desc) const
-> decltype(typename Primitive::primitive_desc(desc, get_dnnl_context().engine))
{
return typename Primitive::primitive_desc(desc, get_dnnl_context().engine);
}
Primitive get_primitive(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
const auto& self = static_cast<const Derived&>(*this);
auto desc = self.get_desc(m);
auto pd = self.get_primitive_desc(desc);
return Primitive(pd);
}
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
return execute(ctx, args);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
void finalize(context&, const shape& output_shape, std::vector<shape> inputs)
{
// Compensate for allocation
inputs.pop_back();
const auto& self = static_cast<const Derived&>(*this);
auto md = to_memory_desc(output_shape, inputs);
auto prim = get_primitive(md);
auto arg_lookup = self.arg_map(inputs.size());
execute = [=](context&, const std::vector<argument>& args) {
#ifndef NDEBUG
// Check that the memory descriptors have not changed
auto debug_args = args;
debug_args.pop_back();
auto debug_md = to_memory_desc(output_shape, to_shapes(debug_args));
for(auto&& p : debug_md)
{
if(p.second == md.at(p.first))
continue;
MIGRAPHX_THROW("Memory descriptor has changed for: " + std::to_string(p.first));
}
#endif
std::unordered_map<int, dnnl::memory> m;
m[DNNL_ARG_DST] = to_dnnl_memory(md.at(DNNL_ARG_DST), args.back());
for(int i = 0; i < args.size() - 1; i++)
m[arg_lookup[i]] = to_dnnl_memory(md.at(arg_lookup[i]), args[i]);
prim.execute(get_dnnl_context().stream, m);
return args.back();
};
}
};
template <class Derived, class Primitive, class Op>
struct dnnl_extend_op : dnnl_op<Derived, Primitive>
{
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "dnnl::" + op.name(); }
shape compute_shape(std::vector<shape> inputs) const
{
// Compensate for allocation
inputs.pop_back();
// check_shapes(inputs, *this).standard();
auto r = migraphx::compute_shape(op, inputs);
// Call to get_primitive to make sure an algo is available
this->get_primitive(this->to_memory_desc(r, inputs));
return r;
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#endif
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_PARALLEL_HPP
#include <migraphx/config.hpp>
#if USE_DNNL
#include <omp.h>
#endif
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
#if USE_DNNL
template <class F>
void parallel_for_impl(std::size_t n, std::size_t threadsize, F f)
{
if(threadsize <= 1)
{
f(std::size_t{0}, n);
}
else
{
std::size_t grainsize = std::ceil(static_cast<double>(n) / threadsize);
#pragma omp parallel num_threads(threadsize)
{
std::size_t tid = omp_get_thread_num();
std::size_t work = tid * grainsize;
f(work, std::min(n, work + grainsize));
}
}
}
template <class F>
void parallel_for(std::size_t n, std::size_t min_grain, F f)
{
const auto threadsize = std::min<std::size_t>(omp_get_num_threads(), n / min_grain);
parallel_for_impl(n, threadsize, f);
}
template <class F>
void parallel_for(std::size_t n, F f)
{
const int min_grain = 8;
parallel_for(n, min_grain, f);
}
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_CPU_POINTWISE_HPP
#include <migraphx/config.hpp>
#include <migraphx/context.hpp>
#include <migraphx/check_shapes.hpp>
#include <migraphx/cpu/context.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/register_op.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct multi_index
{
multi_index(const shape& s, std::size_t i) : n(s.lens().size())
{
assert(n < max_size);
std::copy(s.lens().begin(), s.lens().end(), dims);
s.multi_copy(i, index, index + max_size);
}
std::size_t size() const { return n; }
std::size_t* begin() { return index; }
const std::size_t* begin() const { return index; }
std::size_t* end() { return index + size(); }
const std::size_t* end() const { return index + size(); }
std::size_t offset(const shape& s) const { return s.index(begin(), end()); }
void carry()
{
std::size_t overflow = 0;
for(std::ptrdiff_t i = size() - 1; i > 0; i--)
{
auto z = index[i] + overflow;
// Reset overflow
overflow = 0;
// Compute overflow using while loop instead of mod
while(z >= dims[i])
{
z -= dims[i];
overflow += 1;
}
index[i] = z;
}
index[0] += overflow;
}
void increment(std::size_t i)
{
index[size() - 1] += i;
carry();
}
multi_index& operator+=(std::size_t i)
{
increment(i);
return *this;
}
multi_index& operator++()
{
increment(1);
return *this;
}
multi_index operator++(int) // NOLINT
{
multi_index result = *this;
increment(1);
return result;
}
private:
static const std::size_t max_size = 5;
std::size_t index[max_size];
std::size_t dims[max_size];
std::size_t n;
};
struct reduce_dims_base
{
std::vector<shape> reduce_shapes;
void finalize(context&, const shape&, const std::vector<shape>& inputs)
{
reduce_shapes = reduce_dims(inputs);
}
argument get_arg(const std::vector<argument>& args, std::size_t i) const
{
if(reduce_shapes.empty())
return args[i];
return args.at(i).reshape(reduce_shapes.at(i));
}
argument get_output() const
{
argument a{reduce_shapes[0]};
return a;
}
};
template <class X, class... Xs>
bool is_standard_offset(const X& x, const Xs&... xs)
{
if(all_of({x, xs...}, [](const auto& s) { return s.standard(); }))
return true;
if(all_of({x, xs...}, [](const auto& s) { return s.packed(); }) and
all_of({xs...}, [&](const auto& s) { return s == x; }))
return true;
return false;
}
template <class... Ts>
auto pointwise(Ts... xs)
{
return [=](context& ctx, const shape& base_shape, std::size_t min_grain, auto f) mutable {
if(is_standard_offset(xs.get_shape()...))
{
ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
for(auto i = start; i < end; i++)
{
f(xs.data()[i]...);
}
});
}
else
{
assert(base_shape.lens().size() <= 6);
ctx.bulk_execute(base_shape.elements(), min_grain, [=](auto start, auto end) mutable {
multi_index mi(base_shape, start);
for(auto i = start; i < end; i++)
{
f(xs.data()[mi.offset(xs.get_shape())]...);
++mi;
}
});
}
};
}
template <class Op>
struct cpu_unary : reduce_dims_base, auto_register_op<cpu_unary<Op>>
{
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(2);
auto s = inputs.at(0);
return {s.type(), s.lens()};
}
argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{
argument result = get_arg(args, args.size() - 1);
visit_all(result, get_arg(args, 0))([&](auto output, auto input) {
auto op2 = op;
pointwise(output, input)(
ctx, output.get_shape(), 1024, [op2](auto& y, auto x) { y = op2.apply()(x); });
});
return result.reshape(output_shape);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
template <class Op>
struct cpu_binary : reduce_dims_base, auto_register_op<cpu_binary<Op>>
{
Op op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::" + op.name(); }
shape compute_shape(const std::vector<shape>& inputs) const
{
check_shapes{inputs, *this}.has(3);
auto s = inputs.at(0);
return {s.type(), s.lens()};
}
argument
// cppcheck-suppress constParameter
compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const
{
argument result = get_arg(args, args.size() - 1);
visit_all(result, get_arg(args, 0), get_arg(args, 1))(
[&](auto output, auto input1, auto input2) {
auto op2 = op;
pointwise(output, input1, input2)(
ctx, output.get_shape(), 1024, [op2](auto& z, auto x, auto y) {
z = op2.apply()(x, y);
});
});
return result.reshape(output_shape);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
};
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
This diff is collapsed.
#include <migraphx/cpu/migemm.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/requires.hpp>
#include <migraphx/shape_for_each.hpp>
#include <blaze/math/CustomMatrix.h>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template <class T, class F>
void migemm_impl(
tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta, std::false_type)
{
std::size_t n_dims = cmat.get_shape().lens().size();
std::size_t dim_0 = n_dims - 2;
std::size_t dim_1 = n_dims - 1;
auto k = amat.get_shape().lens()[dim_1];
assert(amat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_0]);
assert(cmat.get_shape().lens()[dim_0] == amat.get_shape().lens()[dim_0]);
assert(cmat.get_shape().lens()[dim_1] == bmat.get_shape().lens()[dim_1]);
shape_for_each(cmat.get_shape(), [&](const auto& c_idx) {
auto a_idx = c_idx;
auto b_idx = c_idx;
double s = 0.0;
dfor(k)([&](auto kk) {
a_idx[dim_1] = b_idx[dim_0] = kk;
s += amat(a_idx.begin(), a_idx.end()) * bmat(b_idx.begin(), b_idx.end());
});
cmat(c_idx.begin(), c_idx.end()) = alpha * s + cmat(c_idx.begin(), c_idx.end()) * beta;
});
}
template <class T, class F>
void migemm_impl(tensor_view<T> cmat, tensor_view<T> amat, tensor_view<T> bmat, F alpha, F beta)
{
migemm_impl(cmat, amat, bmat, alpha, beta, std::false_type{});
}
template <class F>
void migemm_tpl(
const argument& c_arg, const argument& a_arg, const argument& b_arg, F alpha, F beta)
{
visit_all(c_arg, a_arg, b_arg)(
[&](auto cmat, auto amat, auto bmat) { migemm_impl(cmat, amat, bmat, alpha, beta); });
}
void migemm(
const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta)
{
migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
}
void migemm(const argument& c_arg,
const argument& a_arg,
const argument& b_arg,
int32_t alpha,
int32_t beta)
{
migemm_tpl(c_arg, a_arg, b_arg, alpha, beta);
}
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/mul.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template struct cpu_binary<op::mul>;
#if USE_DNNL
struct dnnl_mul : dnnl_extend_op<dnnl_mul, dnnl::binary, op::mul>
{
dnnl::binary::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {dnnl::algorithm::binary_mul,
m.at(DNNL_ARG_SRC_0),
m.at(DNNL_ARG_SRC_1),
m.at(DNNL_ARG_DST)};
}
};
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/register_op.hpp>
#include <migraphx/reflect.hpp>
#include <migraphx/par_for.hpp>
#include <migraphx/context.hpp>
#include <migraphx/cpu/context.hpp>
#include <migraphx/cpu/dnnl.hpp>
#include <migraphx/op/pooling.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
struct max_pool
{
static std::string name() { return "max"; }
template <class T>
static T start()
{
return std::numeric_limits<T>::lowest();
}
static double apply(double x, double y)
{
double m = std::max(x, y);
return (m);
}
static double final(double x, std::size_t) { return (x); }
};
struct avg_pool
{
static std::string name() { return "average"; }
template <class T>
static double start()
{
return 0.0;
}
static double apply(double x, double y) { return x + y; }
static double final(double x, std::size_t y) { return (y == 0) ? 0.0 : (x / y); }
};
template <class Op>
struct cpu_pooling : auto_register_op<cpu_pooling<Op>>
{
cpu_pooling() = default;
cpu_pooling(op::pooling pop) : op(std::move(pop)) {}
op::pooling op;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return migraphx::reflect(self.op, f);
}
std::string name() const { return "cpu::pooling_" + Op::name(); }
shape compute_shape(std::vector<shape> inputs) const
{
inputs.pop_back();
return op.compute_shape(inputs);
}
std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
{
return shapes.size() - 1;
}
argument compute(context&, const shape& output_shape, std::vector<argument> args) const
{
visit_all(args.back(), args[0])([&](auto output, auto input) {
using type = typename decltype(output)::value_type;
auto in_s = input.get_shape();
auto in_lens = in_s.lens();
std::vector<std::size_t> vec_len(in_lens.begin() + 2, in_lens.end());
par_for(output_shape.elements(), [&](auto i) {
auto idx_o = output_shape.multi(i);
auto n_dim = idx_o.size();
std::vector<std::size_t> win_start;
std::vector<std::size_t> win_size;
for(std::size_t dim = 2; dim < n_dim; ++dim)
{
auto d_2 = dim - 2;
int start = static_cast<int>(idx_o[dim] * op.stride[d_2]) -
static_cast<int>(op.padding[d_2]);
int end = std::min(start + op.lengths[d_2], in_lens[dim]);
start = std::max(start, 0);
win_start.push_back(start);
win_size.push_back(end - start);
}
shape win_shape{output_shape.type(), win_size};
auto pool_size = win_shape.elements();
double acc = Op::template start<type>();
shape_for_each(win_shape, [&](auto idx_w) {
auto idx = idx_o;
std::transform(idx_w.begin(),
idx_w.end(),
win_start.begin(),
idx.begin() + 2,
[](auto ii, auto jj) { return ii + jj; });
if(std::all_of(idx.begin() + 2, idx.end(), [&](auto ii) { return ii >= 0; }) and
idx < in_lens)
{
acc = Op::apply(acc, input[in_s.index(idx)]);
}
});
output[i] = type(Op::final(acc, pool_size));
});
});
return args.back();
}
};
template struct cpu_pooling<avg_pool>;
template struct cpu_pooling<max_pool>;
#if USE_DNNL
struct dnnl_pooling : dnnl_extend_op<dnnl_pooling, dnnl::pooling_forward, op::pooling>
{
std::vector<int> arg_map(int) const { return {DNNL_ARG_SRC}; }
dnnl::pooling_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
auto algo = op.mode == "max" ? dnnl::algorithm::pooling_max : dnnl::algorithm::pooling_avg;
return {dnnl::prop_kind::forward_inference,
algo,
m.at(DNNL_ARG_SRC),
m.at(DNNL_ARG_DST),
to_dnnl_dims(op.stride),
to_dnnl_dims(op.lengths),
to_dnnl_dims(op.padding),
to_dnnl_dims(op.padding)};
}
};
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#include <migraphx/config.hpp>
#include <migraphx/cpu/pointwise.hpp>
#include <migraphx/op/relu.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace cpu {
template struct cpu_unary<op::relu>;
#if USE_DNNL
struct dnnl_relu : dnnl_extend_op<dnnl_relu, dnnl::eltwise_forward, op::relu>
{
dnnl::eltwise_forward::desc get_desc(const std::unordered_map<int, dnnl::memory::desc>& m) const
{
return {dnnl::prop_kind::forward_inference,
dnnl::algorithm::eltwise_relu,
m.at(DNNL_ARG_SRC_0)};
}
};
#endif
} // namespace cpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment