Unverified Commit 912c8d22 authored by Shucai Xiao's avatar Shucai Xiao Committed by GitHub
Browse files

Roialign gpu impl (#972)

GPU implementation of the roialign operator, using the jit approach to reduce the lib size.
parent 6df1e02b
......@@ -125,6 +125,7 @@ add_library(migraphx_gpu
compile_hip.cpp
compile_hip_code_object.cpp
compile_pointwise.cpp
compile_roialign.cpp
concat.cpp
convert.cpp
convolution.cpp
......
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/errors.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/env.hpp>
#include <cassert>
#include <iostream>
......@@ -230,6 +231,20 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
return {compiler.compile(srcs)};
}
std::string enum_params(std::size_t count, std::string param)
{
std::vector<std::string> items(count);
transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
return join_strings(items, ",");
}
std::size_t compute_global(std::size_t n, std::size_t local)
{
std::size_t groups = (n + local - 1) / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
return nglobal;
}
#endif // MIGRAPHX_USE_HIPRTC
} // namespace gpu
......
#include <migraphx/gpu/compile_pointwise.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
......@@ -28,20 +29,6 @@ int main() {}
)__migraphx__";
std::string enum_params(std::size_t count, std::string param)
{
std::vector<std::string> items(count);
transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
return join_strings(items, ",");
}
std::size_t compute_global(std::size_t n, std::size_t local = 1024)
{
std::size_t groups = (n + local - 1) / local;
std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
return nglobal;
}
operation compile_pointwise(context&, const std::vector<shape>& inputs, const std::string& lambda)
{
hip_compile_options options;
......
#include <migraphx/gpu/compile_roialign.hpp>
#include <migraphx/gpu/compile_hip_code_object.hpp>
#include <migraphx/gpu/compile_hip.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/ranges.hpp>
#include <migraphx/reduce_dims.hpp>
#include <migraphx/stringutils.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
// NOLINTNEXTLINE
static const char* const roialign_kernel = R"__migraphx__(
#include <migraphx/kernels/roialign.hpp>
#include <migraphx/kernels/basic_ops.hpp>
#include <args.hpp>
using namespace migraphx;
extern "C" {
__global__ void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y)
{
make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) { roialign(xs...); });
}
}
int main() {}
)__migraphx__";
operation compile_roialign(context&, const std::vector<shape>& io_shapes, const value& val)
{
hip_compile_options options;
auto out_s = io_shapes.back();
options.local = 128;
options.global = compute_global(out_s.elements(), options.local);
options.inputs = io_shapes;
options.output = out_s;
options.kernel_name = "roialign_kernel";
options.reduced_inputs = io_shapes;
// sampling_ratio
assert(val.contains("sampling_ratio"));
auto sampling_ratio = val.at("sampling_ratio").to<int64_t>();
options.params += " -DSAMPLING_RATIO=" + std::to_string(sampling_ratio);
// pooling_mode
assert(val.contains("mode"));
auto mode = val.at("mode").to<std::string>();
bool is_avg_pooling = (mode == "avg");
options.params += " -DIS_AVG_POOLING=" + std::to_string(static_cast<int>(is_avg_pooling));
// coord_trans_mode
assert(val.contains("coordinate_transformation_mode"));
auto ctm = val.at("coordinate_transformation_mode").to<std::string>();
float rois_offset = (ctm == "output_half_pixel") ? -0.5f : 0.0f;
options.params += " -DROIS_OFFSET=" + std::to_string(rois_offset);
// spatial_scale
assert(val.contains("spatial_scale"));
float spatial_scale = val.at("spatial_scale").to<float>();
options.params += " -DSPATIAL_SCALE=" + std::to_string(spatial_scale);
return compile_hip_code_object(roialign_kernel, options);
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -15,6 +15,10 @@ namespace gpu {
std::vector<std::vector<char>>
compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);
std::string enum_params(std::size_t count, std::string param);
std::size_t compute_global(std::size_t n, std::size_t local = 1024);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
#include <migraphx/config.hpp>
#include <migraphx/operation.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct context;
operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
......@@ -2,40 +2,51 @@
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/type_traits.hpp>
#include <migraphx/kernels/integral_constant.hpp>
#include <migraphx/kernels/debug.hpp>
namespace migraphx {
// NOLINTNEXTLINE
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
constexpr array& operator op(const array& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x[i]; \
return *this; \
} \
constexpr array& operator op(const T& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x; \
return *this; \
} \
friend constexpr array operator binary_op(const array& x, const array& y) \
{ \
auto z = x; \
return z op y; \
} \
friend constexpr array operator binary_op(const array& x, const T& y) \
{ \
auto z = x; \
return z op y; \
} \
friend constexpr array operator binary_op(const T& x, const array& y) \
{ \
for(index_int i = 0; i < N; i++) \
y[i] = x op y[i]; \
return y; \
#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op) \
template <class U> \
constexpr array& operator op(const array<U, N>& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x[i]; \
return *this; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
constexpr array& operator op(const U& x) \
{ \
for(index_int i = 0; i < N; i++) \
d[i] op x; \
return *this; \
} \
template <class U> \
friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x[i] binary_op y[i]; \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const array& x, const U& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x[i] binary_op y; \
return z; \
} \
template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})> \
friend constexpr auto operator binary_op(const U& x, const array& y) \
{ \
array<decltype(T {} binary_op U{}), N> z{}; \
for(index_int i = 0; i < N; i++) \
z[i] = x binary_op y[i]; \
return z; \
}
template <class T, index_int N>
......
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
#include <migraphx/kernels/types.hpp>
namespace migraphx {
struct sum
{
template <class T, class U>
constexpr auto operator()(T x, U y) const
{
return x + y;
}
};
struct product
{
template <class T, class U>
constexpr auto operator()(T x, U y) const
{
return x * y;
}
};
struct id
{
template <class T>
constexpr auto operator()(T x) const
{
return x;
}
};
struct mean
{
size_t item_num = 1;
template <class T>
constexpr auto operator()(T x) const
{
return x / static_cast<T>(item_num);
}
};
struct max_f
{
template <class T, class U>
constexpr auto operator()(T x, U y) const
{
return (x > y) ? x : y;
}
};
inline constexpr auto max = max_f{};
struct min_f
{
template <class T, class U>
constexpr auto operator()(T x, U y) const
{
return (x < y) ? x : y;
}
};
inline constexpr auto min = min_f{};
struct lowest
{
template <class T>
constexpr operator T() const
{
return std::numeric_limits<T>::lowest();
}
};
struct highest
{
template <class T>
constexpr operator T() const
{
return std::numeric_limits<T>::max();
}
};
} // namespace migraphx
#endif // MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
namespace migraphx {
// Multidimensional for loop
inline constexpr auto dfor()
{
return [](auto f) { f(); };
}
template <class T, class... Ts>
constexpr auto dfor(T x, Ts... xs)
{
return [=](auto f) {
for(T i = 0; i < x; i++)
{
dfor(xs...)([&](Ts... is) { f(i, is...); });
}
};
}
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
#include <migraphx/kernels/index.hpp>
#include <migraphx/kernels/dfor.hpp>
#include <migraphx/kernels/basic_ops.hpp>
#include <args.hpp>
namespace migraphx {
struct max_pool
{
MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest(); }
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
{
return max(x, y);
}
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t)
{
return (x);
}
};
struct avg_pool
{
MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
{
return x + y;
}
template <class T>
MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t y)
{
return (y == 0) ? 0.0 : (x / y);
}
};
template <class T, class Op>
MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
const array<std::size_t, 2>& dims,
array<float, 2> xy,
Op pooling)
{
array<int, 2> low{};
array<int, 2> high{};
for(std::size_t ii = 0; ii < xy.size(); ++ii)
{
if(xy[ii] < -1.0f or xy[ii] > dims[ii])
{
return 0;
}
xy[ii] = max(xy[ii], 0.0f);
low[ii] = xy[ii];
high[ii] = low[ii] + 1;
if(low[ii] >= dims[ii] - 1)
{
xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
}
}
array<std::size_t, 4> locs = {low[0] * dims[1] + low[1],
low[0] * dims[1] + high[1],
high[0] * dims[1] + low[1],
high[0] * dims[1] + high[1]};
float ly = xy[0] - low[0];
float lx = xy[1] - low[1];
float hy = 1.0f - ly;
float hx = 1.0f - lx;
array<T, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
return pooling(v01, v23);
}
template <class T, class Op>
MIGRAPHX_DEVICE_CONSTEXPR T calc_pooling(const T*& data,
const array<float, 2>& roi_starts,
const array<float, 2>& bin_size,
const array<int, 2>& idx,
const array<std::size_t, 2>& bin_grid_size,
const array<std::size_t, 2>& dims,
float roi_offset,
Op op)
{
T output_val = op.init();
const int64_t count = bin_grid_size[0] * bin_grid_size[1];
dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
array<std::size_t, 2> id = {iy, ix};
array<float, 2> locs =
roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
auto val = bilinear_interpolate(data, dims, locs, op);
output_val = op(output_val, val);
});
return op.final(output_val, count);
}
template <class T, class U, class V, class W>
__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t)
{
const float roi_offset = ROIS_OFFSET;
const bool is_avg_pooling = IS_AVG_POOLING;
const int64_t sampling_ratio = SAMPLING_RATIO;
const float spatial_scale = SPATIAL_SCALE;
auto index = make_index();
const auto* x = x_t.data();
const auto* rois = rois_t.data();
const auto* ind = ind_t.data();
auto* out_ptr = y_t.data();
// input shape
auto x_lens = x_t.get_shape().lens;
auto channel_num = x_lens[1];
// input dims of height and width, in all 2-dim arrays, the first dim
// is for height and second dim is for width
array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
const auto stride = index.nglobal();
auto out_s = y_t.get_shape();
auto roi_column_num = rois_t.get_shape().lens[1];
// output dims of height and width, in all 2-dim arrays, the first dim
// is for height and second dim is for width
const auto& out_lens = out_s.lens;
array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
for(index_int i = index.global; i < out_s.elements(); i += stride)
{
auto idx = out_s.multi(i);
int n = idx[0];
int c = idx[1];
int ph = idx[2];
int pw = idx[3];
const auto* offset_rois = rois + (n * roi_column_num);
const int batch_ind = ind[n];
array<float, 2> roi_starts = {offset_rois[1] * spatial_scale,
offset_rois[0] * spatial_scale};
array<float, 2> roi_ends = {offset_rois[3] * spatial_scale, offset_rois[2] * spatial_scale};
array<float, 2> roi_size{};
array<float, 2> bin_size{};
array<std::size_t, 2> bin_grid_size{};
for(std::size_t ii = 0; ii < roi_size.size(); ++ii)
{
roi_size[ii] = roi_ends[ii] - roi_starts[ii];
roi_size[ii] = max(roi_size[ii], 1.0f);
bin_size[ii] = roi_size[ii] / out_dims[ii];
bin_grid_size[ii] =
(sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_size[ii] / out_dims[ii]);
}
const auto* offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
if constexpr(is_avg_pooling)
{
out_ptr[i] = calc_pooling(offset_x,
roi_starts,
bin_size,
{ph, pw},
bin_grid_size,
in_dims,
roi_offset,
avg_pool{});
}
else
{
out_ptr[i] = calc_pooling(offset_x,
roi_starts,
bin_size,
{ph, pw},
bin_grid_size,
in_dims,
roi_offset,
max_pool{});
}
}
}
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
#include <migraphx/kernels/types.hpp>
#include <migraphx/kernels/integral_constant.hpp>
namespace migraphx {
template <bool B, class T = void>
struct enable_if
{
};
template <class T>
struct enable_if<true, T>
{
using type = T;
};
template <bool B, class T = void>
using enable_if_t = typename enable_if<B, T>::type;
template <class From, class To>
struct is_convertible : bool_constant<__is_convertible(From, To)>
{
};
#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
} // namespace migraphx
#endif
......@@ -20,6 +20,7 @@
#include <migraphx/gpu/abs.hpp>
#include <migraphx/gpu/batch_norm_inference.hpp>
#include <migraphx/gpu/compile_roialign.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/gpu/convolution.hpp>
#include <migraphx/gpu/deconvolution.hpp>
......@@ -473,20 +474,16 @@ struct miopen_apply
void add_roialign()
{
apply_map.emplace("roialign", [=](instruction_ref ins) {
auto s = ins->get_shape();
auto op_val = ins->get_operator().to_value();
auto output = insert_allocation(ins, s);
std::vector<instruction_ref> cpu_inputs;
auto inputs = ins->inputs();
std::transform(
inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
});
cpu_inputs.front() =
mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
auto gpu_out =
mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
return mod->replace_instruction(ins, gpu_out);
auto args = ins->inputs();
args.push_back(output);
auto io_shapes = to_shapes(args);
auto co = compile_roialign(get_context(), io_shapes, op_val);
return mod->replace_instruction(ins, co, args);
});
}
......
#include "verify_program.hpp"
#include <migraphx/program.hpp>
#include <migraphx/generate.hpp>
#include <migraphx/make_op.hpp>
struct test_roialign_nondefault : verify_program<test_roialign_nondefault>
{
migraphx::program create_program() const
{
migraphx::program p;
auto* mm = p.get_main_module();
migraphx::shape x_s{migraphx::shape::float_type, {5, 4, 10, 10}};
migraphx::shape roi_s{migraphx::shape::float_type, {5, 4}};
migraphx::shape ind_s{migraphx::shape::int64_type, {5}};
std::vector<int64_t> ind_vec = {0, 2, 3, 4, 1};
auto x = mm->add_parameter("x", x_s);
auto roi = mm->add_parameter("roi", roi_s);
auto ind = mm->add_literal(migraphx::literal(ind_s, ind_vec));
auto r = mm->add_instruction(
migraphx::make_op("roialign",
{{"coordinate_transformation_mode", "output_half_pixel"},
{"mode", "max"},
{"spatial_scale", 1.0},
{"output_height", 5},
{"output_width", 5},
{"sampling_ratio", 2}}),
x,
roi,
ind);
mm->add_return({r});
return p;
}
};
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment