Roialign gpu impl (#972)

GPU implementation of the roialign operator, using the jit approach to reduce the lib size.

Roialign gpu impl (#972)
GPU implementation of the roialign operator, using the jit approach to reduce the lib size.
912c8d22 · Shucai Xiao · GitHub · 6df1e02b · 912c8d22 · 912c8d22
Unverified Commit 912c8d22 authored Oct 28, 2021 by Shucai Xiao Committed by GitHub Oct 28, 2021
13 changed files
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -125,6 +125,7 @@ add_library(migraphx_gpu
    compile_hip.cpp
    compile_hip_code_object.cpp
    compile_pointwise.cpp
+    compile_roialign.cpp
    concat.cpp
    convert.cpp
    convolution.cpp

--- a/src/targets/gpu/compile_hip.cpp
+++ b/src/targets/gpu/compile_hip.cpp
 #include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/errors.hpp>
 #include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>
 #include <migraphx/env.hpp>
 #include <cassert>
 #include <iostream>
@@ -230,6 +231,20 @@ compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std
    return {compiler.compile(srcs)};
 }

+std::string enum_params(std::size_t count, std::string param)
+{
+    std::vector<std::string> items(count);
+    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
+    return join_strings(items, ",");
+}
+
+std::size_t compute_global(std::size_t n, std::size_t local)
+{
+    std::size_t groups  = (n + local - 1) / local;
+    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
+    return nglobal;
+}
+
 #endif // MIGRAPHX_USE_HIPRTC

 } // namespace gpu

--- a/src/targets/gpu/compile_pointwise.cpp
+++ b/src/targets/gpu/compile_pointwise.cpp
 #include <migraphx/gpu/compile_pointwise.hpp>
 #include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/reduce_dims.hpp>
@@ -28,20 +29,6 @@ int main() {}

 )__migraphx__";

-std::string enum_params(std::size_t count, std::string param)
-{
-    std::vector<std::string> items(count);
-    transform(range(count), items.begin(), [&](auto i) { return param + std::to_string(i); });
-    return join_strings(items, ",");
-}
-
-std::size_t compute_global(std::size_t n, std::size_t local = 1024)
-{
-    std::size_t groups  = (n + local - 1) / local;
-    std::size_t nglobal = std::min<std::size_t>(256, groups) * local;
-    return nglobal;
-}
-
 operation compile_pointwise(context&, const std::vector<shape>& inputs, const std::string& lambda)
 {
    hip_compile_options options;

--- a/src/targets/gpu/compile_roialign.cpp
+++ b/src/targets/gpu/compile_roialign.cpp
+#include <migraphx/gpu/compile_roialign.hpp>
+#include <migraphx/gpu/compile_hip_code_object.hpp>
+#include <migraphx/gpu/compile_hip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/ranges.hpp>
+#include <migraphx/reduce_dims.hpp>
+#include <migraphx/stringutils.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+// NOLINTNEXTLINE
+static const char* const roialign_kernel = R"__migraphx__(
+#include <migraphx/kernels/roialign.hpp>
+#include <migraphx/kernels/basic_ops.hpp>
+#include <args.hpp>
+
+using namespace migraphx;
+
+extern "C" {
+__global__ void roialign_kernel(void* in_x, void* in_rois, void* in_ind, void* y) 
+{
+    make_tensors()(in_x, in_rois, in_ind, y)([](auto&&... xs) { roialign(xs...); });
+}
+}
+
+int main() {}
+
+)__migraphx__";
+
+operation compile_roialign(context&, const std::vector<shape>& io_shapes, const value& val)
+{
+    hip_compile_options options;
+    auto out_s             = io_shapes.back();
+    options.local          = 128;
+    options.global         = compute_global(out_s.elements(), options.local);
+    options.inputs         = io_shapes;
+    options.output         = out_s;
+    options.kernel_name    = "roialign_kernel";
+    options.reduced_inputs = io_shapes;
+
+    // sampling_ratio
+    assert(val.contains("sampling_ratio"));
+    auto sampling_ratio = val.at("sampling_ratio").to<int64_t>();
+    options.params += " -DSAMPLING_RATIO=" + std::to_string(sampling_ratio);
+
+    // pooling_mode
+    assert(val.contains("mode"));
+    auto mode           = val.at("mode").to<std::string>();
+    bool is_avg_pooling = (mode == "avg");
+    options.params += " -DIS_AVG_POOLING=" + std::to_string(static_cast<int>(is_avg_pooling));
+
+    // coord_trans_mode
+    assert(val.contains("coordinate_transformation_mode"));
+    auto ctm          = val.at("coordinate_transformation_mode").to<std::string>();
+    float rois_offset = (ctm == "output_half_pixel") ? -0.5f : 0.0f;
+    options.params += " -DROIS_OFFSET=" + std::to_string(rois_offset);
+
+    // spatial_scale
+    assert(val.contains("spatial_scale"));
+    float spatial_scale = val.at("spatial_scale").to<float>();
+    options.params += " -DSPATIAL_SCALE=" + std::to_string(spatial_scale);
+
+    return compile_hip_code_object(roialign_kernel, options);
+}
+} // namespace gpu
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_hip.hpp
@@ -15,6 +15,10 @@ namespace gpu {
 std::vector<std::vector<char>>
 compile_hip_src(const std::vector<src_file>& srcs, std::string params, const std::string& arch);

+std::string enum_params(std::size_t count, std::string param);
+
+std::size_t compute_global(std::size_t n, std::size_t local = 1024);
+
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/compile_roialign.hpp
+#ifndef MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/operation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+operation compile_roialign(context& ctx, const std::vector<shape>& io_shapes, const value& val);
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_GPU_COMPILE_ROIALIGN_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/array.hpp
@@ -2,40 +2,51 @@
 #define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_ARRAY_HPP

 #include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/type_traits.hpp>
 #include <migraphx/kernels/integral_constant.hpp>
 #include <migraphx/kernels/debug.hpp>

 namespace migraphx {

 // NOLINTNEXTLINE
-#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                               \
-    constexpr array& operator op(const array& x)                              \
-    {                                                                         \
-        for(index_int i = 0; i < N; i++)                                      \
-            d[i] op x[i];                                                     \
-        return *this;                                                         \
-    }                                                                         \
-    constexpr array& operator op(const T& x)                                  \
-    {                                                                         \
-        for(index_int i = 0; i < N; i++)                                      \
-            d[i] op x;                                                        \
-        return *this;                                                         \
-    }                                                                         \
-    friend constexpr array operator binary_op(const array& x, const array& y) \
-    {                                                                         \
-        auto z = x;                                                           \
-        return z op y;                                                        \
-    }                                                                         \
-    friend constexpr array operator binary_op(const array& x, const T& y)     \
-    {                                                                         \
-        auto z = x;                                                           \
-        return z op y;                                                        \
-    }                                                                         \
-    friend constexpr array operator binary_op(const T& x, const array& y)     \
-    {                                                                         \
-        for(index_int i = 0; i < N; i++)                                      \
-            y[i] = x op y[i];                                                 \
-        return y;                                                             \
+#define MIGRAPHX_DEVICE_ARRAY_OP(op, binary_op)                                    \
+    template <class U>                                                             \
+    constexpr array& operator op(const array<U, N>& x)                             \
+    {                                                                              \
+        for(index_int i = 0; i < N; i++)                                           \
+            d[i] op x[i];                                                          \
+        return *this;                                                              \
+    }                                                                              \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+    constexpr array& operator op(const U& x)                                       \
+    {                                                                              \
+        for(index_int i = 0; i < N; i++)                                           \
+            d[i] op x;                                                             \
+        return *this;                                                              \
+    }                                                                              \
+    template <class U>                                                             \
+    friend constexpr auto operator binary_op(const array& x, const array<U, N>& y) \
+    {                                                                              \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x[i] binary_op y[i];                                            \
+        return z;                                                                  \
+    }                                                                              \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+    friend constexpr auto operator binary_op(const array& x, const U& y)           \
+    {                                                                              \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x[i] binary_op y;                                               \
+        return z;                                                                  \
+    }                                                                              \
+    template <class U, MIGRAPHX_REQUIRES(is_convertible<U, T>{})>                  \
+    friend constexpr auto operator binary_op(const U& x, const array& y)           \
+    {                                                                              \
+        array<decltype(T {} binary_op U{}), N> z{};                                \
+        for(index_int i = 0; i < N; i++)                                           \
+            z[i] = x binary_op y[i];                                               \
+        return z;                                                                  \
    }

 template <class T, index_int N>

--- a/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/basic_ops.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
+
+#include <migraphx/kernels/types.hpp>
+
+namespace migraphx {
+
+struct sum
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x + y;
+    }
+};
+
+struct product
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return x * y;
+    }
+};
+
+struct id
+{
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x;
+    }
+};
+
+struct mean
+{
+    size_t item_num = 1;
+    template <class T>
+    constexpr auto operator()(T x) const
+    {
+        return x / static_cast<T>(item_num);
+    }
+};
+
+struct max_f
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return (x > y) ? x : y;
+    }
+};
+inline constexpr auto max = max_f{};
+
+struct min_f
+{
+    template <class T, class U>
+    constexpr auto operator()(T x, U y) const
+    {
+        return (x < y) ? x : y;
+    }
+};
+inline constexpr auto min = min_f{};
+
+struct lowest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return std::numeric_limits<T>::lowest();
+    }
+};
+
+struct highest
+{
+    template <class T>
+    constexpr operator T() const
+    {
+        return std::numeric_limits<T>::max();
+    }
+};
+
+} // namespace migraphx
+#endif // MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_BASIC_OPS_HPP
--- a/src/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/dfor.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_DFOR_HPP
+
+namespace migraphx {
+
+// Multidimensional for loop
+inline constexpr auto dfor()
+{
+    return [](auto f) { f(); };
+}
+
+template <class T, class... Ts>
+constexpr auto dfor(T x, Ts... xs)
+{
+    return [=](auto f) {
+        for(T i = 0; i < x; i++)
+        {
+            dfor(xs...)([&](Ts... is) { f(i, is...); });
+        }
+    };
+}
+
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+#ifndef MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+#define MIGRAPHX_GUARD_KERNELS_ROIALIGN_HPP
+
+#include <migraphx/kernels/index.hpp>
+#include <migraphx/kernels/dfor.hpp>
+#include <migraphx/kernels/basic_ops.hpp>
+#include <args.hpp>
+
+namespace migraphx {
+
+struct max_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return lowest(); }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return max(x, y);
+    }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t)
+    {
+        return (x);
+    }
+};
+
+struct avg_pool
+{
+    MIGRAPHX_DEVICE_CONSTEXPR auto init() { return 0.0; }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T operator()(T x, T y)
+    {
+        return x + y;
+    }
+
+    template <class T>
+    MIGRAPHX_DEVICE_CONSTEXPR T final(T x, std::size_t y)
+    {
+        return (y == 0) ? 0.0 : (x / y);
+    }
+};
+
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR T bilinear_interpolate(const T* data,
+                                                 const array<std::size_t, 2>& dims,
+                                                 array<float, 2> xy,
+                                                 Op pooling)
+{
+    array<int, 2> low{};
+    array<int, 2> high{};
+    for(std::size_t ii = 0; ii < xy.size(); ++ii)
+    {
+        if(xy[ii] < -1.0f or xy[ii] > dims[ii])
+        {
+            return 0;
+        }
+
+        xy[ii]   = max(xy[ii], 0.0f);
+        low[ii]  = xy[ii];
+        high[ii] = low[ii] + 1;
+        if(low[ii] >= dims[ii] - 1)
+        {
+            xy[ii] = high[ii] = low[ii] = dims[ii] - 1;
+        }
+    }
+    array<std::size_t, 4> locs = {low[0] * dims[1] + low[1],
+                                  low[0] * dims[1] + high[1],
+                                  high[0] * dims[1] + low[1],
+                                  high[0] * dims[1] + high[1]};
+
+    float ly       = xy[0] - low[0];
+    float lx       = xy[1] - low[1];
+    float hy       = 1.0f - ly;
+    float hx       = 1.0f - lx;
+    array<T, 4> ws = {hy * hx, hy * lx, ly * hx, ly * lx};
+
+    auto v01 = pooling(data[locs[0]] * ws[0], data[locs[1]] * ws[1]);
+    auto v23 = pooling(data[locs[2]] * ws[2], data[locs[3]] * ws[3]);
+    return pooling(v01, v23);
+}
+
+template <class T, class Op>
+MIGRAPHX_DEVICE_CONSTEXPR T calc_pooling(const T*& data,
+                                         const array<float, 2>& roi_starts,
+                                         const array<float, 2>& bin_size,
+                                         const array<int, 2>& idx,
+                                         const array<std::size_t, 2>& bin_grid_size,
+                                         const array<std::size_t, 2>& dims,
+                                         float roi_offset,
+                                         Op op)
+{
+    T output_val        = op.init();
+    const int64_t count = bin_grid_size[0] * bin_grid_size[1];
+    dfor(bin_grid_size[0], bin_grid_size[1])([&](auto iy, auto ix) {
+        array<std::size_t, 2> id = {iy, ix};
+        array<float, 2> locs =
+            roi_starts + idx * bin_size + bin_size * (id + 0.5f) / bin_grid_size + roi_offset;
+
+        auto val   = bilinear_interpolate(data, dims, locs, op);
+        output_val = op(output_val, val);
+    });
+    return op.final(output_val, count);
+}
+
+template <class T, class U, class V, class W>
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t)
+{
+    const float roi_offset       = ROIS_OFFSET;
+    const bool is_avg_pooling    = IS_AVG_POOLING;
+    const int64_t sampling_ratio = SAMPLING_RATIO;
+    const float spatial_scale    = SPATIAL_SCALE;
+
+    auto index       = make_index();
+    const auto* x    = x_t.data();
+    const auto* rois = rois_t.data();
+    const auto* ind  = ind_t.data();
+
+    auto* out_ptr = y_t.data();
+
+    // input shape
+    auto x_lens      = x_t.get_shape().lens;
+    auto channel_num = x_lens[1];
+    // input dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    array<std::size_t, 2> in_dims = {x_lens[2], x_lens[3]};
+
+    const auto stride   = index.nglobal();
+    auto out_s          = y_t.get_shape();
+    auto roi_column_num = rois_t.get_shape().lens[1];
+
+    // output dims of height and width, in all 2-dim arrays, the first dim
+    // is for height and second dim is for width
+    const auto& out_lens           = out_s.lens;
+    array<std::size_t, 2> out_dims = {out_lens[2], out_lens[3]};
+
+    for(index_int i = index.global; i < out_s.elements(); i += stride)
+    {
+        auto idx = out_s.multi(i);
+        int n    = idx[0];
+        int c    = idx[1];
+        int ph   = idx[2];
+        int pw   = idx[3];
+
+        const auto* offset_rois = rois + (n * roi_column_num);
+        const int batch_ind     = ind[n];
+
+        array<float, 2> roi_starts = {offset_rois[1] * spatial_scale,
+                                      offset_rois[0] * spatial_scale};
+        array<float, 2> roi_ends = {offset_rois[3] * spatial_scale, offset_rois[2] * spatial_scale};
+
+        array<float, 2> roi_size{};
+        array<float, 2> bin_size{};
+        array<std::size_t, 2> bin_grid_size{};
+
+        for(std::size_t ii = 0; ii < roi_size.size(); ++ii)
+        {
+            roi_size[ii] = roi_ends[ii] - roi_starts[ii];
+            roi_size[ii] = max(roi_size[ii], 1.0f);
+
+            bin_size[ii] = roi_size[ii] / out_dims[ii];
+            bin_grid_size[ii] =
+                (sampling_ratio > 0) ? sampling_ratio : std::ceil(roi_size[ii] / out_dims[ii]);
+        }
+
+        const auto* offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
+        if constexpr(is_avg_pooling)
+        {
+            out_ptr[i] = calc_pooling(offset_x,
+                                      roi_starts,
+                                      bin_size,
+                                      {ph, pw},
+                                      bin_grid_size,
+                                      in_dims,
+                                      roi_offset,
+                                      avg_pool{});
+        }
+        else
+        {
+            out_ptr[i] = calc_pooling(offset_x,
+                                      roi_starts,
+                                      bin_size,
+                                      {ph, pw},
+                                      bin_grid_size,
+                                      in_dims,
+                                      roi_offset,
+                                      max_pool{});
+        }
+    }
+}
+
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/type_traits.hpp
+#ifndef MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+#define MIGRAPHX_GUARD_AMDMIGRAPHX_KERNELS_TYPE_TRAITS_HPP
+
+#include <migraphx/kernels/types.hpp>
+#include <migraphx/kernels/integral_constant.hpp>
+
+namespace migraphx {
+
+template <bool B, class T = void>
+struct enable_if
+{
+};
+
+template <class T>
+struct enable_if<true, T>
+{
+    using type = T;
+};
+
+template <bool B, class T = void>
+using enable_if_t = typename enable_if<B, T>::type;
+
+template <class From, class To>
+struct is_convertible : bool_constant<__is_convertible(From, To)>
+{
+};
+
+#define MIGRAPHX_REQUIRES(...) class = enable_if_t<__VA_ARGS__>
+
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -20,6 +20,7 @@

 #include <migraphx/gpu/abs.hpp>
 #include <migraphx/gpu/batch_norm_inference.hpp>
+#include <migraphx/gpu/compile_roialign.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/deconvolution.hpp>
@@ -473,20 +474,16 @@ struct miopen_apply
    void add_roialign()
    {
        apply_map.emplace("roialign", [=](instruction_ref ins) {
+
            auto s      = ins->get_shape();
+            auto op_val = ins->get_operator().to_value();
            auto output = insert_allocation(ins, s);
-            std::vector<instruction_ref> cpu_inputs;
-            auto inputs = ins->inputs();
-            std::transform(
-                inputs.begin(), inputs.end(), std::back_inserter(cpu_inputs), [&](auto in) {
-                    return mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), in);
-                });
-            cpu_inputs.front() =
-                mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_inputs);
-            auto cpu_out = mod->insert_instruction(ins, ins->get_operator(), cpu_inputs);
-            auto gpu_out =
-                mod->insert_instruction(ins, make_op("hip::copy_to_gpu"), cpu_out, output);
-            return mod->replace_instruction(ins, gpu_out);
+            auto args   = ins->inputs();
+            args.push_back(output);
+
+            auto io_shapes = to_shapes(args);
+            auto co        = compile_roialign(get_context(), io_shapes, op_val);
+            return mod->replace_instruction(ins, co, args);
        });
    }


--- a/test/verify/test_roialign_nondefault.cpp
+++ b/test/verify/test_roialign_nondefault.cpp
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+struct test_roialign_nondefault : verify_program<test_roialign_nondefault>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape x_s{migraphx::shape::float_type, {5, 4, 10, 10}};
+
+        migraphx::shape roi_s{migraphx::shape::float_type, {5, 4}};
+
+        migraphx::shape ind_s{migraphx::shape::int64_type, {5}};
+        std::vector<int64_t> ind_vec = {0, 2, 3, 4, 1};
+
+        auto x   = mm->add_parameter("x", x_s);
+        auto roi = mm->add_parameter("roi", roi_s);
+        auto ind = mm->add_literal(migraphx::literal(ind_s, ind_vec));
+        auto r   = mm->add_instruction(
+            migraphx::make_op("roialign",
+                              {{"coordinate_transformation_mode", "output_half_pixel"},
+                               {"mode", "max"},
+                               {"spatial_scale", 1.0},
+                               {"output_height", 5},
+                               {"output_width", 5},
+                               {"sampling_ratio", 2}}),
+            x,
+            roi,
+            ind);
+        mm->add_return({r});
+
+        return p;
+    }
+};