Merge remote-tracking branch 'origin/develop' into dev2

4957715b · turneram · f99a3036 · 4ec8209f · 4957715b · 4957715b
Commit 4957715b authored May 11, 2022 by turneram
20 changed files
--- a/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/reduce.hpp
@@ -124,8 +124,8 @@ __device__ auto block_reduce(index idx, Op op, T init, index_int n, F f)
 }
 #endif

-template <class Input, class T, class Output>
-constexpr auto reduce_slice(Input input, T i, Output)
+template <class Output, class Input, class T>
+constexpr auto reduce_slice(Input input, T i)
 {
    constexpr auto lens = transform(get_shape_c<Input>{}.lens,
                                    get_shape_c<Output>{}.lens,
@@ -136,23 +136,126 @@ constexpr auto reduce_slice(Input input, T i, Output)
                                    });
    ;
    constexpr auto s = make_shape(lens, get_shape_c<Input>{}.strides);
+    MIGRAPHX_ASSERT((input.get_shape().index(i) + s.element_space()) <=
+                    input.get_shape().element_space());
    return make_tensor_view(&input[i], s);
 }

-template <class Op, class T, class Input, class Output, class ReadInput, class WriteOuput>
+namespace reduce {
+
+template <class Slicer, class F>
+constexpr auto sliced(Slicer slicer, F f)
+{
+    return [=](auto x, auto... xs) {
+        // TODO: assert all elements are the same
+        return f(slicer(x), slicer(xs)...);
+    };
+}
+
+struct block
+{
+    template <class Slicer>
+    struct reducer
+    {
+        index idx;
+        Slicer slicer;
+        template <class Op, class T, class Read>
+        __device__ auto reduce(Op op, T init, Read read) const
+        {
+            return sliced(slicer, [=](auto x, auto... xs) {
+                return block_reduce(idx, op, init, x.get_shape().elements(), [&](auto j) {
+                    return read(x[j], xs[j]...);
+                });
+            });
+        }
+
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            if(idx.local == 0)
+                f();
+        }
+    };
+
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{idx, slicer};
+    }
+
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i / idx.nlocal());
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+
+struct lane
+{
+    template <class Slicer>
+    struct reducer
+    {
+        index idx;
+        Slicer slicer;
+        template <class Op, class T, class Read>
+        __device__ auto reduce(Op op, T init, Read read) const
+        {
+            return sliced(slicer, [=](auto x, auto... xs) {
+                using type = typename decltype(x)::type;
+                type r     = init;
+                for(index_int j = 0; j < x.get_shape().elements(); j++)
+                {
+                    r = op(r, read(x[j], xs[j]...));
+                }
+                return r;
+            });
+        }
+
+        template <class F>
+        __device__ void outer(F f) const
+        {
+            f();
+        }
+    };
+
+    template <class Slicer>
+    static __device__ auto make(index idx, Slicer slicer)
+    {
+        return reducer<Slicer>{idx, slicer};
+    }
+
+    template <class Output, class F>
+    static __device__ void run(F f)
+    {
+        auto idx                 = make_index();
+        constexpr auto nelements = get_shape_c<Output>{}.elements();
+        idx.global_stride(nelements, [&](auto i) {
+            const auto out_idx = get_shape_c<Output>{}.multi(i);
+            f(out_idx, make(idx, [&](auto input) { return reduce_slice<Output>(input, out_idx); }));
+        });
+    }
+};
+
+} // namespace reduce
+
+template <class Algo,
+          class Op,
+          class T,
+          class Input,
+          class Output,
+          class ReadInput,
+          class WriteOuput>
 __device__ void
 simple_reduce(Op op, T init, Input input, Output output, ReadInput read, WriteOuput write)
 {
-    auto idx                 = make_index();
-    constexpr auto nelements = get_shape_c<Output>{}.elements();
-    constexpr auto relements = get_shape_c<Input>{}.elements() / get_shape_c<Output>{}.elements();
-    idx.global_stride(nelements * idx.nlocal(), [&](auto i) {
-        const auto out_idx = output.get_shape().multi(i / idx.nlocal());
-        auto rs            = reduce_slice(input, out_idx, output);
-        MIGRAPHX_ASSERT(relements == rs.get_shape().elements());
-        auto r = block_reduce(idx, op, init, relements, [&](auto j) { return read(rs[j]); });
-        if(idx.local == 0)
-            output[out_idx] = write(r);
+    Algo::template run<Output>([&](auto out_idx, auto r) {
+        auto x = r.reduce(op, init, read)(input);
+        r.outer([&] { output[out_idx] = write(x); });
    });
 }


--- a/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/roialign.hpp
@@ -118,15 +118,13 @@ constexpr roalign_settings<Ts...> make_roalign_settings(Ts... xs)
 }

 template <class T, class U, class V, class W, class Settings>
-__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W& y_t, Settings s)
+__device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, W& y_t, Settings s)
 {
    auto index      = make_index();
    const auto x    = x_t.begin();
    const auto rois = rois_t.begin();
    const auto ind  = ind_t.begin();

-    auto out_ptr = y_t.begin();
-
    // input shape
    auto x_lens      = x_t.get_shape().lens;
    auto channel_num = x_lens[1];
@@ -176,25 +174,25 @@ __device__ void roialign(const T& x_t, const U& rois_t, const V& ind_t, const W&
        const auto offset_x = x + ((batch_ind * channel_num + c) * in_dims[0] * in_dims[1]);
        if constexpr(s.is_avg_pooling)
        {
-            out_ptr[i] = calc_pooling(offset_x,
-                                      roi_starts,
-                                      bin_size,
-                                      {ph, pw},
-                                      bin_grid_size,
-                                      in_dims,
-                                      s.roi_offset,
-                                      avg_pool{});
+            y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  avg_pool{});
        }
        else
        {
-            out_ptr[i] = calc_pooling(offset_x,
-                                      roi_starts,
-                                      bin_size,
-                                      {ph, pw},
-                                      bin_grid_size,
-                                      in_dims,
-                                      s.roi_offset,
-                                      max_pool{});
+            y_t[i] = calc_pooling(offset_x,
+                                  roi_starts,
+                                  bin_size,
+                                  {ph, pw},
+                                  bin_grid_size,
+                                  in_dims,
+                                  s.roi_offset,
+                                  max_pool{});
        }
    }
 }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/tensor_view.hpp
@@ -29,11 +29,23 @@ struct tensor_view
    constexpr Shape get_shape() const { return Shape{}; }
    constexpr auto size() const { return get_shape().elements(); }

-    template <class U>
-    constexpr T& operator[](U i) const
+    struct index_to_offset
    {
-        MIGRAPHX_ASSERT(get_shape().index(i) < get_shape().element_space());
-        return x[get_shape().index(i)];
+        index_int offset;
+        template <class U>
+        constexpr index_to_offset(U i) : offset(Shape{}.index(i))
+        {
+        }
+    };
+
+    constexpr T& operator[](MIGRAPHX_CAPTURE_SOURCE_LOCATION(index_to_offset) i) const
+    {
+        index_to_offset ito = i;
+        MIGRAPHX_WARN(ito.offset < get_shape().element_space(),
+                      i,
+                      "Out of bounds access at offset: ",
+                      ito.offset);
+        return x[ito.offset];
    }

    constexpr T* data() const { return x; }

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vec.hpp
@@ -60,10 +60,19 @@ constexpr auto common_vec_size()
    })(vec_size<Ts>()...);
 }

+// Bools can not be used as a vector type so convert it to uint8
+template <class T>
+__device__ __host__ T* remove_bool(T* x)
+{
+    return x;
+}
+
+inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
+
 template <index_int N, class T>
 __device__ __host__ auto as_vec(T* x)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return reinterpret_cast<vec<T, N>*>(x);

--- a/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
+++ b/src/targets/gpu/kernels/include/migraphx/kernels/vectorize.hpp
@@ -50,19 +50,10 @@ constexpr auto shape_step(Shape s, Axis)
    });
 }

-// Bools can not be used as a vector type so convert it to uint8
-template <class T>
-__device__ __host__ T* remove_bool(T* x)
-{
-    return x;
-}
-
-inline __device__ __host__ uint8_t* remove_bool(bool* x) { return reinterpret_cast<uint8_t*>(x); }
-
 template <index_int N, class T, class Axis>
 __device__ __host__ auto as_vec(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
        return x;
    else
        return make_tensor_view(as_vec<N>(remove_bool(x.data())),
@@ -72,7 +63,7 @@ __device__ __host__ auto as_vec(T x, Axis axis)
 template <index_int N, class T, class Axis>
 constexpr auto tensor_step(T x, Axis axis)
 {
-    if constexpr(N == 0)
+    if constexpr(N < 2)
    {
        return x;
    }
@@ -157,11 +148,11 @@ constexpr auto find_vectorize_size(P pred)
    else if constexpr(decltype(pred(_c<2>)){})
        return _c<2>;
    else
-        return _c<0>;
+        return _c<1>;
 }

 template <class T>
-__host__ __device__ auto vectorize(T x)
+__host__ __device__ auto auto_vectorize(T x)
 {
    if constexpr(tensor_vec_size<T>() == 0)
    {
@@ -194,7 +185,7 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)
                {
                    MIGRAPHX_ASSERT(s.strides[axis] == 0 or s.strides[axis] == 1);
                    MIGRAPHX_ASSERT(s.lens[axis] > 0);
-                    MIGRAPHX_ASSERT(n == 0 or s.lens[axis] % n == 0);
+                    MIGRAPHX_ASSERT(n == 1 or s.lens[axis] % n == 0);
                    if constexpr(s.strides[axis] == 0)
                        return tensor_step<n>(x, axis);
                    else
@@ -215,7 +206,32 @@ inline __device__ __host__ auto auto_vectorize_impl(F f, Ts... xs)

 inline __device__ __host__ auto auto_vectorize()
 {
-    return [](auto... xs) { return [=](auto f) { auto_vectorize_impl(f, xs...); }; };
+    return make_transform([](auto f, auto... xs) { auto_vectorize_impl(f, xs...); });
+}
+
+template <index_int N, index_int Axis, class T>
+__device__ __host__ auto vectorize_tensor(T x)
+{
+    constexpr auto shape = get_shape_c<T>{};
+    if constexpr(shape.strides[Axis] == 0)
+        return tensor_step<N>(x, _c<Axis>);
+    else
+        return as_vec<N>(x, _c<Axis>);
+}
+
+template <index_int N, index_int Axis>
+__device__ __host__ auto vectorize()
+{
+    return make_transform([](auto f, auto... xs) {
+        if constexpr(N < 2)
+        {
+            f(xs...);
+        }
+        else
+        {
+            f(vectorize_tensor<N, Axis>(xs)...);
+        }
+    });
 }

 } // namespace migraphx

--- a/src/targets/ref/lowering.cpp
+++ b/src/targets/ref/lowering.cpp
@@ -505,7 +505,7 @@ struct ref_unary : auto_register_op<ref_unary<Op>>
    shape compute_shape(const std::vector<shape>& inputs) const
    {
        check_shapes{inputs, *this}.has(1);
-        auto s = inputs.at(0);
+        const auto& s = inputs.at(0);
        return {s.type(), s.lens()};
    }


--- a/test/api/CMakeLists.txt
+++ b/test/api/CMakeLists.txt
-
 function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
    set(NAME test_api_${TEST_NAME})
    add_executable(${NAME} EXCLUDE_FROM_ALL ${TEST_SRC})
@@ -10,6 +9,7 @@ function(add_api_test TEST_NAME TEST_SRC TEST_DIR)
    add_dependencies(check ${NAME})
 endfunction()

+add_api_test(array_base test_array_base.cpp ${TEST_ONNX_DIR})
 add_api_test(assign test_assign.cpp ${TEST_ONNX_DIR})
 add_api_test(custom_op test_custom_op.cpp ${TEST_ONNX_DIR})
 add_api_test(compile_options test_compile_options.cpp ${TEST_ONNX_DIR})
@@ -19,7 +19,8 @@ add_api_test(ref test_cpu.cpp ${TEST_ONNX_DIR})
 add_api_test(save_load test_save_load.cpp ${TEST_ONNX_DIR})
 add_api_test(op test_op_construct.cpp ${TEST_ONNX_DIR})
 add_api_test(tf_parser test_tf_parser.cpp ${TEST_TF_DIR})
+# GPU-based tests
 if(MIGRAPHX_ENABLE_GPU)
 add_api_test(gpu test_gpu.cpp ${TEST_ONNX_DIR})
-# GPU-based tests
+target_link_libraries(test_api_gpu migraphx_gpu)
 endif()
--- a/test/api/test_array_base.cpp
+++ b/test/api/test_array_base.cpp
+#include <migraphx/migraphx.hpp>
+#include "test.hpp"
+
+struct array2 : migraphx::array_base<array2>
+{
+    std::vector<int> v;
+    array2() = default;
+    array2(std::initializer_list<int> x) : v(x) {}
+    std::size_t size() const { return v.size(); }
+    int operator[](std::size_t i) const { return v[i]; }
+};
+
+TEST_CASE(iterators)
+{
+    array2 a = {1, 2, 3};
+    EXPECT(bool{std::equal(a.begin(), a.end(), a.v.begin())});
+}
+
+TEST_CASE(front_back)
+{
+    array2 a = {1, 2, 3};
+    EXPECT(a.front() == 1);
+    EXPECT(a.back() == 3);
+}
+
+TEST_CASE(empty)
+{
+    array2 a = {1, 2, 3};
+    EXPECT(not a.empty());
+}
+
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/api/test_gpu.cpp
+++ b/test/api/test_gpu.cpp
 #include <numeric>
+#include <hip/hip_runtime_api.h>
 #include <migraphx/migraphx.h>
 #include <migraphx/migraphx.hpp>
 #include "test.hpp"
@@ -38,6 +39,7 @@ TEST_CASE(load_and_run_ctx)
        pp.add(name, migraphx::argument::generate(param_shapes[name]));
    }
    auto ctx = p.experimental_get_context();
+    EXPECT(ctx.get_queue<hipStream_t>() != nullptr);
    p.eval(pp);
    ctx.finish();
 }

--- a/test/api/test_module_construct.cpp
+++ b/test/api/test_module_construct.cpp
@@ -3,23 +3,21 @@
 #include <migraphx/migraphx.hpp>
 #include "test.hpp"

-TEST_CASE(add_op)
+TEST_CASE(add_literals)
 {
    migraphx::program p;
    migraphx::module m = p.get_main_module();
    migraphx::shape param_shape{migraphx_shape_float_type, {3, 3}};
-    auto x      = m.add_parameter("x", param_shape);
-    auto y      = m.add_parameter("y", param_shape);
+    std::vector<float> x_values(9, 1);
+    auto x = m.add_literal(param_shape, x_values.data());
+    std::vector<float> y_values(9, -1);
+    auto y      = m.add_literal(param_shape, y_values.data());
    auto add_op = migraphx::operation("add");
    auto r      = m.add_instruction(add_op, {x, y});
    m.add_return({r});
    // run on ref target
    p.compile(migraphx::target("ref"));
    migraphx::program_parameters pp;
-    std::vector<float> x_data(9, 1);
-    std::vector<float> y_data(9, -1);
-    pp.add("x", migraphx::argument(param_shape, x_data.data()));
-    pp.add("y", migraphx::argument(param_shape, y_data.data()));
    auto outputs = p.eval(pp);
    auto output  = outputs[0];
    std::vector<float> expected(9, 0);
@@ -60,16 +58,16 @@ TEST_CASE(if_then_else_op)
        p.compile(migraphx::target("ref"));
        auto outputs =
            p.eval({{"cond", migraphx::argument(cond_s, &cond)}, {"x", x_arg}, {"y", y_arg}});
-        return outputs;
+        return outputs[0];
    };

    // then branch
    auto then_res = run_prog(true);
-    CHECK(bool{then_res[0] == x_arg});
+    CHECK(bool{then_res == x_arg});

    // else branch
    auto else_res = run_prog(false);
-    CHECK(bool{else_res[0] == y_arg});
+    CHECK(bool{else_res == y_arg});
 }

 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/jit.cpp
+++ b/test/gpu/jit.cpp
@@ -3,6 +3,7 @@
 #include <migraphx/make_op.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/program.hpp>
+#include <migraphx/par_for.hpp>
 #include <migraphx/gpu/kernel.hpp>
 #include <migraphx/gpu/target.hpp>
 #include <migraphx/gpu/hip.hpp>
@@ -109,6 +110,24 @@ int main() {}

 )__migraphx__";

+// NOLINTNEXTLINE
+const std::string math_template = R"__migraphx__(
+#include <migraphx/kernels/pointwise.hpp>
+#include <migraphx/kernels/math.hpp>
+
+extern "C" {
+__global__ void kernel(${type}* p) 
+{
+    auto x = *p;
+    *p = migraphx::implicit_conversion(migraphx::${invoke});
+
+}
+}
+
+int main() {}
+
+)__migraphx__";
+
 migraphx::src_file make_src_file(const std::string& name, const std::string& content)
 {
    return {name, std::make_pair(content.data(), content.data() + content.size())};
@@ -248,4 +267,66 @@ TEST_CASE(compile_pointwise)
    EXPECT(result == output_literal.get_argument());
 }

+TEST_CASE(compile_math)
+{
+    std::vector<std::string> math_invoke = {
+        // clang-format off
+        "abs(x)",
+        "acos(x)",
+        "acosh(x)",
+        "asin(x)",
+        "asinh(x)",
+        "atan(x)",
+        "atanh(x)",
+        "ceil(x)",
+        "cos(x)",
+        "cosh(x)",
+        "erf(x)",
+        "exp(x)",
+        "floor(x)",
+        "isnan(x)",
+        "log(x)",
+        "max(x, x)",
+        "min(x, x)",
+        "pow(x, 0)",
+        "pow(x, x)",
+        "round(x)",
+        "rsqrt(x)",
+        "sin(x)",
+        "sinh(x)",
+        "sqrt(x)",
+        "tan(x)",
+        "tanh(x)",
+        "where(true, x, x)",
+        // clang-format on
+    };
+    std::vector<std::string> data_types;
+    auto vec_sizes = {2, 4, 6};
+    for(auto&& t : migraphx::shape::types())
+    {
+        if(contains({migraphx::shape::bool_type, migraphx::shape::tuple_type}, t))
+            continue;
+        auto name = migraphx::shape::cpp_type(t);
+        if(t == migraphx::shape::half_type)
+            name.insert(0, "migraphx::");
+        data_types.push_back(name);
+        migraphx::transform(vec_sizes, std::back_inserter(data_types), [&](auto i) {
+            return "migraphx::vec<" + name + ", " + std::to_string(i) + ">";
+        });
+    }
+    migraphx::shape input{migraphx::shape::float_type, {5, 2}};
+    migraphx::gpu::hip_compile_options options;
+    options.global = 1024;
+    options.local  = 1024;
+    options.inputs = {input};
+    options.output = input;
+    migraphx::par_for(math_invoke.size() * data_types.size(), 1, [&](auto i) {
+        const auto& t      = data_types[i % data_types.size()];
+        const auto& invoke = math_invoke[i / data_types.size()];
+        auto src = migraphx::interpolate_string(math_template, {{"type", t}, {"invoke", invoke}});
+        auto co  = migraphx::gpu::compile_hip_code_object(src, options);
+        (void)co;
+    });
+}
+
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/onnx/gathernd_batch_dims_test.onnx
+++ b/test/onnx/gathernd_batch_dims_test.onnx
+gathernd_batch_dims_test:
+/
+data
+indicesy"GatherND*
+
+batch_dimsgathernd_batch_dims_testZ
+data
+
+
+
+Z
+indices
+
+
+b
+y
+
+
+B
\ No newline at end of file
--- a/test/onnx/gathernd_test.onnx
+++ b/test/onnx/gathernd_test.onnx
+
gathernd_test:q
+
+data
+indicesy"GatherND
gathernd_testZ
+data
+
+
+Z
+indices
+
+
+b
+y
+
+
+B
\ No newline at end of file
--- a/test/onnx/gen_onnx.py
+++ b/test/onnx/gen_onnx.py
@@ -1686,6 +1686,34 @@ def fastgelu_test():
    return ([node], [x], [y])


+def gathernd_test():
+    x = helper.make_tensor_value_info('data', TensorProto.FLOAT, [2, 2])
+    i = helper.make_tensor_value_info('indices', TensorProto.INT64, [2, 2])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2])
+
+    node = onnx.helper.make_node('GatherND',
+                                 inputs=['data', 'indices'],
+                                 outputs=['y'])
+
+    return ([node], [x, i], [y])
+
+
+@onnx_test
+def gathernd_batch_dims_test():
+    x = helper.make_tensor_value_info('data', TensorProto.FLOAT, [2, 2, 2])
+    i = helper.make_tensor_value_info('indices', TensorProto.INT64, [2, 1])
+    y = helper.make_tensor_value_info('y', TensorProto.FLOAT, [2, 2])
+
+    node = onnx.helper.make_node(
+        'GatherND',
+        inputs=['data', 'indices'],
+        outputs=['y'],
+        batch_dims=1,
+    )
+
+    return ([node], [x, i], [y])
+
+
 @onnx_test
 def gemm_test():
    x = helper.make_tensor_value_info('0', TensorProto.FLOAT, [5, 7])

--- a/test/onnx/onnx_test.cpp
+++ b/test/onnx/onnx_test.cpp
@@ -1582,6 +1582,31 @@ TEST_CASE(gather_elements_axis1_test)
    EXPECT(p == prog);
 }

+TEST_CASE(gathernd_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0  = mm->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {2, 2}});
+    auto l1  = mm->add_parameter("indices", migraphx::shape{migraphx::shape::int64_type, {2, 2}});
+    mm->add_instruction(migraphx::make_op("gathernd"), l0, l1);
+    auto prog = optimize_onnx("gathernd_test.onnx");
+
+    EXPECT(p == prog);
+}
+
+TEST_CASE(gathernd_batch_dims_test)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    auto l0  = mm->add_parameter("data", migraphx::shape{migraphx::shape::float_type, {2, 2, 2}});
+    auto l1  = mm->add_parameter("indices", migraphx::shape{migraphx::shape::int64_type, {2, 1}});
+    int batch_dims = 1;
+    mm->add_instruction(migraphx::make_op("gathernd", {{"batch_dims", batch_dims}}), l0, l1);
+    auto prog = optimize_onnx("gathernd_batch_dims_test.onnx");
+
+    EXPECT(p == prog);
+}
+
 TEST_CASE(gemm_test)
 {
    migraphx::program p;

--- a/test/py/onnx_backend_test.py
+++ b/test/py/onnx_backend_test.py
@@ -268,9 +268,6 @@ def create_backend_test(testname=None, target_device=None):
        backend_test.exclude(r'test_expand_shape_model2_cpu')
        backend_test.exclude(r'test_expand_shape_model3_cpu')
        backend_test.exclude(r'test_expand_shape_model4_cpu')
-        backend_test.exclude(r'test_gathernd_example_float32_cpu')
-        backend_test.exclude(r'test_gathernd_example_int32_batch_dim1_cpu')
-        backend_test.exclude(r'test_gathernd_example_int32_cpu')
        backend_test.exclude(r'test_identity_sequence_cpu')
        backend_test.exclude(r'test_maxpool_2d_uint8_cpu')
        backend_test.exclude(r'test_negative_log_likelihood_loss_*')

--- a/test/py/test_module_construct.py
+++ b/test/py/test_module_construct.py
-import migraphx
+import migraphx, array, sys
+
+
+def create_buffer(t, data, shape):
+    a = array.array(t, data)
+    m = memoryview(a.tobytes())
+    return m.cast(t, shape)


 def test_add_op():
    p = migraphx.program()
    mm = p.get_main_module()
-    param_shape = migraphx.shape(lens=[3, 3], type="float")
-    x = mm.add_parameter("x", param_shape)
-    y = mm.add_parameter("y", param_shape)
+    x = mm.add_literal(create_buffer('f', [1.0] * 9, (3, 3)))
+    y = mm.add_literal(create_buffer('f', [2.0] * 9, (3, 3)))
    add_op = mm.add_instruction(migraphx.op("add"), [x, y])
    mm.add_return([add_op])
    p.compile(migraphx.get_target("ref"))
    params = {}
-    params["x"] = migraphx.generate_argument(param_shape)
-    params["y"] = migraphx.generate_argument(param_shape)
    output = p.run(params)[-1].tolist()
-    assert output == [
-        a + b for a, b in zip(params["x"].tolist(), params["y"].tolist())
-    ]
+    assert output == list([3.0] * 9)


 def test_if_then_else():
@@ -60,5 +61,6 @@ def test_if_then_else():


 if __name__ == "__main__":
-    test_add_op()
+    if sys.version_info >= (3, 0):
+        test_add_op()
    test_if_then_else()
--- a/test/py/test_numpy.py
+++ b/test/py/test_numpy.py
+import migraphx, sys
+try:
+    import numpy as np
+except:
+    sys.exit()
+
+
+def test_add_op():
+    p = migraphx.program()
+    mm = p.get_main_module()
+    x = mm.add_literal(np.ones((3, 3), dtype='float32'))
+    y = mm.add_literal(2 * np.ones((3, 3), dtype='float32'))
+    add_op = mm.add_instruction(migraphx.op("add"), [x, y])
+    mm.add_return([add_op])
+    p.compile(migraphx.get_target("ref"))
+    params = {}
+    output = p.run(params)[-1].tolist()
+    assert output == list(3 * np.ones((9), dtype='float32'))
+
+
+if __name__ == "__main__":
+    test_add_op()
--- a/test/ref_ops_test.cpp
+++ b/test/ref_ops_test.cpp
@@ -1653,6 +1653,203 @@ TEST_CASE(gather_test)
    }
 }

+TEST_CASE(gathernd_test)
+{
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 2}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 2}};
+
+        std::vector<float> data_vec(2 * 2);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{0, 0, 1, 1};
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(migraphx::make_op("gathernd"), data, indices);
+        p.compile(migraphx::ref::target{});
+        auto result = p.eval({}).back();
+        std::vector<float> res_data{};
+        std::vector<float> gold{0, 3};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+
+        EXPECT(migraphx::verify_range(res_data, gold));
+    }
+
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 2}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 1}};
+
+        std::vector<float> data_vec(2 * 2);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{1, 0};
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(migraphx::make_op("gathernd"), data, indices);
+        p.compile(migraphx::ref::target{});
+        auto result = p.eval({}).back();
+        std::vector<float> res_data{};
+        std::vector<float> gold{2, 3, 0, 1};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+
+        EXPECT(migraphx::verify_range(res_data, gold));
+    }
+
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 3, 1}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 2, 1}};
+
+        std::vector<float> data_vec(2 * 3 * 1);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{1, 0, 0, 1};
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(migraphx::make_op("gathernd"), data, indices);
+        p.compile(migraphx::ref::target{});
+        auto result = p.eval({}).back();
+        std::vector<float> res_data{};
+        std::vector<float> gold{3, 4, 5, 0, 1, 2, 0, 1, 2, 3, 4, 5};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+
+        EXPECT(migraphx::verify_range(res_data, gold));
+    }
+
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 3, 2, 3}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 2, 2}};
+
+        std::vector<float> data_vec(2 * 3 * 2 * 3);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{0, 0, 0, 1, 0, 0, 0, 1};
+        const int batch_dims = 1;
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(
+            migraphx::make_op("gathernd", {{"batch_dims", batch_dims}}), data, indices);
+        p.compile(migraphx::ref::target{});
+        auto result = p.eval({}).back();
+        std::vector<float> res_data{};
+        std::vector<float> gold{0, 1, 2, 3, 4, 5, 18, 19, 20, 21, 22, 23};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+
+        EXPECT(migraphx::verify_range(res_data, gold));
+    }
+
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 3, 1, 3}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 3, 2}};
+
+        std::vector<float> data_vec(2 * 3 * 1 * 3);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{0, 0, 0, 1, 0, 2, 0, 2, 0, 1, 0, 0};
+        const int batch_dims = 2;
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(
+            migraphx::make_op("gathernd", {{"batch_dims", batch_dims}}), data, indices);
+
+        p.compile(migraphx::ref::target{});
+        auto result = p.eval({}).back();
+        std::vector<float> res_data{};
+        std::vector<float> gold{0, 4, 8, 11, 13, 15};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+
+        EXPECT(migraphx::verify_range(res_data, gold));
+    }
+
+    {
+        // k > r - batch_dims
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 3, 1, 3}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 3, 3}};
+
+        std::vector<float> data_vec(2 * 3 * 1 * 3);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec(2 * 3 * 3, 0);
+        const int batch_dims = 2;
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        EXPECT(test::throws([&] {
+            mm->add_instruction(
+                migraphx::make_op("gathernd", {{"batch_dims", batch_dims}}), data, indices);
+        }));
+    }
+}
+
+TEST_CASE(gathernd_negative_index_test)
+{
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 2}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 1, 1}};
+
+        std::vector<float> data_vec(2 * 2);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{-1, 0};
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(migraphx::make_op("gathernd"), data, indices);
+        p.compile(migraphx::ref::target{});
+        auto result = p.eval({}).back();
+        std::vector<float> res_data{};
+        std::vector<float> gold{2, 3, 0, 1};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+
+        EXPECT(migraphx::verify_range(res_data, gold));
+    }
+
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+
+        migraphx::shape ds{migraphx::shape::float_type, {2, 2}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 1, 1}};
+
+        std::vector<float> data_vec(2 * 2);
+        std::iota(data_vec.begin(), data_vec.end(), 0);
+        std::vector<int64_t> indices_vec{-3, 0};
+
+        auto data    = mm->add_literal(migraphx::literal{ds, data_vec});
+        auto indices = mm->add_literal(migraphx::literal{is, indices_vec});
+
+        mm->add_instruction(migraphx::make_op("gathernd"), data, indices);
+        p.compile(migraphx::ref::target{});
+
+        EXPECT(test::throws([&] { p.eval({}); }));
+    }
+}
+
 TEST_CASE(globalavgpool_test)
 {
    migraphx::program p;

--- a/test/verify/test_gathernd_batch_dims_1.cpp
+++ b/test/verify/test_gathernd_batch_dims_1.cpp
+
+#include "verify_program.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/make_op.hpp>
+
+struct test_gathernd_batch_dims_1 : verify_program<test_gathernd_batch_dims_1>
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto* mm = p.get_main_module();
+        migraphx::shape ds{migraphx::shape::float_type, {2, 3, 2, 3}};
+        migraphx::shape is{migraphx::shape::int64_type, {2, 3, 2}};
+        std::vector<int64_t> indices{1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0};
+        auto a0        = mm->add_parameter("data", ds);
+        auto a1        = mm->add_literal(migraphx::literal{is, indices});
+        int batch_dims = 1;
+        mm->add_instruction(migraphx::make_op("gathernd", {{"batch_dims", batch_dims}}), a0, a1);
+        return p;
+    }
+};