Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into broadcast_attr

4e3ca586 · Khalique · 1775c5ad · 31b2c735 · 4e3ca586 · 4e3ca586
Commit 4e3ca586 authored Jan 21, 2019 by Khalique
20 changed files
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -5,6 +5,7 @@
 #include <migraphx/operators.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/par_dfor.hpp>
 #include <migraphx/cpu/gemm.hpp>
 #include <unordered_map>
 #include <utility>
@@ -72,7 +73,7 @@ struct cpu_batch_norm_inference
            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
-                    dfor(num_batch, num_channels, image_height, image_width)(
+                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
                            assert((variance(c) + epsilon) > 0);
                            result(n, c, h, w) = gamma(c) * (buffer(n, c, h, w) - mean(c)) /
@@ -87,7 +88,7 @@ struct cpu_batch_norm_inference
            visit_all(output, input, mini_batch_mean, mini_batch_mean, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {
-                    dfor(num_batch, num_channels, image_height, image_width)(
+                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
                            assert((variance(c, h, w) + epsilon) > 0);
                            result(n, c, h, w) = gamma(c, h, w) *
@@ -112,28 +113,33 @@ struct cpu_convolution
    {
        argument result{output_shape};
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            auto in_h = input.get_shape().lens()[2];
+            auto in   = input.get_shape().lens();
-            auto in_w = input.get_shape().lens()[3];
+            auto in_h = in[2];
+            auto in_w = in[3];
-            auto wei_c = weights.get_shape().lens()[1];
+            auto wei   = weights.get_shape().lens();
-            auto wei_h = weights.get_shape().lens()[2];
+            auto wei_n = wei[0];
-            auto wei_w = weights.get_shape().lens()[3];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
-            dfor(output_shape.lens()[0],
+            par_dfor(output_shape.lens()[0],
                     output_shape.lens()[1],
                     output_shape.lens()[2],
                     output_shape.lens()[3])(
                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
                    const int start_x  = i * op.stride[0] - op.padding[0];
                    const int start_y  = j * op.stride[1] - op.padding[1];
+                    const int group_id = w / (wei_n / op.group);
                    double acc = 0;
                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
                        const int in_x  = start_x + x;
                        const int in_y  = start_y + y;
+                        const int in_ch = group_id * wei_c + k;
                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
                        {
-                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
+                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
                        }
                    });
                    output(o, w, i, j) = acc;
@@ -240,7 +246,7 @@ struct cpu_pooling
            auto in_h  = input.get_shape().lens()[2];
            auto in_w  = input.get_shape().lens()[3];
-            dfor(output_shape.lens()[0],
+            par_dfor(output_shape.lens()[0],
                     output_shape.lens()[1],
                     output_shape.lens()[2],
                     output_shape.lens()[3])(
@@ -299,24 +305,7 @@ struct cpu_concat
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        argument result{output_shape};
+        return op.compute(output_shape, std::move(args));
-        std::vector<std::size_t> coffsets = op.compute_offsets(output_shape, args);
-        for(std::size_t l = 0; l < args.size(); l++)
-        {
-            auto argl             = args[l];
-            std::size_t nelements = argl.get_shape().elements();
-            visit_all(result, argl)([&](auto output, auto input) {
-                auto slice_shape =
-                    shape{output_shape.type(), input.get_shape().lens(), output_shape.strides()};
-                auto slice = make_view(slice_shape, output.data() + coffsets[l]);
-                // cppcheck-suppress useStlAlgorithm
-                for(std::size_t i = 0; i < nelements; i++)
-                {
-                    slice[i] = input[i];
-                }
-            });
-        }
-        return result;
    }
 };
@@ -334,6 +323,18 @@ struct cpu_gemm
    }
 };
+struct cpu_gather
+{
+    op::gather op;
+    std::string name() const { return "cpu::gather"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        return op.compute(output_shape, std::move(args));
+    }
+};
 struct identity_op
 {
    std::string name() const { return "cpu::identity"; }
@@ -663,6 +664,7 @@ struct cpu_apply
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
+        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -28,6 +28,7 @@ add_library(migraphx_device
    device/contiguous.cpp
    device/mul.cpp
    device/concat.cpp
+    device/gather.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -56,6 +57,7 @@ add_library(migraphx_gpu
    sigmoid.cpp
    abs.cpp
    elu.cpp
+    gather.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
@@ -41,11 +41,11 @@ argument miopen_convolution::compute(context& ctx,
 shape miopen_convolution::compile(context& ctx,
                                  const shape& output_shape,
-                                  std::vector<instruction_ref> inputs)
+                                  std::vector<shape> inputs)
 {
    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0]->get_shape());
+    auto x_desc = make_tensor(inputs[0]);
-    auto w_desc = make_tensor(inputs[1]->get_shape());
+    auto w_desc = make_tensor(inputs[1]);
    auto y_desc = make_tensor(output_shape);
    std::size_t workspace_size = 0;
@@ -57,8 +57,8 @@ shape miopen_convolution::compile(context& ctx,
                                             &workspace_size);
    workspace_shape = shape{shape::int8_type, {workspace_size}};
-    auto x         = to_gpu(generate_argument(inputs[0]->get_shape()));
+    auto x         = to_gpu(generate_argument(inputs[0]));
-    auto w         = to_gpu(generate_argument(inputs[1]->get_shape()));
+    auto w         = to_gpu(generate_argument(inputs[1]));
    auto y         = allocate_gpu(output_shape);
    auto workspace = allocate_gpu(workspace_shape);
@@ -80,10 +80,21 @@ shape miopen_convolution::compile(context& ctx,
                                                        false);
    if(status != miopenStatusSuccess)
        MIGRAPHX_THROW("Find convolution failed");
+    handle = ctx.get_stream().get_miopen();
    algo   = perf.fwd_algo;
    return shape{shape::int8_type, {perf.memory}};
 }
+void miopen_convolution::finalize(context& ctx,
+                                  const shape& output_shape,
+                                  std::vector<shape> inputs)
+{
+    if(handle == ctx.get_stream().get_miopen())
+        return;
+    // TODO: Check that workspace hasn't changed
+    compile(ctx, output_shape, std::move(inputs));
+}
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/gather.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument gather(hipStream_t stream,
+                const migraphx::shape& output_shape,
+                std::vector<migraphx::argument> args,
+                std::size_t axis)
+{
+    visit_all(args.back(), args[0])([&](auto output, auto input) {
+        std::size_t nelements = output_shape.elements();
+        args[1].visit([&](auto indices) {
+            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
+                const auto* indices_ptr = device_cast(indices.data());
+                auto* outptr            = device_cast(output.data());
+                const auto* inptr       = device_cast(input.data());
+                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
+                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
+                gs_launch(stream, nelements)([=](auto i) {
+                    auto lens  = desc_output.multi(i);
+                    lens[axis] = indices_ptr[lens[axis]];
+                    outptr[i]  = inptr[desc_input.linear(lens)];
+                });
+            });
+        });
+    });
+    return args.back();
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
@@ -14,9 +14,6 @@ namespace gpu {
 void eliminate_workspace::apply(program& p) const
 {
-    if(!enabled(MIGRAPHX_DISABLE_MEMORY_COLORING{}))
-        return;
    std::size_t n = 0;
    std::vector<instruction_ref> allocs;
    for(auto ins : iterator_for(p))
@@ -32,12 +29,15 @@ void eliminate_workspace::apply(program& p) const
            allocs.push_back(ins);
        }
    }
+    if(n > 0)
+    {
        auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
        for(auto&& a : allocs)
        {
            p.replace_instruction(a, ws);
            p.remove_instruction(a);
        }
+    }
 }
 } // namespace gpu

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -137,6 +137,8 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    auto wei = ins->inputs().at(1)->get_shape();
    assert(wei.lens().size() == 4);
    auto conv = any_cast<miopen_convolution>(ins->get_operator());
+    if(conv.op.group > 1)
+        return false;
    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
        return false;
    auto op = conv.op;
@@ -272,11 +274,8 @@ struct miopen_conv_bias
        return f.execute(ctx, fargs, args[0], args[4]);
    }
-    shape compile(context& ctx)
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
-    {
+    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-        f.compile(ctx);
-        return f.get_workspace(ctx);
-    }
    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
 };
@@ -316,12 +315,8 @@ struct miopen_conv_bias_relu
        miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
        return f.execute(ctx, fargs, args[0], args[4]);
    }
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
-    shape compile(context& ctx)
+    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    {
-        f.compile(ctx);
-        return f.get_workspace(ctx);
-    }
    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
 };
@@ -348,7 +343,7 @@ void apply_conv_bias(context& ctx, program& p, match::matcher_result r)
    Op cb{conv_op, input_ins->get_shape(), weights_ins->get_shape(), bias_ins->get_shape()};
    // TODO: Insert ws allocation
-    auto ws = cb.compile(ctx);
+    auto ws = cb.get_workspace(ctx);
    p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
 }

--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
+#include <migraphx/gpu/gather.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/device/concat.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_gather::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+argument hip_gather::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    return device::gather(ctx.get_stream().get(), output_shape, args, op.axis);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -27,6 +27,7 @@ struct miopen_convolution
    op::convolution op;
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
+    miopenHandle_t handle = nullptr;
    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -39,7 +40,8 @@ struct miopen_convolution
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    shape compile(context& ctx, const shape& output_shape, std::vector<instruction_ref> inputs);
+    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
+    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
 };

--- a/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/gather.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_GATHER_HPP
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument gather(hipStream_t stream,
+                const migraphx::shape& output_shape,
+                std::vector<migraphx::argument> args,
+                std::size_t axis);
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gather.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_GATHER_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GATHER_HPP
+#include <migraphx/gpu/lowering.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/shape_for_each.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/gpu/device/contiguous.hpp>
+#include <migraphx/gpu/device/gather.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/gpu/rocblas.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct hip_gather
+{
+    op::gather op;
+    std::string name() const { return "gpu::gather"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/miopen.hpp
@@ -54,14 +54,19 @@ inline tensor_descriptor make_tensor(const migraphx::shape& s)
 inline convolution_descriptor make_conv(const migraphx::op::convolution& op)
 {
    auto c = make_obj<convolution_descriptor>(&miopenCreateConvolutionDescriptor);
+    miopenConvolutionMode_t c_mode = miopenConvolution;
+    if(op.group > 1)
+        c_mode = miopenGroupConv;
    miopenInitConvolutionDescriptor(c.get(),
-                                    miopenConvolution,
+                                    c_mode,
                                    op.padding[0],
                                    op.padding[1],
                                    op.stride[0],
                                    op.stride[1],
                                    op.dilation[0],
                                    op.dilation[1]);
+    if(op.group > 1)
+        miopenSetConvolutionGroupCount(c.get(), op.group);
    return c;
 }

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -40,8 +40,10 @@
 #include <migraphx/gpu/pooling.hpp>
 #include <migraphx/gpu/gemm.hpp>
 #include <migraphx/gpu/concat.hpp>
+#include <migraphx/gpu/gather.hpp>
 #include <utility>
 #include <functional>
+#include <algorithm>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -89,7 +91,7 @@ struct miopen_apply
        add_extend_op<miopen_contiguous, op::contiguous>("contiguous");
        add_extend_op<hip_concat, op::concat>("concat");
        add_extend_op<miopen_softmax, op::softmax>("softmax");
+        add_extend_op<hip_gather, op::gather>("gather");
        add_convolution_op();
        add_pooling_op();
        add_batch_norm_inference_op();
@@ -128,7 +130,7 @@ struct miopen_apply
            auto&& op = any_cast<op::convolution>(ins->get_operator());
            auto conv = miopen_convolution{op, make_conv(op)};
-            auto ws   = conv.compile(ctx, ins->get_shape(), ins->inputs());
+            auto ws   = conv.compile(ctx, ins->get_shape(), to_shapes(ins->inputs()));
            auto workspace = insert_allocation(ins, ws, "workspace");
            auto output    = insert_allocation(ins, ins->get_shape());

--- a/test/const_eval_test.cpp
+++ b/test/const_eval_test.cpp
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <sstream>
+#include "test.hpp"
+#include <basic_ops.hpp>
+struct sum_cf_op
+{
+    std::string name() const { return "sum_cf"; }
+    migraphx::argument compute(const migraphx::shape&, std::vector<migraphx::argument> args) const
+    {
+        migraphx::argument result;
+        if(args.size() != 2)
+            MIGRAPHX_THROW("Wrong args");
+        if(args[0].get_shape() != args[1].get_shape())
+            MIGRAPHX_THROW("Wrong args");
+        if(args[0].get_shape().lens().size() != 1)
+            MIGRAPHX_THROW("Wrong args");
+        if(args[0].get_shape().lens().front() != 1)
+            MIGRAPHX_THROW("Wrong args");
+        args[0].visit_at([&](auto x) {
+            args[1].visit_at([&](auto y) { result = migraphx::literal{x + y}.get_argument(); });
+        });
+        return result;
+    }
+    migraphx::shape compute_shape(std::vector<migraphx::shape> inputs) const
+    {
+        if(inputs.size() != 2)
+            MIGRAPHX_THROW("Wrong inputs");
+        return inputs.front();
+    }
+};
+struct non_computable_cf
+{
+    std::string name() const { return "non_computable"; }
+    migraphx::shape compute_shape(std::vector<migraphx::shape> inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        return inputs.front();
+    }
+};
+struct test_context
+{
+    void finish() const {}
+};
+TEST_CASE(literal_test)
+{
+    migraphx::program p;
+    auto lit = p.add_literal(1);
+    CHECK(lit->eval() == migraphx::literal{1});
+}
+TEST_CASE(param_test)
+{
+    migraphx::program p;
+    auto lit = p.add_parameter("param", migraphx::shape{migraphx::shape::float_type, {1}});
+    CHECK(lit->eval().empty());
+}
+TEST_CASE(op_test1)
+{
+    migraphx::program p;
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    auto sum = p.add_instruction(sum_cf_op{}, one, two);
+    CHECK(sum->eval() == migraphx::literal{3});
+}
+TEST_CASE(op_test2)
+{
+    migraphx::program p;
+    auto x   = p.add_parameter("param", migraphx::shape{migraphx::shape::float_type, {1}});
+    auto two = p.add_literal(2);
+    auto sum = p.add_instruction(sum_cf_op{}, x, two);
+    CHECK(sum->eval().empty());
+}
+TEST_CASE(op_test3)
+{
+    migraphx::program p;
+    auto one  = p.add_literal(1);
+    auto two  = p.add_literal(2);
+    auto sum1 = p.add_instruction(sum_op{}, one, two);
+    auto sum2 = p.add_instruction(sum_cf_op{}, sum1, two);
+    CHECK(sum2->eval().empty());
+}
+TEST_CASE(compute_op_c)
+{
+    migraphx::operation op = sum_op{};
+    auto one               = migraphx::literal{1}.get_argument();
+    auto two               = migraphx::literal{2}.get_argument();
+    EXPECT(test::throws([&] {
+        op.compute(migraphx::shape{migraphx::shape::float_type, {1}}, {one, two});
+    }));
+}
+TEST_CASE(compute_nop_c)
+{
+    migraphx::operation op = non_computable_cf{};
+    auto one               = migraphx::literal{1}.get_argument();
+    auto two               = migraphx::literal{2}.get_argument();
+    EXPECT(test::throws([&] {
+        op.compute(migraphx::shape{migraphx::shape::float_type, {1}}, {one, two});
+    }));
+}
+TEST_CASE(compute_nop_context)
+{
+    migraphx::operation op = non_computable_cf{};
+    auto one               = migraphx::literal{1}.get_argument();
+    auto two               = migraphx::literal{2}.get_argument();
+    migraphx::context ctx  = test_context{};
+    EXPECT(test::throws([&] {
+        op.compute(ctx, migraphx::shape{migraphx::shape::float_type, {1}}, {one, two});
+    }));
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -101,6 +101,49 @@ TEST_CASE(concat_test)
    }
 }
+TEST_CASE(gather_test)
+{
+    {
+        migraphx::program p;
+        std::vector<float> data(3 * 3);
+        std::iota(data.begin(), data.end(), 0.5);
+        migraphx::shape s{migraphx::shape::float_type, {3, 3}};
+        auto a0 = p.add_literal(migraphx::literal{s, data});
+        migraphx::shape s_indices{migraphx::shape::int32_type, {1, 2}};
+        std::vector<int> indices{0, 2};
+        auto a1          = p.add_literal(migraphx::literal{s_indices, indices});
+        std::size_t axis = 0;
+        p.add_instruction(migraphx::op::gather{axis}, a0, a1);
+        p.compile(migraphx::cpu::target{});
+        auto result = p.eval({});
+        std::vector<float> res_data(4 * 5);
+        std::vector<float> golden = {0.5f, 1.5f, 2.5f, 6.5f, 7.5f, 8.5f};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+        EXPECT(migraphx::verify_range(res_data, golden));
+    }
+    {
+        migraphx::program p;
+        std::vector<float> data(3 * 3);
+        std::iota(data.begin(), data.end(), 0.5);
+        migraphx::shape s{migraphx::shape::float_type, {3, 3}};
+        auto a0 = p.add_literal(migraphx::literal{s, data});
+        migraphx::shape s_indices{migraphx::shape::int32_type, {1, 2}};
+        std::vector<int> indices{0, 2};
+        auto a1          = p.add_literal(migraphx::literal{s_indices, indices});
+        std::size_t axis = 1;
+        p.add_instruction(migraphx::op::gather{axis}, a0, a1);
+        p.compile(migraphx::cpu::target{});
+        auto result = p.eval({});
+        std::vector<float> res_data(4 * 5);
+        std::vector<float> golden = {0.5f, 2.5f, 3.5f, 5.5f, 6.5f, 8.5f};
+        result.visit([&](auto output) { res_data.assign(output.begin(), output.end()); });
+        EXPECT(migraphx::verify_range(res_data, golden));
+    }
+}
 TEST_CASE(squeeze_test)
 {
    {

--- a/test/eliminate_concat_test.cpp
+++ b/test/eliminate_concat_test.cpp
@@ -63,9 +63,9 @@ struct allocate
    }
 };
-struct fred_op
+struct simple_op
 {
-    std::string name() const { return "fred_op"; }
+    std::string name() const { return "simple_op"; }
    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
    {
        migraphx::check_shapes{inputs}.has(1);
@@ -77,44 +77,127 @@ struct fred_op
    {
        return args.at(0);
    }
+    int output_alias(const std::vector<migraphx::shape>&) const { return 0; }
 };
+template <class... Ts>
+migraphx::shape create_shape(Ts... xs)
+{
+    return migraphx::shape{migraphx::shape::float_type, {std::size_t(xs)...}};
+}
+using load     = migraphx::op::load;
+using identity = migraphx::op::identity;
+TEST_CASE(simple)
+{
+    auto create_test_program = [] {
+        migraphx::program p;
+        auto a1          = p.add_instruction(allocate{create_shape(1)});
+        auto p1          = p.add_instruction(simple_op{}, a1);
+        auto a2          = p.add_instruction(allocate{create_shape(1)});
+        auto p2          = p.add_instruction(simple_op{}, a2);
+        std::size_t axis = 0;
+        auto a3          = p.add_instruction(allocate{create_shape(2)});
+        p.add_instruction(concat(axis), p1, p2, a3);
+        return p;
+    };
+    auto create_control_program = [] {
+        migraphx::program p;
+        auto a1 = p.add_instruction(allocate{create_shape(2)});
+        auto l1 = p.add_instruction(load{create_shape(1), 0}, a1);
+        auto p1 = p.add_instruction(simple_op{}, l1);
+        auto l2 = p.add_instruction(load{create_shape(1), 4}, a1);
+        auto p2 = p.add_instruction(simple_op{}, l2);
+        p.add_instruction(identity{}, a1, p1, p2);
+        return p;
+    };
+    auto p1 = create_test_program();
+    auto p2 = create_control_program();
+    p1.compile(eliminate_concat_target{});
+    EXPECT(p1 == p2);
+}
+TEST_CASE(nested)
+{
+    auto concat_test_program = [](auto& p) {
+        auto a1          = p.add_instruction(allocate{create_shape(1)});
+        auto p1          = p.add_instruction(simple_op{}, a1);
+        auto a2          = p.add_instruction(allocate{create_shape(1)});
+        auto p2          = p.add_instruction(simple_op{}, a2);
+        std::size_t axis = 0;
+        auto a3          = p.add_instruction(allocate{create_shape(2)});
+        return p.add_instruction(concat(axis), p1, p2, a3);
+    };
+    auto create_test_program = [&] {
+        migraphx::program p;
+        auto concat1     = concat_test_program(p);
+        auto concat2     = concat_test_program(p);
+        std::size_t axis = 0;
+        auto a1          = p.add_instruction(allocate{create_shape(4)});
+        p.add_instruction(concat(axis), concat1, concat2, a1);
+        return p;
+    };
+    auto concat_control_program = [](auto& p, auto a1) {
+        auto l1 = p.add_instruction(load{create_shape(1), 0}, a1);
+        auto p1 = p.add_instruction(simple_op{}, l1);
+        auto l2 = p.add_instruction(load{create_shape(1), 4}, a1);
+        auto p2 = p.add_instruction(simple_op{}, l2);
+        return p.add_instruction(identity{}, a1, p1, p2);
+    };
+    auto create_control_program = [&] {
+        migraphx::program p;
+        auto a1      = p.add_instruction(allocate{create_shape(4)});
+        auto l1      = p.add_instruction(load{create_shape(2), 0}, a1);
+        auto concat1 = concat_control_program(p, l1);
+        auto l2      = p.add_instruction(load{create_shape(2), 8}, a1);
+        auto concat2 = concat_control_program(p, l2);
+        p.add_instruction(identity{}, a1, concat1, concat2);
+        return p;
+    };
+    auto p1 = create_test_program();
+    auto p2 = create_control_program();
+    p1.compile(eliminate_concat_target{});
+    EXPECT(p1 == p2);
+}
 TEST_CASE(basic)
 {
-    auto create_test_program = []() {
+    auto create_test_program = [] {
        migraphx::program p;
        auto a1 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {1, 2, 8, 8}}});
-        auto p1 = p.add_instruction(fred_op{}, a1);
+        auto p1 = p.add_instruction(simple_op{}, a1);
        auto a2 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {1, 3, 8, 8}}});
-        auto p2 = p.add_instruction(fred_op{}, a2);
+        auto p2 = p.add_instruction(simple_op{}, a2);
        auto a3 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {1, 5, 8, 8}}});
-        auto p3          = p.add_instruction(fred_op{}, a3);
+        auto p3          = p.add_instruction(simple_op{}, a3);
        std::size_t axis = 1;
        auto a4          = p.add_instruction(
            allocate{migraphx::shape{migraphx::shape::float_type, {1, 10, 8, 8}}});
        p.add_instruction(concat(axis), p1, p2, p3, a4);
        return p;
    };
-    auto create_control_program = []() {
+    auto create_control_program = [] {
        migraphx::program p;
        auto a1 = p.add_instruction(
            allocate{migraphx::shape{migraphx::shape::float_type, {1, 10, 8, 8}}});
        auto l1 = p.add_instruction(
-            migraphx::op::load{migraphx::shape{migraphx::shape::float_type, {1, 2, 8, 8}}, 0},
+            load{migraphx::shape{migraphx::shape::float_type, {1, 2, 8, 8}}, 0}, {a1});
-            {a1});
+        auto p1 = p.add_instruction(simple_op{}, l1);
-        auto p1 = p.add_instruction(fred_op{}, l1);
        auto l2 = p.add_instruction(
-            migraphx::op::load{migraphx::shape{migraphx::shape::float_type, {1, 3, 8, 8}}, 512},
+            load{migraphx::shape{migraphx::shape::float_type, {1, 3, 8, 8}}, 512}, {a1});
-            {a1});
+        auto p2 = p.add_instruction(simple_op{}, l2);
-        auto p2 = p.add_instruction(fred_op{}, l2);
        auto l3 = p.add_instruction(
-            migraphx::op::load{migraphx::shape{migraphx::shape::float_type, {1, 5, 8, 8}}, 1280},
+            load{migraphx::shape{migraphx::shape::float_type, {1, 5, 8, 8}}, 1280}, {a1});
-            {a1});
+        auto p3 = p.add_instruction(simple_op{}, l3);
-        auto p3 = p.add_instruction(fred_op{}, l3);
+        p.add_instruction(identity{}, {a1, p1, p2, p3});
-        p.add_instruction(migraphx::op::identity{}, {a1, p1, p2, p3});
        return p;
    };
@@ -127,34 +210,34 @@ TEST_CASE(basic)
 TEST_CASE(wont_work)
 {
-    auto create_test_program = []() {
+    auto create_test_program = [] {
        migraphx::program p;
        auto a1 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {2, 2, 8, 8}}});
-        auto p1 = p.add_instruction(fred_op{}, a1);
+        auto p1 = p.add_instruction(simple_op{}, a1);
        auto a2 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {2, 3, 8, 8}}});
-        auto p2 = p.add_instruction(fred_op{}, a2);
+        auto p2 = p.add_instruction(simple_op{}, a2);
        auto a3 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {2, 5, 8, 8}}});
-        auto p3          = p.add_instruction(fred_op{}, a3);
+        auto p3          = p.add_instruction(simple_op{}, a3);
        std::size_t axis = 1;
        auto a4          = p.add_instruction(
            allocate{migraphx::shape{migraphx::shape::float_type, {2, 10, 8, 8}}});
        p.add_instruction(concat(axis), p1, p2, p3, a4);
        return p;
    };
-    auto create_control_program = []() {
+    auto create_control_program = [] {
        migraphx::program p;
        auto a1 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {2, 2, 8, 8}}});
-        auto p1 = p.add_instruction(fred_op{}, a1);
+        auto p1 = p.add_instruction(simple_op{}, a1);
        auto a2 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {2, 3, 8, 8}}});
-        auto p2 = p.add_instruction(fred_op{}, a2);
+        auto p2 = p.add_instruction(simple_op{}, a2);
        auto a3 =
            p.add_instruction(allocate{migraphx::shape{migraphx::shape::float_type, {2, 5, 8, 8}}});
-        auto p3          = p.add_instruction(fred_op{}, a3);
+        auto p3          = p.add_instruction(simple_op{}, a3);
        std::size_t axis = 1;
        auto a4          = p.add_instruction(
            allocate{migraphx::shape{migraphx::shape::float_type, {2, 10, 8, 8}}});

--- a/test/eval_test.cpp
+++ b/test/eval_test.cpp
@@ -8,9 +8,57 @@
 struct id_target
 {
+    struct context
+    {
+        void finish() const {}
+    };
+    migraphx::context ctx = context{};
    std::string name() const { return "id"; }
    std::vector<migraphx::pass> get_passes(migraphx::context&) const { return {}; }
-    migraphx::context get_context() const { return {}; }
+    migraphx::context get_context() const { return ctx; }
+};
+struct id_ctx_op
+{
+    std::string name() const { return "id_ctx_op"; }
+    migraphx::argument
+    compute(id_target::context&, const migraphx::shape&, std::vector<migraphx::argument> args) const
+    {
+        if(args.empty())
+            return {};
+        return args.front();
+    }
+    migraphx::shape compute_shape(std::vector<migraphx::shape> inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        return inputs.front();
+    }
+    int output_alias(const std::vector<migraphx::shape>&) const { return 0; }
+};
+struct id_ctx_final_op
+{
+    std::string name() const { return "id_ctx_final_op"; }
+    migraphx::argument compute(const migraphx::shape&, std::vector<migraphx::argument> args) const
+    {
+        if(args.empty())
+            return {};
+        return args.front();
+    }
+    void finalize(id_target::context&, const migraphx::shape&, const std::vector<migraphx::shape>&)
+    {
+    }
+    migraphx::shape compute_shape(std::vector<migraphx::shape> inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        return inputs.front();
+    }
+    int output_alias(const std::vector<migraphx::shape>&) const { return 0; }
 };
 struct reverse_pass
@@ -224,4 +272,52 @@ TEST_CASE(double_reverse_target_test)
    EXPECT(result != migraphx::literal{4});
 }
+// Check that the program doesnt modify the context directly, and only the operators modify the
+// context
+TEST_CASE(eval_context1)
+{
+    migraphx::program p;
+    id_target t{};
+    EXPECT(is_shared(t.ctx, t.get_context()));
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(sum_op{}, one, two);
+    p.compile(t);
+    EXPECT(is_shared(t.ctx, p.get_context()));
+    p.eval({});
+    EXPECT(is_shared(t.ctx, p.get_context()));
+}
+TEST_CASE(eval_context2)
+{
+    migraphx::program p;
+    id_target t{};
+    EXPECT(is_shared(t.ctx, t.get_context()));
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(id_ctx_op{}, one, two);
+    p.compile(t);
+    EXPECT(is_shared(t.ctx, p.get_context()));
+    p.eval({});
+    // id_ctx_op will modify the context
+    EXPECT(not is_shared(t.ctx, p.get_context()));
+}
+TEST_CASE(eval_context3)
+{
+    migraphx::program p;
+    id_target t{};
+    EXPECT(is_shared(t.ctx, t.get_context()));
+    auto one = p.add_literal(1);
+    auto two = p.add_literal(2);
+    p.add_instruction(id_ctx_final_op{}, one, two);
+    p.compile(t);
+    // Finalizer will modify the context
+    EXPECT(not is_shared(t.ctx, p.get_context()));
+    auto ctx = p.get_context();
+    p.eval({});
+    EXPECT(is_shared(ctx, p.get_context()));
+    EXPECT(not is_shared(t.ctx, p.get_context()));
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/gpu/miopen.cpp
+++ b/test/gpu/miopen.cpp
@@ -534,6 +534,22 @@ struct test_conv2
    }
 };
+struct test_group_conv
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        auto input =
+            p.add_parameter("x", migraphx::shape{migraphx::shape::float_type, {1, 4, 16, 16}});
+        auto weights =
+            p.add_parameter("w", migraphx::shape{migraphx::shape::float_type, {4, 1, 3, 3}});
+        migraphx::op::convolution op;
+        op.group = 4;
+        p.add_instruction(op, input, weights);
+        return p;
+    }
+};
 struct test_conv_relu
 {
    migraphx::program create_program() const
@@ -918,6 +934,22 @@ struct test_concat_relu
    }
 };
+struct test_gather
+{
+    migraphx::program create_program() const
+    {
+        migraphx::program p;
+        migraphx::shape s{migraphx::shape::float_type, {3, 3}};
+        migraphx::shape s_indices{migraphx::shape::int32_type, {2, 2}};
+        std::vector<int> indices{1, 2, 2, 1};
+        auto a0          = p.add_parameter("data", s);
+        auto a1          = p.add_literal(migraphx::literal{s_indices, indices});
+        std::size_t axis = 0;
+        p.add_instruction(migraphx::op::gather{axis}, a0, a1);
+        return p;
+    }
+};
 void manual_identity()
 {
    migraphx::program p;
@@ -1034,6 +1066,7 @@ int main()
    verify_program<test_softmax2>();
    verify_program<test_conv>();
    verify_program<test_conv2>();
+    verify_program<test_group_conv>();
    verify_program<test_conv_relu>();
    verify_program<test_conv_relu_half>();
    verify_program<test_add_relu>();

--- a/test/include/test.hpp
+++ b/test/include/test.hpp
@@ -224,7 +224,13 @@ inline void run(int argc, const char* argv[])
        std::unordered_map<std::string, std::function<void()>> m(get_test_cases().begin(),
                                                                 get_test_cases().end());
        for(auto&& name : cases)
-            run_test_case(name, m[name]);
+        {
+            auto f = m.find(name);
+            if(f == m.end())
+                std::cout << "[  ERROR   ] Test case '" << name << "' not found." << std::endl;
+            else
+                run_test_case(name, f->second);
+        }
    }
 }

--- a/test/onnx/const_fill1.onnx
+++ b/test/onnx/const_fill1.onnx
--- a/test/onnx/const_fill2.onnx
+++ b/test/onnx/const_fill2.onnx