Merge from develop

bc5d7f75 · Paul · 47c0854d · a5b0afa0 · bc5d7f75 · bc5d7f75
Commit bc5d7f75 authored Feb 15, 2019 by Paul
20 changed files
--- a/src/targets/gpu/device/exp.cpp
+++ b/src/targets/gpu/device/exp.cpp
+#include <migraphx/gpu/device/exp.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void exp(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::exp(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/gather.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument gather(hipStream_t stream,
+                const migraphx::shape& output_shape,
+                std::vector<migraphx::argument> args,
+                int axis)
+{
+    int axis_index = (axis < 0) ? (axis + output_shape.lens().size()) : axis;
+    visit_all(args.back(), args[0])([&](auto output, auto input) {
+        std::size_t nelements = output_shape.elements();
+        args[1].visit([&](auto indices) {
+            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
+                const auto* indices_ptr = device_cast(indices.data());
+                auto* outptr            = device_cast(output.data());
+                const auto* inptr       = device_cast(input.data());
+                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
+                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
+                gs_launch(stream, nelements)([=](auto i) {
+                    auto lens        = desc_output.multi(i);
+                    lens[axis_index] = indices_ptr[lens[axis_index]];
+                    outptr[i]        = inptr[desc_input.linear(lens)];
+                });
+            });
+        });
+    });
+    return args.back();
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
+++ b/src/targets/gpu/device/include/migraph/gpu/device/launch.hpp
-#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
-#define MIGRAPH_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_LAUNCH_HPP
 #include <hip/hip_runtime.h>
-#include <migraph/config.hpp>
+#include <migraphx/config.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
@@ -53,14 +53,14 @@ inline auto gs_launch(hipStream_t stream, std::size_t n, std::size_t local = 102
 // Workaround hcc's broken tile_static macro
 #ifdef tile_static
 #undef tile_static
-#define MIGRAPH_DEVICE_SHARED __attribute__((tile_static))
+#define MIGRAPHX_DEVICE_SHARED __attribute__((tile_static))
 #else
-#define MIGRAPH_DEVICE_SHARED __shared__
+#define MIGRAPHX_DEVICE_SHARED __shared__
 #endif
 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
 #endif
--- a/src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
+++ b/src/targets/gpu/device/include/migraph/gpu/device/nary.hpp
-#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_NARY_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_NARY_HPP
-#define MIGRAPH_GUARD_RTGLIB_DEVICE_NARY_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_NARY_HPP
-#include <migraph/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
-#include <migraph/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/launch.hpp>
-#include <migraph/gpu/device/types.hpp>
+#include <migraphx/gpu/device/types.hpp>
-#include <migraph/functional.hpp>
+#include <migraphx/functional.hpp>
-#include <migraph/ranges.hpp>
+#include <migraphx/ranges.hpp>
-#include <migraph/config.hpp>
+#include <migraphx/config.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
@@ -87,7 +87,7 @@ void trinary_broadcast_vec_impl(hipStream_t stream,
        const std::size_t bdim_vec_len = bdim_len / vec_size;
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
-            MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
+            MIGRAPHX_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
            // Load bias into LDS
            for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
            {
@@ -144,7 +144,7 @@ void trinary_broadcast_impl(hipStream_t stream,
        const std::size_t n       = output.size();
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
-            MIGRAPH_DEVICE_SHARED type buffer[2048];
+            MIGRAPHX_DEVICE_SHARED type buffer[2048];
            // Load bias into LDS
            for(size_t i = idx.local; i < bdim_len; i += nlocal)
            {
@@ -192,7 +192,7 @@ void binary_broadcast_vec_impl(
        const std::size_t bdim_vec_len = bdim_len / vec_size;
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
-            MIGRAPH_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
+            MIGRAPHX_DEVICE_SHARED vec4<type> buffer[2048 / vec_size];
            // Load bias into LDS
            for(size_t i = idx.local; i < bdim_vec_len; i += nlocal)
            {
@@ -243,7 +243,7 @@ void binary_broadcast_impl(
        const std::size_t n       = output.size();
        launch(stream, nglobal, nlocal)([=](auto idx) __device__ {
-            MIGRAPH_DEVICE_SHARED type buffer[2048];
+            MIGRAPHX_DEVICE_SHARED type buffer[2048];
            // Load bias into LDS
            for(size_t i = idx.local; i < bdim_len; i += nlocal)
            {
@@ -313,6 +313,12 @@ void nary_impl(hipStream_t stream, F f, argument result, Arguments... args)
        nary_nonstandard_impl(stream, f, result, args...);
 }
+template <class F>
+void nary_impl(hipStream_t stream, F f, argument result)
+{
+    nary_standard_impl(stream, f, result);
+}
 template <class... Arguments>
 auto nary_nonstandard(hipStream_t stream, argument result, Arguments... args)
 {
@@ -396,7 +402,7 @@ inline auto nary(hipStream_t stream,
 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
 #endif
--- a/src/targets/gpu/device/include/migraph/gpu/device/tensor.hpp
+++ b/src/targets/gpu/device/include/migraph/gpu/device/tensor.hpp
-#ifndef MIGRAPH_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
-#define MIGRAPH_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEAVICE_TENSOR_HPP
 #include <hip/hip_runtime.h>
-#include <migraph/functional.hpp>
+#include <migraphx/functional.hpp>
-#include <migraph/config.hpp>
+#include <migraphx/config.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
@@ -87,7 +87,7 @@ struct hip_tensor_descriptor
 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
 #endif
--- a/src/targets/gpu/device/include/migraph/gpu/device/types.hpp
+++ b/src/targets/gpu/device/include/migraph/gpu/device/types.hpp
@@ -5,14 +5,14 @@
    file LICENSE_1_0.txt or copy at http://www.boost.org/LICENSE_1_0.txt)
 ==============================================================================*/
-#ifndef MIGRAPH_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
-#define MIGRAPH_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_DEVICE_TYPES_HPP
-#include <migraph/half.hpp>
+#include <migraphx/half.hpp>
-#include <migraph/config.hpp>
+#include <migraphx/config.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
@@ -75,9 +75,18 @@ device_type<T>* device_cast(T* x)
    return reinterpret_cast<device_type<T>*>(x);
 }
+template <class T>
+T to_hip_type(T x)
+{
+    return x;
+}
+// Hip doens't support __fp16
+inline float to_hip_type(gpu_half x) { return x; }
 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
 #endif
--- a/src/targets/gpu/device/log.cpp
+++ b/src/targets/gpu/device/log.cpp
+#include <migraphx/gpu/device/log.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void log(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::log(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/max.cpp
+++ b/src/targets/gpu/device/max.cpp
+#include <migraphx/gpu/device/max.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void max(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)(
+        [](auto x, auto y) { return std::max(to_hip_type(x), to_hip_type(y)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/min.cpp
+++ b/src/targets/gpu/device/min.cpp
+#include <migraphx/gpu/device/min.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void min(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)(
+        [](auto x, auto y) { return std::min(to_hip_type(x), to_hip_type(y)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/mul.cpp
+++ b/src/targets/gpu/device/mul.cpp
-#include <migraph/gpu/device/mul.hpp>
+#include <migraphx/gpu/device/mul.hpp>
-#include <migraph/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/nary.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {
@@ -22,5 +22,5 @@ void mul(hipStream_t stream,
 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
--- a/src/targets/gpu/device/pad.cpp
+++ b/src/targets/gpu/device/pad.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/pad.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+argument
+pad(hipStream_t stream, argument result, argument arg1, float value, std::vector<std::int64_t> pads)
+{
+    std::size_t nelements = arg1.get_shape().elements();
+    nary(stream, result)([=] { return value; });
+    visit_all(result, arg1)([&](auto output, auto input) {
+        visit_tensor_size(result.get_shape().lens().size(), [&](auto ndim) {
+            std::size_t offsets[ndim];
+            std::copy(pads.begin(), pads.begin() + ndim, offsets);
+            auto* outptr      = output.data();
+            const auto* inptr = input.data();
+            hip_tensor_descriptor<ndim> desc_input(input.get_shape());
+            hip_tensor_descriptor<ndim> desc_output(output.get_shape());
+            gs_launch(stream, nelements)([=](auto i) {
+                auto idx = desc_input.multi(i);
+                for(std::size_t j = 0; j < ndim; j++)
+                {
+                    idx[j] += offsets[j];
+                }
+                outptr[desc_output.linear(idx)] = inptr[i];
+            });
+        });
+    });
+    return result;
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sin.cpp
+++ b/src/targets/gpu/device/sin.cpp
+#include <migraphx/gpu/device/sin.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sin(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::sin(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sinh.cpp
+++ b/src/targets/gpu/device/sinh.cpp
+#include <migraphx/gpu/device/sinh.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sinh(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::sinh(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/sub.cpp
+++ b/src/targets/gpu/device/sub.cpp
+#include <migraphx/gpu/device/sub.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void sub(hipStream_t stream, const argument& result, const argument& arg1, const argument& arg2)
+{
+    nary(stream, result, arg1, arg2)([](auto x, auto y) { return y - x; });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/tan.cpp
+++ b/src/targets/gpu/device/tan.cpp
+#include <migraphx/gpu/device/tan.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+void tan(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::tan(to_hip_type(x)); });
+}
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
-#include <migraph/gpu/eliminate_workspace.hpp>
+#include <migraphx/gpu/eliminate_workspace.hpp>
-#include <migraph/gpu/hip.hpp>
+#include <migraphx/gpu/hip.hpp>
-#include <migraph/program.hpp>
+#include <migraphx/program.hpp>
-#include <migraph/instruction.hpp>
+#include <migraphx/instruction.hpp>
-#include <migraph/operators.hpp>
+#include <migraphx/operators.hpp>
-#include <migraph/iterator_for.hpp>
+#include <migraphx/iterator_for.hpp>
-#include <migraph/ranges.hpp>
+#include <migraphx/ranges.hpp>
-#include <migraph/stringutils.hpp>
+#include <migraphx/stringutils.hpp>
-#include <migraph/pass_config.hpp>
+#include <migraphx/pass_config.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 void eliminate_workspace::apply(program& p) const
 {
-    if(!enabled(MIGRAPH_DISABLE_MEMORY_COLORING{}))
-        return;
    std::size_t n = 0;
    std::vector<instruction_ref> allocs;
    for(auto ins : iterator_for(p))
@@ -32,14 +29,17 @@ void eliminate_workspace::apply(program& p) const
            allocs.push_back(ins);
        }
    }
-    auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
+    if(n > 0)
-    for(auto&& a : allocs)
    {
-        p.replace_instruction(a, ws);
+        auto ws = p.add_parameter("workspace", shape{shape::int8_type, {n}});
-        p.remove_instruction(a);
+        for(auto&& a : allocs)
+        {
+            p.replace_instruction(a, ws);
+            p.remove_instruction(a);
+        }
    }
 }
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
+#include <migraphx/gpu/elu.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape miopen_elu::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    return inputs.at(1);
+}
+argument miopen_elu::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    float alpha = 1;
+    float beta  = 0;
+    auto x_desc = make_tensor(args[0].get_shape());
+    auto y_desc = make_tensor(output_shape);
+    miopenActivationForward(ctx.get_stream().get_miopen(),
+                            ad.get(),
+                            &alpha,
+                            x_desc.get(),
+                            args[0].implicit(),
+                            &beta,
+                            y_desc.get(),
+                            args[1].implicit());
+    return args[1];
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
-#include <migraph/gpu/fuse_ops.hpp>
+#include <migraphx/gpu/fuse_ops.hpp>
-#include <migraph/matcher.hpp>
+#include <migraphx/matcher.hpp>
-#include <migraph/gpu/miopen.hpp>
+#include <migraphx/gpu/miopen.hpp>
-#include <migraph/gpu/convolution.hpp>
+#include <migraphx/gpu/convolution.hpp>
-#include <migraph/gpu/device/add_relu.hpp>
+#include <migraphx/gpu/device/add_relu.hpp>
-#include <migraph/instruction.hpp>
+#include <migraphx/instruction.hpp>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 struct fusion
@@ -38,7 +38,7 @@ struct fusion
        op_t result;
        auto status = miopenFusionPlanGetOp(fp.get(), i, &result);
        if(status != miopenStatusSuccess)
-            MIGRAPH_THROW("Failed retrieving operator at " + std::to_string(i));
+            MIGRAPHX_THROW("Failed retrieving operator at " + std::to_string(i));
        return result;
    }
@@ -51,7 +51,7 @@ struct fusion
        auto t      = keep_alive(make_tensor(b));
        auto status = miopenCreateOpBiasForward(fp.get(), &result, t.get());
        if(status != miopenStatusSuccess)
-            MIGRAPH_THROW("Creating operator failed");
+            MIGRAPHX_THROW("Creating operator failed");
        return result;
    }
@@ -60,7 +60,7 @@ struct fusion
        op_t result;
        auto status = miopenCreateOpActivationForward(fp.get(), &result, miopenActivationRELU);
        if(status != miopenStatusSuccess)
-            MIGRAPH_THROW("Creating operator failed");
+            MIGRAPHX_THROW("Creating operator failed");
        return result;
    }
@@ -71,7 +71,7 @@ struct fusion
        auto t      = keep_alive(make_tensor(weights));
        auto status = miopenCreateOpConvForward(fp.get(), &result, cd.get(), t.get());
        if(status != miopenStatusSuccess)
-            MIGRAPH_THROW("Creating operator failed");
+            MIGRAPHX_THROW("Creating operator failed");
        return result;
    }
@@ -91,7 +91,7 @@ struct fusion
    {
        auto status = miopenCompileFusionPlan(ctx.get_stream().get_miopen(), fp.get());
        if(status != miopenStatusSuccess)
-            MIGRAPH_THROW("Compiling fusion plan failed");
+            MIGRAPHX_THROW("Compiling fusion plan failed");
    }
    argument execute(context& ctx,
@@ -109,12 +109,12 @@ struct fusion
                                              y.implicit(),
                                              fargs.get());
        if(status != miopenStatusSuccess)
-            MIGRAPH_THROW("Failed to execute fusion plan");
+            MIGRAPHX_THROW("Failed to execute fusion plan");
        return y;
    }
 };
-MIGRAPH_PRED_MATCHER(bias_shape, instruction_ref ins)
+MIGRAPHX_PRED_MATCHER(bias_shape, instruction_ref ins)
 {
    auto&& s = ins->get_shape();
    return s.broadcasted() and s.strides().size() == 4 and s.strides()[0] == 0 and
@@ -128,7 +128,7 @@ std::array<T, sizeof...(Ts) + 1> make_array(T x, Ts... xs)
    return {std::move(x), std::move(static_cast<T>(xs))...};
 }
-MIGRAPH_PRED_MATCHER(fusable_conv, instruction_ref ins)
+MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
 {
    if(ins->name() != "gpu::convolution")
        return false;
@@ -137,6 +137,8 @@ MIGRAPH_PRED_MATCHER(fusable_conv, instruction_ref ins)
    auto wei = ins->inputs().at(1)->get_shape();
    assert(wei.lens().size() == 4);
    auto conv = any_cast<miopen_convolution>(ins->get_operator());
+    if(conv.op.group > 1)
+        return false;
    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
        return false;
    auto op = conv.op;
@@ -265,17 +267,15 @@ struct miopen_conv_bias
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
        auto fargs  = make_fused_args();
-        float alpha = 1, beta = 0;
+        float alpha = 1;
+        float beta  = 0;
        miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
        miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
        return f.execute(ctx, fargs, args[0], args[4]);
    }
-    shape compile(context& ctx)
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
-    {
+    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-        f.compile(ctx);
-        return f.get_workspace(ctx);
-    }
    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
 };
@@ -308,18 +308,15 @@ struct miopen_conv_bias_relu
    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
    {
        auto fargs  = make_fused_args();
-        float alpha = 1, beta = 0;
+        float alpha = 1;
+        float beta  = 0;
        miopenSetOpArgsConvForward(fargs.get(), conv, &alpha, &beta, args[1].implicit());
        miopenSetOpArgsBiasForward(fargs.get(), bias, &alpha, &beta, args[3].implicit());
        miopenSetOpArgsActivForward(fargs.get(), relu, &alpha, &beta, 0, 0, 0);
        return f.execute(ctx, fargs, args[0], args[4]);
    }
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
-    shape compile(context& ctx)
+    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    {
-        f.compile(ctx);
-        return f.get_workspace(ctx);
-    }
    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
 };
@@ -346,8 +343,8 @@ void apply_conv_bias(context& ctx, program& p, match::matcher_result r)
    Op cb{conv_op, input_ins->get_shape(), weights_ins->get_shape(), bias_ins->get_shape()};
    // TODO: Insert ws allocation
-    auto ws = cb.compile(ctx);
+    auto ws = cb.get_workspace(ctx);
+    (void)ws;
    p.replace_instruction(ins, cb, input_ins, weights_ins, old_ws_ins, bias_ins, alloc_ins);
 }
@@ -389,5 +386,5 @@ void fuse_ops::apply(program& p) const
 }
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx
--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
+#include <migraphx/gpu/gather.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <migraphx/gpu/device/concat.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+shape hip_gather::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+argument hip_gather::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    return device::gather(ctx.get_stream().get(), output_shape, args, op.axis);
+}
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
-#include <migraph/gpu/gemm.hpp>
+#include <migraphx/gpu/gemm.hpp>
-#include <migraph/operators.hpp>
+#include <migraphx/operators.hpp>
-#include <migraph/manage_ptr.hpp>
+#include <migraphx/manage_ptr.hpp>
-#include <migraph/gpu/miopen.hpp>
+#include <migraphx/gpu/miopen.hpp>
 #include <utility>
-namespace migraph {
+namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 template <class... Ts>
@@ -29,7 +29,7 @@ void generic_rocblas_gemm(shape::as<half>, Ts&&... xs)
 template <class T, class... Ts>
 void generic_rocblas_gemm(shape::as<T>, Ts&&...)
 {
-    MIGRAPH_THROW("Type unsupported by rocblas");
+    MIGRAPHX_THROW("Type unsupported by rocblas");
 }
 template <class T>
@@ -107,9 +107,10 @@ argument miopen_gemm::compute(context& ctx,
                             ldc);
    });
    return args[2];
 }
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraph
+} // namespace migraphx