Merge branch 'develop' into mem_color_ordering_fix

a5c1c7f6 · Paul Fultz II · GitHub · 462a4920 · d516b099 · a5c1c7f6
Unverified Commit a5c1c7f6 authored Feb 10, 2019 by Paul Fultz II Committed by GitHub Feb 10, 2019
20 changed files
--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -4,7 +4,7 @@
 #include <blaze/math/CustomMatrix.h>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 template <class T>
@@ -94,5 +94,5 @@ void migemm(
 }

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/context.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/context.hpp
-#ifndef MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
-#define MIGRAPH_GUARD_RTGLIB_CONTEXT_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CONTEXT_HPP

 #include <migraphx/config.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 struct context
@@ -13,7 +13,7 @@ struct context
 };

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

 #endif
--- a/src/targets/cpu/include/migraphx/cpu/gemm.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/gemm.hpp
-#ifndef MIGRAPH_GUARD_RTGLIB_CPU_GEMM_HPP
-#define MIGRAPH_GUARD_RTGLIB_CPU_GEMM_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_CPU_GEMM_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CPU_GEMM_HPP

 #include <migraphx/argument.hpp>
 #include <migraphx/config.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 void migemm(
    const argument& c_arg, const argument& a_arg, const argument& b_arg, float alpha, float beta);

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

 #endif
--- a/src/targets/cpu/include/migraphx/cpu/lowering.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/lowering.hpp
-#ifndef MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
-#define MIGRAPH_GUARD_RTGLIB_CPU_LOWERING_HPP
+#ifndef MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CPU_LOWERING_HPP

 #include <migraphx/program.hpp>
 #include <migraphx/config.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 struct lowering
@@ -15,7 +15,7 @@ struct lowering
 };

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

 #endif
--- a/src/targets/cpu/include/migraphx/cpu/target.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/target.hpp
-#ifndef MIGRAPH_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
-#define MIGRAPH_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_CPU_TARGET_HPP

 #include <migraphx/program.hpp>
 #include <migraphx/cpu/context.hpp>
 #include <migraphx/config.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 struct target
@@ -17,7 +17,7 @@ struct target
 };

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

 #endif
--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -5,12 +5,13 @@
 #include <migraphx/operators.hpp>
 #include <migraphx/shape_for_each.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/par_dfor.hpp>
 #include <migraphx/cpu/gemm.hpp>
 #include <unordered_map>
 #include <utility>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 template <typename T>
@@ -19,6 +20,14 @@ T zero(const T&)
    return T(0);
 }

+template <class T>
+typename std::conditional_t<std::is_integral<T>{}, std::make_signed<T>, std::enable_if<true, T>>::
+    type
+    make_signed(T x)
+{
+    return x;
+}
+
 //
 // cpu implemenataion of batch norm for inference
 //
@@ -64,7 +73,7 @@ struct cpu_batch_norm_inference
            visit_all(output, input, mini_batch_mean, mini_batch_variance, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {

-                    dfor(num_batch, num_channels, image_height, image_width)(
+                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
                            assert((variance(c) + epsilon) > 0);
                            result(n, c, h, w) = gamma(c) * (buffer(n, c, h, w) - mean(c)) /
@@ -79,7 +88,7 @@ struct cpu_batch_norm_inference
            visit_all(output, input, mini_batch_mean, mini_batch_mean, arg_gamma, arg_bias)(
                [&](auto result, auto buffer, auto mean, auto variance, auto gamma, auto bias) {

-                    dfor(num_batch, num_channels, image_height, image_width)(
+                    par_dfor(num_batch, num_channels, image_height, image_width)(
                        [&](std::size_t n, std::size_t c, std::size_t h, std::size_t w) {
                            assert((variance(c, h, w) + epsilon) > 0);
                            result(n, c, h, w) = gamma(c, h, w) *
@@ -94,6 +103,43 @@ struct cpu_batch_norm_inference
    }
 };

+struct cpu_lrn
+{
+    op::lrn op;
+
+    std::string name() const { return "cpu::lrn"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            int n_batch         = output_shape.lens()[0];
+            int channels        = output_shape.lens()[1];
+            int height          = output_shape.lens()[2];
+            int width           = output_shape.lens()[3];
+            float alphaoverarea = op.alpha / op.size;
+            int radius          = (op.size - 1) / 2;
+
+            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
+                float scale = 0;
+                dfor(channels)([&](int c) {
+                    auto start = (c - radius) < 0 ? 0 : (c - radius);
+                    auto end   = (c + radius) > channels ? channels : (c + radius);
+                    for(auto k = start; k < end; ++k)
+                    {
+                        scale += std::pow(input(b, k, h, w), 2);
+                    }
+                    scale *= alphaoverarea;
+                    scale += op.bias;
+                    scale              = std::pow(scale, -op.beta);
+                    output(b, c, h, w) = input(b, c, h, w) * scale;
+                });
+            });
+        });
+        return result;
+    }
+};
+
 struct cpu_convolution
 {
    op::convolution op;
@@ -104,28 +150,33 @@ struct cpu_convolution
    {
        argument result{output_shape};
        visit_all(result, args[0], args[1])([&](auto output, auto input, auto weights) {
-            auto in_h = input.get_shape().lens()[2];
-            auto in_w = input.get_shape().lens()[3];
-
-            auto wei_c = weights.get_shape().lens()[1];
-            auto wei_h = weights.get_shape().lens()[2];
-            auto wei_w = weights.get_shape().lens()[3];
-
-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
+            auto in   = input.get_shape().lens();
+            auto in_h = in[2];
+            auto in_w = in[3];
+
+            auto wei   = weights.get_shape().lens();
+            auto wei_n = wei[0];
+            auto wei_c = wei[1];
+            auto wei_h = wei[2];
+            auto wei_w = wei[3];
+
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x = i * op.stride[0] - op.padding[0];
-                    const int start_y = j * op.stride[1] - op.padding[1];
+                    const int start_x  = i * op.stride[0] - op.padding[0];
+                    const int start_y  = j * op.stride[1] - op.padding[1];
+                    const int group_id = w / (wei_n / op.group);

                    double acc = 0;
                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x = start_x + x;
-                        const int in_y = start_y + y;
+                        const int in_x  = start_x + x;
+                        const int in_y  = start_y + y;
+                        const int in_ch = group_id * wei_c + k;
                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
                        {
-                            acc += input(o, k, in_x, in_y) * weights(w, k, x, y);
+                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
                        }
                    });
                    output(o, w, i, j) = acc;
@@ -158,7 +209,8 @@ struct cpu_im2col
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];

-            int kdiv2_h, kdiv2_w;
+            int kdiv2_h;
+            int kdiv2_w;
            kdiv2_h = kernel_h / 2;
            kdiv2_w = kernel_w / 2;
            // calculate output sizes
@@ -231,10 +283,10 @@ struct cpu_pooling
            auto in_h  = input.get_shape().lens()[2];
            auto in_w  = input.get_shape().lens()[3];

-            dfor(output_shape.lens()[0],
-                 output_shape.lens()[1],
-                 output_shape.lens()[2],
-                 output_shape.lens()[3])(
+            par_dfor(output_shape.lens()[0],
+                     output_shape.lens()[1],
+                     output_shape.lens()[2],
+                     output_shape.lens()[3])(
                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
                    const int start_x0 = i * op.stride[0] - op.padding[0];
                    const int start_y0 = j * op.stride[1] - op.padding[1];
@@ -271,14 +323,33 @@ struct cpu_contiguous
    std::string name() const { return "cpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        return op.compute(output_shape, std::move(args));
+    }
+};
+
+struct cpu_pad
+{
+    op::pad op;
+    std::string name() const { return "cpu::contiguous"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        assert(output_shape.standard());
        argument result{output_shape};
+        result.visit([&](auto output) { std::fill(output.begin(), output.end(), op.value); });
+
        visit_all(result, args[0])([&](auto output, auto input) {
-            shape_for_each(output.get_shape(), [&](const auto& idx) {
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end());
+            shape_for_each(input.get_shape(), [&](const auto& idx) {
+                std::vector<std::size_t> new_idx(idx.size());
+                std::transform(
+                    idx.begin(), idx.end(), op.pads.begin(), new_idx.begin(), [](auto i, auto j) {
+                        return i + j;
+                    });
+                output(new_idx.begin(), new_idx.end()) = input(idx.begin(), idx.end());
            });
        });
+
        return result;
    }
 };
@@ -290,24 +361,7 @@ struct cpu_concat
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        argument result{output_shape};
-        std::vector<std::size_t> coffsets = op.compute_offsets(output_shape, args);
-        for(std::size_t l = 0; l < args.size(); l++)
-        {
-            auto argl             = args[l];
-            std::size_t nelements = argl.get_shape().elements();
-            visit_all(result, argl)([&](auto output, auto input) {
-                auto slice_shape =
-                    shape{output_shape.type(), input.get_shape().lens(), output_shape.strides()};
-                auto slice = make_view(slice_shape, output.data() + coffsets[l]);
-                // cppcheck-suppress useStlAlgorithm
-                for(std::size_t i = 0; i < nelements; i++)
-                {
-                    slice[i] = input[i];
-                }
-            });
-        }
-        return result;
+        return op.compute(output_shape, std::move(args));
    }
 };

@@ -325,6 +379,18 @@ struct cpu_gemm
    }
 };

+struct cpu_gather
+{
+    op::gather op;
+    std::string name() const { return "cpu::gather"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        return op.compute(output_shape, std::move(args));
+    }
+};
+
 struct identity_op
 {
    std::string name() const { return "cpu::identity"; }
@@ -339,7 +405,7 @@ struct abs_op
    std::string name() const { return "cpu::abs"; }
    auto fcn() const
    {
-        return [](auto x) { return std::abs(x); };
+        return [](auto x) { return std::abs(make_signed(x)); };
    }
 };

@@ -352,6 +418,15 @@ struct exp_op
    }
 };

+struct log_op
+{
+    std::string name() const { return "cpu::log"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::log(x); };
+    }
+};
+
 struct sin_op
 {
    std::string name() const { return "cpu::sin"; }
@@ -406,6 +481,24 @@ struct atan_op
    }
 };

+struct sinh_op
+{
+    std::string name() const { return "cpu::sinh"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::sinh(x); };
+    }
+};
+
+struct cosh_op
+{
+    std::string name() const { return "cpu::cosh"; }
+    auto fcn() const
+    {
+        return [](auto x) { return std::cosh(x); };
+    }
+};
+
 struct tanh_op
 {
    std::string name() const { return "cpu::tanh"; }
@@ -453,6 +546,17 @@ struct leaky_relu_op
    }
 };

+struct elu_op
+{
+    op::elu op;
+    std::string name() const { return "cpu::elu"; }
+    auto fcn() const
+    {
+        auto& a = op.alpha;
+        return [a](auto x) { return x > 0 ? x : a * std::expm1(x); };
+    }
+};
+
 template <typename Op>
 struct cpu_unary
 {
@@ -545,6 +649,24 @@ struct div_op
    }
 };

+struct max_op
+{
+    std::string name() const { return "max"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return std::max(x, y); };
+    }
+};
+
+struct min_op
+{
+    std::string name() const { return "min"; }
+    auto fcn() const
+    {
+        return [](auto x, auto y) { return std::min(x, y); };
+    }
+};
+
 template <typename Op>
 struct cpu_binary
 {
@@ -596,22 +718,35 @@ struct cpu_apply
        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
+        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
+        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
+        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();
+        apply_map["abs"]        = simple_op<cpu_unary<abs_op>>();
+        apply_map["sinh"]       = simple_op<cpu_unary<sinh_op>>();
+        apply_map["cosh"]       = simple_op<cpu_unary<cosh_op>>();
        apply_map["tanh"]       = simple_op<cpu_unary<tanh_op>>();
        apply_map["sigmoid"]    = simple_op<cpu_unary<sigmoid_op>>();
        apply_map["exp"]        = simple_op<cpu_unary<exp_op>>();
+        apply_map["log"]        = simple_op<cpu_unary<log_op>>();
        apply_map["neg"]        = simple_op<cpu_unary<neg_op>>();
        apply_map["sin"]        = simple_op<cpu_unary<sin_op>>();
        apply_map["cos"]        = simple_op<cpu_unary<cos_op>>();
        apply_map["tan"]        = simple_op<cpu_unary<tan_op>>();
+        apply_map["asin"]       = simple_op<cpu_unary<asin_op>>();
+        apply_map["acos"]       = simple_op<cpu_unary<acos_op>>();
+        apply_map["atan"]       = simple_op<cpu_unary<atan_op>>();
        apply_map["relu"]       = simple_op<cpu_unary<relu_op>>();
        apply_map["add"]        = simple_op<cpu_binary<add_op>>();
        apply_map["sub"]        = simple_op<cpu_binary<sub_op>>();
        apply_map["mul"]        = simple_op<cpu_binary<mul_op>>();
        apply_map["div"]        = simple_op<cpu_binary<div_op>>();
+        apply_map["max"]        = simple_op<cpu_binary<max_op>>();
+        apply_map["min"]        = simple_op<cpu_binary<min_op>>();

        apply_map["softmax"] = simple_op<softmax2d>();
    }
@@ -658,5 +793,5 @@ struct cpu_apply
 void lowering::apply(program& p) const { cpu_apply{&p}.apply(); }

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/target.cpp
+++ b/src/targets/cpu/target.cpp
@@ -2,18 +2,24 @@
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/cpu/lowering.hpp>
 #include <migraphx/auto_contiguous.hpp>
+#include <migraphx/rewrite_rnn.hpp>
+#include <migraphx/dead_code_elimination.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace cpu {

 std::string target::name() const { return "cpu"; }

 std::vector<pass> target::get_passes(migraphx::context&) const
 {
-    return {auto_contiguous{}, lowering{}};
+    return {auto_contiguous{},
+            rewrite_rnn{},
+            dead_code_elimination{},
+            lowering{},
+            dead_code_elimination{}};
 }

 } // namespace cpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -12,15 +12,29 @@ endif()

 add_library(migraphx_device
    device/add.cpp
+    device/max.cpp
+    device/min.cpp
+    device/exp.cpp
+    device/log.cpp
    device/sin.cpp
+    device/cos.cpp
+    device/tan.cpp
+    device/sinh.cpp
+    device/cosh.cpp
+    device/asin.cpp
+    device/acos.cpp
+    device/atan.cpp
    device/add_relu.cpp
    device/contiguous.cpp
    device/mul.cpp
    device/concat.cpp
+    device/pad.cpp
+    device/gather.cpp
+    device/sub.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
-target_link_libraries(migraphx_device migraphx hip::device -Wno-invalid-command-line-argument -amdgpu-target=gfx803 -amdgpu-target=gfx900 -amdgpu-target=gfx903)
+target_link_libraries(migraphx_device migraphx hip::device -Wno-invalid-command-line-argument -amdgpu-target=gfx803 -amdgpu-target=gfx900 -amdgpu-target=gfx906)
 target_include_directories(migraphx_device PUBLIC $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/include>)
 target_include_directories(migraphx_device PRIVATE $<BUILD_INTERFACE:${CMAKE_CURRENT_SOURCE_DIR}/device/include>)

@@ -38,12 +52,16 @@ add_library(migraphx_gpu
    concat.cpp
    relu.cpp
    leaky_relu.cpp
-    add.cpp
-    sin.cpp
-    mul.cpp
+    tanh.cpp
    batchnorm.cpp
    write_literals.cpp
    rocblas.cpp
+    sigmoid.cpp
+    abs.cpp
+    elu.cpp
+    pad.cpp
+    gather.cpp
+    lrn.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/abs.cpp
+++ b/src/targets/gpu/abs.cpp
+#include <migraphx/gpu/abs.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/manage_ptr.hpp>
+#include <migraphx/gpu/miopen.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    return inputs.at(1);
+}
+
+argument miopen_abs::compute(context& ctx,
+                             const shape& output_shape,
+                             const std::vector<argument>& args) const
+{
+    float alpha = 1;
+    float beta  = 0;
+    auto x_desc = make_tensor(args[0].get_shape());
+    auto y_desc = make_tensor(output_shape);
+    miopenActivationForward(ctx.get_stream().get_miopen(),
+                            ad.get(),
+                            &alpha,
+                            x_desc.get(),
+                            args[0].implicit(),
+                            &beta,
+                            y_desc.get(),
+                            args[1].implicit());
+
+    return args[1];
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/add.cpp
+++ b/src/targets/gpu/add.cpp
-#include <migraphx/gpu/add.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/config.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
-
-namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
-namespace gpu {
-
-shape hip_add::compute_shape(const std::vector<shape>& inputs) const
-{
-    // check_shapes{inputs, *this}.has(3).standard();
-    check_shapes{inputs, *this}.has(3);
-    return inputs.at(0);
-}
-
-argument hip_add::compute(context& ctx, const shape&, const std::vector<argument>& args) const
-{
-    device::add(ctx.get_stream().get(), args[2], args[0], args[1]);
-    return args[2];
-}
-
-shape miopen_add::compute_shape(const std::vector<shape>& inputs) const
-{
-    check_shapes{inputs, *this}.has(3).not_broadcasted();
-    return inputs.at(0);
-}
-
-argument miopen_add::compute(context& ctx,
-                             const shape& output_shape,
-                             const std::vector<argument>& args) const
-{
-    float alpha = 1, beta = 0;
-    auto a_desc = make_tensor(args[0].get_shape());
-    auto b_desc = make_tensor(args[1].get_shape());
-    auto c_desc = make_tensor(output_shape);
-    miopenOpTensor(ctx.get_stream().get_miopen(),
-                   miopenTensorOpAdd,
-                   &alpha,
-                   a_desc.get(),
-                   args[0].implicit(),
-                   &alpha,
-                   b_desc.get(),
-                   args[1].implicit(),
-                   &beta,
-                   c_desc.get(),
-                   args[2].implicit());
-    return args[2];
-}
-
-} // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
-} // namespace migraphx
--- a/src/targets/gpu/batchnorm.cpp
+++ b/src/targets/gpu/batchnorm.cpp
@@ -5,7 +5,7 @@
 #include <utility>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 shape miopen_batch_norm_inference::compute_shape(const std::vector<shape>& inputs) const
@@ -22,7 +22,8 @@ argument miopen_batch_norm_inference::compute(context& ctx,
    auto y_desc  = make_tensor(output_shape);
    auto bn_desc = make_tensor(args[3].get_shape());

-    float alpha = 1.0, beta = 0.0f;
+    float alpha = 1.0;
+    float beta  = 0.0f;

    miopenBatchNormalizationForwardInference(ctx.get_stream().get_miopen(),
                                             miopenBatchNormMode_t(op.bn_mode),
@@ -43,5 +44,5 @@ argument miopen_batch_norm_inference::compute(context& ctx,
 }

 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/concat.cpp
+++ b/src/targets/gpu/concat.cpp
@@ -6,7 +6,7 @@
 #include <utility>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 shape hip_concat::compute_shape(std::vector<shape> inputs) const
@@ -24,5 +24,5 @@ argument hip_concat::compute(context& ctx,
 }

 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/contiguous.cpp
+++ b/src/targets/gpu/contiguous.cpp
@@ -5,7 +5,7 @@
 #include <utility>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 shape miopen_contiguous::compute_shape(const std::vector<shape>& inputs) const
@@ -25,5 +25,5 @@ argument miopen_contiguous::compute(context& ctx,
 }

 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
@@ -5,7 +5,7 @@
 #include <utility>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 shape miopen_convolution::compute_shape(const std::vector<shape>& inputs) const
@@ -21,7 +21,8 @@ argument miopen_convolution::compute(context& ctx,
    auto w_desc = make_tensor(args[1].get_shape());
    auto y_desc = make_tensor(output_shape);

-    float alpha = 1, beta = 0;
+    float alpha = 1;
+    float beta  = 0;
    miopenConvolutionForward(ctx.get_stream().get_miopen(),
                             &alpha,
                             x_desc.get(),
@@ -40,11 +41,11 @@ argument miopen_convolution::compute(context& ctx,

 shape miopen_convolution::compile(context& ctx,
                                  const shape& output_shape,
-                                  std::vector<instruction_ref> inputs)
+                                  std::vector<shape> inputs)
 {
    shape workspace_shape{};
-    auto x_desc = make_tensor(inputs[0]->get_shape());
-    auto w_desc = make_tensor(inputs[1]->get_shape());
+    auto x_desc = make_tensor(inputs[0]);
+    auto w_desc = make_tensor(inputs[1]);
    auto y_desc = make_tensor(output_shape);

    std::size_t workspace_size = 0;
@@ -56,31 +57,44 @@ shape miopen_convolution::compile(context& ctx,
                                             &workspace_size);
    workspace_shape = shape{shape::int8_type, {workspace_size}};

-    auto x         = to_gpu(generate_argument(inputs[0]->get_shape()));
-    auto w         = to_gpu(generate_argument(inputs[1]->get_shape()));
+    auto x         = to_gpu(generate_argument(inputs[0]));
+    auto w         = to_gpu(generate_argument(inputs[1]));
    auto y         = allocate_gpu(output_shape);
    auto workspace = allocate_gpu(workspace_shape);

    int algo_count = 1;
    miopenConvAlgoPerf_t perf;
-    miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
-                                          x_desc.get(),
-                                          x.implicit(),
-                                          w_desc.get(),
-                                          w.implicit(),
-                                          cd.get(),
-                                          y_desc.get(),
-                                          y.implicit(),
-                                          1,
-                                          &algo_count,
-                                          &perf,
-                                          workspace.implicit(),
-                                          workspace_size,
-                                          false);
-    algo = perf.fwd_algo;
+    auto status = miopenFindConvolutionForwardAlgorithm(ctx.get_stream().get_miopen(),
+                                                        x_desc.get(),
+                                                        x.implicit(),
+                                                        w_desc.get(),
+                                                        w.implicit(),
+                                                        cd.get(),
+                                                        y_desc.get(),
+                                                        y.implicit(),
+                                                        1,
+                                                        &algo_count,
+                                                        &perf,
+                                                        workspace.implicit(),
+                                                        workspace_size,
+                                                        false);
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("Find convolution failed");
+    handle = ctx.get_stream().get_miopen();
+    algo   = perf.fwd_algo;
    return shape{shape::int8_type, {perf.memory}};
 }

+void miopen_convolution::finalize(context& ctx,
+                                  const shape& output_shape,
+                                  std::vector<shape> inputs)
+{
+    if(handle == ctx.get_stream().get_miopen())
+        return;
+    // TODO: Check that workspace hasn't changed
+    compile(ctx, output_shape, std::move(inputs));
+}
+
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/device/acos.cpp
+++ b/src/targets/gpu/device/acos.cpp
+#include <migraphx/gpu/device/acos.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void acos(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::acos(to_hip_type(x)); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/add.cpp
+++ b/src/targets/gpu/device/add.cpp
@@ -2,7 +2,7 @@
 #include <migraphx/gpu/device/nary.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

@@ -22,5 +22,5 @@ void add(hipStream_t stream,

 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/device/add_relu.cpp
+++ b/src/targets/gpu/device/add_relu.cpp
@@ -2,7 +2,7 @@
 #include <migraphx/gpu/device/nary.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

@@ -27,5 +27,5 @@ void add_relu(hipStream_t stream,

 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/gpu/device/asin.cpp
+++ b/src/targets/gpu/device/asin.cpp
+#include <migraphx/gpu/device/asin.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void asin(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::asin(to_hip_type(x)); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/atan.cpp
+++ b/src/targets/gpu/device/atan.cpp
+#include <migraphx/gpu/device/atan.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+#include <migraphx/gpu/device/types.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void atan(hipStream_t stream, const argument& result, const argument& arg)
+{
+    nary(stream, result, arg)([](auto x) { return ::atan(to_hip_type(x)); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/concat.cpp
+++ b/src/targets/gpu/device/concat.cpp
@@ -5,7 +5,7 @@
 #include <migraphx/gpu/device/launch.hpp>

 namespace migraphx {
-inline namespace MIGRAPH_INLINE_NS {
+inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
 namespace device {

@@ -34,5 +34,5 @@ argument concat(hipStream_t stream,

 } // namespace device
 } // namespace gpu
-} // namespace MIGRAPH_INLINE_NS
+} // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx