Merge branch 'develop' into driver

eb0d8fee · Paul · 65ef35cd · 0d796941 · eb0d8fee · eb0d8fee
Commit eb0d8fee authored Jun 04, 2019 by Paul
20 changed files
--- a/src/targets/gpu/device/convert.cpp
+++ b/src/targets/gpu/device/convert.cpp
+#include <migraphx/gpu/device/convert.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void convert(hipStream_t stream, const argument& result, const argument& arg)
+{
+    result.visit([&](auto output) {
+        arg.visit([&](auto input) {
+            const auto* input_ptr = device_cast(input.data());
+            auto* output_ptr      = device_cast(output.data());
+            gs_launch(stream,
+                      result.get_shape().elements())([=](auto i) { output_ptr[i] = input_ptr[i]; });
+        });
+    });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -16,20 +16,24 @@ argument gather(hipStream_t stream,
                std::vector<migraphx::argument> args,
                int axis)
 {
-    int axis_index = (axis < 0) ? (axis + output_shape.lens().size()) : axis;
+    auto axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
    visit_all(args.back(), args[0])([&](auto output, auto input) {
        std::size_t nelements = output_shape.elements();
        args[1].visit([&](auto indices) {
-            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
-                const auto* indices_ptr = device_cast(indices.data());
-                auto* outptr            = device_cast(output.data());
-                const auto* inptr       = device_cast(input.data());
-                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
-                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
-                gs_launch(stream, nelements)([=](auto i) {
-                    auto lens        = desc_output.multi(i);
-                    lens[axis_index] = indices_ptr[lens[axis_index]];
-                    outptr[i]        = inptr[desc_input.linear(lens)];
+            const auto* indices_ptr = device_cast(indices.data());
+            auto* out_ptr           = device_cast(output.data());
+            const auto* in_ptr      = device_cast(input.data());
+            auto& input_shape       = args[0].get_shape();
+            auto lens               = input_shape.lens();
+            lens[axis_index]        = args[1].get_shape().elements();
+            migraphx::shape out_comp_shape{output_shape.type(), lens};
+            visit_tensor_size(out_comp_shape.lens().size(), [&](auto n_out_dim) {
+                hip_tensor_descriptor<n_out_dim> desc_input(input_shape);
+                hip_tensor_descriptor<n_out_dim> desc_output(out_comp_shape);
+                gs_launch(stream, nelements)([=](auto ii) {
+                    auto in_idx        = desc_output.multi(ii);
+                    in_idx[axis_index] = indices_ptr[in_idx[axis_index]];
+                    out_ptr[ii]        = in_ptr[desc_input.linear(in_idx)];
                });
            });
        });

--- a/src/targets/gpu/device/logsoftmax.cpp
+++ b/src/targets/gpu/device/logsoftmax.cpp
+#include <migraphx/shape.hpp>
+#include <migraphx/argument.hpp>
+#include <migraphx/gpu/device/logsoftmax.hpp>
+#include <migraphx/gpu/device/tensor.hpp>
+#include <migraphx/gpu/device/launch.hpp>
+#include <migraphx/gpu/device/types.hpp>
+#include <migraphx/gpu/hip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+argument logsoftmax(hipStream_t stream,
+                    const migraphx::shape& output_shape,
+                    std::vector<migraphx::argument> args,
+                    int axis)
+{
+
+    auto lens              = output_shape.lens();
+    std::size_t batch_size = std::accumulate(
+        lens.begin(), lens.begin() + axis, std::size_t{1}, std::multiplies<std::size_t>());
+    std::size_t n_dims = std::accumulate(
+        lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<std::size_t>());
+    migraphx::shape comp_shape{output_shape.type(), {batch_size, n_dims}};
+
+    visit_all(args.back(), args.front())([&](auto output, auto input) {
+        const auto* input_ptr = device_cast(input.data());
+        auto* output_ptr      = device_cast(output.data());
+
+        // each thread is for one item in the batch
+        gs_launch(stream, batch_size)([=](auto i) {
+            std::size_t row_start = i * n_dims;
+            // get max
+            auto batch_max = input_ptr[row_start];
+            for(std::size_t j = 1; j < n_dims; ++j)
+            {
+                auto ind  = row_start + j;
+                batch_max = std::max(to_hip_type(batch_max), to_hip_type(input_ptr[ind]));
+            }
+
+            for(std::size_t j = 0; j < n_dims; ++j)
+            {
+                auto ind        = row_start + j;
+                output_ptr[ind] = input_ptr[ind] - batch_max;
+            }
+
+            auto batch_sum = ::exp(to_hip_type(output_ptr[row_start]));
+            for(std::size_t j = 1; j < n_dims; ++j)
+            {
+                auto ind = row_start + j;
+                batch_sum += ::exp(to_hip_type(output_ptr[ind]));
+            }
+            batch_sum = ::log(to_hip_type(batch_sum));
+
+            for(std::size_t j = 0; j < n_dims; ++j)
+            {
+                auto ind = row_start + j;
+                output_ptr[ind] -= batch_sum;
+            }
+        });
+    });
+
+    return args.back();
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/gpu/hip.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/stringutils.hpp>

--- a/src/targets/gpu/elu.cpp
+++ b/src/targets/gpu/elu.cpp
 #include <migraphx/gpu/elu.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -3,6 +3,7 @@
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/convolution.hpp>
 #include <migraphx/gpu/device/add_relu.hpp>
+#include <migraphx/gpu/device/add.hpp>
 #include <migraphx/instruction.hpp>

 namespace migraphx {
@@ -139,6 +140,8 @@ MIGRAPHX_PRED_MATCHER(fusable_conv, instruction_ref ins)
    auto conv = any_cast<miopen_convolution>(ins->get_operator());
    if(conv.op.group > 1)
        return false;
+    if(conv.op.padding_mode != op::padding_mode_t::default_)
+        return false;
    if(wei.lens()[1] > 512 and conv.algo != miopenConvolutionFwdAlgoWinograd)
        return false;
    auto op = conv.op;
@@ -159,7 +162,10 @@ struct hip_triadd
        device::add(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
        return args.at(3);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct hip_triadd_relu
@@ -175,7 +181,10 @@ struct hip_triadd_relu
        device::add_relu(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
        return args.at(3);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct hip_add_relu
@@ -191,7 +200,10 @@ struct hip_add_relu
        device::add_relu(ctx.get_stream().get(), args.at(2), args.at(0), args.at(1));
        return args.at(2);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct find_add_relu
@@ -250,6 +262,12 @@ struct miopen_conv_bias
    fusion::op_t conv;
    fusion::op_t bias;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return op::convolution::reflect(self.op, f);
+    }
+
    miopen_conv_bias(op::convolution c, const shape& input, const shape& weights, const shape& b)
        : op(c), f(input)
    {
@@ -276,7 +294,10 @@ struct miopen_conv_bias

    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct miopen_conv_bias_relu
@@ -287,6 +308,12 @@ struct miopen_conv_bias_relu
    fusion::op_t bias;
    fusion::op_t relu;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return op::convolution::reflect(self.op, f);
+    }
+
    miopen_conv_bias_relu(op::convolution c,
                          const shape& input,
                          const shape& weights,
@@ -317,7 +344,10 @@ struct miopen_conv_bias_relu
    }
    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 template <class... Ms>

--- a/src/targets/gpu/gather.cpp
+++ b/src/targets/gpu/gather.cpp
 #include <migraphx/gpu/gather.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/device/concat.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/gather.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
 #include <migraphx/gpu/gemm.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <utility>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/add.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 template <class... Ts>
-void generic_rocblas_gemm(shape::as<float>, Ts&&... xs)
+rocblas_status generic_rocblas_scal(shape::as<float>, Ts&&... xs)
 {
-    rocblas_sgemm(std::forward<Ts>(xs)...);
+    return rocblas_sscal(std::forward<Ts>(xs)...);
 }

 template <class... Ts>
-void generic_rocblas_gemm(shape::as<double>, Ts&&... xs)
+rocblas_status generic_rocblas_scal(shape::as<double>, Ts&&... xs)
 {
-    rocblas_dgemm(std::forward<Ts>(xs)...);
+    return rocblas_dscal(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_scal(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_SCAL: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<half>, Ts&&... xs)
+{
+    return rocblas_haxpy(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_saxpy(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_daxpy(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_AXPY: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_dot(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sdot(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_dot(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_ddot(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_dot(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_DOT: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_gemv(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sgemv(std::forward<Ts>(xs)...);
 }

 template <class... Ts>
-void generic_rocblas_gemm(shape::as<half>, Ts&&... xs)
+rocblas_status generic_rocblas_gemv(shape::as<double>, Ts&&... xs)
 {
-    rocblas_hgemm(std::forward<Ts>(xs)...);
+    return rocblas_dgemv(std::forward<Ts>(xs)...);
 }

 template <class T, class... Ts>
-void generic_rocblas_gemm(shape::as<T>, Ts&&...)
+rocblas_status generic_rocblas_gemv(shape::as<T>, Ts&&...)
 {
-    MIGRAPHX_THROW("Type unsupported by rocblas");
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_GEMMV: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_dgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<half>, Ts&&... xs)
+{
+    return rocblas_hgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_BATCHED_GEMM: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_gemm(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sgemm(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_gemm(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_dgemm(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_gemm(shape::as<half>, Ts&&... xs)
+{
+    return rocblas_hgemm(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_gemm(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_GEMM: type unsupported by rocblas");
 }

 template <class T>
@@ -69,46 +169,94 @@ rocblas_half to_rocblas_type(half x) { return reinterpret_cast<const rocblas_hal

 shape miopen_gemm::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(3);
-    return op.compute_shape({inputs.at(0), inputs.at(1)});
+    std::vector<shape> input_shapes(inputs.begin(), inputs.begin() + inputs.size() - 1);
+    check_shapes{input_shapes}.not_broadcasted();
+    return op.compute_shape(input_shapes);
 }
+
 argument miopen_gemm::compute(context& ctx,
                              const shape& output_shape,
                              const std::vector<argument>& args) const
 {
-    float alpha     = 1.0f;
+    bool is_3inputs = (args.size() == 4);
    float beta      = 0.0f;
-    bool transa     = args[0].get_shape().transposed();
-    bool transb     = args[1].get_shape().transposed();
-    rocblas_int lda = args[0].get_shape().strides()[transa ? 1 : 0];
-    rocblas_int ldb = args[1].get_shape().strides()[transb ? 1 : 0];
-    rocblas_int ldc = args[2].get_shape().strides()[0];
-    rocblas_int m   = output_shape.lens()[0];
-    rocblas_int n   = output_shape.lens()[1];
-    rocblas_int k   = args[0].get_shape().lens()[1];
+    if(is_3inputs)
+    {
+        output_shape.visit_type([&](auto as) {
+            auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
+            hipMemcpyAsync(to_pointer(args[3]),
+                           to_pointer(args[2]),
+                           output_shape.bytes(),
+                           hipMemcpyDeviceToDevice,
+                           ctx.get_stream().get());
+        });
+        beta = op.beta;
+    }
+
+    auto a_lens = args[0].get_shape().lens();
+    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
-        auto alpha_r    = to_rocblas_type(as(alpha));
-        auto beta_r     = to_rocblas_type(as(beta));
+        auto n_dim        = output_shape.lens().size();
+        auto dim_1        = n_dim - 1;
+        auto dim_0        = n_dim - 2;
+        auto alpha_r      = to_rocblas_type(as(op.alpha));
+        auto beta_r       = to_rocblas_type(as(beta));
+        bool transa       = args[0].get_shape().transposed();
+        bool transb       = args[1].get_shape().transposed();
+        rocblas_int lda   = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
+        rocblas_int ldb   = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
+        rocblas_int ldc   = args[2].get_shape().strides()[dim_0];
+        auto out_lens     = output_shape.lens();
+        rocblas_int m     = out_lens[dim_0];
+        rocblas_int n     = out_lens[dim_1];
+        rocblas_int k     = args[0].get_shape().lens()[dim_1];
+        auto num_matrices = std::accumulate(
+            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
-        generic_rocblas_gemm(as,
-                             ctx.get_stream().get_rocblas(),
-                             transb ? rocblas_operation_transpose : rocblas_operation_none,
-                             transa ? rocblas_operation_transpose : rocblas_operation_none,
-                             n,
-                             m,
-                             k,
-                             &alpha_r,
-                             to_pointer(args[1]),
-                             ldb,
-                             to_pointer(args[0]),
-                             lda,
-                             &beta_r,
-                             to_pointer(args[2]),
-                             ldc);
-
+        if(num_matrices == 1)
+        {
+            generic_rocblas_gemm(as,
+                                 ctx.get_stream().get_rocblas(),
+                                 transb ? rocblas_operation_transpose : rocblas_operation_none,
+                                 transa ? rocblas_operation_transpose : rocblas_operation_none,
+                                 n,
+                                 m,
+                                 k,
+                                 &alpha_r,
+                                 to_pointer(args[1]),
+                                 ldb,
+                                 to_pointer(args[0]),
+                                 lda,
+                                 &beta_r,
+                                 (is_3inputs ? to_pointer(args[3]) : to_pointer(args[2])),
+                                 ldc);
+        }
+        else
+        {
+            generic_rocblas_batched_gemm(
+                as,
+                ctx.get_stream().get_rocblas(),
+                transb ? rocblas_operation_transpose : rocblas_operation_none,
+                transa ? rocblas_operation_transpose : rocblas_operation_none,
+                n,
+                m,
+                k,
+                &alpha_r,
+                to_pointer(args[1]),
+                ldb,
+                k * n,
+                to_pointer(args[0]),
+                lda,
+                m * k,
+                &beta_r,
+                (is_3inputs ? to_pointer(args[3]) : to_pointer(args[2])),
+                ldc,
+                m * n,
+                num_matrices);
+        }
    });

-    return args[2];
+    return (is_3inputs ? args[3] : args[2]);
 }

 } // namespace gpu

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -43,6 +43,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
 template <class T>
 std::vector<T> read_from_gpu(const void* x, std::size_t sz)
 {
+    gpu_sync();
    std::vector<T> result(sz);
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
@@ -52,6 +53,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)

 hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
 {
+    gpu_sync();
    auto result = allocate_gpu(sz, host);
    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
    if(status != hipSuccess)

--- a/src/targets/gpu/include/migraphx/gpu/abs.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/abs.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_ABS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_ABS_HPP

-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/config.hpp>
+#include <migraphx/shape.hpp>
 #include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+struct context;
+
 struct miopen_abs
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::abs"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/acos.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/acos.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_ACOS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_ACOS_HPP

-#include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/oper.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/acos.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/config.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/add.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/add.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_ADD_HPP
 #define MIGRAPHX_GUARD_RTGLIB_ADD_HPP

-#include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/oper.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/config.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/adjust_allocation.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/adjust_allocation.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ADJUST_ALLOCATION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ADJUST_ALLOCATION_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+struct adjust_allocation
+{
+    std::string name() const { return "gpu::adjust_allocation"; }
+    void apply(program& p) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/asin.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/asin.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_ASIN_HPP
 #define MIGRAPHX_GUARD_RTGLIB_ASIN_HPP

-#include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/oper.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/asin.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/config.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/atan.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/atan.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_ATAN_HPP
 #define MIGRAPHX_GUARD_RTGLIB_ATAN_HPP

-#include <migraphx/gpu/lowering.hpp>
 #include <migraphx/gpu/oper.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
 #include <migraphx/gpu/device/atan.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/config.hpp>
-#include <utility>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP
 #define MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP

-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <migraphx/config.hpp>
-#include <utility>
+#include <migraphx/shape.hpp>
+#include <migraphx/op/batch_norm.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+struct context;
+
 struct miopen_batch_norm_inference
 {
    op::batch_norm_inference op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::batch_norm_inference"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/clip.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_clip
+{
+    op::clip op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::clip"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP

-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/config.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/concat.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <utility>
+#include <migraphx/shape.hpp>
+#include <migraphx/op/concat.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+struct context;
+
 struct hip_concat
 {
    op::concat op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::concat"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -11,13 +11,19 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_NULL_STREAM)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NULL_STREAM)
+
+using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);

 struct hip_device
 {
    hip_device() { add_stream(); }

-    hip_device(std::size_t id) : device_id(id) { add_stream(); }
+    hip_device(std::size_t id, std::size_t n) : device_id(id)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            add_stream();
+    }

    struct stream
    {
@@ -32,7 +38,7 @@ struct hip_device
        static hip_stream_ptr create_stream()
        {
            hipStream_t result = nullptr;
-            auto status        = hipStreamCreate(&result);
+            auto status        = hipStreamCreateWithFlags(&result, hipStreamNonBlocking);
            if(status != hipSuccess)
                MIGRAPHX_THROW("Failed to allocate stream");
            return hip_stream_ptr{result};
@@ -40,7 +46,7 @@ struct hip_device

        hipStream_t get()
        {
-            if(enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+            if(not enabled(MIGRAPHX_ENABLE_NULL_STREAM{}))
            {
                setup();
                if(s == nullptr)
@@ -53,7 +59,7 @@ struct hip_device

        auto create_miopen_handle()
        {
-            if(enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+            if(not enabled(MIGRAPHX_ENABLE_NULL_STREAM{}))
                return make_obj<miopen_handle>(&miopenCreateWithStream, get());
            else
                return make_obj<miopen_handle>(&miopenCreate);
@@ -77,6 +83,22 @@ struct hip_device
            return rbhandle.get();
        }

+        void wait(hipEvent_t event)
+        {
+            setup();
+            auto status = hipStreamWaitEvent(get(), event, 0);
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to wait.");
+        }
+
+        void record(hipEvent_t event)
+        {
+            setup();
+            auto status = hipEventRecord(event, get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to record.");
+        }
+
        private:
        std::size_t id                      = 0;
        shared<hip_stream_ptr> s            = nullptr;
@@ -88,8 +110,14 @@ struct hip_device

    stream& get_stream() { return streams.at(current_stream); }

+    stream& get_stream(std::size_t n) { return streams.at(n); }
+
    void set_stream(std::size_t n) { current_stream = n; }

+    std::size_t nstreams() const { return streams.size(); }
+
+    std::size_t stream_id() const { return current_stream; }
+
    private:
    std::size_t device_id      = 0;
    std::size_t current_stream = 0;
@@ -98,7 +126,10 @@ struct hip_device

 struct context
 {
-    context(std::size_t n = 0) : current_device(std::make_shared<hip_device>(n)) {}
+    context(std::size_t device_id = 0, std::size_t n = 4)
+        : current_device(std::make_shared<hip_device>(device_id, n))
+    {
+    }

    hip_device& get_current_device()
    {
@@ -107,13 +138,34 @@ struct context
    }

    hip_device::stream& get_stream() { return get_current_device().get_stream(); }
+    hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
+
+    void set_stream(std::size_t n) { get_current_device().set_stream(n); }
+
+    void create_events(std::size_t num_of_events)
+    {
+        for(std::size_t i = events.size(); i < num_of_events + 1; ++i)
+            events.emplace_back(create_event());
+    }
+
+    hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }

    std::vector<argument> literals{};
    void finish() const { gpu_sync(); }

+    static hip_event_ptr create_event()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreateWithFlags(&event, hipEventDisableTiming);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to create event");
+        return hip_event_ptr{event};
+    }
+
    private:
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
+    std::vector<shared<hip_event_ptr>> events;
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_CONTIGUOUS_HPP
 #define MIGRAPHX_GUARD_RTGLIB_CONTIGUOUS_HPP

-#include <migraphx/gpu/lowering.hpp>
-#include <migraphx/manage_ptr.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/generate.hpp>
-#include <migraphx/shape_for_each.hpp>
-#include <migraphx/config.hpp>
-#include <migraphx/gpu/miopen.hpp>
-#include <migraphx/gpu/hip.hpp>
-#include <migraphx/dfor.hpp>
-#include <migraphx/gpu/device/contiguous.hpp>
-#include <migraphx/gpu/device/add.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/gpu/rocblas.hpp>
-#include <migraphx/gpu/context.hpp>
-#include <utility>
+#include <migraphx/shape.hpp>
+#include <migraphx/op/contiguous.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

+struct context;
+
 struct miopen_contiguous
 {
    op::contiguous op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument compute(context&, shape output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu