Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX...

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into conv_same_padding

Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX...
Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into conv_same_padding
dbb87db1 · Khalique · 4614de7c · eeb5bad1 · dbb87db1 · dbb87db1
Commit dbb87db1 authored May 20, 2019 by Khalique
20 changed files
--- a/src/schedule.cpp
+++ b/src/schedule.cpp
 #include <migraphx/schedule.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/identity.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/dfor.hpp>
 #include <migraphx/functional.hpp>

--- a/src/simplify_algebra.cpp
+++ b/src/simplify_algebra.cpp
 #include <migraphx/simplify_algebra.hpp>
 #include <migraphx/program.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/add.hpp>
 #include <migraphx/matcher.hpp>
 #include <migraphx/literal.hpp>


--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
 #include <migraphx/simplify_reshapes.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/as_shape.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
 #include <unordered_set>

--- a/src/targets/cpu/gemm.cpp
+++ b/src/targets/cpu/gemm.cpp
@@ -55,7 +55,13 @@ void migemm_impl(tensor_view<T> cmat,
    visit_mat(amat, [&](const auto& a) {
        visit_mat(bmat, [&](const auto& b) {
            auto c = make_mat(cmat);
-            c      = (a * b) * alpha + beta * c;
+            c      = beta * c;
+            // This is a simple optimization to avoid
+            // compute A * B if alpha is 0.0
+            if(alpha != 0.0)
+            {
+                c = c + alpha * a * b;
+            }
        });
    });
 }
@@ -95,8 +101,8 @@ void migemm_impl(
 {
    auto lens = amat.get_shape().lens();
    bool batch_mul =
-        std::accumulate(lens.begin(), lens.end(), std::size_t{1}, std::multiplies<std::size_t>()) ==
-        (*lens.rbegin()) * (*(lens.rbegin() + 1));
+        std::accumulate(
+            lens.rbegin() + 2, lens.rend(), std::size_t{1}, std::multiplies<std::size_t>()) == 1;
    if(batch_mul)
    {
        migemm_impl(cmat, amat, bmat, alpha, beta, is_fast_gemm_type<T>{});

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -48,6 +48,12 @@ struct cpu_batch_norm_inference
 {
    op::batch_norm_inference op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::batch_norm_inference"; }

    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -107,6 +113,12 @@ struct cpu_lrn
 {
    op::lrn op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::lrn"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -117,7 +129,7 @@ struct cpu_lrn
            int channels        = output_shape.lens()[1];
            int height          = output_shape.lens()[2];
            int width           = output_shape.lens()[3];
-            float alphaoverarea = op.alpha / op.size;
+            float alphaoverarea = op.alpha / float(op.size);
            int radius          = (op.size - 1) / 2;

            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
@@ -144,6 +156,12 @@ struct cpu_convolution
 {
    op::convolution op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::convolution"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -165,15 +183,15 @@ struct cpu_convolution
                     output_shape.lens()[2],
                     output_shape.lens()[3])(
                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x  = i * op.stride[0] - op.padding[0];
-                    const int start_y  = j * op.stride[1] - op.padding[1];
-                    const int group_id = w / (wei_n / op.group);
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);

                    double acc = 0;
                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x  = start_x + x;
-                        const int in_y  = start_y + y;
-                        const int in_ch = group_id * wei_c + k;
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
                        {
                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
@@ -190,6 +208,12 @@ struct cpu_im2col
 {
    op::im2col op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

@@ -209,10 +233,8 @@ struct cpu_im2col
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];

-            int kdiv2_h;
-            int kdiv2_w;
-            kdiv2_h = kernel_h / 2;
-            kdiv2_w = kernel_w / 2;
+            auto kdiv2_h = kernel_h / 2;
+            auto kdiv2_w = kernel_w / 2;
            // calculate output sizes
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
@@ -230,8 +252,8 @@ struct cpu_im2col
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
-                        int idx     = iinput + koffset - kdiv2_h;
-                        int jdx     = jinput + loffset - kdiv2_w;
+                        auto idx    = iinput + koffset - kdiv2_h;
+                        auto jdx    = jinput + loffset - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
                                          ? input(0, c, idx, jdx)
                                          : 0;
@@ -273,6 +295,12 @@ struct cpu_pooling
 {
    op::pooling op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::pooling_" + Op::name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -317,20 +345,35 @@ struct cpu_pooling
    }
 };

-struct cpu_contiguous
+struct cpu_op
 {
-    op::contiguous op;
-    std::string name() const { return "cpu::contiguous"; }
+    operation op;
+    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        return op.compute(output_shape, args);
+    }
+    friend bool operator==(const cpu_op& x, const cpu_op& y) { return x.op == y.op; }
+    friend bool operator==(const cpu_op& x, const operation& y)
    {
-        return op.compute(output_shape, std::move(args));
+        if(x.name() != y.name())
+            return false;
+        return x == any_cast<cpu_op>(y);
    }
+    friend bool operator==(const operation& x, const cpu_op& y) { return y == x; }
 };

 struct cpu_pad
 {
    op::pad op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -354,184 +397,54 @@ struct cpu_pad
    }
 };

-struct cpu_concat
-{
-    op::concat op;
-    std::string name() const { return "cpu::concat"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
 struct cpu_gemm
 {
    op::dot op;
-    std::string name() const { return "cpu::dot"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
    {
-        argument result{output_shape};
-        migemm(result, args[0], args[1], op.alpha, op.beta);
-        return result;
+        return migraphx::reflect(self.op, f);
    }
-};
-
-struct cpu_gather
-{
-    op::gather op;
-    std::string name() const { return "cpu::gather"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
-struct identity_op
-{
-    std::string name() const { return "cpu::identity"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x; };
-    }
-};
-
-struct abs_op
-{
-    std::string name() const { return "cpu::abs"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::abs(make_signed(x)); };
-    }
-};
-
-struct exp_op
-{
-    std::string name() const { return "cpu::exp"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::exp(x); };
-    }
-};
-
-struct log_op
-{
-    std::string name() const { return "cpu::log"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::log(x); };
-    }
-};
-
-struct sin_op
-{
-    std::string name() const { return "cpu::sin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sin(x); };
-    }
-};
-
-struct cos_op
-{
-    std::string name() const { return "cpu::cos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cos(x); };
-    }
-};
-
-struct tan_op
-{
-    std::string name() const { return "cpu::tan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tan(x); };
-    }
-};
-
-struct asin_op
-{
-    std::string name() const { return "cpu::asin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::asin(x); };
-    }
-};
-
-struct acos_op
-{
-    std::string name() const { return "cpu::acos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::acos(x); };
-    }
-};
-
-struct atan_op
-{
-    std::string name() const { return "cpu::atan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::atan(x); };
-    }
-};
-
-struct sinh_op
-{
-    std::string name() const { return "cpu::sinh"; }
-    auto fcn() const
+    std::string name() const { return "cpu::dot"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
    {
-        return [](auto x) { return std::sinh(x); };
+        if(inputs.size() == 3)
+        {
+            auto c_shape = inputs.at(2);
+            check_shapes{{c_shape}}.not_broadcasted();
+        }
+        return op.compute_shape(inputs);
    }
-};

-struct cosh_op
-{
-    std::string name() const { return "cpu::cosh"; }
-    auto fcn() const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        return [](auto x) { return std::cosh(x); };
-    }
-};
+        argument result{output_shape};
+        // 3 inputs, it is alpha * A * B + beta * C, then
+        // A and B are matrics, and C is broadcastable to A * B
+        if(args.size() == 3)
+        {
+            // no need to consider the value of args[2]
+            if(op.beta == 0.0f)
+            {
+                result.visit([&](auto output) { std::fill(output.begin(), output.end(), 0); });
+            }
+            else
+            {
+                visit_all(result, args[2])([&](auto output, auto input) {
+                    std::copy(input.begin(), input.end(), output.begin());
+                });
+            }

-struct tanh_op
-{
-    std::string name() const { return "cpu::tanh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tanh(x); };
-    }
-};
+            migemm(result, args[0], args[1], op.alpha, op.beta);

-struct sigmoid_op
-{
-    std::string name() const { return "cpu::sigmoid"; }
-    auto fcn() const
-    {
-        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
-    }
-};
+            return result;
+        }

-struct neg_op
-{
-    std::string name() const { return "cpu::neg"; }
-    auto fcn() const
-    {
-        return [](auto x) { return -x; };
-    }
-};
+        // 2 input arguments
+        migemm(result, args[0], args[1], op.alpha, 0.0f);

-struct relu_op
-{
-    std::string name() const { return "cpu::relu"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::max(decltype(x){0}, x); };
+        return result;
    }
 };

@@ -561,16 +474,45 @@ template <typename Op>
 struct cpu_unary
 {
    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op.op, f);
+    }
    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        auto s = inputs.at(0);
+        if(s.packed())
+        {
+            return s;
+        }
+        else
+        {
+            return {s.type(), s.lens()};
+        }
+    }
+
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
-                std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+                if(input.get_shape().standard())
+                {
+                    std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+                }
+                else
+                {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) = op.fcn()(input(idx.begin(), idx.end()));
+                    });
+                }
            });
        });
+
        return result;
    }
 };
@@ -590,20 +532,20 @@ struct softmax2d
            auto nw          = input.get_shape().lens()[3];
            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
                value_type cmax = std::numeric_limits<value_type>::lowest();
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    cmax = std::max(cmax, input(b, c, i, j));
                }
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
                }
                value_type sum = value_type(0);
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    sum += output(b, c, i, j);
                }
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    output(b, c, i, j) = output(b, c, i, j) / sum;
                }
@@ -616,6 +558,13 @@ struct softmax2d
 struct cpu_logsoftmax
 {
    op::logsoftmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::logsoftmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

@@ -682,87 +631,6 @@ struct cpu_logsoftmax
    }
 };

-struct add_op
-{
-    std::string name() const { return "add"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x + y; };
-    }
-};
-
-struct sub_op
-{
-    std::string name() const { return "sub"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x - y; };
-    }
-};
-
-struct mul_op
-{
-    std::string name() const { return "mul"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x * y; };
-    }
-};
-
-struct div_op
-{
-    std::string name() const { return "div"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x / y; };
-    }
-};
-
-struct max_op
-{
-    std::string name() const { return "max"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return std::max(x, y); };
-    }
-};
-
-struct min_op
-{
-    std::string name() const { return "min"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return std::min(x, y); };
-    }
-};
-
-template <typename Op>
-struct cpu_binary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            if(input1.get_shape().packed() and input2.get_shape().packed())
-            {
-                std::transform(
-                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
-            }
-            else
-            {
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
-                });
-            }
-        });
-        return result;
-    }
-};
-
 struct cpu_apply
 {
    program* prog;
@@ -782,43 +650,17 @@ struct cpu_apply

    void init()
    {
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
-        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
-        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
-        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
-        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
-        apply_map["logsoftmax"] = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();
-        apply_map["abs"]        = simple_op<cpu_unary<abs_op>>();
-        apply_map["sinh"]       = simple_op<cpu_unary<sinh_op>>();
-        apply_map["cosh"]       = simple_op<cpu_unary<cosh_op>>();
-        apply_map["tanh"]       = simple_op<cpu_unary<tanh_op>>();
-        apply_map["sigmoid"]    = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["exp"]        = simple_op<cpu_unary<exp_op>>();
-        apply_map["log"]        = simple_op<cpu_unary<log_op>>();
-        apply_map["neg"]        = simple_op<cpu_unary<neg_op>>();
-        apply_map["sin"]        = simple_op<cpu_unary<sin_op>>();
-        apply_map["cos"]        = simple_op<cpu_unary<cos_op>>();
-        apply_map["tan"]        = simple_op<cpu_unary<tan_op>>();
-        apply_map["asin"]       = simple_op<cpu_unary<asin_op>>();
-        apply_map["acos"]       = simple_op<cpu_unary<acos_op>>();
-        apply_map["atan"]       = simple_op<cpu_unary<atan_op>>();
-        apply_map["relu"]       = simple_op<cpu_unary<relu_op>>();
-        apply_map["add"]        = simple_op<cpu_binary<add_op>>();
-        apply_map["sub"]        = simple_op<cpu_binary<sub_op>>();
-        apply_map["mul"]        = simple_op<cpu_binary<mul_op>>();
-        apply_map["div"]        = simple_op<cpu_binary<div_op>>();
-        apply_map["max"]        = simple_op<cpu_binary<max_op>>();
-        apply_map["min"]        = simple_op<cpu_binary<min_op>>();
-
-        apply_map["softmax"] = simple_op<softmax2d>();
+        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
+        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]     = simple_op<softmax2d>();
    }

    void apply()
@@ -834,9 +676,18 @@ struct cpu_apply
            {
                apply_map.at(it->name())(it);
            }
+            else if(is_context_free(it->get_operator()))
+            {
+                apply_cpu_op(it);
+            }
        }
    }

+    void apply_cpu_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, cpu_op{ins->get_operator()}, ins->inputs());
+    }
+
    template <class T>
    void apply_simple_op(instruction_ref ins)
    {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/clip.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -65,6 +66,8 @@ add_library(migraphx_gpu
    gather.cpp
    lrn.cpp
    schedule_model.cpp
+    adjust_allocation.cpp
+    clip.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/abs.cpp
+++ b/src/targets/gpu/abs.cpp
@@ -7,8 +7,8 @@ namespace gpu {

 shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
-    return inputs.at(1);
+    check_shapes{inputs, *this}.has(2).packed();
+    return inputs.at(0);
 }

 argument miopen_abs::compute(context& ctx,

--- a/src/targets/gpu/adjust_allocation.cpp
+++ b/src/targets/gpu/adjust_allocation.cpp
+#include <migraphx/gpu/adjust_allocation.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <algorithm>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+void adjust_allocation::apply(program& p) const
+{
+    for(auto ins : iterator_for(p))
+    {
+        // skip instruction with no input
+        if(ins->inputs().empty())
+            continue;
+
+        if(ins->name() == "load")
+            continue;
+
+        auto alias_ins = instruction::get_output_alias(ins, true);
+        if(alias_ins->name() == "hip::allocate")
+        {
+            // shape allocated is different from actual shape
+            // of the instruction, reallocate and replace the previous one
+            if(alias_ins->get_shape() != ins->get_shape())
+            {
+                auto alloc_ins = p.insert_instruction(ins, hip_allocate{ins->get_shape()});
+                p.replace_instruction(alias_ins, alloc_ins);
+            }
+        }
+    }
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
+#include <migraphx/gpu/clip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_clip::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::clip(ctx.get_stream().get(), args.back(), args.front(), op.max_val, op.min_val);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/clip.cpp
+++ b/src/targets/gpu/device/clip.cpp
+#include <migraphx/gpu/device/clip.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void clip(hipStream_t stream,
+          const argument& result,
+          const argument& arg1,
+          const float max,
+          const float min)
+{
+    nary(stream, result, arg1)(
+        [max, min](auto x) { return std::min<decltype(x)>(std::max<decltype(x)>(min, x), max); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -16,7 +16,7 @@ argument gather(hipStream_t stream,
                std::vector<migraphx::argument> args,
                int axis)
 {
-    int axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
+    auto axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
    visit_all(args.back(), args[0])([&](auto output, auto input) {
        std::size_t nelements = output_shape.elements();
        args[1].visit([&](auto indices) {

--- a/src/targets/gpu/eliminate_workspace.cpp
+++ b/src/targets/gpu/eliminate_workspace.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/gpu/hip.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
 #include <migraphx/iterator_for.hpp>
 #include <migraphx/ranges.hpp>
 #include <migraphx/stringutils.hpp>

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -162,7 +162,10 @@ struct hip_triadd
        device::add(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
        return args.at(3);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct hip_triadd_relu
@@ -178,7 +181,10 @@ struct hip_triadd_relu
        device::add_relu(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
        return args.at(3);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct hip_add_relu
@@ -194,7 +200,10 @@ struct hip_add_relu
        device::add_relu(ctx.get_stream().get(), args.at(2), args.at(0), args.at(1));
        return args.at(2);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct find_add_relu
@@ -285,7 +294,10 @@ struct miopen_conv_bias

    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct miopen_conv_bias_relu
@@ -332,7 +344,10 @@ struct miopen_conv_bias_relu
    }
    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 template <class... Ms>

--- a/src/targets/gpu/gemm.cpp
+++ b/src/targets/gpu/gemm.cpp
 #include <migraphx/gpu/gemm.hpp>
 #include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/add.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {

 template <class... Ts>
-void generic_rocblas_batched_gemm(shape::as<float>, Ts&&... xs)
+rocblas_status generic_rocblas_scal(shape::as<float>, Ts&&... xs)
 {
-    rocblas_sgemm_strided_batched(std::forward<Ts>(xs)...);
+    return rocblas_sscal(std::forward<Ts>(xs)...);
 }

 template <class... Ts>
-void generic_rocblas_batched_gemm(shape::as<double>, Ts&&... xs)
+rocblas_status generic_rocblas_scal(shape::as<double>, Ts&&... xs)
 {
-    rocblas_dgemm_strided_batched(std::forward<Ts>(xs)...);
+    return rocblas_dscal(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_scal(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_SCAL: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<half>, Ts&&... xs)
+{
+    return rocblas_haxpy(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_saxpy(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_daxpy(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_axpy(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_AXPY: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_dot(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sdot(std::forward<Ts>(xs)...);
 }

 template <class... Ts>
-void generic_rocblas_batched_gemm(shape::as<half>, Ts&&... xs)
+rocblas_status generic_rocblas_dot(shape::as<double>, Ts&&... xs)
 {
-    rocblas_hgemm_strided_batched(std::forward<Ts>(xs)...);
+    return rocblas_ddot(std::forward<Ts>(xs)...);
 }

 template <class T, class... Ts>
-void generic_rocblas_batched_gemm(shape::as<T>, Ts&&...)
+rocblas_status generic_rocblas_dot(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_DOT: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_gemv(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sgemv(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_gemv(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_dgemv(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_gemv(shape::as<T>, Ts&&...)
+{
+    MIGRAPHX_THROW("GENERIC_ROCBLAS_GEMMV: type unsupported by rocblas");
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<float>, Ts&&... xs)
+{
+    return rocblas_sgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<double>, Ts&&... xs)
+{
+    return rocblas_dgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+
+template <class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<half>, Ts&&... xs)
+{
+    return rocblas_hgemm_strided_batched(std::forward<Ts>(xs)...);
+}
+
+template <class T, class... Ts>
+rocblas_status generic_rocblas_batched_gemm(shape::as<T>, Ts&&...)
 {
    MIGRAPHX_THROW("GENERIC_ROCBLAS_BATCHED_GEMM: type unsupported by rocblas");
 }

 template <class... Ts>
-void generic_rocblas_gemm(shape::as<float>, Ts&&... xs)
+rocblas_status generic_rocblas_gemm(shape::as<float>, Ts&&... xs)
 {
-    rocblas_sgemm(std::forward<Ts>(xs)...);
+    return rocblas_sgemm(std::forward<Ts>(xs)...);
 }

 template <class... Ts>
-void generic_rocblas_gemm(shape::as<double>, Ts&&... xs)
+rocblas_status generic_rocblas_gemm(shape::as<double>, Ts&&... xs)
 {
-    rocblas_dgemm(std::forward<Ts>(xs)...);
+    return rocblas_dgemm(std::forward<Ts>(xs)...);
 }

 template <class... Ts>
-void generic_rocblas_gemm(shape::as<half>, Ts&&... xs)
+rocblas_status generic_rocblas_gemm(shape::as<half>, Ts&&... xs)
 {
-    rocblas_hgemm(std::forward<Ts>(xs)...);
+    return rocblas_hgemm(std::forward<Ts>(xs)...);
 }

 template <class T, class... Ts>
-void generic_rocblas_gemm(shape::as<T>, Ts&&...)
+rocblas_status generic_rocblas_gemm(shape::as<T>, Ts&&...)
 {
    MIGRAPHX_THROW("GENERIC_ROCBLAS_GEMM: type unsupported by rocblas");
 }
@@ -90,56 +169,94 @@ rocblas_half to_rocblas_type(half x) { return reinterpret_cast<const rocblas_hal

 shape miopen_gemm::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(3);
-    return op.compute_shape({inputs.at(0), inputs.at(1)});
+    std::vector<shape> input_shapes(inputs.begin(), inputs.begin() + inputs.size() - 1);
+    check_shapes{input_shapes}.not_broadcasted();
+    return op.compute_shape(input_shapes);
 }
+
 argument miopen_gemm::compute(context& ctx,
                              const shape& output_shape,
                              const std::vector<argument>& args) const
 {
-    float alpha        = 1.0f;
-    float beta         = 0.0f;
-    bool transa        = args[0].get_shape().transposed();
-    bool transb        = args[1].get_shape().transposed();
-    std::size_t n_dims = args[0].get_shape().lens().size();
-    std::size_t dim_0  = n_dims - 2;
-    std::size_t dim_1  = n_dims - 1;
-    rocblas_int lda    = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
-    rocblas_int ldb    = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
-    rocblas_int ldc    = args[2].get_shape().strides()[dim_0];
-    auto out_lens      = output_shape.lens();
-    rocblas_int m      = out_lens[dim_0];
-    rocblas_int n      = out_lens[dim_1];
-    rocblas_int k      = args[0].get_shape().lens()[dim_1];
-    auto batch_num     = std::accumulate(
-        out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
+    bool is_3inputs = (args.size() == 4);
+    float beta      = 0.0f;
+    if(is_3inputs)
+    {
+        output_shape.visit_type([&](auto as) {
+            auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
+            hipMemcpyAsync(to_pointer(args[3]),
+                           to_pointer(args[2]),
+                           output_shape.bytes(),
+                           hipMemcpyDeviceToDevice,
+                           ctx.get_stream().get());
+        });
+        beta = op.beta;
+    }
+
+    auto a_lens = args[0].get_shape().lens();
+    auto b_lens = args[1].get_shape().lens();
    output_shape.visit_type([&](auto as) {
-        auto alpha_r    = to_rocblas_type(as(alpha));
-        auto beta_r     = to_rocblas_type(as(beta));
+        auto n_dim        = output_shape.lens().size();
+        auto dim_1        = n_dim - 1;
+        auto dim_0        = n_dim - 2;
+        auto alpha_r      = to_rocblas_type(as(op.alpha));
+        auto beta_r       = to_rocblas_type(as(beta));
+        bool transa       = args[0].get_shape().transposed();
+        bool transb       = args[1].get_shape().transposed();
+        rocblas_int lda   = args[0].get_shape().strides()[transa ? dim_1 : dim_0];
+        rocblas_int ldb   = args[1].get_shape().strides()[transb ? dim_1 : dim_0];
+        rocblas_int ldc   = args[2].get_shape().strides()[dim_0];
+        auto out_lens     = output_shape.lens();
+        rocblas_int m     = out_lens[dim_0];
+        rocblas_int n     = out_lens[dim_1];
+        rocblas_int k     = args[0].get_shape().lens()[dim_1];
+        auto num_matrices = std::accumulate(
+            out_lens.rbegin() + 2, out_lens.rend(), std::size_t{1}, std::multiplies<std::size_t>());
        auto to_pointer = [&](auto&& arg) { return to_rocblas_type(as.from(arg.data())); };
-        generic_rocblas_batched_gemm(as,
-                                     ctx.get_stream().get_rocblas(),
-                                     transb ? rocblas_operation_transpose : rocblas_operation_none,
-                                     transa ? rocblas_operation_transpose : rocblas_operation_none,
-                                     n,
-                                     m,
-                                     k,
-                                     &alpha_r,
-                                     to_pointer(args[1]),
-                                     ldb,
-                                     k * n,
-                                     to_pointer(args[0]),
-                                     lda,
-                                     m * k,
-                                     &beta_r,
-                                     to_pointer(args[2]),
-                                     ldc,
-                                     m * n,
-                                     batch_num);
-
+        if(num_matrices == 1)
+        {
+            generic_rocblas_gemm(as,
+                                 ctx.get_stream().get_rocblas(),
+                                 transb ? rocblas_operation_transpose : rocblas_operation_none,
+                                 transa ? rocblas_operation_transpose : rocblas_operation_none,
+                                 n,
+                                 m,
+                                 k,
+                                 &alpha_r,
+                                 to_pointer(args[1]),
+                                 ldb,
+                                 to_pointer(args[0]),
+                                 lda,
+                                 &beta_r,
+                                 (is_3inputs ? to_pointer(args[3]) : to_pointer(args[2])),
+                                 ldc);
+        }
+        else
+        {
+            generic_rocblas_batched_gemm(
+                as,
+                ctx.get_stream().get_rocblas(),
+                transb ? rocblas_operation_transpose : rocblas_operation_none,
+                transa ? rocblas_operation_transpose : rocblas_operation_none,
+                n,
+                m,
+                k,
+                &alpha_r,
+                to_pointer(args[1]),
+                ldb,
+                k * n,
+                to_pointer(args[0]),
+                lda,
+                m * k,
+                &beta_r,
+                (is_3inputs ? to_pointer(args[3]) : to_pointer(args[2])),
+                ldc,
+                m * n,
+                num_matrices);
+        }
    });

-    return args[2];
+    return (is_3inputs ? args[3] : args[2]);
 }

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/abs.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/abs.hpp
@@ -13,11 +13,21 @@ struct context;
 struct miopen_abs
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::abs"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/adjust_allocation.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/adjust_allocation.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_ADJUST_ALLOCATION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_ADJUST_ALLOCATION_HPP
+
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/context.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+namespace gpu {
+
+struct adjust_allocation
+{
+    std::string name() const { return "gpu::adjust_allocation"; }
+    void apply(program& p) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
@@ -2,7 +2,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_BATCHNORM_HPP

 #include <migraphx/shape.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/batch_norm.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -13,11 +13,21 @@ struct context;
 struct miopen_batch_norm_inference
 {
    op::batch_norm_inference op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::batch_norm_inference"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/clip.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_clip
+{
+    op::clip op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::clip"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat.hpp
@@ -2,7 +2,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONCAT_HPP

 #include <migraphx/shape.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/concat.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -14,11 +14,20 @@ struct hip_concat
 {
    op::concat op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::concat"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -2,7 +2,7 @@
 #define MIGRAPHX_GUARD_RTGLIB_CONTIGUOUS_HPP

 #include <migraphx/shape.hpp>
-#include <migraphx/operators.hpp>
+#include <migraphx/op/contiguous.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -13,10 +13,20 @@ struct context;
 struct miopen_contiguous
 {
    op::contiguous op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument compute(context&, shape output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu