Merge from develop

bf6f82d8 · Paul · 6a0797e2 · b93f5320 · bf6f82d8 · bf6f82d8
Commit bf6f82d8 authored May 16, 2019 by Paul
20 changed files
--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -63,11 +63,11 @@ bool memory_coloring_impl::allocate(interval_ptr interval)
        }
    }

-    long long offset = 0;
+    std::size_t offset = 0;
    while(!conflict_queue.empty())
    {
        live_range* range       = conflict_queue.top();
-        long long iter_offset = range->offset;
+        std::size_t iter_offset = range->offset;
        if(offset > iter_offset)
        {
            offset = std::max(offset, iter_offset + range->size);
@@ -97,7 +97,7 @@ void memory_coloring_impl::build()
    if(num_of_instrs == 0)
        return;

-    int cur_points        = num_of_instrs * 2;
+    auto cur_points       = num_of_instrs * 2;
    instruction_ref iter  = p_program->end();
    instruction_ref begin = p_program->begin();
    std::vector<instruction_ref> dead_instrs;
@@ -193,13 +193,13 @@ void memory_coloring_impl::rewrite()
                continue;

            std::size_t offset = 0;
-            if(interval->get_offset() == invalid_offset)
+            if(interval->get_offset() != invalid_offset)
            {
-                assert(interval->result.bytes() == 0);
+                offset = interval->get_offset();
            }
            else
            {
-                offset = interval->get_offset();
+                assert(interval->result.bytes() == 0);
            }

            if(is_allocate(ins))
@@ -207,15 +207,6 @@ void memory_coloring_impl::rewrite()
                p_program->replace_instruction(
                    ins, op::load{ins->get_shape(), offset}, scratch_param);
            }
-            else if(is_literal(ins))
-            {
-#if 0                
-                auto pre      = p_program->add_literal(ins->lit);
-                bool pre_copy = (interval->get_begin() < earliest_end_point);
-                p_program->replace_instruction(
-                    ins, write_literal{offset, pre_copy}, scratch_param, pre);
-#endif
-            }
        }
    }
    MIGRAPHX_DEBUG(dump("---After rewrite---"));

--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
@@ -21,15 +21,15 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-static const int invalid_offset = -1;
+static const std::size_t invalid_offset = std::numeric_limits<std::size_t>::max();

 struct live_range
 {
-    int begin;        // begin point in the instruction stream.
-    int end;          // end point in the instruction stream.
-    long long offset; // offset to base pointer of allocated memory trunk.
-    int vn;           // value number that identifies this live_range.
-    long long size;   // size of required memory in bytes
+    std::size_t begin;  // begin point in the instruction stream.
+    std::size_t end;    // end point in the instruction stream.
+    std::size_t offset; // offset to base pointer of allocated memory trunk.
+    std::size_t vn;     // value number that identifies this live_range.
+    std::size_t size;   // size of required memory in bytes
 #ifdef MIGRAPHX_DEBUG_OPT
    void dump();
 #endif
@@ -45,9 +45,9 @@ struct live_interval
        is_live_on_entry = false;
    }

-    void add_use(int use) { use_points.push_front(use); }
-    int get_begin() const { return segment.begin; }
-    int get_end() const { return segment.end; }
+    void add_use(std::size_t use) { use_points.push_front(use); }
+    std::size_t get_begin() const { return segment.begin; }
+    std::size_t get_end() const { return segment.end; }
    long long get_offset() const { return segment.offset; }

 #ifdef MIGRAPHX_DEBUG_OPT
@@ -55,9 +55,9 @@ struct live_interval
 #endif

    live_range segment;
-    int id;
-    std::list<int> use_points;
-    int def_point;
+    std::size_t id;
+    std::list<std::size_t> use_points;
+    std::size_t def_point;
    shape result;
    bool is_literal;
    bool is_live_on_entry;
@@ -111,8 +111,8 @@ struct memory_coloring_impl
    {
        if((range1.size == 0) || (range2.size == 0))
            return false;
-        long long end1 = range1.offset + range1.size - 1;
-        long long end2 = range2.offset + range2.size - 1;
+        auto end1 = range1.offset + range1.size - 1;
+        auto end2 = range2.offset + range2.size - 1;
        return ((end1 < range2.offset) || (end2 < range1.offset));
    }
    void verify();
@@ -125,8 +125,8 @@ struct memory_coloring_impl
    {
        bool operator()(const interval_ptr i1, const interval_ptr i2) const
        {
-            int len1 = i1->get_end() - i1->get_begin();
-            int len2 = i2->get_end() - i2->get_begin();
+            auto len1 = i1->get_end() - i1->get_begin();
+            auto len2 = i2->get_end() - i2->get_begin();
            if(len1 != len2)
            {
                return (len1 < len2);
@@ -158,7 +158,7 @@ struct memory_coloring_impl

    int num_of_lives;
    int max_value_number;
-    long long required_bytes;
+    std::size_t required_bytes;
    // The earliest program point where an live interval ends.
    int earliest_end_point;
    // The latest program point where an live interval ends.

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -63,11 +63,16 @@ static void print_program(const program& p, F print_func)

    for(auto ins : iterator_for(p))
    {
-        std::string var_name = "@" + std::to_string(count);
+        std::string var_name;
        if(ins->name() == "@param")
        {
            var_name = any_cast<builtin::param>(ins->get_operator()).parameter;
        }
+        else
+        {
+            var_name = "@" + std::to_string(count);
+            count++;
+        }
        names.emplace(ins, var_name);

        // TODO: Use all_of
@@ -78,17 +83,77 @@ static void print_program(const program& p, F print_func)
        }

        print_func(ins, names);
-
-        count++;
    }
 }

 program::program() : impl(std::make_unique<program_impl>()) {}

 program::program(program&&) noexcept = default;
-program& program::operator=(program&&) noexcept = default;
 program::~program() noexcept         = default;

+// copy constructor
+program::program(const program& p) { assign(p); }
+
+// copy assignment operator
+program& program::operator=(program p)
+{
+    std::swap(p.impl, this->impl);
+    return *this;
+}
+
+void program::assign(const program& p)
+{
+    // clean the current program
+    if(!impl)
+    {
+        impl = std::make_unique<program_impl>();
+    }
+    else if(!impl->instructions.empty())
+    {
+        impl->instructions.clear();
+    }
+    impl->ctx = p.impl->ctx;
+
+    std::unordered_map<instruction_ref, instruction_ref> ins_map;
+    for(auto ins : iterator_for(p))
+    {
+        instruction_ref copy_ins{};
+        if(ins->name() == "@literal")
+        {
+            auto l   = ins->get_literal();
+            copy_ins = impl->instructions.insert(impl->instructions.end(), instruction{l});
+        }
+        else if(ins->name() == "@param")
+        {
+            auto&& name = any_cast<builtin::param>(ins->get_operator()).parameter;
+            auto s      = ins->get_shape();
+            copy_ins    = impl->instructions.insert(impl->instructions.end(),
+                                                 {builtin::param{name}, std::move(s), {}});
+        }
+        else if(ins->name() == "@outline")
+        {
+            auto s = ins->get_shape();
+            copy_ins =
+                impl->instructions.insert(impl->instructions.end(), {builtin::outline{s}, s, {}});
+        }
+        else
+        {
+            // retrieve its mapped input
+            auto inputs = ins->inputs();
+            // ensure all inputs have its corresponding copy instructions
+            assert(std::all_of(
+                inputs.begin(), inputs.end(), [&](auto i) { return ins_map.count(i) > 0; }));
+            std::vector<instruction_ref> copy_inputs(inputs.size());
+            std::transform(inputs.begin(), inputs.end(), copy_inputs.begin(), [&](auto i) {
+                return ins_map[i];
+            });
+            copy_ins = add_instruction(ins->get_operator(), copy_inputs);
+        }
+
+        ins_map[ins] = copy_ins;
+    }
+}
+
 instruction_ref program::add_instruction(const operation& op, std::vector<instruction_ref> args)
 {
    return insert_instruction(impl->instructions.end(), op, std::move(args));

--- a/src/propagate_constant.cpp
+++ b/src/propagate_constant.cpp
@@ -22,22 +22,32 @@ bool skip_propogate(instruction_ref ins)

 void propagate_constant::apply(program& p) const
 {
+    for(auto i : iterator_for(p))
+    {
+        if(i->name() != "@literal")
+            continue;
+        if(i->outputs().empty())
+            continue;
        fix([&](auto self, auto ins) {
-        if(not skip_propogate(ins))
+            std::unordered_set<instruction_ref> children(ins->outputs().begin(),
+                                                         ins->outputs().end());
+            for(auto child : children)
            {
-            auto r = ins->eval();
+                if(skip_propogate(child))
+                {
+                    self(child);
+                    continue;
+                }
+                auto r = child->eval();
                if(not r.empty())
                {
-                assert(r.get_shape() == ins->get_shape());
+                    assert(r.get_shape() == child->get_shape());
                    auto l = p.add_literal(r.get_shape(), r.data());
-                p.replace_instruction(ins, l);
-                return;
+                    self(p.replace_instruction(child, l));
                }
            }
-        std::unordered_set<instruction_ref> children(ins->inputs().begin(), ins->inputs().end());
-        for(auto child : children)
-            self(child);
-    })(std::prev(p.end()));
+        })(i);
+    }
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -48,6 +48,12 @@ struct cpu_batch_norm_inference
 {
    op::batch_norm_inference op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::batch_norm_inference"; }

    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -107,6 +113,12 @@ struct cpu_lrn
 {
    op::lrn op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::lrn"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -117,7 +129,7 @@ struct cpu_lrn
            int channels        = output_shape.lens()[1];
            int height          = output_shape.lens()[2];
            int width           = output_shape.lens()[3];
-            float alphaoverarea = op.alpha / op.size;
+            float alphaoverarea = op.alpha / float(op.size);
            int radius          = (op.size - 1) / 2;

            par_dfor(n_batch, height, width)([&](int b, int h, int w) {
@@ -144,6 +156,12 @@ struct cpu_convolution
 {
    op::convolution op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::convolution"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -165,15 +183,15 @@ struct cpu_convolution
                     output_shape.lens()[2],
                     output_shape.lens()[3])(
                [&](std::size_t o, std::size_t w, std::size_t i, std::size_t j) {
-                    const int start_x  = i * op.stride[0] - op.padding[0];
-                    const int start_y  = j * op.stride[1] - op.padding[1];
-                    const int group_id = w / (wei_n / op.group);
+                    const auto start_x  = i * op.stride[0] - op.padding[0];
+                    const auto start_y  = j * op.stride[1] - op.padding[1];
+                    const auto group_id = w / (wei_n / op.group);

                    double acc = 0;
                    dfor(wei_c, wei_h, wei_w)([&](std::size_t k, std::size_t x, std::size_t y) {
-                        const int in_x  = start_x + x;
-                        const int in_y  = start_y + y;
-                        const int in_ch = group_id * wei_c + k;
+                        const auto in_x  = start_x + x;
+                        const auto in_y  = start_y + y;
+                        const auto in_ch = group_id * wei_c + k;
                        if(in_x >= 0 && in_x < in_h && in_y >= 0 && in_y < in_w)
                        {
                            acc += input(o, in_ch, in_x, in_y) * weights(w, k, x, y);
@@ -190,6 +208,12 @@ struct cpu_im2col
 {
    op::im2col op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

@@ -209,10 +233,8 @@ struct cpu_im2col
            const std::size_t& stride_h = op.stride[0];
            const std::size_t& stride_w = op.stride[1];

-            int kdiv2_h;
-            int kdiv2_w;
-            kdiv2_h = kernel_h / 2;
-            kdiv2_w = kernel_w / 2;
+            auto kdiv2_h = kernel_h / 2;
+            auto kdiv2_w = kernel_w / 2;
            // calculate output sizes
            const std::size_t col_height = (height - kernel_h + 2 * pad_h) / stride_h + 1;
            const std::size_t col_width  = (width - kernel_w + 2 * pad_w) / stride_w + 1;
@@ -230,8 +252,8 @@ struct cpu_im2col
                    dfor(channels,
                         kernel_h,
                         kernel_w)([&](std::size_t c, std::size_t koffset, std::size_t loffset) {
-                        int idx     = iinput + koffset - kdiv2_h;
-                        int jdx     = jinput + loffset - kdiv2_w;
+                        auto idx    = iinput + koffset - kdiv2_h;
+                        auto jdx    = jinput + loffset - kdiv2_w;
                        col(ldx, p) = ((idx >= 0) && (idx < height) && (jdx >= 0) && (jdx < width))
                                          ? input(0, c, idx, jdx)
                                          : 0;
@@ -273,6 +295,12 @@ struct cpu_pooling
 {
    op::pooling op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::pooling_" + Op::name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -317,20 +345,35 @@ struct cpu_pooling
    }
 };

-struct cpu_contiguous
+struct cpu_op
 {
-    op::contiguous op;
-    std::string name() const { return "cpu::contiguous"; }
+    operation op;
+    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
    {
-        return op.compute(output_shape, std::move(args));
+        return op.compute(output_shape, args);
    }
+    friend bool operator==(const cpu_op& x, const cpu_op& y) { return x.op == y.op; }
+    friend bool operator==(const cpu_op& x, const operation& y)
+    {
+        if(x.name() != y.name())
+            return false;
+        return x == any_cast<cpu_op>(y);
+    }
+    friend bool operator==(const operation& x, const cpu_op& y) { return y == x; }
 };

 struct cpu_pad
 {
    op::pad op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -354,20 +397,15 @@ struct cpu_pad
    }
 };

-struct cpu_concat
-{
-    op::concat op;
-    std::string name() const { return "cpu::concat"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
 struct cpu_gemm
 {
    op::dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
    std::string name() const { return "cpu::dot"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
@@ -410,162 +448,6 @@ struct cpu_gemm
    }
 };

-struct cpu_gather
-{
-    op::gather op;
-    std::string name() const { return "cpu::gather"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
-struct identity_op
-{
-    std::string name() const { return "cpu::identity"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x; };
-    }
-};
-
-struct abs_op
-{
-    std::string name() const { return "cpu::abs"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::abs(make_signed(x)); };
-    }
-};
-
-struct exp_op
-{
-    std::string name() const { return "cpu::exp"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::exp(x); };
-    }
-};
-
-struct log_op
-{
-    std::string name() const { return "cpu::log"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::log(x); };
-    }
-};
-
-struct sin_op
-{
-    std::string name() const { return "cpu::sin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sin(x); };
-    }
-};
-
-struct cos_op
-{
-    std::string name() const { return "cpu::cos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cos(x); };
-    }
-};
-
-struct tan_op
-{
-    std::string name() const { return "cpu::tan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tan(x); };
-    }
-};
-
-struct asin_op
-{
-    std::string name() const { return "cpu::asin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::asin(x); };
-    }
-};
-
-struct acos_op
-{
-    std::string name() const { return "cpu::acos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::acos(x); };
-    }
-};
-
-struct atan_op
-{
-    std::string name() const { return "cpu::atan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::atan(x); };
-    }
-};
-
-struct sinh_op
-{
-    std::string name() const { return "cpu::sinh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sinh(x); };
-    }
-};
-
-struct cosh_op
-{
-    std::string name() const { return "cpu::cosh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cosh(x); };
-    }
-};
-
-struct tanh_op
-{
-    std::string name() const { return "cpu::tanh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tanh(x); };
-    }
-};
-
-struct sigmoid_op
-{
-    std::string name() const { return "cpu::sigmoid"; }
-    auto fcn() const
-    {
-        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
-    }
-};
-
-struct neg_op
-{
-    std::string name() const { return "cpu::neg"; }
-    auto fcn() const
-    {
-        return [](auto x) { return -x; };
-    }
-};
-
-struct relu_op
-{
-    std::string name() const { return "cpu::relu"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::max(decltype(x){0}, x); };
-    }
-};
-
 struct leaky_relu_op
 {
    op::leaky_relu op;
@@ -592,14 +474,42 @@ template <typename Op>
 struct cpu_unary
 {
    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op.op, f);
+    }
    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs}.has(1);
+        auto s = inputs.at(0);
+        if(s.packed())
+        {
+            return s;
+        }
+        else
+        {
+            return {s.type(), s.lens()};
+        }
+    }
+
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
        result.visit([&](auto output) {
            args[0].visit([&](auto input) {
+                if(input.get_shape().standard())
+                {
                    std::transform(input.begin(), input.end(), output.begin(), op.fcn());
+                }
+                else
+                {
+                    shape_for_each(output.get_shape(), [&](const auto& idx) {
+                        output(idx.begin(), idx.end()) = op.fcn()(input(idx.begin(), idx.end()));
+                    });
+                }
            });
        });

@@ -622,20 +532,20 @@ struct softmax2d
            auto nw          = input.get_shape().lens()[3];
            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
                value_type cmax = std::numeric_limits<value_type>::lowest();
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    cmax = std::max(cmax, input(b, c, i, j));
                }
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
                }
                value_type sum = value_type(0);
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    sum += output(b, c, i, j);
                }
-                for(int c = 0; c < nc; c++)
+                for(std::size_t c = 0; c < nc; c++)
                {
                    output(b, c, i, j) = output(b, c, i, j) / sum;
                }
@@ -648,6 +558,13 @@ struct softmax2d
 struct cpu_logsoftmax
 {
    op::logsoftmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::logsoftmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

@@ -714,87 +631,6 @@ struct cpu_logsoftmax
    }
 };

-struct add_op
-{
-    std::string name() const { return "add"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x + y; };
-    }
-};
-
-struct sub_op
-{
-    std::string name() const { return "sub"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x - y; };
-    }
-};
-
-struct mul_op
-{
-    std::string name() const { return "mul"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x * y; };
-    }
-};
-
-struct div_op
-{
-    std::string name() const { return "div"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x / y; };
-    }
-};
-
-struct max_op
-{
-    std::string name() const { return "max"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return std::max(x, y); };
-    }
-};
-
-struct min_op
-{
-    std::string name() const { return "min"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return std::min(x, y); };
-    }
-};
-
-template <typename Op>
-struct cpu_binary
-{
-    Op op;
-    std::string name() const { return op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            if(input1.get_shape().packed() and input2.get_shape().packed())
-            {
-                std::transform(
-                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
-            }
-            else
-            {
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
-                });
-            }
-        });
-        return result;
-    }
-};
-
 struct cpu_apply
 {
    program* prog;
@@ -814,42 +650,16 @@ struct cpu_apply

    void init()
    {
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
+        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
+        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
-        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
-        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
-        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
-        apply_map["logsoftmax"] = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();
-        apply_map["abs"]        = simple_op<cpu_unary<abs_op>>();
-        apply_map["sinh"]       = simple_op<cpu_unary<sinh_op>>();
-        apply_map["cosh"]       = simple_op<cpu_unary<cosh_op>>();
-        apply_map["tanh"]       = simple_op<cpu_unary<tanh_op>>();
-        apply_map["sigmoid"]    = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["exp"]        = simple_op<cpu_unary<exp_op>>();
-        apply_map["log"]        = simple_op<cpu_unary<log_op>>();
-        apply_map["neg"]        = simple_op<cpu_unary<neg_op>>();
-        apply_map["sin"]        = simple_op<cpu_unary<sin_op>>();
-        apply_map["cos"]        = simple_op<cpu_unary<cos_op>>();
-        apply_map["tan"]        = simple_op<cpu_unary<tan_op>>();
-        apply_map["asin"]       = simple_op<cpu_unary<asin_op>>();
-        apply_map["acos"]       = simple_op<cpu_unary<acos_op>>();
-        apply_map["atan"]       = simple_op<cpu_unary<atan_op>>();
-        apply_map["relu"]       = simple_op<cpu_unary<relu_op>>();
-        apply_map["add"]        = simple_op<cpu_binary<add_op>>();
-        apply_map["sub"]        = simple_op<cpu_binary<sub_op>>();
-        apply_map["mul"]        = simple_op<cpu_binary<mul_op>>();
-        apply_map["div"]        = simple_op<cpu_binary<div_op>>();
-        apply_map["max"]        = simple_op<cpu_binary<max_op>>();
-        apply_map["min"]        = simple_op<cpu_binary<min_op>>();
-
        apply_map["softmax"]     = simple_op<softmax2d>();
    }

@@ -866,9 +676,18 @@ struct cpu_apply
            {
                apply_map.at(it->name())(it);
            }
+            else if(is_context_free(it->get_operator()))
+            {
+                apply_cpu_op(it);
+            }
        }
    }

+    void apply_cpu_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, cpu_op{ins->get_operator()}, ins->inputs());
+    }
+
    template <class T>
    void apply_simple_op(instruction_ref ins)
    {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -32,6 +32,7 @@ add_library(migraphx_device
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/clip.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -66,6 +67,7 @@ add_library(migraphx_gpu
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    clip.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/abs.cpp
+++ b/src/targets/gpu/abs.cpp
@@ -7,7 +7,7 @@ namespace gpu {

 shape miopen_abs::compute_shape(const std::vector<shape>& inputs) const
 {
-    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    check_shapes{inputs, *this}.has(2).packed();
    return inputs.at(0);
 }


--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
+#include <migraphx/gpu/clip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_clip::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::clip(ctx.get_stream().get(), args.back(), args.front(), op.max_val, op.min_val);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/clip.cpp
+++ b/src/targets/gpu/device/clip.cpp
+#include <migraphx/gpu/device/clip.hpp>
+#include <migraphx/gpu/device/nary.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void clip(hipStream_t stream,
+          const argument& result,
+          const argument& arg1,
+          const float max,
+          const float min)
+{
+    nary(stream, result, arg1)(
+        [max, min](auto x) { return std::min<decltype(x)>(std::max<decltype(x)>(min, x), max); });
+}
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/device/gather.cpp
+++ b/src/targets/gpu/device/gather.cpp
@@ -16,7 +16,7 @@ argument gather(hipStream_t stream,
                std::vector<migraphx::argument> args,
                int axis)
 {
-    int axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
+    auto axis_index = (axis < 0) ? (axis + args[0].get_shape().lens().size()) : axis;
    visit_all(args.back(), args[0])([&](auto output, auto input) {
        std::size_t nelements = output_shape.elements();
        args[1].visit([&](auto indices) {

--- a/src/targets/gpu/fuse_ops.cpp
+++ b/src/targets/gpu/fuse_ops.cpp
@@ -162,7 +162,10 @@ struct hip_triadd
        device::add(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
        return args.at(3);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct hip_triadd_relu
@@ -178,7 +181,10 @@ struct hip_triadd_relu
        device::add_relu(ctx.get_stream().get(), args.at(3), args.at(0), args.at(1), args.at(2));
        return args.at(3);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct hip_add_relu
@@ -194,7 +200,10 @@ struct hip_add_relu
        device::add_relu(ctx.get_stream().get(), args.at(2), args.at(0), args.at(1));
        return args.at(2);
    }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct find_add_relu
@@ -285,7 +294,10 @@ struct miopen_conv_bias

    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 struct miopen_conv_bias_relu
@@ -332,7 +344,10 @@ struct miopen_conv_bias_relu
    }
    void finalize(context& ctx, const shape&, const std::vector<shape>&) { f.compile(ctx); }
    shape get_workspace(context& ctx) { return f.get_workspace(ctx); }
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 template <class... Ms>

--- a/src/targets/gpu/include/migraphx/gpu/abs.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/abs.hpp
@@ -13,11 +13,21 @@ struct context;
 struct miopen_abs
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::abs"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/batchnorm.hpp
@@ -13,11 +13,21 @@ struct context;
 struct miopen_batch_norm_inference
 {
    op::batch_norm_inference op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::batch_norm_inference"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/clip.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_CLIP_HPP
+
+#include <migraphx/shape.hpp>
+#include <migraphx/op/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct context;
+
+struct hip_clip
+{
+    op::clip op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
+    std::string name() const { return "gpu::clip"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/concat.hpp
@@ -14,11 +14,20 @@ struct hip_concat
 {
    op::concat op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::concat"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/contiguous.hpp
@@ -13,10 +13,20 @@ struct context;
 struct miopen_contiguous
 {
    op::contiguous op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument compute(context&, shape output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/convolution.hpp
@@ -31,7 +31,10 @@ struct miopen_convolution
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    shape compile(context& ctx, const shape& output_shape, std::vector<shape> inputs);
    void finalize(context& ctx, const shape& output_shape, std::vector<shape> inputs);
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/device/clip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/device/clip.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_DEVICE_CLIP_HPP
+#define MIGRAPHX_GUARD_RTGLIB_DEVICE_CLIP_HPP
+
+#include <migraphx/argument.hpp>
+#include <migraphx/config.hpp>
+#include <hip/hip_runtime_api.h>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+namespace device {
+
+void clip(hipStream_t stream, const argument& result, const argument& arg1, float max, float min);
+
+} // namespace device
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/elu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/elu.hpp
@@ -13,11 +13,21 @@ struct context;
 struct miopen_elu
 {
    shared<activation_descriptor> ad;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return gpu::reflect(self.ad.get(), f);
+    }
+
    std::string name() const { return "gpu::elu"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu

--- a/src/targets/gpu/include/migraphx/gpu/gather.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/gather.hpp
@@ -14,11 +14,21 @@ struct context;
 struct hip_gather
 {
    op::gather op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "gpu::gather"; }
    shape compute_shape(std::vector<shape> inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
-    int output_alias(const std::vector<shape>& shapes) const { return shapes.size() - 1; }
+    std::ptrdiff_t output_alias(const std::vector<shape>& shapes) const
+    {
+        return shapes.size() - 1;
+    }
 };

 } // namespace gpu