Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into rnn_optimization

b8090620 · Shucai Xiao · c2db3b96 · 3540f1b9 · b8090620 · b8090620
Commit b8090620 authored Jun 10, 2019 by Shucai Xiao
20 changed files
--- a/src/include/migraphx/op/unsqueeze.hpp
+++ b/src/include/migraphx/op/unsqueeze.hpp
@@ -29,6 +29,7 @@ struct unsqueeze
    std::string name() const { return "unsqueeze"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
+        check_shapes{inputs, *this}.has(1).standard_or_scalar();
        auto input_shape = inputs[0];
        auto type        = input_shape.type();
        auto old_lens    = input_shape.lens();

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -69,7 +69,7 @@ auto operator<<(std::ostream& os, const T& x) -> decltype(os << x.name())
 {
    os << x.name();
    char delim = '[';
-    reflect_each(x, [&](auto& y, auto name) {
+    reflect_each(x, [&](auto&& y, auto name) {
        os << delim;
        os << name << "=";
        stream_write_value(os, y);
@@ -87,6 +87,8 @@ namespace operation_equal {
 template <class T, class U>
 auto operator==(const T& x, const U& y) -> decltype(x.name() == y.name())
 {
+    static_assert(is_reflectable<T>{} or sizeof(T) <= 1,
+                  "Missing equality operator or reflect method.");
    if(x.name() != y.name())
        return false;
    const auto& yy = any_cast<T>(y);

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -11,9 +11,11 @@
 #include <migraphx/op/batch_norm.hpp>
 #include <migraphx/op/binary.hpp>
 #include <migraphx/op/broadcast.hpp>
+#include <migraphx/op/clip.hpp>
 #include <migraphx/op/common.hpp>
 #include <migraphx/op/concat.hpp>
 #include <migraphx/op/contiguous.hpp>
+#include <migraphx/op/convert.hpp>
 #include <migraphx/op/convolution.hpp>
 #include <migraphx/op/cosh.hpp>
 #include <migraphx/op/cos.hpp>

--- a/src/include/migraphx/pad_calc.hpp
+++ b/src/include/migraphx/pad_calc.hpp
+#ifndef MIGRAPHX_GUARD_OPERATORS_PAD_CALC_HPP
+#define MIGRAPHX_GUARD_OPERATORS_PAD_CALC_HPP
+
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+inline std::size_t calculate_padding(std::size_t weight_dim, std::size_t dilation)
+{
+    return (dilation * (weight_dim - 1)) / 2;
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/quantization.hpp
+++ b/src/include/migraphx/quantization.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_QUANTIZATION_HPP
+#define MIGRAPHX_GUARD_RTGLIB_QUANTIZATION_HPP
+
+#include <string>
+#include <vector>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct program;
+
+void quantize(program& prog, const std::vector<std::string>& ins_names);
+void quantize(program& prog);
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/raw_data.hpp
+++ b/src/include/migraphx/raw_data.hpp
@@ -27,7 +27,8 @@ struct raw_data : raw_data_base
    template <class Stream>
    friend Stream& operator<<(Stream& os, const Derived& d)
    {
-        d.visit([&](auto x) { os << x; });
+        if(not d.empty())
+            d.visit([&](auto x) { os << x; });
        return os;
    }

@@ -40,8 +41,11 @@ struct raw_data : raw_data_base
    template <class Visitor>
    void visit_at(Visitor v, std::size_t n = 0) const
    {
-        auto&& s      = static_cast<const Derived&>(*this).get_shape();
-        auto&& buffer = static_cast<const Derived&>(*this).data();
+        auto&& derived = static_cast<const Derived&>(*this);
+        if(derived.empty())
+            MIGRAPHX_THROW("Visiting empty data!");
+        auto&& s      = derived.get_shape();
+        auto&& buffer = derived.data();
        s.visit_type([&](auto as) { v(*(as.from(buffer) + s.index(n))); });
    }

@@ -55,8 +59,11 @@ struct raw_data : raw_data_base
    template <class Visitor>
    void visit(Visitor v) const
    {
-        auto&& s      = static_cast<const Derived&>(*this).get_shape();
-        auto&& buffer = static_cast<const Derived&>(*this).data();
+        auto&& derived = static_cast<const Derived&>(*this);
+        if(derived.empty())
+            MIGRAPHX_THROW("Visiting empty data!");
+        auto&& s      = derived.get_shape();
+        auto&& buffer = derived.data();
        s.visit_type([&](auto as) { v(make_view(s, as.from(buffer))); });
    }


--- a/src/include/migraphx/reflect.hpp
+++ b/src/include/migraphx/reflect.hpp
@@ -11,6 +11,15 @@ inline namespace MIGRAPHX_INLINE_NS {

 namespace detail {

+struct reflect_placeholder
+{
+    template <class... Ts>
+    int operator()(Ts&&...) const
+    {
+        return 0;
+    }
+};
+
 template <class T, class Selector>
 auto reflect_impl(rank<1>, T& x, Selector f) -> decltype(T::reflect(x, f))
 {
@@ -23,8 +32,53 @@ auto reflect_impl(rank<0>, T&, Selector)
    return pack();
 }

+template <class T>
+auto reflectable_impl(rank<1>, T&& x)
+    -> decltype(T::reflect(x, reflect_placeholder{}), std::true_type{});
+
+template <class T>
+auto reflectable_impl(rank<0>, T &&) -> decltype(std::false_type{});
+
+template <class T>
+struct remove_rvalue_reference
+{
+    using type = T;
+};
+
+template <class T>
+struct remove_rvalue_reference<T&&>
+{
+    using type = T;
+};
+
+template <class T>
+struct wrapper
+{
+    using type = typename remove_rvalue_reference<T>::type;
+    type data;
+    type get() const { return data; }
+};
+
+template <class T>
+wrapper<T> wrap(std::remove_reference_t<T>& x)
+{
+    return wrapper<T>{std::forward<T>(x)};
+}
+
+template <class... Ts>
+using auto_tuple_t = std::tuple<typename remove_rvalue_reference<Ts>::type...>;
+
+template <class... Ts>
+auto_tuple_t<Ts...> auto_tuple(Ts&&... xs)
+{
+    return auto_tuple_t<Ts...>{std::forward<Ts>(xs)...};
+}
+
 } // namespace detail

+template <class T>
+using is_reflectable = decltype(detail::reflectable_impl(rank<1>{}, std::declval<T>()));
+
 template <class T, class Selector>
 auto reflect(T& x, Selector f)
 {
@@ -34,17 +88,18 @@ auto reflect(T& x, Selector f)
 template <class T>
 auto reflect_tie(T& x)
 {
-    return reflect(x, [](auto&& y, auto&&...) { return std::ref(y); })(
-        [](auto&&... xs) { return std::tie(xs.get()...); });
+    return reflect(x, [](auto&& y, auto&&...) { return detail::wrap<decltype(y)>(y); })(
+        [](auto&&... xs) { return detail::auto_tuple(xs.get()...); });
 }

 template <class T, class F>
 void reflect_each(T& x, F f)
 {
-    return reflect(x, [](auto&& y, auto... ys) { return pack(std::ref(y), ys...); })(
-        [&](auto&&... xs) {
-            each_args([&](auto p) { p([&](auto&& y, auto... ys) { f(y.get(), ys...); }); }, xs...);
-        });
+    return reflect(x, [](auto&& y, auto... ys) {
+        return pack(detail::wrap<decltype(y)>(y), ys...);
+    })([&](auto&&... xs) {
+        each_args([&](auto p) { p([&](auto&& y, auto... ys) { f(y.get(), ys...); }); }, xs...);
+    });
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/tensor_view.hpp
+++ b/src/include/migraphx/tensor_view.hpp
@@ -12,6 +12,14 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+template <class T>
+T as_number(T x)
+{
+    return x;
+}
+inline int32_t as_number(int8_t x) { return static_cast<int32_t>(x); }
+inline uint32_t as_number(uint8_t x) { return static_cast<uint32_t>(x); }
+
 template <class T>
 struct tensor_view
 {
@@ -130,10 +138,10 @@ struct tensor_view
    {
        if(!x.empty())
        {
-            os << x.front();
+            os << as_number(x.front());
            for(std::size_t i = 1; i < x.m_shape.elements(); i++)
            {
-                os << ", " << x.m_data[x.m_shape.index(i)];
+                os << ", " << as_number(x.m_data[x.m_shape.index(i)]);
            }
        }
        return os;

--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -28,6 +28,12 @@ void instruction::replace(const shape& r)
    }
 }

+void instruction::replace(operation o)
+{
+    op = std::move(o);
+    recompute_shape();
+}
+
 void instruction::recompute_shape() { replace(compute_shape(op, arguments)); }

 void instruction::clear_arguments()

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -63,6 +63,7 @@ struct onnx_parser
        add_variadic_op("Max", op::max{});
        add_variadic_op("Min", op::min{});

+        add_mem_op("Clip", &onnx_parser::parse_clip);
        add_mem_op("LRN", &onnx_parser::parse_lrn);
        add_mem_op("ImageScaler", &onnx_parser::parse_imagescaler);
        add_mem_op("LeakyRelu", &onnx_parser::parse_leaky_relu);
@@ -225,6 +226,22 @@ struct onnx_parser
        });
    }

+    instruction_ref parse_clip(const std::string&,
+                               const attribute_map& attributes,
+                               std::vector<instruction_ref> args)
+    {
+        op::clip op;
+        if(contains(attributes, "max"))
+        {
+            op.max_val = parse_value(attributes.at("max")).at<float>();
+        }
+        if(contains(attributes, "min"))
+        {
+            op.min_val = parse_value(attributes.at("min")).at<float>();
+        }
+        return prog.add_instruction(op, std::move(args));
+    }
+
    instruction_ref
    parse_softmax(const std::string&, const attribute_map&, std::vector<instruction_ref> args)
    {

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -177,7 +177,7 @@ void memory_coloring_impl::build()
 void memory_coloring_impl::rewrite()
 {
    std::vector<std::size_t> dims;
-    dims.push_back(required_bytes / sizeof(float));
+    dims.push_back((required_bytes + sizeof(float) - 1) / sizeof(float));
    shape s                       = {shape::float_type, dims};
    instruction_ref scratch_param = p_program->add_parameter("scratch", s);
    for(auto ins : iterator_for(*p_program))

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -63,11 +63,16 @@ static void print_program(const program& p, F print_func)

    for(auto ins : iterator_for(p))
    {
-        std::string var_name = "@" + std::to_string(count);
+        std::string var_name;
        if(ins->name() == "@param")
        {
            var_name = any_cast<builtin::param>(ins->get_operator()).parameter;
        }
+        else
+        {
+            var_name = "@" + std::to_string(count);
+            count++;
+        }
        names.emplace(ins, var_name);

        // TODO: Use all_of
@@ -78,8 +83,6 @@ static void print_program(const program& p, F print_func)
        }

        print_func(ins, names);
-
-        count++;
    }
 }

@@ -434,13 +437,20 @@ argument program::eval(std::unordered_map<std::string, argument> params) const
 #else
    auto check_context = [](auto f) { return f(); };
 #endif
-    if(enabled(MIGRAPHX_TRACE_EVAL{}))
+
+    auto trace_level = value_of(MIGRAPHX_TRACE_EVAL{});
+
+    if(trace_level > 0)
    {
        return generic_eval(*this, ctx, std::move(params), [&](auto& ins, auto f) {
            ctx.finish();
            std::cout << "Run instruction: ";
            this->debug_print(ins);
-            return check_context(f);
+            auto result = check_context(f);
+            ctx.finish();
+            if(trace_level > 1 and ins->name().front() != '@' and ins->name() != "load")
+                std::cout << "Ouput: " << result << std::endl;
+            return result;
        });
    }
    else

--- a/src/py/CMakeLists.txt
+++ b/src/py/CMakeLists.txt
@@ -12,12 +12,7 @@ if(MIGRAPHX_ENABLE_PYTHON)
        C_VISIBILITY_PRESET hidden
        CXX_VISIBILITY_PRESET hidden
    )
-    if(MIGRAPHX_ENABLE_TF)
-        target_link_libraries(migraphx_py PRIVATE migraphx migraphx_tf migraphx_cpu)
-        target_compile_definitions(migraphx_py PRIVATE -DENABLE_TF)
-    else()
-        target_link_libraries(migraphx_py PRIVATE migraphx migraphx_onnx migraphx_cpu)
-    endif()
+    target_link_libraries(migraphx_py PRIVATE migraphx migraphx_tf migraphx_onnx migraphx_cpu)
    if(MIGRAPHX_ENABLE_GPU)
        target_link_libraries(migraphx_py PRIVATE migraphx_gpu)
        target_compile_definitions(migraphx_py PRIVATE -DHAVE_GPU)

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -2,14 +2,12 @@
 #include <pybind11/pybind11.h>
 #include <pybind11/stl.h>
 #include <migraphx/program.hpp>
+#include <migraphx/quantization.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/cpu/target.hpp>
 #include <migraphx/stringutils.hpp>
-#ifdef ENABLE_TF
 #include <migraphx/tf.hpp>
-#else
 #include <migraphx/onnx.hpp>
-#endif

 #ifdef HAVE_GPU
 #include <migraphx/gpu/target.hpp>
@@ -160,16 +158,13 @@ PYBIND11_MODULE(migraphx, m)
        .def("__ne__", std::not_equal_to<migraphx::program>{})
        .def("__repr__", [](const migraphx::program& p) { return migraphx::to_string(p); });

-#ifdef ENABLE_TF
    m.def("parse_tf",
          &migraphx::parse_tf,
          "Parse tf protobuf (default format is nhwc)",
          py::arg("filename"),
          py::arg("is_nhwc") = true);
-#else
    m.def("parse_onnx", &migraphx::parse_onnx);

-#endif
    m.def("get_target", [](const std::string& name) -> migraphx::target {
        if(name == "cpu")
            return migraphx::cpu::target{};
@@ -181,6 +176,10 @@ PYBIND11_MODULE(migraphx, m)
    });

    m.def("generate_argument", &migraphx::generate_argument, py::arg("s"), py::arg("seed") = 0);
+    m.def("quantize", [](migraphx::program& p, std::vector<std::string>& ins_names) {
+        migraphx::quantize(p, ins_names);
+    });
+    m.def("quantize", [](migraphx::program& p) { migraphx::quantize(p, {"all"}); });

 #ifdef HAVE_GPU
    m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);

--- a/src/quantization.cpp
+++ b/src/quantization.cpp
+#include <migraphx/quantization.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/op/convert.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/ranges.hpp>
+#include <utility>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+instruction_ref insert_fp16(program& prog,
+                            instruction_ref& ins,
+                            shape::type_t type,
+                            std::unordered_map<instruction_ref, instruction_ref>& map_fp16)
+{
+    if(map_fp16.count(ins) > 0)
+    {
+        return map_fp16[ins];
+    }
+
+    assert(ins->get_shape().type() == shape::float_type ||
+           ins->get_shape().type() == shape::double_type);
+    instruction_ref ins_fp16{};
+    ins_fp16      = prog.insert_instruction(std::next(ins), op::convert{type}, ins);
+    map_fp16[ins] = ins_fp16;
+
+    return ins_fp16;
+}
+
+void quantize(program& prog, const std::vector<std::string>& ins_names)
+{
+    std::unordered_map<instruction_ref, instruction_ref> map_fp16;
+    for(auto ins : iterator_for(prog))
+    {
+        // all indicates every instruction is converted
+        if((not contains(ins_names, "all")) and (not contains(ins_names, ins->name())))
+        {
+            continue;
+        }
+
+        shape::type_t orig_type = ins->get_shape().type();
+        // process all inputs, if input is a fp32 or fp64, convert it
+        // to a fp16 by adding a convert operator.
+        auto inputs = ins->inputs();
+        std::vector<instruction_ref> converted_inputs;
+        for(auto input : inputs)
+        {
+            auto s = input->get_shape();
+            if(s.type() == shape::float_type || s.type() == shape::double_type)
+            {
+                // if the input is a convert operator, uses its input
+                // as its current input
+                instruction_ref input_fp16{};
+                if(input->name() == "convert")
+                {
+                    input_fp16 = input->inputs().front();
+                }
+                else
+                {
+                    input_fp16 = insert_fp16(prog, input, shape::half_type, map_fp16);
+                }
+                converted_inputs.push_back(input_fp16);
+            }
+            else
+            {
+                converted_inputs.push_back(input);
+            }
+        }
+
+        // no change for the input, go to the next instruction
+        if(inputs == converted_inputs)
+        {
+            continue;
+        }
+
+        auto op        = ins->get_operator();
+        auto ins_shape = compute_shape(op, converted_inputs);
+        if(ins_shape.type() != orig_type)
+        {
+            // insert another convert instruction to convert it back
+            if(ins == std::prev(prog.end()))
+            {
+                prog.add_instruction(op::convert{orig_type}, ins);
+            }
+            else
+            {
+                // check the dead code case to avoid assert
+                bool output_empty = ins->outputs().empty();
+                auto ins_orig_type =
+                    prog.insert_instruction(std::next(ins), op::convert{orig_type}, ins);
+                if(!output_empty)
+                {
+                    prog.replace_instruction(ins, ins_orig_type);
+                }
+            }
+        }
+
+        prog.replace_instruction(ins, op, converted_inputs);
+    }
+}
+
+void quantize(program& prog) { quantize(prog, {"all"}); }
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -14,7 +14,9 @@ bool is_reshaper(instruction_ref ins)
    // clang-format off
    static const std::unordered_set<std::string> names = {
        "reshape",
-        "contiguous"
+        "contiguous",
+        "squeeze",
+        "unsqueeze"
    };
    // clang-format on
    return contains(names, ins->name());
@@ -45,6 +47,9 @@ void simplify_reshapes::apply(program& p) const
    auto end = std::prev(p.end());
    for(auto ins : iterator_for(p))
    {
+        if(ins == end and ins->name() == "contiguous")
+            continue;
+        // Skip possible dead instructions
        if(ins->outputs().empty() and ins != end)
            continue;
        if(is_reshaper(ins))
@@ -94,13 +99,6 @@ void simplify_reshapes::apply(program& p) const
            p.replace_instruction(ins, t->inputs().front());
        }
    }
-    // Replace all reshapes with as_shape
-    for(auto ins : iterator_for(p))
-    {
-        if(ins->name() != "reshape")
-            continue;
-        p.replace_instruction(ins, op::as_shape{ins->get_shape()}, ins->inputs());
-    }
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/cpu/lowering.cpp
+++ b/src/targets/cpu/lowering.cpp
@@ -48,6 +48,12 @@ struct cpu_batch_norm_inference
 {
    op::batch_norm_inference op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::batch_norm_inference"; }

    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
@@ -107,6 +113,12 @@ struct cpu_lrn
 {
    op::lrn op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::lrn"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -144,6 +156,12 @@ struct cpu_convolution
 {
    op::convolution op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::convolution"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, shape output_shape, std::vector<argument> args) const
@@ -190,6 +208,12 @@ struct cpu_im2col
 {
    op::im2col op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    static std::string name() { return "cpu::im2col"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

@@ -271,6 +295,12 @@ struct cpu_pooling
 {
    op::pooling op;

+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::pooling_" + Op::name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -315,20 +345,35 @@ struct cpu_pooling
    }
 };

-struct cpu_contiguous
+struct cpu_op
 {
-    op::contiguous op;
-    std::string name() const { return "cpu::contiguous"; }
+    operation op;
+    std::string name() const { return "cpu::" + op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    argument compute(context&, const shape& output_shape, const std::vector<argument>& args) const
+    {
+        return op.compute(output_shape, args);
+    }
+    friend bool operator==(const cpu_op& x, const cpu_op& y) { return x.op == y.op; }
+    friend bool operator==(const cpu_op& x, const operation& y)
    {
-        return op.compute(output_shape, std::move(args));
+        if(x.name() != y.name())
+            return false;
+        return x == any_cast<cpu_op>(y);
    }
+    friend bool operator==(const operation& x, const cpu_op& y) { return y == x; }
 };

 struct cpu_pad
 {
    op::pad op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
+
    std::string name() const { return "cpu::contiguous"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
@@ -352,20 +397,15 @@ struct cpu_pad
    }
 };

-struct cpu_concat
-{
-    op::concat op;
-    std::string name() const { return "cpu::concat"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
 struct cpu_gemm
 {
    op::dot op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op, f);
+    }
    std::string name() const { return "cpu::dot"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
@@ -452,162 +492,6 @@ struct cpu_gemm
    }
 };

-struct cpu_gather
-{
-    op::gather op;
-    std::string name() const { return "cpu::gather"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
-
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        return op.compute(output_shape, std::move(args));
-    }
-};
-
-struct identity_op
-{
-    std::string name() const { return "cpu::identity"; }
-    auto fcn() const
-    {
-        return [](auto x) { return x; };
-    }
-};
-
-struct abs_op
-{
-    std::string name() const { return "cpu::abs"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::abs(make_signed(x)); };
-    }
-};
-
-struct exp_op
-{
-    std::string name() const { return "cpu::exp"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::exp(x); };
-    }
-};
-
-struct log_op
-{
-    std::string name() const { return "cpu::log"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::log(x); };
-    }
-};
-
-struct sin_op
-{
-    std::string name() const { return "cpu::sin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sin(x); };
-    }
-};
-
-struct cos_op
-{
-    std::string name() const { return "cpu::cos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cos(x); };
-    }
-};
-
-struct tan_op
-{
-    std::string name() const { return "cpu::tan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tan(x); };
-    }
-};
-
-struct asin_op
-{
-    std::string name() const { return "cpu::asin"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::asin(x); };
-    }
-};
-
-struct acos_op
-{
-    std::string name() const { return "cpu::acos"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::acos(x); };
-    }
-};
-
-struct atan_op
-{
-    std::string name() const { return "cpu::atan"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::atan(x); };
-    }
-};
-
-struct sinh_op
-{
-    std::string name() const { return "cpu::sinh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::sinh(x); };
-    }
-};
-
-struct cosh_op
-{
-    std::string name() const { return "cpu::cosh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::cosh(x); };
-    }
-};
-
-struct tanh_op
-{
-    std::string name() const { return "cpu::tanh"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::tanh(x); };
-    }
-};
-
-struct sigmoid_op
-{
-    std::string name() const { return "cpu::sigmoid"; }
-    auto fcn() const
-    {
-        return [](auto x) { return 1.f / (1.f + std::exp(-x)); };
-    }
-};
-
-struct neg_op
-{
-    std::string name() const { return "cpu::neg"; }
-    auto fcn() const
-    {
-        return [](auto x) { return -x; };
-    }
-};
-
-struct relu_op
-{
-    std::string name() const { return "cpu::relu"; }
-    auto fcn() const
-    {
-        return [](auto x) { return std::max(decltype(x){0}, x); };
-    }
-};
-
 struct leaky_relu_op
 {
    op::leaky_relu op;
@@ -634,6 +518,12 @@ template <typename Op>
 struct cpu_unary
 {
    Op op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return migraphx::reflect(self.op.op, f);
+    }
    std::string name() const { return op.name(); }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
@@ -671,78 +561,33 @@ struct cpu_unary
    }
 };

-struct softmax2d
+struct cpu_softmax
 {
-    std::string name() const { return "cpu::softmax2d"; }
-    shape compute_shape(const std::vector<shape>& inputs) const { return inputs.front(); }
-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    op::softmax op;
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
    {
-        argument result{output_shape};
-        visit_all(result, args[0])([&](auto output, auto input) {
-            using value_type = typename decltype(input)::value_type;
-            auto nb          = input.get_shape().lens()[0];
-            auto nc          = input.get_shape().lens()[1];
-            auto nh          = input.get_shape().lens()[2];
-            auto nw          = input.get_shape().lens()[3];
-            dfor(nb, nh, nw)([&](std::size_t b, std::size_t i, std::size_t j) {
-                value_type cmax = std::numeric_limits<value_type>::lowest();
-                for(std::size_t c = 0; c < nc; c++)
-                {
-                    cmax = std::max(cmax, input(b, c, i, j));
-                }
-                for(std::size_t c = 0; c < nc; c++)
-                {
-                    output(b, c, i, j) = std::exp(input(b, c, i, j) - cmax);
-                }
-                value_type sum = value_type(0);
-                for(std::size_t c = 0; c < nc; c++)
-                {
-                    sum += output(b, c, i, j);
-                }
-                for(std::size_t c = 0; c < nc; c++)
-                {
-                    output(b, c, i, j) = output(b, c, i, j) / sum;
-                }
-            });
-        });
-        return result;
+        return migraphx::reflect(self.op, f);
    }
-};

-struct cpu_logsoftmax
-{
-    op::logsoftmax op;
-    std::string name() const { return "cpu::logsoftmax"; }
+    std::string name() const { return "cpu::softmax"; }
    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

    template <typename T>
-    std::size_t compute_batch_index(const T& idx, shape& batch_shape, int axis) const
+    std::size_t compute_batch_index(T idx, shape& batch_shape, int axis) const
    {
-        if(axis == 0)
-        {
-            return 0;
-        }
-        else
-        {
-            std::vector<std::size_t> batch_idx(idx.begin(), idx.begin() + axis);
-            return batch_shape.index(batch_idx.begin(), batch_idx.end());
-        }
+        idx[axis] = 0;
+        return batch_shape.index(idx);
    }

    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
        argument result{output_shape};
-        auto lens = output_shape.lens();
-        std::vector<std::size_t> batch_lens{};
-        if(op.axis == 0)
-        {
-            batch_lens.push_back(1);
-        }
-        else
-        {
-            batch_lens.insert(batch_lens.begin(), lens.begin(), lens.begin() + op.axis);
-        }
-        shape batch_shape{migraphx::shape::uint32_type, batch_lens};
+        auto batch_lens     = output_shape.lens();
+        batch_lens[op.axis] = 1;
+        shape batch_shape{shape::int32_type, batch_lens};
+
        visit_all(result, args[0])([&](auto output, auto input) {
            using value_type = typename decltype(input)::value_type;
            std::vector<value_type> batch_max(batch_shape.elements(),
@@ -754,23 +599,19 @@ struct cpu_logsoftmax

            shape_for_each(output_shape, [&](auto idx) {
                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
+                output(idx.begin(), idx.end()) =
+                    std::exp(input(idx.begin(), idx.end()) - batch_max[index]);
            });

            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
            shape_for_each(output_shape, [&](auto idx) {
                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
+                batch_sum[index] += output(idx.begin(), idx.end());
            });

-            for(std::size_t i = 0; i < batch_sum.size(); ++i)
-            {
-                batch_sum[i] = std::log(batch_sum[i]);
-            }
-
            shape_for_each(output_shape, [&](auto idx) {
                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
-                output(idx.begin(), idx.end()) -= batch_sum[index];
+                output(idx.begin(), idx.end()) /= batch_sum[index];
            });
        });

@@ -778,98 +619,62 @@ struct cpu_logsoftmax
    }
 };

-struct add_op
+struct cpu_logsoftmax
 {
-    std::string name() const { return "add"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x + y; };
-    }
-};
+    op::logsoftmax op;

-struct sub_op
-{
-    std::string name() const { return "sub"; }
-    auto fcn() const
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
    {
-        return [](auto x, auto y) { return x - y; };
+        return migraphx::reflect(self.op, f);
    }
-};

-struct mul_op
-{
-    std::string name() const { return "mul"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return x * y; };
-    }
-};
+    std::string name() const { return "cpu::logsoftmax"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }

-struct div_op
-{
-    std::string name() const { return "div"; }
-    auto fcn() const
+    template <typename T>
+    std::size_t compute_batch_index(T idx, const shape& batch_shape, int axis) const
    {
-        return [](auto x, auto y) { return x / y; };
+        idx[axis] = 0;
+        return batch_shape.index(idx);
    }
-};

-struct max_op
-{
-    std::string name() const { return "max"; }
-    auto fcn() const
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
    {
-        return [](auto x, auto y) { return std::max(x, y); };
-    }
-};
+        argument result{output_shape};
+        auto batch_lens     = output_shape.lens();
+        batch_lens[op.axis] = 1;
+        shape batch_shape{shape::int32_type, batch_lens};

-struct min_op
-{
-    std::string name() const { return "min"; }
-    auto fcn() const
-    {
-        return [](auto x, auto y) { return std::min(x, y); };
-    }
-};
+        visit_all(result, args[0])([&](auto output, auto input) {
+            using value_type = typename decltype(input)::value_type;
+            std::vector<value_type> batch_max(batch_shape.elements(),
+                                              std::numeric_limits<value_type>::lowest());
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index       = this->compute_batch_index(idx, batch_shape, op.axis);
+                batch_max[index] = std::max(batch_max[index], input(idx.begin(), idx.end()));
+            });

-template <typename Op>
-struct cpu_binary
-{
-    Op op;
-    std::string name() const { return "cpu::" + op.name(); }
-    shape compute_shape(const std::vector<shape>& inputs) const
-    {
-        check_shapes{inputs}.has(2).same_type().same_dims();
-        auto s0 = inputs.at(0);
-        auto s1 = inputs.at(1);
-        if(s0 == s1 and s0.packed())
-        {
-            return s0;
-        }
-        else
-        {
-            return {s0.type(), s0.lens()};
-        }
-    }
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                output(idx.begin(), idx.end()) = input(idx.begin(), idx.end()) - batch_max[index];
+            });

-    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
-    {
-        argument result{output_shape};
-        visit_all(result, args[0], args[1])([&](auto output, auto input1, auto input2) {
-            auto s1 = input1.get_shape();
-            auto s2 = input2.get_shape();
-            if(s1 == s2 and s1.standard())
-            {
-                std::transform(
-                    input1.begin(), input1.end(), input2.begin(), output.begin(), op.fcn());
-            }
-            else
+            std::vector<value_type> batch_sum(batch_shape.elements(), value_type(0));
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                batch_sum[index] += std::exp(output(idx.begin(), idx.end()));
+            });
+
+            for(std::size_t i = 0; i < batch_sum.size(); ++i)
            {
-                shape_for_each(output.get_shape(), [&](const auto& idx) {
-                    output(idx.begin(), idx.end()) =
-                        op.fcn()(input1(idx.begin(), idx.end()), input2(idx.begin(), idx.end()));
-                });
+                batch_sum[i] = std::log(batch_sum[i]);
            }
+
+            shape_for_each(output_shape, [&](auto idx) {
+                auto index = this->compute_batch_index(idx, batch_shape, op.axis);
+                output(idx.begin(), idx.end()) -= batch_sum[index];
+            });
        });

        return result;
@@ -895,43 +700,17 @@ struct cpu_apply

    void init()
    {
-        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
-        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
-        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
-        apply_map["lrn"]        = extend_op<cpu_lrn, op::lrn>();
-        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
-        apply_map["pad"]        = extend_op<cpu_pad, op::pad>();
-        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
-        apply_map["gather"]     = extend_op<cpu_gather, op::gather>();
-        apply_map["logsoftmax"] = extend_op<cpu_logsoftmax, op::logsoftmax>();
-        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["elu"]        = extend_op<cpu_unary<elu_op>, op::elu>();
-        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();
-        apply_map["abs"]        = simple_op<cpu_unary<abs_op>>();
-        apply_map["sinh"]       = simple_op<cpu_unary<sinh_op>>();
-        apply_map["cosh"]       = simple_op<cpu_unary<cosh_op>>();
-        apply_map["tanh"]       = simple_op<cpu_unary<tanh_op>>();
-        apply_map["sigmoid"]    = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["exp"]        = simple_op<cpu_unary<exp_op>>();
-        apply_map["log"]        = simple_op<cpu_unary<log_op>>();
-        apply_map["neg"]        = simple_op<cpu_unary<neg_op>>();
-        apply_map["sin"]        = simple_op<cpu_unary<sin_op>>();
-        apply_map["cos"]        = simple_op<cpu_unary<cos_op>>();
-        apply_map["tan"]        = simple_op<cpu_unary<tan_op>>();
-        apply_map["asin"]       = simple_op<cpu_unary<asin_op>>();
-        apply_map["acos"]       = simple_op<cpu_unary<acos_op>>();
-        apply_map["atan"]       = simple_op<cpu_unary<atan_op>>();
-        apply_map["relu"]       = simple_op<cpu_unary<relu_op>>();
-        apply_map["add"]        = simple_op<cpu_binary<add_op>>();
-        apply_map["sub"]        = simple_op<cpu_binary<sub_op>>();
-        apply_map["mul"]        = simple_op<cpu_binary<mul_op>>();
-        apply_map["div"]        = simple_op<cpu_binary<div_op>>();
-        apply_map["max"]        = simple_op<cpu_binary<max_op>>();
-        apply_map["min"]        = simple_op<cpu_binary<min_op>>();
-
-        apply_map["softmax"] = simple_op<softmax2d>();
+        apply_map["convolution"] = extend_op<cpu_convolution, op::convolution>();
+        apply_map["dot"]         = extend_op<cpu_gemm, op::dot>();
+        apply_map["elu"]         = extend_op<cpu_unary<elu_op>, op::elu>();
+        apply_map["im2col"]      = extend_op<cpu_im2col, op::im2col>();
+        apply_map["leaky_relu"]  = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
+        apply_map["logsoftmax"]  = extend_op<cpu_logsoftmax, op::logsoftmax>();
+        apply_map["lrn"]         = extend_op<cpu_lrn, op::lrn>();
+        apply_map["pad"]         = extend_op<cpu_pad, op::pad>();
+        apply_map["softmax"]     = extend_op<cpu_softmax, op::softmax>();
    }

    void apply()
@@ -947,9 +726,18 @@ struct cpu_apply
            {
                apply_map.at(it->name())(it);
            }
+            else if(is_context_free(it->get_operator()))
+            {
+                apply_cpu_op(it);
+            }
        }
    }

+    void apply_cpu_op(instruction_ref ins)
+    {
+        prog->replace_instruction(ins, cpu_op{ins->get_operator()}, ins->inputs());
+    }
+
    template <class T>
    void apply_simple_op(instruction_ref ins)
    {

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -27,11 +27,14 @@ add_library(migraphx_device
    device/add_relu.cpp
    device/contiguous.cpp
    device/logsoftmax.cpp
+    device/softmax.cpp
+    device/convert.cpp
    device/mul.cpp
    device/concat.cpp
    device/pad.cpp
    device/gather.cpp
    device/sub.cpp
+    device/clip.cpp
 )
 set_target_properties(migraphx_device PROPERTIES EXPORT_NAME device)
 rocm_clang_tidy_check(migraphx_device)
@@ -66,6 +69,7 @@ add_library(migraphx_gpu
    lrn.cpp
    schedule_model.cpp
    adjust_allocation.cpp
+    clip.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/adjust_allocation.cpp
+++ b/src/targets/gpu/adjust_allocation.cpp
@@ -2,7 +2,6 @@
 #include <migraphx/instruction.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/iterator_for.hpp>
-#include <algorithm>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

--- a/src/targets/gpu/clip.cpp
+++ b/src/targets/gpu/clip.cpp
+#include <migraphx/gpu/clip.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/gpu/device/clip.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+shape hip_clip::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+
+argument hip_clip::compute(context& ctx, const shape&, const std::vector<argument>& args) const
+{
+    device::clip(ctx.get_stream().get(), args.back(), args.front(), op.max_val, op.min_val);
+    return args.back();
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx