Merge branch 'master' into onnx_parsing_squeeze_slice_concat

7255bc66 · Scott Thornton · bc367f6b · ad414ba9 · 7255bc66 · 7255bc66
Commit 7255bc66 authored Oct 23, 2018 by Scott Thornton
20 changed files
--- a/src/include/migraph/functional.hpp
+++ b/src/include/migraph/functional.hpp
@@ -87,6 +87,11 @@ constexpr void each_args(F f, Ts&&... xs)
    swallow{(f(std::forward<Ts>(xs)), 0)...};
 }
+template <class F>
+constexpr void each_args(F)
+{
+}
 /// Implements a fix-point combinator
 template <class R, class F>
 detail::fix_f<R, F> fix(F f)

--- a/src/include/migraph/operation.hpp
+++ b/src/include/migraph/operation.hpp
@@ -8,7 +8,8 @@
 #include <type_traits>
 #include <utility>
 #include <migraph/shape.hpp>
-#include <migraph/rank.hpp>
+#include <migraph/reflect.hpp>
+#include <migraph/streamutils.hpp>
 #include <migraph/argument.hpp>
 #include <migraph/context.hpp>
 #include <migraph/auto_any_cast.hpp>
@@ -54,11 +55,34 @@ namespace operation_stream {
 template <class T>
 auto operator<<(std::ostream& os, const T& x) -> decltype(os << x.name())
 {
-    return os << x.name();
+    os << x.name();
+    char delim = '[';
+    reflect_each(x, [&](auto& y, auto name) {
+        os << delim;
+        os << name << "=";
+        stream_write_value(os, y);
+        delim = ',';
+    });
+    if(delim == ',')
+        os << "]";
+    return os;
 }
 } // namespace operation_stream
+namespace operation_equal {
+template <class T, class U>
+auto operator==(const T& x, const U& y) -> decltype(x.name() == y.name())
+{
+    if(x.name() != y.name())
+        return false;
+    const auto& yy = any_cast<T>(y);
+    return reflect_tie(x) == reflect_tie(yy);
+}
+} // namespace operation_equal
 template <class T>
 auto compute_op(rank<1>,
                const T& x,
@@ -93,6 +117,7 @@ compute_op(const T& x, context& ctx, const shape& output_shape, const std::vecto
 *      shape compute_shape(const std::vector<shape>& input) const;
 *      argument compute(context& ctx,const shape& output,const std::vector<argument>& input) const;
 *     friend std::ostream & operator<<(std::ostream & os,const operation & op) ;
+ *     friend bool operator==(const operation & x,const operation & y) ;
 * };
 *
 */
@@ -178,6 +203,12 @@ struct operation
        return op.private_detail_te_get_handle().operator_shift_left(os);
    }
+    friend bool operator==(const operation& x, const operation& y)
+    {
+        assert(x.private_detail_te_handle_mem_var);
+        return x.private_detail_te_get_handle().operator==(y);
+    }
    private:
    struct private_detail_te_handle_base_type
    {
@@ -190,6 +221,7 @@ struct operation
        virtual argument
        compute(context& ctx, const shape& output, const std::vector<argument>& input) const = 0;
        virtual std::ostream& operator_shift_left(std::ostream& os) const                    = 0;
+        virtual bool operator==(const operation& y) const                                    = 0;
    };
    template <typename PrivateDetailTypeErasedT>
@@ -242,6 +274,12 @@ struct operation
            return os << private_detail_te_value;
        }
+        bool operator==(const operation& y) const override
+        {
+            using migraph::operation_equal::operator==;
+            return private_detail_te_value == y;
+        }
        PrivateDetailTypeErasedT private_detail_te_value;
    };
@@ -307,6 +345,8 @@ inline const ValueType& any_cast(const operation& x)
    return *y;
 }
+inline bool operator!=(const operation& x, const operation& y) { return !(x == y); }
 #endif
 } // namespace migraph

--- a/src/include/migraph/operators.hpp
+++ b/src/include/migraph/operators.hpp
@@ -35,7 +35,12 @@ struct batch_norm_inference
    bn_infer_mode_t bn_mode = spatial;
-    bool is_test = false;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(
+            f(self.epsilon, "epsilon"), f(self.momentum, "momentum"), f(self.bn_mode, "bn_mode"));
+    }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -56,6 +61,16 @@ struct convolution
        valid
    };
    padding_mode_t padding_mode = default_;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.padding, "padding"),
+                    f(self.stride, "stride"),
+                    f(self.dilation, "dilation"),
+                    f(self.padding_mode, "padding_mode"));
+    }
    std::string name() const { return "convolution"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -110,16 +125,6 @@ struct convolution
            MIGRAPH_THROW("Invalid padding mode");
        }
    }
-    friend std::ostream& operator<<(std::ostream& os, const convolution& op)
-    {
-        os << op.name() << "[";
-        os << "padding={" << stream_range(op.padding) << "}, ";
-        os << "stride={" << stream_range(op.stride) << "}, ";
-        os << "dilation={" << stream_range(op.dilation) << "}";
-        os << "]";
-        return os;
-    }
 };
 struct im2col
@@ -133,6 +138,16 @@ struct im2col
        same,
        valid
    };
+    padding_mode_t padding_mode = default_;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.padding, "padding"),
+                    f(self.stride, "stride"),
+                    f(self.dilation, "dilation"),
+                    f(self.padding_mode, "padding_mode"));
+    }
    std::string name() const { return "im2col"; }
@@ -168,6 +183,16 @@ struct pooling
    std::array<std::size_t, 2> padding = {{0, 0}};
    std::array<std::size_t, 2> stride  = {{1, 1}};
    std::array<std::size_t, 2> lengths = {{1, 1}};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.mode, "mode"),
+                    f(self.padding, "padding"),
+                    f(self.stride, "stride"),
+                    f(self.lengths, "lengths"));
+    }
    std::string name() const { return "pooling"; }
    shape compute_shape(std::vector<shape> inputs) const
@@ -196,16 +221,6 @@ struct pooling
                            1)),
                }};
    }
-    friend std::ostream& operator<<(std::ostream& os, const pooling& op)
-    {
-        os << op.name() << "[";
-        os << "padding={" << stream_range(op.padding) << "}, ";
-        os << "stride={" << stream_range(op.stride) << "}, ";
-        os << "lengths={" << stream_range(op.lengths) << "}";
-        os << "]";
-        return os;
-    }
 };
 struct activation
@@ -224,9 +239,32 @@ struct activation
    }
 };
+struct leaky_relu
+{
+    std::string name() const { return "leaky_relu"; }
+    float alpha;
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(1);
+        return inputs.front();
+    }
+    friend std::ostream& operator<<(std::ostream& os, const leaky_relu& op)
+    {
+        os << op.name() << ":" << op.alpha;
+        return os;
+    }
+};
 struct transpose
 {
    std::vector<int64_t> dims;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dims, "dims"));
+    }
    std::string name() const { return "transpose"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -258,13 +296,6 @@ struct transpose
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
-    friend std::ostream& operator<<(std::ostream& os, const transpose& op)
-    {
-        os << op.name() << "[";
-        os << "dims={" << stream_range(op.dims) << "}";
-        os << "]";
-        return os;
-    }
 };
 struct contiguous
@@ -283,11 +314,69 @@ struct contiguous
    }
 };
+struct concat
+{
+    std::size_t axis = 0;
+    std::string name() const { return "concat"; }
+    std::vector<std::size_t> compute_offsets(const shape& output_shape,
+                                             const std::vector<argument> args) const
+    {
+        std::vector<std::size_t> offsets;
+        std::vector<std::size_t> offset(args[0].get_shape().lens().size(), 0);
+        offset[axis] = 0;
+        for(const auto& arg : args)
+        {
+            offsets.push_back(output_shape.index(offset));
+            offset[axis] += arg.get_shape().lens()[axis];
+        }
+        return offsets;
+    }
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        if(inputs.empty())
+        {
+            MIGRAPH_THROW("Number of input tensors should exceed 0");
+        }
+        const auto& first_shape_lens = inputs.front().lens();
+        const auto& type             = inputs.front().type();
+        for(std::size_t l = 0; l < first_shape_lens.size(); l++)
+        {
+            if(l != axis)
+            {
+                if(!std::all_of(inputs.begin(), inputs.end(), [&](auto s) {
+                       return s.lens()[l] == first_shape_lens[l];
+                   }))
+                {
+                    MIGRAPH_THROW("Non-axis dimensions should match");
+                }
+            }
+        }
+        std::size_t new_dim_axis = 0;
+        for(const auto& input : inputs)
+        {
+            const auto& lens = input.lens();
+            new_dim_axis += lens[axis];
+        }
+        std::vector<std::size_t> new_lens;
+        std::copy(first_shape_lens.begin(), first_shape_lens.end(), std::back_inserter(new_lens));
+        new_lens[axis] = new_dim_axis;
+        return {type, new_lens};
+    }
+};
 struct slice
 {
    std::vector<int64_t> axes;
    std::vector<int64_t> starts;
    std::vector<int64_t> ends;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"), f(self.starts, "starts"), f(self.ends, "ends"));
+    }
    std::string name() const { return "slice"; }
    auto fix_index(const std::vector<std::size_t>& lens, std::size_t axis, int64_t index) const
@@ -360,6 +449,13 @@ struct slice
 struct squeeze
 {
    std::vector<int64_t> axes;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"));
+    }
    std::string name() const { return "squeeze"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -400,6 +496,13 @@ struct squeeze
 struct unsqueeze
 {
    std::vector<int64_t> axes;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axes, "axes"));
+    }
    std::string name() const { return "unsqueeze"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -431,6 +534,13 @@ struct unsqueeze
 struct reshape
 {
    std::vector<int64_t> dims;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.dims, "dims"));
+    }
    std::string name() const { return "reshape"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -470,19 +580,19 @@ struct reshape
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
-    friend std::ostream& operator<<(std::ostream& os, const reshape& op)
-    {
-        os << op.name() << "[";
-        os << "dims={" << stream_range(op.dims) << "}";
-        os << "]";
-        return os;
-    }
 };
 struct gemm
 {
    float alpha = 1.0;
    float beta  = 0.0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.alpha, "alpha"), f(self.beta, "beta"));
+    }
    std::string name() const { return "gemm"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -496,13 +606,6 @@ struct gemm
                          to_string_range(b.lens()) + "}");
        return {t, {a.lens()[0], b.lens()[1]}};
    }
-    friend std::ostream& operator<<(std::ostream& os, const gemm& op)
-    {
-        os << op.name() << "[";
-        os << "]";
-        return os;
-    }
 };
 struct unary
@@ -587,6 +690,13 @@ struct softmax
 struct flatten
 {
    uint64_t axis = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"));
+    }
    std::string name() const { return "flatten"; }
    shape compute_shape(std::vector<shape> inputs) const
    {
@@ -607,17 +717,17 @@ struct flatten
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
-    friend std::ostream& operator<<(std::ostream& os, const flatten& op)
-    {
-        os << op.name() << "[";
-        os << "axis=" << op.axis;
-        os << "]";
-        return os;
-    }
 };
 struct broadcast
 {
    uint64_t axis = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.axis, "axis"));
+    }
    shape broadcast_shape;
    std::string name() const { return "broadcast"; }
    shape compute_shape(std::vector<shape> inputs) const
@@ -649,18 +759,10 @@ struct broadcast
    {
        return {std::move(output_shape), std::move(args.at(0).data)};
    }
-    friend std::ostream& operator<<(std::ostream& os, const broadcast& op)
-    {
-        os << op.name() << "[";
-        os << "axis=" << op.axis;
-        os << "]";
-        return os;
-    }
 };
 struct binary
 {
-    uint64_t broadcast = 0;
    shape compute_shape(std::vector<shape> inputs) const
    {
        check_shapes{inputs}.has(2).same_type().same_dims();
@@ -692,6 +794,13 @@ struct load
 {
    shape s;
    std::size_t offset = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"), f(self.offset, "offset"));
+    }
    std::string name() const { return "load"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {
@@ -707,6 +816,13 @@ struct load
 struct outline
 {
    shape s;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"));
+    }
    std::string name() const { return "outline"; }
    shape compute_shape(const std::vector<shape>& inputs) const
    {

--- a/src/include/migraph/reflect.hpp
+++ b/src/include/migraph/reflect.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_REFLECT_HPP
+#define MIGRAPH_GUARD_RTGLIB_REFLECT_HPP
+#include <migraph/functional.hpp>
+#include <migraph/rank.hpp>
+#include <functional>
+namespace migraph {
+namespace detail {
+template <class T, class Selector>
+auto reflect_impl(rank<1>, T& x, Selector f) -> decltype(T::reflect(x, f))
+{
+    return T::reflect(x, std::move(f));
+}
+template <class T, class Selector>
+auto reflect_impl(rank<0>, T&, Selector)
+{
+    return pack();
+}
+} // namespace detail
+template <class T, class Selector>
+auto reflect(T& x, Selector f)
+{
+    return detail::reflect_impl(rank<1>{}, x, std::move(f));
+}
+template <class T>
+auto reflect_tie(T& x)
+{
+    return reflect(x, [](auto&& y, auto&&...) { return std::ref(y); })(
+        [](auto&&... xs) { return std::tie(xs.get()...); });
+}
+template <class T, class F>
+void reflect_each(T& x, F f)
+{
+    return reflect(x, [](auto&& y, auto... ys) { return pack(std::ref(y), ys...); })(
+        [&](auto&&... xs) {
+            each_args([&](auto p) { p([&](auto&& y, auto... ys) { f(y.get(), ys...); }); }, xs...);
+        });
+}
+} // namespace migraph
+#endif
--- a/src/include/migraph/streamutils.hpp
+++ b/src/include/migraph/streamutils.hpp
@@ -3,6 +3,7 @@
 #include <ostream>
 #include <algorithm>
+#include <migraph/rank.hpp>
 namespace migraph {
@@ -31,6 +32,28 @@ inline stream_range_container<Range> stream_range(const Range& r)
    return {r};
 }
+namespace detail {
+template <class Range>
+auto stream_write_value_impl(rank<1>, std::ostream& os, const Range& r)
+    -> decltype(r.begin(), r.end(), void())
+{
+    os << stream_range(r);
+}
+template <class T>
+void stream_write_value_impl(rank<0>, std::ostream& os, const T& x)
+{
+    os << x;
+}
+} // namespace detail
+template <class T>
+void stream_write_value(std::ostream& os, const T& x)
+{
+    detail::stream_write_value_impl(rank<1>{}, os, x);
+}
 } // namespace migraph
 #endif
--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -56,6 +56,7 @@ struct onnx_parser
        add_generic_op("Sub", op::sub{});
        add_generic_op("Sum", op::add{});
+        add_mem_op("LeakyRelu", &onnx_parser::parse_leaky_relu);
        add_mem_op("Constant", &onnx_parser::parse_constant);
        add_mem_op("Conv", &onnx_parser::parse_conv);
        add_mem_op("MaxPool", &onnx_parser::parse_pooling);
@@ -305,10 +306,24 @@ struct onnx_parser
                          ? op::batch_norm_inference::spatial
                          : op::batch_norm_inference::per_activation;
        }
-        op::batch_norm_inference op{epsilon, momentum, bn_mode, is_test};
+        (void)is_test;
+        op::batch_norm_inference op{epsilon, momentum, bn_mode};
        return prog.add_instruction(op, std::move(args));
    }
+    instruction_ref parse_leaky_relu(const std::string&,
+                                     attribute_map attributes,
+                                     std::vector<instruction_ref> args)
+    {
+        float alpha = 0.01;
+        if(contains(attributes, "alpha"))
+        {
+            alpha = parse_value(attributes.at("alpha")).at<float>();
+        }
+        op::leaky_relu op{alpha};
+        return prog.add_instruction(op, args.front());
+    }
    void parse_from(std::istream& is)
    {
        onnx::ModelProto model;

--- a/src/opt/memory_coloring_impl.cpp
+++ b/src/opt/memory_coloring_impl.cpp
@@ -192,7 +192,7 @@ void memory_coloring_impl::register_operand_alias()
    operand_alias["@param"]            = -1;
    operand_alias["transpose"]         = 0;
    operand_alias["flatten"]           = 0;
-    operand_alias["broadcast"]         = 1;
+    operand_alias["broadcast"]         = 0;
    operand_alias["reshape"]           = 0;
    operand_alias["pass"]              = 0;
 }

--- a/src/targets/cpu/cpu_lowering.cpp
+++ b/src/targets/cpu/cpu_lowering.cpp
@@ -282,6 +282,34 @@ struct cpu_contiguous
    }
 };
+struct cpu_concat
+{
+    op::concat op;
+    std::string name() const { return "cpu::concat"; }
+    shape compute_shape(const std::vector<shape>& inputs) const { return op.compute_shape(inputs); }
+    argument compute(context&, const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        std::vector<std::size_t> coffsets = op.compute_offsets(output_shape, args);
+        for(std::size_t l = 0; l < args.size(); l++)
+        {
+            auto argl             = args[l];
+            std::size_t nelements = argl.get_shape().elements();
+            visit_all(result, argl)([&](auto output, auto input) {
+                auto slice_shape =
+                    shape{output_shape.type(), input.get_shape().lens(), output_shape.strides()};
+                auto slice = make_view(slice_shape, output.data() + coffsets[l]);
+                // cppcheck-suppress useStlAlgorithm
+                for(std::size_t i = 0; i < nelements; i++)
+                {
+                    slice[i] = input[i];
+                }
+            });
+        }
+        return result;
+    }
+};
 struct cpu_gemm
 {
    op::gemm op;
@@ -413,6 +441,17 @@ struct relu_op
    }
 };
+struct leaky_relu_op
+{
+    op::leaky_relu op;
+    std::string name() const { return "cpu::leaky_relu"; }
+    auto fcn() const
+    {
+        auto& a = op.alpha;
+        return [a](auto x) { return x > 0 ? x : x * a; };
+    }
+};
 template <typename Op>
 struct cpu_unary
 {
@@ -557,19 +596,20 @@ struct cpu_apply
        apply_map["batch_norm_inference"] =
            extend_op<cpu_batch_norm_inference, op::batch_norm_inference>();
        apply_map["contiguous"] = extend_op<cpu_contiguous, op::contiguous>();
+        apply_map["concat"]     = extend_op<cpu_concat, op::concat>();
-        apply_map["identity"] = simple_op<cpu_unary<identity_op>>();
+        apply_map["leaky_relu"] = extend_op<cpu_unary<leaky_relu_op>, op::leaky_relu>();
-        apply_map["tanh"]     = simple_op<cpu_unary<tanh_op>>();
+        apply_map["identity"]   = simple_op<cpu_unary<identity_op>>();
-        apply_map["sigmoid"]  = simple_op<cpu_unary<sigmoid_op>>();
+        apply_map["tanh"]       = simple_op<cpu_unary<tanh_op>>();
-        apply_map["exp"]      = simple_op<cpu_unary<exp_op>>();
+        apply_map["sigmoid"]    = simple_op<cpu_unary<sigmoid_op>>();
-        apply_map["neg"]      = simple_op<cpu_unary<neg_op>>();
+        apply_map["exp"]        = simple_op<cpu_unary<exp_op>>();
-        apply_map["sin"]      = simple_op<cpu_unary<sin_op>>();
+        apply_map["neg"]        = simple_op<cpu_unary<neg_op>>();
-        apply_map["cos"]      = simple_op<cpu_unary<cos_op>>();
+        apply_map["sin"]        = simple_op<cpu_unary<sin_op>>();
-        apply_map["tan"]      = simple_op<cpu_unary<tan_op>>();
+        apply_map["cos"]        = simple_op<cpu_unary<cos_op>>();
-        apply_map["add"]      = simple_op<cpu_binary<add_op>>();
+        apply_map["tan"]        = simple_op<cpu_unary<tan_op>>();
-        apply_map["sub"]      = simple_op<cpu_binary<sub_op>>();
+        apply_map["add"]        = simple_op<cpu_binary<add_op>>();
-        apply_map["mul"]      = simple_op<cpu_binary<mul_op>>();
+        apply_map["sub"]        = simple_op<cpu_binary<sub_op>>();
-        apply_map["div"]      = simple_op<cpu_binary<div_op>>();
+        apply_map["mul"]        = simple_op<cpu_binary<mul_op>>();
+        apply_map["div"]        = simple_op<cpu_binary<div_op>>();
        apply_map["softmax"] = simple_op<softmax2d>();
    }

--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -14,6 +14,7 @@ add_library(migraph_device
    device/add.cpp
    device/add_relu.cpp
    device/contiguous.cpp
+    device/concat.cpp
 )
 rocm_clang_tidy_check(migraph_device)
 target_link_libraries(migraph_device migraph hip::device)
@@ -31,7 +32,9 @@ add_library(migraph_gpu
    convolution.cpp
    softmax.cpp
    contiguous.cpp
+    concat.cpp
    relu.cpp
+    leaky_relu.cpp
    add.cpp
    batchnorm.cpp
    write_literals.cpp

--- a/src/targets/gpu/concat.cpp
+++ b/src/targets/gpu/concat.cpp
+#include <migraph/gpu/concat.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/gpu/miopen.hpp>
+#include <migraph/gpu/device/concat.hpp>
+#include <utility>
+namespace migraph {
+namespace gpu {
+shape hip_concat::compute_shape(std::vector<shape> inputs) const
+{
+    inputs.pop_back();
+    return op.compute_shape(inputs);
+}
+argument
+hip_concat::compute(context&, const shape& output_shape, const std::vector<argument>& args) const
+{
+    std::vector<std::size_t> offsets = op.compute_offsets(output_shape, args);
+    return device::concat(output_shape, args, offsets);
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/device/concat.cpp
+++ b/src/targets/gpu/device/concat.cpp
+#include <migraph/shape.hpp>
+#include <migraph/argument.hpp>
+#include <migraph/gpu/device/concat.hpp>
+#include <migraph/gpu/device/tensor.hpp>
+#include <migraph/gpu/device/launch.hpp>
+namespace migraph {
+namespace gpu {
+namespace device {
+argument concat(const migraph::shape& output_shape,
+                std::vector<migraph::argument> args,
+                std::vector<std::size_t> offsets)
+{
+    // migraph::argument& result = args.back();
+    for(std::size_t l = 0; l < args.size() - 1; l++)
+    {
+        auto argl             = args[l];
+        std::size_t nelements = argl.get_shape().elements();
+        visit_all(args.back(), argl)([&](auto output, auto input) {
+            visit_tensor_size(output_shape.lens().size(), [&](auto ndim) {
+                auto* outptr      = output.data() + offsets[l];
+                const auto* inptr = input.data();
+                hip_tensor_descriptor<ndim> desc_input(input.get_shape());
+                hip_tensor_descriptor<ndim> desc_output(output.get_shape());
+                gs_launch(nelements)(
+                    [=](auto i) { outptr[desc_output.linear(desc_input.multi(i))] = inptr[i]; });
+            });
+        });
+    }
+    // return result;
+    return args.back();
+}
+} // namespace device
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/include/migraph/gpu/concat.hpp
+++ b/src/targets/gpu/include/migraph/gpu/concat.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_CONCAT_HPP
+#define MIGRAPH_GUARD_RTGLIB_CONCAT_HPP
+#include <migraph/gpu/lowering.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/generate.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/gpu/miopen.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/gpu/device/concat.hpp>
+#include <migraph/gpu/device/add.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/gpu/rocblas.hpp>
+#include <migraph/gpu/context.hpp>
+#include <utility>
+namespace migraph {
+namespace gpu {
+struct hip_concat
+{
+    op::concat op;
+    std::string name() const { return "gpu::concat"; }
+    shape compute_shape(std::vector<shape> inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/convolution.hpp
+++ b/src/targets/gpu/include/migraph/gpu/convolution.hpp
@@ -26,19 +26,18 @@ struct miopen_convolution
    shared<convolution_descriptor> cd;
    miopenConvFwdAlgorithm_t algo{};
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        // TODO: Add algo
+        return op::convolution::reflect(self.op, f);
+    }
    std::string name() const { return "gpu::convolution"; }
    shape compute_shape(const std::vector<shape>& inputs) const;
    argument
    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
    shape compile(context& ctx, const shape& output_shape, std::vector<instruction_ref> inputs);
-    friend std::ostream& operator<<(std::ostream& os, const miopen_convolution& self)
-    {
-        os << self.name() << "[";
-        os << self.op << ", ";
-        os << "algo=" << self.algo;
-        os << "]";
-        return os;
-    }
 };
 } // namespace gpu

--- a/src/targets/gpu/include/migraph/gpu/device/concat.hpp
+++ b/src/targets/gpu/include/migraph/gpu/device/concat.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
+#define MIGRAPH_GUARD_RTGLIB_DEVICE_CONCAT_HPP
+namespace migraph {
+namespace gpu {
+namespace device {
+argument
+concat(const shape& output_shape, std::vector<argument> args, std::vector<std::size_t> offsets);
+} // namespace device
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/leaky_relu.hpp
+++ b/src/targets/gpu/include/migraph/gpu/leaky_relu.hpp
+#ifndef MIGRAPH_GUARD_RTGLIB_LEAKY_RELU_HPP
+#define MIGRAPH_GUARD_RTGLIB_LEAKY_RELU_HPP
+#include <migraph/gpu/lowering.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/instruction.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/generate.hpp>
+#include <migraph/shape_for_each.hpp>
+#include <migraph/gpu/miopen.hpp>
+#include <migraph/gpu/hip.hpp>
+#include <migraph/dfor.hpp>
+#include <migraph/gpu/device/contiguous.hpp>
+#include <migraph/gpu/device/add.hpp>
+#include <migraph/iterator_for.hpp>
+#include <migraph/gpu/rocblas.hpp>
+#include <migraph/gpu/context.hpp>
+#include <utility>
+namespace migraph {
+namespace gpu {
+struct miopen_leaky_relu
+{
+    shared<activation_descriptor> ad;
+    std::string name() const { return "gpu::leaky_relu"; }
+    shape compute_shape(const std::vector<shape>& inputs) const;
+    argument
+    compute(context& ctx, const shape& output_shape, const std::vector<argument>& args) const;
+};
+} // namespace gpu
+} // namespace migraph
+#endif
--- a/src/targets/gpu/include/migraph/gpu/miopen.hpp
+++ b/src/targets/gpu/include/migraph/gpu/miopen.hpp
@@ -87,6 +87,13 @@ inline activation_descriptor make_relu()
    return ad;
 }
+inline activation_descriptor make_leaky_relu(double alpha)
+{
+    auto ad = make_obj<activation_descriptor>(&miopenCreateActivationDescriptor);
+    miopenSetActivationDescriptor(ad.get(), miopenActivationLEAKYRELU, alpha, 0, 0);
+    return ad;
+}
 inline fusion_plan_descriptor make_fusion_plan(const shape& input)
 {
    auto t = make_tensor(input);

--- a/src/targets/gpu/leaky_relu.cpp
+++ b/src/targets/gpu/leaky_relu.cpp
+#include <migraph/gpu/leaky_relu.hpp>
+#include <migraph/operators.hpp>
+#include <migraph/manage_ptr.hpp>
+#include <migraph/gpu/miopen.hpp>
+#include <utility>
+namespace migraph {
+namespace gpu {
+shape miopen_leaky_relu::compute_shape(const std::vector<shape>& inputs) const
+{
+    check_shapes{inputs, *this}.has(2).not_broadcasted();
+    return inputs.at(1);
+}
+argument miopen_leaky_relu::compute(context& ctx,
+                                    const shape& output_shape,
+                                    const std::vector<argument>& args) const
+{
+    float alpha = 1, beta = 0;
+    auto x_desc = make_tensor(args[0].get_shape());
+    auto y_desc = make_tensor(output_shape);
+    miopenActivationForward(ctx.handle.get(),
+                            ad.get(),
+                            &alpha,
+                            x_desc.get(),
+                            args[0].implicit(),
+                            &beta,
+                            y_desc.get(),
+                            args[1].implicit());
+    return args[1];
+}
+} // namespace gpu
+} // namespace migraph
--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -16,11 +16,13 @@
 #include <migraph/gpu/convolution.hpp>
 #include <migraph/gpu/contiguous.hpp>
 #include <migraph/gpu/relu.hpp>
+#include <migraph/gpu/leaky_relu.hpp>
 #include <migraph/gpu/softmax.hpp>
 #include <migraph/gpu/add.hpp>
 #include <migraph/gpu/batchnorm.hpp>
 #include <migraph/gpu/pooling.hpp>
 #include <migraph/gpu/gemm.hpp>
+#include <migraph/gpu/concat.hpp>
 #include <utility>
 namespace migraph {
@@ -51,6 +53,10 @@ struct miopen_apply
            {
                check_shape(s, apply_activation(it));
            }
+            else if(it->name() == "leaky_relu")
+            {
+                check_shape(s, apply_leaky_relu(it));
+            }
            else if(it->name() == "pooling")
            {
                check_shape(s, apply_pooling(it));
@@ -67,6 +73,10 @@ struct miopen_apply
            {
                check_shape(s, apply_contiguous(it));
            }
+            else if(it->name() == "concat")
+            {
+                check_shape(s, apply_concat(it));
+            }
            else if(it->name() == "batch_norm_inference")
            {
                check_shape(s, apply_batch_norm_inference(it));
@@ -129,6 +139,16 @@ struct miopen_apply
        return ins;
    }
+    instruction_ref apply_leaky_relu(instruction_ref ins)
+    {
+        auto&& op = any_cast<op::leaky_relu>(ins->get_operator());
+        auto ad   = make_leaky_relu(op.alpha);
+        auto output = insert_allocation(ins, ins->get_shape());
+        return prog->replace_instruction(
+            ins, miopen_leaky_relu{std::move(ad)}, ins->inputs().at(0), output);
+    }
    instruction_ref apply_softmax(instruction_ref ins)
    {
        auto&& op   = any_cast<op::softmax>(ins->get_operator());
@@ -158,6 +178,15 @@ struct miopen_apply
        return prog->replace_instruction(ins, miopen_contiguous{op}, ins->inputs().at(0), output);
    }
+    instruction_ref apply_concat(instruction_ref ins)
+    {
+        auto&& op                         = any_cast<op::concat>(ins->get_operator());
+        auto output                       = insert_allocation(ins, ins->get_shape());
+        std::vector<instruction_ref> refs = ins->inputs();
+        refs.push_back(output);
+        return prog->replace_instruction(ins, hip_concat{op}, refs);
+    }
    instruction_ref apply_batch_norm_inference(instruction_ref ins)
    {
        auto&& op       = any_cast<op::batch_norm_inference>(ins->get_operator());

--- a/test/cpu_ops_test.cpp
+++ b/test/cpu_ops_test.cpp
@@ -47,6 +47,56 @@ void slice_test()
    }
 }
+void concat_test()
+{
+    {
+        migraph::program p;
+        std::size_t axis       = 1;
+        std::vector<int> data0 = {0, 1, 5, 6};
+        std::vector<int> data1 = {2, 3, 4, 7, 8, 9};
+        std::vector<int> data2 = {10, 20};
+        migraph::shape s0{migraph::shape::int32_type, {2, 2}};
+        migraph::shape s1{migraph::shape::int32_type, {2, 3}};
+        migraph::shape s2{migraph::shape::int32_type, {2, 1}};
+        auto l0 = p.add_literal(migraph::literal{s0, data0});
+        auto l1 = p.add_literal(migraph::literal{s1, data1});
+        auto l2 = p.add_literal(migraph::literal{s2, data2});
+        p.add_instruction(migraph::op::concat{axis}, l0, l1, l2);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result           = p.eval({});
+        std::vector<int> gold = {0, 1, 2, 3, 4, 10, 5, 6, 7, 8, 9, 20};
+        std::vector<int> results_vector(2 * 6);
+        result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+        EXPECT(migraph::verify_range(results_vector, gold));
+        EXPECT(migraph::verify_range(result.get_shape().lens(), std::vector<std::size_t>({2, 6})));
+        EXPECT(
+            migraph::verify_range(result.get_shape().strides(), std::vector<std::size_t>({6, 1})));
+    }
+    {
+        migraph::program p;
+        std::size_t axis       = 0;
+        std::vector<int> data0 = {0, 1, 2, 3};
+        std::vector<int> data1 = {4, 5, 6, 7, 8, 9};
+        std::vector<int> data2 = {10, 11};
+        migraph::shape s0{migraph::shape::int32_type, {2, 2}};
+        migraph::shape s1{migraph::shape::int32_type, {3, 2}};
+        migraph::shape s2{migraph::shape::int32_type, {1, 2}};
+        auto l0 = p.add_literal(migraph::literal{s0, data0});
+        auto l1 = p.add_literal(migraph::literal{s1, data1});
+        auto l2 = p.add_literal(migraph::literal{s2, data2});
+        p.add_instruction(migraph::op::concat{axis}, l0, l1, l2);
+        p.compile(migraph::cpu::cpu_target{});
+        auto result           = p.eval({});
+        std::vector<int> gold = {0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11};
+        std::vector<int> results_vector(6 * 2);
+        result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+        EXPECT(migraph::verify_range(results_vector, gold));
+        EXPECT(migraph::verify_range(result.get_shape().lens(), std::vector<std::size_t>({6, 2})));
+        EXPECT(
+            migraph::verify_range(result.get_shape().strides(), std::vector<std::size_t>({2, 1})));
+    }
+}
 void squeeze_test()
 {
    {
@@ -461,6 +511,34 @@ void div_test()
    EXPECT(migraph::verify_range(results_vector, gold));
 }
+void relu_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l = p.add_literal(migraph::literal{s, {-1.f, 0.f, 1.f}});
+    p.add_instruction(migraph::op::activation{"relu"}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {0.f, 0.f, 1.f};
+    EXPECT(migraph::verify_range(results_vector, gold));
+}
+void leaky_relu_test()
+{
+    migraph::program p;
+    migraph::shape s{migraph::shape::float_type, {3}};
+    auto l = p.add_literal(migraph::literal{s, {-1.f, 0.f, 1.f}});
+    p.add_instruction(migraph::op::leaky_relu{0.01}, l);
+    p.compile(migraph::cpu::cpu_target{});
+    auto result = p.eval({});
+    std::vector<float> results_vector(3);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    std::vector<float> gold = {-0.01f, 0.f, 1.f};
+    EXPECT(migraph::verify_range(results_vector, gold));
+}
 void reshape_test()
 {
    migraph::shape a_shape{migraph::shape::float_type, {24, 1, 1, 1}};
@@ -905,6 +983,7 @@ void contiguous_test()
 int main()
 {
+    concat_test();
    slice_test();
    squeeze_test();
    unsqueeze_test();
@@ -917,6 +996,9 @@ int main()
    add_broadcast_test();
    sub_test();
    mul_test();
+    div_test();
+    relu_test();
+    leaky_relu_test();
    gemm_test<float>();
    gemm_test<double>();
    reshape_test();

--- a/test/fwd_conv_batchnorm_rewrite_test.cpp
+++ b/test/fwd_conv_batchnorm_rewrite_test.cpp
@@ -36,9 +36,9 @@ void fwd_conv_batchnorm_rewrite_test()
    auto create_program = [&]() {
        migraph::program p;
-        auto x        = p.add_literal(xs, xdata);
+        auto x    = p.add_literal(xs, xdata);
-        auto w        = p.add_literal(ws, wdata);
+        auto w    = p.add_literal(ws, wdata);
-        auto conv     = p.add_instruction(migraph::op::convolution{{0, 0}, {1, 1}, {1, 1}}, x, w);
+        auto conv = p.add_instruction(migraph::op::convolution{{{0, 0}}, {{1, 1}}, {{1, 1}}}, x, w);
        auto scale    = p.add_literal(migraph::literal{vars, {3.0f}});
        auto bias     = p.add_literal(migraph::literal{vars, {8.1f}});
        auto mean     = p.add_literal(migraph::literal{vars, {4.0f}});