Merge branch 'develop' of https://github.com/ROCmSoftwarePlatform/AMDMIGraphX into broadcast_attr

4e3ca586 · Khalique · 1775c5ad · 31b2c735 · 4e3ca586 · 4e3ca586
Commit 4e3ca586 authored Jan 21, 2019 by Khalique
20 changed files
--- a/cppcheck.rules
+++ b/cppcheck.rules
@@ -74,7 +74,7 @@
    </message>
 </rule>
 <rule>
-    <pattern>(fclose|free|hipFree|hipHostFree|hipFreeArray|hipMemFree|hipStreamDestroy|hipEventDestroy|hipArrayDestroy|hipCtxDestroy|hipDestroyTextureObject|hipDestroySurfaceObject) \(</pattern>
+    <pattern>\\W(fclose|free|hipFree|hipHostFree|hipFreeArray|hipMemFree|hipStreamDestroy|hipEventDestroy|hipArrayDestroy|hipCtxDestroy|hipDestroyTextureObject|hipDestroySurfaceObject) \(</pattern>
    <message>
        <id>useManagePointer</id>
        <severity>style</severity>

--- a/requirements.txt
+++ b/requirements.txt
 google/protobuf -DCMAKE_POSITION_INDEPENDENT_CODE=On
 RadeonOpenCompute/rocm-cmake@ac45c6e --build
 ROCmSoftwarePlatform/rocBLAS@30a992ae02fda568688bcd190edd5e277d6674d9
-ROCmSoftwarePlatform/MIOpen@1.6.0
+ROCmSoftwarePlatform/MIOpen@1.7.0
 blaze,https://bitbucket.org/blaze-lib/blaze/get/f0755dea0e03.tar.gz -X header -DHEADER_DIR=blaze
 half,https://github.com/pfultz2/half/archive/1.12.0.tar.gz -X header -H sha256:0a08660b68abb176ebc2a0cdf8de46e3182a7f46c66443bb80dbfaaec98cf969
--- a/src/eliminate_allocation.cpp
+++ b/src/eliminate_allocation.cpp
@@ -13,8 +13,6 @@ inline namespace MIGRAPHX_INLINE_NS {
 void eliminate_allocation::apply(program& p) const
 {
    assert(alignment > 0);
-    if(!enabled(MIGRAPHX_DISABLE_MEMORY_COLORING{}))
-        return;

    std::size_t n = 0;
    std::vector<std::pair<instruction_ref, std::size_t>> allocs;
@@ -27,13 +25,16 @@ void eliminate_allocation::apply(program& p) const
        std::size_t padding = (alignment - (size % alignment)) % alignment;
        n += size + padding;
    }
-    auto mem = p.add_parameter("memory", shape{shape::int8_type, {n}});
-    for(auto&& pp : allocs)
+    if(n > 0)
    {
-        auto ins    = pp.first;
-        auto s      = ins->get_shape();
-        auto offset = pp.second;
-        p.replace_instruction(ins, op::load{s, offset}, mem);
+        auto mem = p.add_parameter("memory", shape{shape::int8_type, {n}});
+        for(auto&& pp : allocs)
+        {
+            auto ins    = pp.first;
+            auto s      = ins->get_shape();
+            auto offset = pp.second;
+            p.replace_instruction(ins, op::load{s, offset}, mem);
+        }
    }
 }


--- a/src/eliminate_concat.cpp
+++ b/src/eliminate_concat.cpp
@@ -36,14 +36,17 @@ void eliminate_concat::apply(program& p) const
            // Where are the allocations for the tensors to be concatenated?
            std::vector<instruction_ref> allocations;

-            for(auto ins2 = ins->inputs().begin(); ins2 != ins->inputs().end() - 1; ins2++)
-            {
-                auto last2 = (*ins2)->inputs().back();
-                if(last2->name() == concat_opt.allocate())
-                {
-                    allocations.push_back(last2);
-                }
-            }
+            std::transform(
+                ins->inputs().begin(),
+                std::prev(ins->inputs().end()),
+                std::back_inserter(allocations),
+                [&](instruction_ref x) { return instruction::get_output_alias(x, true); });
+
+            if(std::any_of(allocations.begin(), allocations.end(), [&](auto x) {
+                   return x->name() != concat_opt.allocate();
+               }))
+                continue;
+
            // Need to sort the allocations, so that we know where to
            // insert the "super"-allocation
            std::sort(
@@ -51,15 +54,15 @@ void eliminate_concat::apply(program& p) const
                    return std::distance(p.begin(), x) < std::distance(p.begin(), y);
                });
            // Move "super" allocation to the front
-            auto first         = allocations.front();
-            auto super         = p.move_instruction(last, first);
+            auto first = allocations.front();
+            auto super = p.move_instruction(last, first);
+            // Replace each allocation with a load
            std::size_t offset = 0;
-            for(auto x : allocations)
+            for(auto alloc : allocations)
            {
-                migraphx::op::load op{x->get_shape(), offset};
-                // migraphx::op::load op{x->get_shape(), 0};
-                p.replace_instruction(x, op, {super});
-                offset += x->get_shape().bytes();
+                op::load op{alloc->get_shape(), offset};
+                p.replace_instruction(alloc, op, {super});
+                offset += alloc->get_shape().bytes();
            }
            std::vector<instruction_ref> args = {super};
            std::copy(ins->inputs().begin(), ins->inputs().end() - 1, std::back_inserter(args));

--- a/src/include/migraphx/check_context.hpp
+++ b/src/include/migraphx/check_context.hpp
@@ -15,11 +15,19 @@ struct check_context
        std::string name() const { return "check_context"; }
        shape compute_shape(const std::vector<shape>&) const { return {}; }
        argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+        {
+            this->check(ctx);
+            return {};
+        }
+        void finalize(context& ctx, const shape&, const std::vector<shape>&) const
+        {
+            this->check(ctx);
+        }
+        void check(context& ctx) const
        {
            T* x = any_cast<T>(&ctx);
            if(x == nullptr)
                MIGRAPHX_THROW(std::string("Unexpected context type: ") + ctx.type_id().name());
-            return {};
        }
    };


--- a/src/include/migraphx/concat_opt.hpp
+++ b/src/include/migraphx/concat_opt.hpp
@@ -119,6 +119,13 @@ struct concat_optimization
        return (*this).private_detail_te_get_handle().get_concat(op);
    }

+    friend bool is_shared(const concat_optimization& private_detail_x,
+                          const concat_optimization& private_detail_y)
+    {
+        return private_detail_x.private_detail_te_handle_mem_var ==
+               private_detail_y.private_detail_te_handle_mem_var;
+    }
+
    private:
    struct private_detail_te_handle_base_type
    {

--- a/src/include/migraphx/context.hpp
+++ b/src/include/migraphx/context.hpp
@@ -95,7 +95,13 @@ struct context
    void finish() const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        return (*this).private_detail_te_get_handle().finish();
+        (*this).private_detail_te_get_handle().finish();
+    }
+
+    friend bool is_shared(const context& private_detail_x, const context& private_detail_y)
+    {
+        return private_detail_x.private_detail_te_handle_mem_var ==
+               private_detail_y.private_detail_te_handle_mem_var;
    }

    private:
@@ -136,7 +142,7 @@ struct context

        const std::type_info& type() const override { return typeid(private_detail_te_value); }

-        void finish() const override { return private_detail_te_value.finish(); }
+        void finish() const override { private_detail_te_value.finish(); }

        PrivateDetailTypeErasedT private_detail_te_value;
    };

--- a/src/include/migraphx/functional.hpp
+++ b/src/include/migraphx/functional.hpp
@@ -94,6 +94,12 @@ constexpr void each_args(F)
 {
 }

+template <class F, class T>
+auto unpack(F f, T& x)
+{
+    return sequence_c<std::tuple_size<T>{}>([&](auto... is) { f(std::get<is>(x)...); });
+}
+
 /// Implements a fix-point combinator
 template <class R, class F>
 detail::fix_f<R, F> fix(F f)

--- a/src/include/migraphx/instruction.hpp
+++ b/src/include/migraphx/instruction.hpp
@@ -14,6 +14,7 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

 shape compute_shape(const operation& op, const std::vector<instruction_ref>& args);
+std::vector<shape> to_shapes(const std::vector<instruction_ref>& args);

 struct instruction
 {
@@ -71,7 +72,11 @@ struct instruction
    static void
    replace(instruction_ref ins, operation o, const shape& r, std::vector<instruction_ref> args);

-    static instruction_ref get_output_alias(instruction_ref ins);
+    argument eval() const;
+
+    void finalize(context& ctx);
+
+    static instruction_ref get_output_alias(instruction_ref ins, bool shallow = false);

    private:
    // internal

--- a/src/include/migraphx/operation.hpp
+++ b/src/include/migraphx/operation.hpp
@@ -26,6 +26,8 @@ struct operation
 {
    /// A unique name identifying the operation
    std::string name() const;
+    /// An optional method that can be used to finalize the operator before running
+    void finalize(context& ctx);
    /// This is used to compute the resulting shape from an operation. If an
    /// operation cannot be run with input shapes, then it should throw an
    /// exception.
@@ -53,6 +55,11 @@ struct operation
    friend std::ostream& operator<<(std::ostream& os, const operation& op);
 };

+/// Returns true if operation does not require a context to run compute
+bool is_context_free(const operation& x);
+/// Returns true if the operation has a finalize method
+bool has_finalize(const operation& x);
+
 #else

 namespace operation_stream {
@@ -89,7 +96,7 @@ auto operator==(const T& x, const U& y) -> decltype(x.name() == y.name())
 } // namespace operation_equal

 template <class T>
-auto compute_op(rank<1>,
+auto compute_op(rank<2>,
                const T& x,
                context& ctx,
                const shape& output_shape,
@@ -99,6 +106,14 @@ auto compute_op(rank<1>,
    return x.compute(auto_any_cast(ctx), output_shape, input);
 }

+template <class T>
+auto compute_op(
+    rank<1>, const T& x, context&, const shape& output_shape, const std::vector<argument>& input)
+    -> decltype(x.compute(output_shape, input))
+{
+    return x.compute(output_shape, input);
+}
+
 template <class T>
 argument compute_op(rank<0>, const T& x, context&, const shape&, const std::vector<argument>&)
 {
@@ -110,7 +125,53 @@ template <class T>
 argument
 compute_op(const T& x, context& ctx, const shape& output_shape, const std::vector<argument>& input)
 {
-    return compute_op(rank<1>{}, x, ctx, output_shape, input);
+    return compute_op(rank<2>{}, x, ctx, output_shape, input);
+}
+
+template <class T>
+auto compute_op(rank<2>, const T& x, const shape& output_shape, const std::vector<argument>& input)
+    -> decltype(x.compute(output_shape, input))
+{
+    return x.compute(output_shape, input);
+}
+
+template <class T>
+auto compute_op(rank<1>, const T& x, const shape& output_shape, const std::vector<argument>& input)
+    -> decltype(x.compute(auto_any_cast(std::declval<context&>()), output_shape, input))
+{
+    std::string name = x.name();
+    MIGRAPHX_THROW("Not computable without a context: " + name);
+}
+
+template <class T>
+argument compute_op(rank<0>, const T& x, const shape&, const std::vector<argument>&)
+{
+    std::string name = x.name();
+    MIGRAPHX_THROW("Not computable: " + name);
+}
+
+template <class T>
+argument compute_op(const T& x, const shape& output_shape, const std::vector<argument>& input)
+{
+    return compute_op(rank<2>{}, x, output_shape, input);
+}
+
+template <class T>
+auto is_context_free_op(rank<1>,
+                        const T& x,
+                        const shape& output_shape,
+                        const std::vector<argument>& input)
+    -> decltype(x.compute(output_shape, input), std::true_type{});
+
+template <class T>
+auto is_context_free_op(rank<0>, const T&, const shape&, const std::vector<argument>&)
+    -> std::false_type;
+
+template <class T>
+auto is_context_free_op(const T& x) -> decltype(is_context_free_op(
+    rank<1>{}, x, std::declval<const shape&>(), std::declval<std::vector<argument>>()))
+{
+    return {};
 }

 template <class T>
@@ -132,15 +193,57 @@ int output_alias_op(const T& x, const std::vector<shape>& shapes)
    return output_alias_op(rank<1>{}, x, shapes);
 }

+template <class T>
+auto finalize_op(
+    rank<1>, T& x, context& ctx, const shape& output_shape, const std::vector<shape>& input)
+    -> decltype(x.finalize(auto_any_cast(ctx), output_shape, input), void())
+{
+    x.finalize(auto_any_cast(ctx), output_shape, input);
+}
+
+template <class T>
+void finalize_op(rank<0>, T&, context&, const shape&, const std::vector<shape>&)
+{
+}
+
+template <class T>
+void finalize_op(T& x, context& ctx, const shape& output_shape, const std::vector<shape>& input)
+{
+    finalize_op(rank<1>{}, x, ctx, output_shape, input);
+}
+
+template <class T>
+auto has_finalize_op(
+    rank<1>, T& x, context& ctx, const shape& output_shape, const std::vector<shape>& input)
+    -> decltype(x.finalize(auto_any_cast(ctx), output_shape, input), std::true_type{});
+
+template <class T>
+auto has_finalize_op(rank<0>, T&, context&, const shape&, const std::vector<shape>&)
+    -> std::false_type;
+
+template <class T>
+auto has_finalize_op(const T&) -> decltype(has_finalize_op(rank<1>{},
+                                                           std::declval<T&>(),
+                                                           std::declval<context&>(),
+                                                           std::declval<const shape&>(),
+                                                           std::declval<std::vector<shape>>()))
+{
+    return {};
+}
+
 /*
 * Type-erased interface for:
 *
 * struct operation
 * {
 *      std::string name() const;
+ *      bool is_context_free() const;
+ *      bool has_finalize() const;
 *      int output_alias(const std::vector<shape>& input) const;
+ *      void finalize(context& ctx,const shape& output,const std::vector<shape>& input) ;
 *      shape compute_shape(const std::vector<shape>& input) const;
 *      argument compute(context& ctx,const shape& output,const std::vector<argument>& input) const;
+ *      argument compute(const shape& output,const std::vector<argument>& input) const;
 *     friend std::ostream & operator<<(std::ostream & os,const operation & op) ;
 *     friend bool operator==(const operation & x,const operation & y) ;
 * };
@@ -210,12 +313,30 @@ struct operation
        return (*this).private_detail_te_get_handle().name();
    }

+    bool is_context_free() const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().is_context_free();
+    }
+
+    bool has_finalize() const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().has_finalize();
+    }
+
    int output_alias(const std::vector<shape>& input) const
    {
        assert((*this).private_detail_te_handle_mem_var);
        return (*this).private_detail_te_get_handle().output_alias(input);
    }

+    void finalize(context& ctx, const shape& output, const std::vector<shape>& input)
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        (*this).private_detail_te_get_handle().finalize(ctx, output, input);
+    }
+
    shape compute_shape(const std::vector<shape>& input) const
    {
        assert((*this).private_detail_te_handle_mem_var);
@@ -228,6 +349,12 @@ struct operation
        return (*this).private_detail_te_get_handle().compute(ctx, output, input);
    }

+    argument compute(const shape& output, const std::vector<argument>& input) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().compute(output, input);
+    }
+
    friend std::ostream& operator<<(std::ostream& os, const operation& op)
    {
        assert(op.private_detail_te_handle_mem_var);
@@ -240,6 +367,12 @@ struct operation
        return x.private_detail_te_get_handle().operator==(y);
    }

+    friend bool is_shared(const operation& private_detail_x, const operation& private_detail_y)
+    {
+        return private_detail_x.private_detail_te_handle_mem_var ==
+               private_detail_y.private_detail_te_handle_mem_var;
+    }
+
    private:
    struct private_detail_te_handle_base_type
    {
@@ -247,13 +380,18 @@ struct operation
        virtual std::shared_ptr<private_detail_te_handle_base_type> clone() const = 0;
        virtual const std::type_info& type() const                                = 0;

-        virtual std::string name() const                                   = 0;
-        virtual int output_alias(const std::vector<shape>& input) const    = 0;
-        virtual shape compute_shape(const std::vector<shape>& input) const = 0;
+        virtual std::string name() const                                = 0;
+        virtual bool is_context_free() const                            = 0;
+        virtual bool has_finalize() const                               = 0;
+        virtual int output_alias(const std::vector<shape>& input) const = 0;
+        virtual void
+        finalize(context& ctx, const shape& output, const std::vector<shape>& input) = 0;
+        virtual shape compute_shape(const std::vector<shape>& input) const           = 0;
        virtual argument
-        compute(context& ctx, const shape& output, const std::vector<argument>& input) const = 0;
-        virtual std::ostream& operator_shift_left(std::ostream& os) const                    = 0;
-        virtual bool operator==(const operation& y) const                                    = 0;
+        compute(context& ctx, const shape& output, const std::vector<argument>& input) const    = 0;
+        virtual argument compute(const shape& output, const std::vector<argument>& input) const = 0;
+        virtual std::ostream& operator_shift_left(std::ostream& os) const                       = 0;
+        virtual bool operator==(const operation& y) const                                       = 0;
    };

    template <typename PrivateDetailTypeErasedT>
@@ -286,12 +424,26 @@ struct operation

        std::string name() const override { return private_detail_te_value.name(); }

+        bool is_context_free() const override
+        {
+
+            return is_context_free_op(private_detail_te_value);
+        }
+
+        bool has_finalize() const override { return has_finalize_op(private_detail_te_value); }
+
        int output_alias(const std::vector<shape>& input) const override
        {

            return output_alias_op(private_detail_te_value, input);
        }

+        void finalize(context& ctx, const shape& output, const std::vector<shape>& input) override
+        {
+
+            finalize_op(private_detail_te_value, ctx, output, input);
+        }
+
        shape compute_shape(const std::vector<shape>& input) const override
        {

@@ -306,6 +458,12 @@ struct operation
            return compute_op(private_detail_te_value, ctx, output, input);
        }

+        argument compute(const shape& output, const std::vector<argument>& input) const override
+        {
+
+            return compute_op(private_detail_te_value, output, input);
+        }
+
        std::ostream& operator_shift_left(std::ostream& os) const override
        {
            using migraphx::operation_stream::operator<<;
@@ -385,6 +543,22 @@ inline const ValueType& any_cast(const operation& x)

 inline bool operator!=(const operation& x, const operation& y) { return !(x == y); }

+inline bool is_context_free(const operation& op) { return op.is_context_free(); }
+
+template <class T>
+bool is_context_free(const T& x)
+{
+    return is_context_free_op(x);
+}
+
+inline bool has_finalize(const operation& op) { return op.has_finalize(); }
+
+template <class T>
+bool has_finalize(const T& x)
+{
+    return has_finalize_op(x);
+}
+
 #endif

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -6,6 +6,8 @@
 #include <migraphx/check_shapes.hpp>
 #include <migraphx/stringutils.hpp>
 #include <migraphx/streamutils.hpp>
+#include <migraphx/literal.hpp>
+#include <migraphx/shape_for_each.hpp>
 #include <migraphx/config.hpp>
 #include <cmath>
 #include <utility>
@@ -16,7 +18,7 @@ namespace op {

 struct not_computable
 {
-    argument compute(context&, const shape&, const std::vector<argument>&) const
+    argument compute(const shape&, const std::vector<argument>&) const
    {
        MIGRAPHX_THROW("not computable");
    }
@@ -63,6 +65,7 @@ struct convolution
        valid
    };
    padding_mode_t padding_mode = default_;
+    int group                   = 1;

    template <class Self, class F>
    static auto reflect(Self& self, F f)
@@ -70,7 +73,8 @@ struct convolution
        return pack(f(self.padding, "padding"),
                    f(self.stride, "stride"),
                    f(self.dilation, "dilation"),
-                    f(self.padding_mode, "padding_mode"));
+                    f(self.padding_mode, "padding_mode"),
+                    f(self.group, "group"));
    }

    std::string name() const { return "convolution"; }
@@ -296,7 +300,7 @@ struct transpose
        }
        return {t, output_lens, output_strides};
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
@@ -370,6 +374,27 @@ struct concat
        new_lens[axis] = new_dim_axis;
        return {type, new_lens};
    }
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        std::vector<std::size_t> coffsets = compute_offsets(output_shape, args);
+        for(std::size_t l = 0; l < args.size(); l++)
+        {
+            auto argl             = args[l];
+            std::size_t nelements = argl.get_shape().elements();
+            visit_all(result, argl)([&](auto output, auto input) {
+                auto slice_shape =
+                    shape{output_shape.type(), input.get_shape().lens(), output_shape.strides()};
+                auto slice = make_view(slice_shape, output.data() + coffsets[l]);
+                // cppcheck-suppress useStlAlgorithm
+                for(std::size_t i = 0; i < nelements; i++)
+                {
+                    slice[i] = input[i];
+                }
+            });
+        }
+        return result;
+    }
    int output_alias(const std::vector<shape>&) const { return 0; }
 };

@@ -437,7 +462,7 @@ struct slice
        }
        return shape{t, new_lens, old_strides};
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        auto input  = args[0];
        auto offset = compute_offset(input.get_shape()) * output_shape.type_size();
@@ -487,7 +512,7 @@ struct squeeze
        }
        return shape{type, new_lens};
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
@@ -526,7 +551,7 @@ struct unsqueeze
        }
        return shape{type, new_lens};
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
@@ -578,13 +603,91 @@ struct reshape
            MIGRAPHX_THROW("Wrong number of elements for reshape");
        return s;
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
+    {
+        return {std::move(output_shape), std::move(args.front().data)};
+    }
+    int output_alias(const std::vector<shape>&) const { return 0; }
+};
+
+struct as_shape
+{
+    shape s;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.s, "shape"));
+    }
+
+    std::string name() const { return "as_shape"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        check_shapes{inputs, *this}.has(1).standard();
+        assert(inputs.front().elements() == s.elements());
+        return s;
+    }
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
    int output_alias(const std::vector<shape>&) const { return 0; }
 };

+struct gather
+{
+    std::size_t axis = 0;
+    std::string name() const { return "gather"; }
+
+    shape compute_shape(std::vector<shape> inputs) const
+    {
+        check_shapes{inputs, *this}.has(2);
+        auto lens = inputs[0].lens();
+        if(axis >= lens.size())
+        {
+            MIGRAPHX_THROW("Gather, axis is out of range.");
+        }
+        auto type  = inputs[0].type();
+        lens[axis] = inputs[1].elements();
+
+        return {type, lens};
+    }
+
+    template <class T>
+    void compute_index(const T& out_idx,
+                       const std::vector<std::size_t>& vec_indices,
+                       const std::size_t max_dim,
+                       T& in_idx) const
+    {
+        in_idx          = out_idx;
+        std::size_t idx = vec_indices.at(out_idx[axis]);
+        if(idx >= max_dim)
+        {
+            MIGRAPHX_THROW("Gather: indices are out of range in input tensor");
+        }
+        in_idx[axis] = idx;
+    }
+
+    argument compute(const shape& output_shape, std::vector<argument> args) const
+    {
+        argument result{output_shape};
+        // max dimension in axis
+        std::size_t max_dim = args[0].get_shape().lens()[axis];
+        std::vector<std::size_t> vec_indices;
+        args[1].visit([&](auto indices) { vec_indices.assign(indices.begin(), indices.end()); });
+        visit_all(result, args[0])([&](auto output, auto input) {
+            std::vector<std::size_t> in_idx;
+            shape_for_each(output.get_shape(), [&](const auto& idx) {
+                this->compute_index(idx, vec_indices, max_dim, in_idx);
+                output(idx.begin(), idx.end()) = input(in_idx.begin(), in_idx.end());
+            });
+        });
+
+        return result;
+    }
+
+    int output_alias(const std::vector<shape>&) const { return 0; }
+};
+
 struct dot
 {
    float alpha = 1.0;
@@ -624,7 +727,7 @@ struct identity
 {
    std::string name() const { return "identity"; }
    shape compute_shape(std::vector<shape> inputs) const { return inputs.at(0); }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.at(0).data)};
    }
@@ -742,7 +845,7 @@ struct flatten
            std::accumulate(lens.begin() + axis, lens.end(), std::size_t{1}, std::multiplies<>{});
        return {inputs.at(0).type(), {x, y}};
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.front().data)};
    }
@@ -794,7 +897,7 @@ struct broadcast
            return {t, broadcast_shape.lens(), std::move(bcast_strides)};
        }
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.at(0).data)};
    }
@@ -836,7 +939,7 @@ struct multibroadcast
        }
        return {t, output_lens, bcast_strides};
    }
-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.at(0).data)};
    }
@@ -858,7 +961,7 @@ struct scalar
        return {t, scalar_bcast.lens(), strides};
    }

-    argument compute(context&, shape output_shape, std::vector<argument> args) const
+    argument compute(shape output_shape, std::vector<argument> args) const
    {
        return {std::move(output_shape), std::move(args.at(0).data)};
    }
@@ -923,7 +1026,7 @@ struct load
        check_shapes{inputs}.has(1);
        return s;
    }
-    argument compute(context&, const shape&, const std::vector<argument>& args) const
+    argument compute(const shape&, const std::vector<argument>& args) const
    {
        return {s, args[0].data() + offset};
    }
@@ -946,10 +1049,7 @@ struct outline
        check_shapes{inputs, *this}.has(0);
        return s;
    }
-    argument compute(context&, const shape&, const std::vector<argument>&) const
-    {
-        return {s, nullptr};
-    }
+    argument compute(const shape&, const std::vector<argument>&) const { return {s, nullptr}; }
 };

 } // namespace op

--- a/src/include/migraphx/par_dfor.hpp
+++ b/src/include/migraphx/par_dfor.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PAR_DFOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PAR_DFOR_HPP
+
+#include <migraphx/par_for.hpp>
+#include <migraphx/functional.hpp>
+#include <array>
+#include <numeric>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+template <class... Ts>
+auto par_dfor(Ts... xs)
+{
+    return [=](auto f) {
+        using array_type = std::array<std::size_t, sizeof...(Ts)>;
+        array_type lens  = {{static_cast<std::size_t>(xs)...}};
+        auto n = std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<std::size_t>{});
+        const std::size_t min_grain = 8;
+        if(n > 2 * min_grain)
+        {
+            array_type strides;
+            strides.fill(1);
+            std::partial_sum(lens.rbegin(),
+                             lens.rend() - 1,
+                             strides.rbegin() + 1,
+                             std::multiplies<std::size_t>());
+            auto size =
+                std::accumulate(lens.begin(), lens.end(), 1, std::multiplies<std::size_t>());
+            par_for(size, min_grain, [&](std::size_t i) {
+                array_type indices;
+                std::transform(strides.begin(),
+                               strides.end(),
+                               lens.begin(),
+                               indices.begin(),
+                               [&](size_t stride, size_t len) { return (i / stride) % len; });
+                migraphx::unpack(f, indices);
+            });
+        }
+        else
+        {
+            dfor(xs...)(f);
+        }
+
+    };
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/par_for.hpp
+++ b/src/include/migraphx/par_for.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PAR_FOR_HPP
+
+#include <thread>
+#include <cmath>
+#include <algorithm>
+#include <vector>
+#include <cassert>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct joinable_thread : std::thread
+{
+    template <class... Xs>
+    joinable_thread(Xs&&... xs) : std::thread(std::forward<Xs>(xs)...) // NOLINT
+    {
+    }
+
+    joinable_thread& operator=(joinable_thread&& other) = default;
+    joinable_thread(joinable_thread&& other)            = default;
+
+    ~joinable_thread()
+    {
+        if(this->joinable())
+            this->join();
+    }
+};
+
+template <class F>
+void par_for_impl(std::size_t n, std::size_t threadsize, F f)
+{
+    if(threadsize <= 1)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            f(i);
+    }
+    else
+    {
+        std::vector<joinable_thread> threads(threadsize);
+// Using const here causes gcc 5 to ICE
+#if(!defined(__GNUC__) || __GNUC__ != 5)
+        const
+#endif
+            std::size_t grainsize = std::ceil(static_cast<double>(n) / threads.size());
+
+        std::size_t work = 0;
+        std::generate(threads.begin(), threads.end(), [=, &work] {
+            auto result = joinable_thread([=] {
+                std::size_t start = work;
+                std::size_t last  = std::min(n, work + grainsize);
+                for(std::size_t i = start; i < last; i++)
+                {
+                    f(i);
+                }
+            });
+            work += grainsize;
+            return result;
+        });
+        assert(work >= n);
+    }
+}
+
+template <class F>
+void par_for(std::size_t n, std::size_t min_grain, F f)
+{
+    const auto threadsize =
+        std::min<std::size_t>(std::thread::hardware_concurrency(), n / min_grain);
+    par_for_impl(n, threadsize, f);
+}
+
+template <class F>
+void par_for(std::size_t n, F f)
+{
+    const int min_grain = 8;
+    par_for(n, min_grain, f);
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/pass.hpp
+++ b/src/include/migraphx/pass.hpp
@@ -105,7 +105,13 @@ struct pass
    void apply(program& p) const
    {
        assert((*this).private_detail_te_handle_mem_var);
-        return (*this).private_detail_te_get_handle().apply(p);
+        (*this).private_detail_te_get_handle().apply(p);
+    }
+
+    friend bool is_shared(const pass& private_detail_x, const pass& private_detail_y)
+    {
+        return private_detail_x.private_detail_te_handle_mem_var ==
+               private_detail_y.private_detail_te_handle_mem_var;
    }

    private:
@@ -149,7 +155,7 @@ struct pass

        std::string name() const override { return private_detail_te_value.name(); }

-        void apply(program& p) const override { return private_detail_te_value.apply(p); }
+        void apply(program& p) const override { private_detail_te_value.apply(p); }

        PrivateDetailTypeErasedT private_detail_te_value;
    };

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -91,10 +91,14 @@ struct program

    shape get_shape() const;

+    context& get_context() const;
+
    instruction_ref validate() const;

    void compile(const target& t, tracer trace = tracer{});

+    void finalize();
+
    void perf_report(std::ostream& os, std::size_t n, parameter_map params) const;

    void debug_print() const;

--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -127,6 +127,12 @@ struct target
        return (*this).private_detail_te_get_handle().get_context();
    }

+    friend bool is_shared(const target& private_detail_x, const target& private_detail_y)
+    {
+        return private_detail_x.private_detail_te_handle_mem_var ==
+               private_detail_y.private_detail_te_handle_mem_var;
+    }
+
    private:
    struct private_detail_te_handle_base_type
    {

--- a/src/instruction.cpp
+++ b/src/instruction.cpp
@@ -162,25 +162,54 @@ void instruction::replace_argument(instruction_ref old, instruction_ref new_ins)
    old->remove_output(*this);
 }

-std::vector<shape> compute_shapes(const std::vector<instruction_ref>& args)
+argument instruction::eval() const
 {
-    std::vector<shape> shapes(args.size());
-    std::transform(
-        args.begin(), args.end(), shapes.begin(), [](instruction_ref i) { return i->get_shape(); });
-    return shapes;
+    if(op.name() == "@literal")
+    {
+        return this->get_literal().get_argument();
+    }
+    if(is_context_free(op))
+    {
+        std::vector<argument> args;
+        for(auto&& arg : this->inputs())
+        {
+            argument a = arg->eval();
+            if(a.empty())
+                return {};
+            args.push_back(a);
+        }
+        return op.compute(result, args);
+    }
+    return {};
 }

-instruction_ref instruction::get_output_alias(instruction_ref ins)
+void instruction::finalize(context& ctx)
 {
-    auto i = ins->get_operator().output_alias(compute_shapes(ins->inputs()));
+    if(has_finalize(this->op))
+        this->op.finalize(ctx, this->get_shape(), to_shapes(this->inputs()));
+}
+
+instruction_ref instruction::get_output_alias(instruction_ref ins, bool shallow)
+{
+    auto i = ins->get_operator().output_alias(to_shapes(ins->inputs()));
    if(i < 0)
        return ins;
+    if(shallow)
+        return ins->inputs().at(i);
    return get_output_alias(ins->inputs().at(i));
 }

+std::vector<shape> to_shapes(const std::vector<instruction_ref>& args)
+{
+    std::vector<shape> shapes(args.size());
+    std::transform(
+        args.begin(), args.end(), shapes.begin(), [](instruction_ref i) { return i->get_shape(); });
+    return shapes;
+}
+
 shape compute_shape(const operation& op, const std::vector<instruction_ref>& args)
 {
-    return op.compute_shape(compute_shapes(args));
+    return op.compute_shape(to_shapes(args));
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/onnx/onnx.cpp
+++ b/src/onnx/onnx.cpp
@@ -80,6 +80,9 @@ struct onnx_parser
        add_mem_op("Unsqueeze", &onnx_parser::parse_unsqueeze);
        add_mem_op("Slice", &onnx_parser::parse_slice);
        add_mem_op("Concat", &onnx_parser::parse_concat);
+        add_mem_op("Gather", &onnx_parser::parse_gather);
+        add_mem_op("Shape", &onnx_parser::parse_shape);
+        add_mem_op("ConstantFill", &onnx_parser::parse_constant_fill);
        add_mem_op("Transpose", &onnx_parser::parse_transpose);
    }

@@ -148,7 +151,7 @@ struct onnx_parser
            if(s0->size() > s1->size())
                std::swap(s0, s1);

-            std::vector<std::size_t> output_lens(s1->size());
+            std::vector<std::size_t> output_lens(*s1);
            auto offset = s1->size() - s0->size();
            std::transform(s0->begin(),
                           s0->end(),
@@ -241,6 +244,10 @@ struct onnx_parser
                op.padding_mode = op::convolution::same;
            }
        }
+        if(contains(attributes, "group"))
+        {
+            op.group = parse_value(attributes.at("group")).at<int>();
+        }
        if(args.size() == 3)
        {
            uint64_t axis = 1;
@@ -350,6 +357,18 @@ struct onnx_parser
        return prog.add_instruction(op, std::move(args));
    }

+    instruction_ref
+    parse_gather(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
+    {
+        std::size_t axis = 0;
+        if(contains(attributes, "axis"))
+        {
+            axis = parse_value(attributes.at("axis")).at<int>();
+        }
+        op::gather op{axis};
+        return prog.add_instruction(op, std::move(args));
+    }
+
    instruction_ref
    parse_slice(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
@@ -382,7 +401,7 @@ struct onnx_parser
    parse_gemm(const std::string&, attribute_map attributes, std::vector<instruction_ref> args)
    {
        float alpha = 1.0f;
-        float beta  = 0.0f;
+        float beta  = 1.0f;
        bool transa = false;
        bool transb = false;
        if(contains(attributes, "alpha"))
@@ -406,10 +425,20 @@ struct onnx_parser
        auto l2 = (transb) ? prog.add_instruction(op::transpose{perm}, args[1]) : args[1];
        if(args.size() == 3)
        {
-            uint64_t axis = 1;
-            auto l3       = prog.add_instruction(op::dot{alpha, beta}, l1, l2);
-            auto l4       = prog.add_instruction(op::broadcast{axis, l3->get_shape()}, args[2]);
-            return prog.add_instruction(op::add{}, l3, l4);
+            if(beta != 0.f)
+            {
+                auto l3 = prog.add_instruction(op::dot{alpha}, l1, l2);
+                auto l4 = args[2];
+                if(l4->get_shape().scalar()) // ignore args[2] (no C value added to alpha*A*B)
+                    return l3;
+                if(beta != 1.f)
+                {
+                    auto beta_val = prog.add_literal(beta);
+                    auto l5 = prog.add_instruction(op::scalar{args[2]->get_shape()}, beta_val);
+                    l4      = prog.add_instruction(op::mul{}, args[2], l5);
+                }
+                return add_broadcastable_binary_op(l3, l4, op::add{});
+            }
        }
        return prog.add_instruction(op::dot{alpha, beta}, l1, l2);
    }
@@ -509,6 +538,99 @@ struct onnx_parser
        return prog.add_instruction(migraphx::op::transpose{perm}, args.front());
    }

+    // Use a literal instruction to replace the shape since, output of
+    // shape operator are literals in migraphx
+    instruction_ref
+    parse_shape(const std::string&, const attribute_map&, std::vector<instruction_ref> args)
+    {
+        if(args.size() != 1)
+            MIGRAPHX_THROW("Shape: operator should have 1 operand");
+        std::vector<std::size_t> arg_shape = args[0]->get_shape().lens();
+        std::vector<int64_t> vec_shape(arg_shape.size());
+        migraphx::shape s(migraphx::shape::int64_type, {arg_shape.size()});
+        std::transform(arg_shape.begin(), arg_shape.end(), vec_shape.begin(), [](auto i) {
+            return int64_t(i);
+        });
+        return prog.add_literal(migraphx::literal{s, vec_shape});
+    }
+
+    // Use a literal instruction to replace the constantFill operator. In RNN, input shape
+    // and value are fixed, so no need to do the actual computation for the constantFill
+    // operator
+    instruction_ref parse_constant_fill(const std::string&,
+                                        attribute_map attributes,
+                                        std::vector<instruction_ref> args)
+    {
+        int input_as_shape = 0;
+        int dtype          = 1;
+        float value        = 0.0f;
+
+        if(contains(attributes, "dtype"))
+        {
+            dtype = parse_value(attributes.at("dtype")).at<int>();
+        }
+        migraphx::shape::type_t type = get_type(dtype);
+
+        if(contains(attributes, "input_as_shape"))
+        {
+            input_as_shape = parse_value(attributes.at("input_as_shape")).at<int>();
+        }
+
+        if(contains(attributes, "value"))
+        {
+            value = parse_value(attributes.at("value")).at<float>();
+        }
+
+        if(contains(attributes, "extra_shape"))
+        {
+            MIGRAPHX_THROW("ConstantFill: cannot handle extra shape attribute");
+        }
+
+        if(input_as_shape == 1)
+        {
+            if(args.size() != 1)
+            {
+                MIGRAPHX_THROW("ConstantFill: need an input argument as output shape");
+            }
+
+            if(contains(attributes, "shape"))
+            {
+                MIGRAPHX_THROW("ConstantFill: cannot set the shape argument and pass in an input "
+                               "at the same time");
+            }
+
+            migraphx::argument in = args[0]->eval();
+            if(in.empty())
+            {
+                MIGRAPHX_THROW("ConstantFill: cannot handle dynamic shape as input");
+            }
+
+            std::vector<std::size_t> dims;
+            in.visit([&](auto input) { dims.assign(input.begin(), input.end()); });
+            migraphx::shape s(type, dims);
+            std::vector<float> values(s.elements(), value);
+            return prog.add_literal(migraphx::literal(s, values));
+        }
+        else if(input_as_shape == 0)
+        {
+            if(!contains(attributes, "shape"))
+            {
+                MIGRAPHX_THROW("ConstantFill: attribute output shape is needed");
+            }
+
+            literal ls = parse_value(attributes.at("shape"));
+            std::vector<std::size_t> dims;
+            ls.visit([&](auto s) { dims.assign(s.begin(), s.end()); });
+            migraphx::shape s{type, dims};
+            std::vector<float> values(s.elements(), value);
+            return prog.add_literal(migraphx::literal(s, values));
+        }
+        else
+        {
+            MIGRAPHX_THROW("ConstantFill: wrong value of attribute input_as_shape");
+        }
+    }
+
    void parse_from(std::istream& is)
    {
        onnx::ModelProto model;
@@ -758,6 +880,28 @@ struct onnx_parser
                       });
        return {shape_type, dims};
    }
+
+    shape::type_t get_type(int dtype)
+    {
+        switch(dtype)
+        {
+        case 1: return shape::float_type;
+        case 2: return shape::uint8_type;
+        case 3: return shape::int8_type;
+        case 4: return shape::uint16_type;
+        case 5: return shape::int16_type;
+        case 6: return shape::int32_type;
+        case 7: return shape::int64_type;
+        case 10: return shape::half_type;
+        case 11: return shape::double_type;
+        case 12: return shape::uint32_type;
+        case 13: return shape::uint64_type;
+        default:
+        {
+            MIGRAPHX_THROW("Prototensor data type " + std::to_string(dtype) + " not supported");
+        }
+        }
+    }
 };

 program parse_onnx(const std::string& name)

--- a/src/program.cpp
+++ b/src/program.cpp
@@ -271,6 +271,8 @@ instruction_ref program::end() const { return impl->instructions.end(); }

 shape program::get_shape() const { return impl->instructions.back().get_shape(); }

+context& program::get_context() const { return impl->ctx; }
+
 instruction_ref program::validate() const
 {
    return std::find_if(impl->instructions.begin(),
@@ -309,6 +311,15 @@ void program::compile(const target& t, tracer trace)
        auto index = std::distance(impl->instructions.begin(), invalid);
        MIGRAPHX_THROW("Invalid program from compilation at instruction " + std::to_string(index));
    }
+    this->finalize();
+}
+
+void program::finalize()
+{
+    for(auto ins : iterator_for(*this))
+    {
+        ins->finalize(this->impl->ctx);
+    }
 }

 template <class F>

--- a/src/simplify_reshapes.cpp
+++ b/src/simplify_reshapes.cpp
@@ -9,7 +9,18 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-bool is_reshaper(const std::string& name)
+// Reshapers that can't handle nonstandard input shapes
+bool is_nonstandard_reshaper(instruction_ref ins)
+{
+    // clang-format off
+    static const std::unordered_set<std::string> names = {
+        "reshape"
+    };
+    // clang-format on
+    return contains(names, ins->name()) and ins->inputs().front()->name() == "contiguous";
+}
+
+bool is_reshaper(instruction_ref ins)
 {
    // clang-format off
    static const std::unordered_set<std::string> names = {
@@ -19,26 +30,27 @@ bool is_reshaper(const std::string& name)
        "contiguous"
    };
    // clang-format on
-    return contains(names, name);
+    return contains(names, ins->name()) and not is_nonstandard_reshaper(ins);
 }

 void simplify_reshapes::apply(program& p) const
 {
    for(auto ins : iterator_for(p))
    {
-        if(not is_reshaper(ins->name()))
+        if(not is_reshaper(ins))
            continue;
        if(ins->outputs().size() != 1)
            continue;
-        if(is_reshaper(ins->outputs().front()->name()))
+        if(is_reshaper(ins->outputs().front()))
            continue;
        // Gather reshapes
        std::vector<instruction_ref> reshapes{ins};
-        while(is_reshaper(reshapes.back()->name()))
+        while(is_reshaper(reshapes.back()))
        {
            assert(!reshapes.back()->inputs().empty());
            assert(p.has_instruction(reshapes.back()->inputs().front()));
-            reshapes.push_back(reshapes.back()->inputs().front());
+            auto input = reshapes.back()->inputs().front();
+            reshapes.push_back(input);
        }

        std::pair<instruction_ref, instruction_ref> r{p.end(), p.end()};
@@ -58,6 +70,13 @@ void simplify_reshapes::apply(program& p) const
            p.replace_instruction(r.first, r.second);
        }
    }
+    // Replace all reshapes with as_shape
+    for(auto ins : iterator_for(p))
+    {
+        if(ins->name() != "reshape")
+            continue;
+        p.replace_instruction(ins, op::as_shape{ins->get_shape()}, ins->inputs());
+    }
 }

 } // namespace MIGRAPHX_INLINE_NS