Merge pull request #175 from ROCmSoftwarePlatform/stream_execution_checkin

Stream execution checkin

Merge pull request #175 from ROCmSoftwarePlatform/stream_execution_checkin
Stream execution checkin
2b0b77f1 · Paul Fultz II · GitHub · fa38a3a6 · 7ede2b3b · 2b0b77f1
Unverified Commit 2b0b77f1 authored Mar 12, 2019 by Paul Fultz II Committed by GitHub Mar 12, 2019
20 changed files
--- a/src/CMakeLists.txt
+++ b/src/CMakeLists.txt
@@ -17,6 +17,7 @@ add_library(migraphx
    instruction.cpp
    program.cpp
    shape.cpp
+    schedule.cpp
    simplify_algebra.cpp
    simplify_reshapes.cpp
    opt/memory_coloring.cpp

--- a/src/dead_code_elimination.cpp
+++ b/src/dead_code_elimination.cpp
@@ -41,9 +41,9 @@ void dead_code_elimination::apply(program& p) const
        // Skip the last instruction
        if(i == last)
            break;
-        // Skip instruction with empty shape as output unless its a builtin or undefined
-        if(i->get_shape().elements() == 0 and not(i->name().front() == '@') and
-           not(i->name() == "undefined"))
+        // Skip instruction with empty shape as output unless its a builtin or undefined or identity
+        if(i->get_shape().elements() == 0 and i->name().front() != '@' and
+           i->name() != "undefined" and i->name() != "identity")
            continue;
        assert(bidistance(p, i, last) > 0);
        fix([&](auto self, auto leaf) {

--- a/src/include/migraphx/functional.hpp
+++ b/src/include/migraphx/functional.hpp
@@ -137,6 +137,18 @@ auto fold(F f)
    return [=](auto&&... xs) { return fold_impl(f, std::forward<decltype(xs)>(xs)...); };
 }

+template <class F, class Proj>
+auto by(F f, Proj proj)
+{
+    return [=](auto&&... xs) { return f(proj(std::forward<decltype(xs)>(xs))...); };
+}
+
+template <class T>
+auto index_of(T& x)
+{
+    return [&](auto&& y) { return x[y]; };
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/include/migraphx/iterator_for.hpp
+++ b/src/include/migraphx/iterator_for.hpp
@@ -3,21 +3,64 @@

 #include <cassert>
 #include <type_traits>
+#include <iterator>
 #include <migraphx/config.hpp>

 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-template <class T>
+struct iterator_for_select
+{
+    template <class T>
+    static T deref(T x)
+    {
+        return x;
+    }
+
+    template <class T>
+    static auto begin(T* x)
+    {
+        return x->begin();
+    }
+
+    template <class T>
+    static auto end(T* x)
+    {
+        return x->end();
+    }
+};
+
+struct iterator_for_select_reverse
+{
+    template <class T>
+    static auto deref(T x)
+    {
+        return std::prev(x.base());
+    }
+
+    template <class T>
+    static auto begin(T* x)
+    {
+        return std::make_reverse_iterator(x->end());
+    }
+
+    template <class T>
+    static auto end(T* x)
+    {
+        return std::make_reverse_iterator(x->begin());
+    }
+};
+
+template <class T, class Selector = iterator_for_select>
 struct iterator_for_range
 {
    T* base;
-    using base_iterator = std::remove_reference_t<decltype(base->begin())>;
+    using base_iterator = std::remove_reference_t<decltype(Selector::begin(base))>;

    struct iterator
    {
        base_iterator i;
-        base_iterator operator*() const { return i; }
+        auto operator*() const { return Selector::deref(i); }
        base_iterator operator++() { return ++i; }
        bool operator!=(const iterator& rhs) const { return i != rhs.i; }
    };
@@ -25,12 +68,12 @@ struct iterator_for_range
    iterator begin()
    {
        assert(base != nullptr);
-        return {base->begin()};
+        return {Selector::begin(base)};
    }
    iterator end()
    {
        assert(base != nullptr);
-        return {base->end()};
+        return {Selector::end(base)};
    }
 };
 template <class T>
@@ -39,6 +82,12 @@ iterator_for_range<T> iterator_for(T& x)
    return {&x};
 }

+template <class T>
+iterator_for_range<T, iterator_for_select_reverse> reverse_iterator_for(T& x)
+{
+    return {&x};
+}
+
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx


--- a/src/include/migraphx/operators.hpp
+++ b/src/include/migraphx/operators.hpp
@@ -1174,9 +1174,19 @@ struct load
    }
    argument compute(const shape&, const std::vector<argument>& args) const
    {
+        if((offset + s.bytes()) > args[0].get_shape().bytes())
+            MIGRAPHX_THROW("Load access is out of bounds");
        return {s, args[0].data() + offset};
    }
    int output_alias(const std::vector<shape>&) const { return 0; }
+
+    friend std::ostream& operator<<(std::ostream& os, const load& op)
+    {
+        os << op.name() << "[";
+        os << "offset=" << op.offset << ",";
+        os << "end=" << (op.offset + op.s.bytes()) << "]";
+        return os;
+    }
 };

 struct outline

--- a/src/include/migraphx/program.hpp
+++ b/src/include/migraphx/program.hpp
@@ -9,6 +9,7 @@
 #include <migraphx/instruction_ref.hpp>
 #include <migraphx/target.hpp>
 #include <migraphx/tracer.hpp>
+#include <migraphx/env.hpp>
 #include <migraphx/config.hpp>
 #include <algorithm>
 #include <iostream>
@@ -16,6 +17,9 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_COMPILE)
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_EVAL)
+
 struct program_impl;

 const operation& get_operation(instruction_ref ins);
@@ -107,6 +111,8 @@ struct program

    void dry_run(parameter_map params) const;

+    void annotate(std::ostream& os, std::function<void(instruction_ref)> a) const;
+
    friend std::ostream& operator<<(std::ostream& os, const program& p);
    friend bool operator==(const program& x, const program& y);
    friend bool operator!=(const program& x, const program& y) { return !(x == y); }

--- a/src/include/migraphx/schedule.hpp
+++ b/src/include/migraphx/schedule.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_SCHEDULE_HPP
+#define MIGRAPHX_GUARD_RTGLIB_SCHEDULE_HPP
+
+#include <string>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/schedule_model.hpp>
+#include <migraphx/config.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct program;
+
+/**
+ * Schedule instructions for concurrent execution
+ */
+struct schedule
+{
+    schedule_model model{};
+    std::string name() const { return "schedule"; }
+    void apply(program& p) const;
+};
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/schedule_model.hpp
+++ b/src/include/migraphx/schedule_model.hpp
+#ifndef MIGRAPHX_GUARD_SCHEDULE_MODEL_HPP
+#define MIGRAPHX_GUARD_SCHEDULE_MODEL_HPP
+
+#include <cassert>
+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct program;
+struct operation;
+
+#ifdef DOXYGEN
+
+/// An interface for target-dependent model for the scheduler
+struct schedule_model
+{
+    /// Get the number of concurrent instruction allowed
+    std::size_t concurrency() const;
+    /// Schedule a concurrent instruction
+    void sched(program& p, instruction_ref ins, std::size_t n) const;
+    // Insert necessary waits before an instruction
+    void wait(program& p, instruction_ref ins, std::size_t wait_id) const;
+    // Insert necessary records after an instruction
+    void record(program& p, instruction_ref ins, std::size_t wait_id) const;
+    /// Compute weights for an operation
+    std::size_t weight(const operation& op) const;
+};
+
+#else
+
+/*
+ * Type-erased interface for:
+ *
+ * struct schedule_model
+ * {
+ *      std::size_t concurrency() const;
+ *      void sched(program& p,instruction_ref ins,std::size_t n) const;
+ *      void wait(program& p,instruction_ref ins,std::size_t wait_id) const;
+ *      void record(program& p,instruction_ref ins,std::size_t wait_id) const;
+ *      std::size_t weight(const operation& op) const;
+ * };
+ *
+ */
+
+struct schedule_model
+{
+    // Constructors
+    schedule_model() = default;
+
+    template <typename PrivateDetailTypeErasedT>
+    schedule_model(PrivateDetailTypeErasedT value)
+        : private_detail_te_handle_mem_var(
+              std::make_shared<private_detail_te_handle_type<
+                  typename std::remove_reference<PrivateDetailTypeErasedT>::type>>(
+                  std::forward<PrivateDetailTypeErasedT>(value)))
+    {
+    }
+
+    // Assignment
+    template <typename PrivateDetailTypeErasedT>
+    schedule_model& operator=(PrivateDetailTypeErasedT value)
+    {
+        if(private_detail_te_handle_mem_var.unique())
+            *private_detail_te_handle_mem_var = std::forward<PrivateDetailTypeErasedT>(value);
+        else if(!private_detail_te_handle_mem_var)
+            private_detail_te_handle_mem_var = std::make_shared<PrivateDetailTypeErasedT>(
+                std::forward<PrivateDetailTypeErasedT>(value));
+        return *this;
+    }
+
+    // Cast
+    template <typename PrivateDetailTypeErasedT>
+    PrivateDetailTypeErasedT* any_cast()
+    {
+        return private_detail_te_get_handle().type() == typeid(PrivateDetailTypeErasedT)
+                   ? std::addressof(static_cast<private_detail_te_handle_type<
+                                        typename std::remove_cv<PrivateDetailTypeErasedT>::type>&>(
+                                        private_detail_te_get_handle())
+                                        .private_detail_te_value)
+                   : nullptr;
+    }
+
+    template <typename PrivateDetailTypeErasedT>
+    const typename std::remove_cv<PrivateDetailTypeErasedT>::type* any_cast() const
+    {
+        return private_detail_te_get_handle().type() == typeid(PrivateDetailTypeErasedT)
+                   ? std::addressof(static_cast<const private_detail_te_handle_type<
+                                        typename std::remove_cv<PrivateDetailTypeErasedT>::type>&>(
+                                        private_detail_te_get_handle())
+                                        .private_detail_te_value)
+                   : nullptr;
+    }
+
+    const std::type_info& type_id() const
+    {
+        if(private_detail_te_handle_empty())
+            return typeid(std::nullptr_t);
+        else
+            return private_detail_te_get_handle().type();
+    }
+
+    std::size_t concurrency() const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().concurrency();
+    }
+
+    void sched(program& p, instruction_ref ins, std::size_t n) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        (*this).private_detail_te_get_handle().sched(p, ins, n);
+    }
+
+    void wait(program& p, instruction_ref ins, std::size_t wait_id) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        (*this).private_detail_te_get_handle().wait(p, ins, wait_id);
+    }
+
+    void record(program& p, instruction_ref ins, std::size_t wait_id) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        (*this).private_detail_te_get_handle().record(p, ins, wait_id);
+    }
+
+    std::size_t weight(const operation& op) const
+    {
+        assert((*this).private_detail_te_handle_mem_var);
+        return (*this).private_detail_te_get_handle().weight(op);
+    }
+
+    friend bool is_shared(const schedule_model& private_detail_x,
+                          const schedule_model& private_detail_y)
+    {
+        return private_detail_x.private_detail_te_handle_mem_var ==
+               private_detail_y.private_detail_te_handle_mem_var;
+    }
+
+    private:
+    struct private_detail_te_handle_base_type
+    {
+        virtual ~private_detail_te_handle_base_type() {}
+        virtual std::shared_ptr<private_detail_te_handle_base_type> clone() const = 0;
+        virtual const std::type_info& type() const                                = 0;
+
+        virtual std::size_t concurrency() const                                         = 0;
+        virtual void sched(program& p, instruction_ref ins, std::size_t n) const        = 0;
+        virtual void wait(program& p, instruction_ref ins, std::size_t wait_id) const   = 0;
+        virtual void record(program& p, instruction_ref ins, std::size_t wait_id) const = 0;
+        virtual std::size_t weight(const operation& op) const                           = 0;
+    };
+
+    template <typename PrivateDetailTypeErasedT>
+    struct private_detail_te_handle_type : private_detail_te_handle_base_type
+    {
+        template <typename PrivateDetailTypeErasedU = PrivateDetailTypeErasedT>
+        private_detail_te_handle_type(
+            PrivateDetailTypeErasedT value,
+            typename std::enable_if<std::is_reference<PrivateDetailTypeErasedU>::value>::type* =
+                nullptr)
+            : private_detail_te_value(value)
+        {
+        }
+
+        template <typename PrivateDetailTypeErasedU = PrivateDetailTypeErasedT>
+        private_detail_te_handle_type(
+            PrivateDetailTypeErasedT value,
+            typename std::enable_if<!std::is_reference<PrivateDetailTypeErasedU>::value,
+                                    int>::type* = nullptr) noexcept
+            : private_detail_te_value(std::move(value))
+        {
+        }
+
+        std::shared_ptr<private_detail_te_handle_base_type> clone() const override
+        {
+            return std::make_shared<private_detail_te_handle_type>(private_detail_te_value);
+        }
+
+        const std::type_info& type() const override { return typeid(private_detail_te_value); }
+
+        std::size_t concurrency() const override { return private_detail_te_value.concurrency(); }
+
+        void sched(program& p, instruction_ref ins, std::size_t n) const override
+        {
+
+            private_detail_te_value.sched(p, ins, n);
+        }
+
+        void wait(program& p, instruction_ref ins, std::size_t wait_id) const override
+        {
+
+            private_detail_te_value.wait(p, ins, wait_id);
+        }
+
+        void record(program& p, instruction_ref ins, std::size_t wait_id) const override
+        {
+
+            private_detail_te_value.record(p, ins, wait_id);
+        }
+
+        std::size_t weight(const operation& op) const override
+        {
+
+            return private_detail_te_value.weight(op);
+        }
+
+        PrivateDetailTypeErasedT private_detail_te_value;
+    };
+
+    template <typename PrivateDetailTypeErasedT>
+    struct private_detail_te_handle_type<std::reference_wrapper<PrivateDetailTypeErasedT>>
+        : private_detail_te_handle_type<PrivateDetailTypeErasedT&>
+    {
+        private_detail_te_handle_type(std::reference_wrapper<PrivateDetailTypeErasedT> ref)
+            : private_detail_te_handle_type<PrivateDetailTypeErasedT&>(ref.get())
+        {
+        }
+    };
+
+    bool private_detail_te_handle_empty() const
+    {
+        return private_detail_te_handle_mem_var == nullptr;
+    }
+
+    const private_detail_te_handle_base_type& private_detail_te_get_handle() const
+    {
+        assert(private_detail_te_handle_mem_var != nullptr);
+        return *private_detail_te_handle_mem_var;
+    }
+
+    private_detail_te_handle_base_type& private_detail_te_get_handle()
+    {
+        assert(private_detail_te_handle_mem_var != nullptr);
+        if(!private_detail_te_handle_mem_var.unique())
+            private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
+        return *private_detail_te_handle_mem_var;
+    }
+
+    std::shared_ptr<private_detail_te_handle_base_type> private_detail_te_handle_mem_var;
+};
+
+template <typename ValueType>
+inline const ValueType* any_cast(const schedule_model* x)
+{
+    return x->any_cast<ValueType>();
+}
+
+template <typename ValueType>
+inline ValueType* any_cast(schedule_model* x)
+{
+    return x->any_cast<ValueType>();
+}
+
+template <typename ValueType>
+inline ValueType& any_cast(schedule_model& x)
+{
+    auto* y = x.any_cast<typename std::remove_reference<ValueType>::type>();
+    if(y == nullptr)
+        throw std::bad_cast();
+    return *y;
+}
+
+template <typename ValueType>
+inline const ValueType& any_cast(const schedule_model& x)
+{
+    const auto* y = x.any_cast<typename std::remove_reference<ValueType>::type>();
+    if(y == nullptr)
+        throw std::bad_cast();
+    return *y;
+}
+
+#endif
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/include/migraphx/streamutils.hpp
+++ b/src/include/migraphx/streamutils.hpp
@@ -36,6 +36,8 @@ inline stream_range_container<Range> stream_range(const Range& r)

 namespace detail {

+inline void stream_write_value_impl(rank<2>, std::ostream& os, const std::string& x) { os << x; }
+
 template <class Range>
 auto stream_write_value_impl(rank<1>, std::ostream& os, const Range& r)
    -> decltype(r.begin(), r.end(), void())
@@ -53,7 +55,7 @@ void stream_write_value_impl(rank<0>, std::ostream& os, const T& x)
 template <class T>
 void stream_write_value(std::ostream& os, const T& x)
 {
-    detail::stream_write_value_impl(rank<1>{}, os, x);
+    detail::stream_write_value_impl(rank<2>{}, os, x);
 }

 } // namespace MIGRAPHX_INLINE_NS

--- a/src/include/migraphx/target.hpp
+++ b/src/include/migraphx/target.hpp
@@ -22,10 +22,8 @@ struct target
 {
    /// A unique name used to identify the target
    std::string name() const;
-    /// The transformation passes to be run
    /**
     * @brief The transformation pass to be run during compilation.
-     * @details [long description]
     *
     * @param ctx This is the target-dependent context that is created by `get_context`
     * @return The passes to be ran

--- a/src/opt/common_header.hpp
+++ b/src/opt/common_header.hpp
-#ifndef MIGRAPHX_GUARD_RTGLIB_COMMON_HEADER_HPP
-#define MIGRAPHX_GUARD_RTGLIB_COMMON_HEADER_HPP
-#include <migraphx/program.hpp>
-#include <migraphx/stringutils.hpp>
-#include <migraphx/instruction.hpp>
-#include <migraphx/operators.hpp>
-#include <migraphx/iterator_for.hpp>
-#include <migraphx/pass_config.hpp>
-#include <migraphx/config.hpp>
-
-#include <set>
-#include <list>
-#include <vector>
-#include <queue>
-
-namespace migraphx {
-inline namespace MIGRAPHX_INLINE_NS {
-
-//#define MIGRAPHX_DEBUG_OPT
-
-#ifdef MIGRAPHX_DEBUG_OPT
-#define MIGRAPHX_DEBUG(s) s
-#else
-#define MIGRAPHX_DEBUG(s)
-#endif // MIGRAPHX_DEBUG_OPT
-
-} // namespace MIGRAPHX_INLINE_NS
-} // namespace migraphx
-
-#endif // MIGRAPHX_GUARD_RTGLIB_COMMON_HEADER_HPP
--- a/src/opt/memory_coloring_impl.hpp
+++ b/src/opt/memory_coloring_impl.hpp
 #ifndef MIGRAPHX_GUARD_RTGLIB_MEMORY_COLORING_IMPL_HPP
 #define MIGRAPHX_GUARD_RTGLIB_MEMORY_COLORING_IMPL_HPP
-#include "common_header.hpp"
+#include <migraphx/program.hpp>
+#include <migraphx/stringutils.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/pass_config.hpp>
 #include <migraphx/config.hpp>

+#include <set>
+#include <list>
+#include <vector>
+#include <queue>
+
+#ifdef MIGRAPHX_DEBUG_OPT
+#define MIGRAPHX_DEBUG(s) s
+#else
+#define MIGRAPHX_DEBUG(s)
+#endif // MIGRAPHX_DEBUG_OPT
+
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {


--- a/src/program.cpp
+++ b/src/program.cpp
@@ -15,9 +15,6 @@
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {

-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_COMPILE)
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_EVAL)
-
 struct program_impl
 {
    // A list is used to keep references to an instruction stable
@@ -107,11 +104,9 @@ instruction_ref program::insert_instruction(instruction_ref ins,
               args.begin(), args.end(), [&](instruction_ref x) { return has_instruction(x); }) &&
           "Argument is not an exisiting instruction");
    assert(not starts_with(op.name(), "@"));
-    // TODO: Use move
    shape r     = compute_shape(op, args);
    auto result = impl->instructions.insert(ins, {op, r, std::move(args)});
    instruction::backreference(result);
-    // assert(result->inputs() == args);
    assert(result->valid(begin()));
    return result;
 }
@@ -510,6 +505,16 @@ void program::perf_report(std::ostream& os, std::size_t n, parameter_map params)
 void program::debug_print() const { std::cout << *this << std::endl; }
 void program::debug_print(instruction_ref ins) const
 {
+    if(ins == this->end())
+    {
+        std::cout << "End instruction" << std::endl;
+        return;
+    }
+    if(not has_instruction(ins))
+    {
+        std::cout << "Instruction not part of program" << std::endl;
+        return;
+    }
    std::stringstream ss;
    print_program(ss, *this, [&](auto x, auto&& names) {
        if(x == ins)
@@ -532,6 +537,11 @@ void program::dry_run(std::unordered_map<std::string, argument> params) const
    generic_eval(*this, ctx, std::move(params), [](auto&&...) { return argument{}; });
 }

+void program::annotate(std::ostream& os, std::function<void(instruction_ref)> a) const
+{
+    print_program(os, *this, [&](auto ins, auto&&) { a(ins); });
+}
+
 bool operator==(const program& x, const program& y) { return to_string(x) == to_string(y); }

 std::ostream& operator<<(std::ostream& os, const program& p)

--- a/src/schedule.cpp
+++ b/src/schedule.cpp
+#include <migraphx/schedule.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/iterator_for.hpp>
+#include <migraphx/dfor.hpp>
+#include <migraphx/functional.hpp>
+#include <migraphx/ranges.hpp>
+#include <unordered_map>
+#include <unordered_set>
+#include <set>
+#include <deque>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+auto get_inputs()
+{
+    return [](auto i) { return i->inputs(); };
+}
+
+auto get_outputs()
+{
+    return [](auto i) { return i->outputs(); };
+}
+
+struct stream_info
+{
+    std::unordered_map<instruction_ref, std::size_t> ins2stream;
+    std::unordered_map<instruction_ref, std::size_t> weights;
+    std::unordered_map<instruction_ref, std::size_t> iweights;
+
+    void accumulate_weights(instruction_ref last, const schedule_model& model)
+    {
+        fix<std::size_t>([&](auto self, auto ins) -> std::size_t {
+            if(not contains(weights, ins))
+            {
+                std::size_t weight = 0;
+                auto&& op          = ins->get_operator();
+                if(not is_context_free(op) and op.name()[0] != '@')
+                    weight = model.weight(op);
+                iweights[ins] = weight;
+                weights[ins] =
+                    std::accumulate(ins->inputs().begin(),
+                                    ins->inputs().end(),
+                                    weight,
+                                    [&](std::size_t w, instruction_ref i) { return w + self(i); });
+            }
+            return weights[ins];
+        })(last);
+    }
+
+    std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args)
+    {
+        if(args.size() < 2)
+        {
+            return args.end();
+        }
+
+        const std::size_t min_partition_threshold = 2;
+        auto compare                              = by(std::greater<>{}, [&](auto x) {
+            return std::make_tuple(this->weights[x], x->inputs().size());
+        });
+        std::sort(args.begin(), args.end(), compare);
+
+        auto it = std::lower_bound(std::next(args.begin()),
+                                   args.end(),
+                                   min_partition_threshold,
+                                   [&](auto i, std::size_t w) { return this->weights[i] > w; });
+        assert(it == args.end() or this->weights[*it] <= min_partition_threshold);
+        assert(it == args.end() or std::prev(it) == args.begin() or
+               this->weights[*std::prev(it)] > min_partition_threshold);
+        return it;
+    }
+
+    struct partition
+    {
+        std::size_t weight = 0;
+        std::vector<instruction_ref> instructions{};
+
+        void add(instruction_ref ins, std::size_t w)
+        {
+            weight += w;
+            instructions.push_back(ins);
+        }
+    };
+
+    void assign_streams(program& p, std::size_t n)
+    {
+        partition critical;
+        std::unordered_map<instruction_ref, std::deque<partition>> partitions;
+        partitions.reserve(weights.size());
+        fix([&](auto self, auto ins, auto& part) {
+            assert(ins != p.end());
+            if(contains(partitions, ins))
+                return;
+            assert(p.has_instruction(ins));
+            // Add an entry so we know the instruction was visited
+            partitions[ins];
+            part.add(ins, this->iweights[ins]);
+
+            auto args         = ins->inputs();
+            auto threshold_it = this->sort_args(args);
+
+            if(not args.empty())
+            {
+                assert(threshold_it != args.begin());
+                self(args.front(), part);
+                for(auto i : range(std::next(args.begin()), threshold_it))
+                {
+                    partitions[ins].emplace_back();
+                    self(i, partitions[ins].back());
+                }
+                for(auto i : range(threshold_it, args.end()))
+                {
+                    self(i, part);
+                }
+            }
+            // Sort instructions
+            p.move_instruction(ins, p.end());
+        })(std::prev(p.end()), critical);
+
+        // Set the critical partition to stream 0
+        set_stream(critical, 0);
+        std::vector<std::size_t> streams(n - 1);
+        // Assign streams for the other partitions
+        for(auto&& ins_part : partitions)
+        {
+            std::sort(
+                ins_part.second.begin(), ins_part.second.end(), by(std::greater<>{}, [](auto&& x) {
+                    return std::make_tuple(x.weight, x.instructions.size());
+                }));
+            for(auto&& part : ins_part.second)
+            {
+                auto stream = std::min_element(streams.begin(), streams.end()) - streams.begin();
+                set_stream(part, stream + 1);
+                streams[stream] += part.weight;
+            }
+        }
+    }
+
+    void set_stream(const partition& p, std::size_t n)
+    {
+        for(auto ins : p.instructions)
+            if(iweights[ins] > 0)
+                set_stream(ins, n);
+    }
+
+    void set_stream(instruction_ref ins, std::size_t n)
+    {
+        assert(iweights[ins] > 0);
+        ins2stream[ins] = n;
+    }
+
+    std::size_t get_stream(instruction_ref ins) const { return ins2stream.at(ins); }
+
+    bool has_stream(instruction_ref ins) const { return contains(ins2stream, ins); }
+
+    template <class F>
+    bool different(F f, std::size_t stream) const
+    {
+        bool result = false;
+        f([&](auto s) {
+            if(s != stream)
+            {
+                result = true;
+                return false;
+            }
+            // cppcheck-suppress uselessAssignmentArg
+            stream = s;
+            return true;
+        });
+        return result;
+    }
+
+    template <class F>
+    bool different(F f) const
+    {
+        bool result = false;
+        f([&](auto s) {
+            result = this->different(f, s);
+            return false;
+        });
+        return result;
+    }
+
+    template <class Selector>
+    auto get_streams_from(instruction_ref start, Selector select) const
+    {
+        return [=](auto f) {
+            return fix<bool>([&](auto self, auto ins) {
+                for(auto i : select(ins))
+                {
+                    if(iweights.at(i) == 0)
+                    {
+                        if(not self(i))
+                            return false;
+                    }
+                    else
+                    {
+                        if(not f(this->get_stream(i)))
+                            return false;
+                    }
+                }
+                return true;
+            })(start);
+        };
+    }
+
+    std::unordered_set<std::size_t> get_streams(instruction_ref ins) const
+    {
+        if(has_stream(ins))
+            return {get_stream(ins)};
+        std::unordered_set<std::size_t> result;
+        get_streams_from(ins, get_inputs())([&](auto s) {
+            result.insert(s);
+            return true;
+        });
+        return result;
+    }
+
+    template <class... Ts>
+    bool is_merge_point(instruction_ref ins, Ts... xs) const
+    {
+        return different(get_streams_from(ins, get_inputs()), xs...);
+    }
+
+    template <class... Ts>
+    bool is_split_point(instruction_ref ins, Ts... xs) const
+    {
+        return different(get_streams_from(ins, get_outputs()), xs...);
+    }
+
+    std::vector<instruction_ref> get_recorded_instructions(instruction_ref start)
+    {
+        std::vector<instruction_ref> result;
+        std::unordered_map<std::size_t, instruction_ref> m;
+        fix([&](auto self, auto ins) {
+            for(auto i : ins->inputs())
+            {
+                if(iweights.at(i) == 0)
+                {
+                    self(i);
+                    continue;
+                }
+                auto stream = this->get_stream(i);
+                if(not contains(m, stream))
+                    m[stream] = i;
+                else
+                    m[stream] = std::min(m[stream], i, by(std::less<>{}, [&](auto x) {
+                                             return std::distance(x, start);
+                                         }));
+            }
+        })(start);
+        std::transform(
+            m.begin(), m.end(), std::back_inserter(result), [](auto&& p) { return p.second; });
+        return result;
+    }
+
+    std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>>
+    find_concurrent_instructions(program& p)
+    {
+        std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>> result;
+        std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> merge_from;
+        result.reserve(p.size());
+        merge_from.reserve(p.size());
+        for(auto ins : reverse_iterator_for(p))
+        {
+            for(auto&& arg : ins->outputs())
+            {
+                if(is_merge_point(arg))
+                    merge_from[ins].insert(arg);
+                merge_from[ins].insert(merge_from[arg].begin(), merge_from[arg].end());
+            }
+
+            auto streams = this->get_streams(ins);
+
+            // Collect concur instructions for each merge point.
+            for(auto& merge : merge_from[ins])
+            {
+                for(auto stream : streams)
+                {
+                    if(result[merge].size() <= stream)
+                        result[merge].resize(stream + 1);
+                    auto&& r = result[merge][stream];
+                    r.push_back(ins);
+                    // Copy inputs if they dont have a stream(and are not a builtin and context
+                    // free). Inputs without a stream can have a implicit dependency
+                    std::copy_if(ins->inputs().begin(),
+                                 ins->inputs().end(),
+                                 std::back_inserter(r),
+                                 [&](auto x) {
+                                     return not this->has_stream(x) and
+                                            not is_context_free(x->get_operator()) and
+                                            x->name().front() != '@';
+                                 });
+                }
+            }
+        }
+        return result;
+    }
+
+    std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>
+    get_conflicts(program& p)
+    {
+        std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> conflict_table;
+        auto concur_ins = this->find_concurrent_instructions(p);
+        for(auto&& merge : concur_ins)
+        {
+            dfor(merge.second.size(), merge.second.size())([&](auto i, auto j) {
+                if(i == j)
+                    return;
+                for(auto ins1 : merge.second[i])
+                {
+                    auto p1 = std::distance(ins1, merge.first);
+                    for(auto ins2 : merge.second[j])
+                    {
+                        if(ins1 == ins2)
+                            continue;
+                        auto p2 = std::distance(ins2, merge.first);
+                        // The smaller distance means the instruction occurs later
+                        if(p1 > p2)
+                            conflict_table[ins2].insert(ins1);
+                        else
+                            conflict_table[ins1].insert(ins2);
+                    }
+                }
+            });
+        }
+        // Remove duplicates
+        for(auto&& ip : conflict_table)
+        {
+            auto ins1 = ip.first;
+            for(auto ins2 : ip.second)
+                if(contains(conflict_table[ins2], ins1))
+                    conflict_table[ins2].erase(ins1);
+        }
+        return conflict_table;
+    }
+};
+
+void schedule::apply(program& p) const
+{
+    stream_info si;
+    auto last = std::prev(p.end());
+    si.accumulate_weights(last, model);
+    si.assign_streams(p, model.concurrency());
+
+    if(enabled(MIGRAPHX_TRACE_COMPILE{}))
+    {
+        p.annotate(std::cout, [&](auto ins) {
+            std::cout << ":";
+            std::cout << " weight=" << si.weights.at(ins);
+            std::cout << " input={";
+            si.get_streams_from(ins, get_inputs())([&](auto s) {
+                std::cout << s << ",";
+                return true;
+            });
+            std::cout << "}";
+            if(si.has_stream(ins))
+                std::cout << " stream=" << si.get_stream(ins);
+        });
+        std::cout << std::endl;
+    }
+
+    // Schedule instructions
+    std::size_t wait_id = 0;
+    std::unordered_map<instruction_ref, std::size_t> ins2wait;
+    std::unordered_map<std::size_t, std::unordered_set<std::size_t>> waited_for;
+    std::unordered_map<instruction_ref, std::unordered_set<std::size_t>> ins2waited;
+    ins2wait.reserve(p.size());
+    ins2waited.reserve(p.size());
+    for(auto ins : iterator_for(p))
+    {
+        // Only schedule instructions that have a stream
+        if(not si.has_stream(ins))
+            continue;
+        assert(si.weights[ins] > 0);
+        // Schedule instruction on the stream
+        auto stream = si.get_stream(ins);
+        assert(stream < model.concurrency());
+        model.sched(p, ins, stream);
+        // Insert wait instructions
+        if(si.is_merge_point(ins, stream))
+        {
+            for(auto i : si.get_recorded_instructions(ins))
+            {
+                if(not si.has_stream(i))
+                    continue;
+                auto istream = si.get_stream(i);
+                if(stream == istream)
+                    continue;
+                // Create a new event if it hasn't been recorded
+                if(not contains(ins2wait, i))
+                {
+                    ins2wait[i] = wait_id;
+                    model.record(p, i, wait_id);
+                    wait_id++;
+                }
+                auto w = ins2wait.at(i);
+                // If we already waited for the event on this stream then dont
+                // insert another wait event
+                if(not contains(waited_for[stream], w))
+                    model.wait(p, ins, w);
+                // Store the event as waited
+                waited_for[stream].insert(w);
+                // Store all wait events that have been waited on prior to the recorded instruction
+                waited_for[stream].insert(ins2waited[i].begin(), ins2waited[i].end());
+            }
+        }
+        // Store wait events that have already been waited on
+        if(si.is_split_point(ins, stream))
+        {
+            ins2waited[ins] = waited_for[stream];
+        }
+    }
+
+    // Add memory conflicts
+    auto conflict_table = si.get_conflicts(p);
+    for(auto&& ip : conflict_table)
+    {
+        if(ip.second.empty())
+            continue;
+        std::vector<instruction_ref> args;
+        args.push_back(ip.first);
+        args.insert(args.end(), ip.second.begin(), ip.second.end());
+        p.insert_instruction(std::next(ip.first), op::identity{}, args);
+    }
+}
+
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
--- a/src/targets/gpu/CMakeLists.txt
+++ b/src/targets/gpu/CMakeLists.txt
@@ -64,6 +64,7 @@ add_library(migraphx_gpu
    pad.cpp
    gather.cpp
    lrn.cpp
+    schedule_model.cpp
 )
 set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
 rocm_clang_tidy_check(migraphx_gpu)

--- a/src/targets/gpu/convolution.cpp
+++ b/src/targets/gpu/convolution.cpp
@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx,

    float alpha = 1;
    float beta  = 0;
-    miopenConvolutionForward(ctx.get_stream().get_miopen(),
+    auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
                                           &alpha,
                                           x_desc.get(),
                                           args[0].implicit(),
@@ -34,6 +34,8 @@ argument miopen_convolution::compute(context& ctx,
                                           args[3].implicit(),
                                           args[2].implicit(),
                                           args[2].get_shape().bytes());
+    if(status != miopenStatusSuccess)
+        MIGRAPHX_THROW("Running convolution failed");
    return args[3];
 }

@@ -89,8 +91,11 @@ void miopen_convolution::finalize(context& ctx,
 {
    if(handle == ctx.get_stream().get_miopen())
        return;
-    // TODO: Check that workspace hasn't changed
-    compile(ctx, output_shape, std::move(inputs));
+    // Check that workspace hasn't changed
+    auto size = inputs.at(2).bytes();
+    auto ws   = compile(ctx, output_shape, std::move(inputs));
+    if(ws.bytes() > size)
+        MIGRAPHX_THROW("Workspace has changed during finalization.");
 }

 } // namespace gpu

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -43,6 +43,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
 template <class T>
 std::vector<T> read_from_gpu(const void* x, std::size_t sz)
 {
+    gpu_sync();
    std::vector<T> result(sz);
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
@@ -52,6 +53,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)

 hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
 {
+    gpu_sync();
    auto result = allocate_gpu(sz, host);
    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
    if(status != hipSuccess)

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -13,11 +13,17 @@ namespace gpu {

 MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NULL_STREAM)

+using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);
+
 struct hip_device
 {
    hip_device() { add_stream(); }

-    hip_device(std::size_t id) : device_id(id) { add_stream(); }
+    hip_device(std::size_t id, std::size_t n) : device_id(id)
+    {
+        for(std::size_t i = 0; i < n; i++)
+            add_stream();
+    }

    struct stream
    {
@@ -32,7 +38,7 @@ struct hip_device
        static hip_stream_ptr create_stream()
        {
            hipStream_t result = nullptr;
-            auto status        = hipStreamCreate(&result);
+            auto status        = hipStreamCreateWithFlags(&result, hipStreamNonBlocking);
            if(status != hipSuccess)
                MIGRAPHX_THROW("Failed to allocate stream");
            return hip_stream_ptr{result};
@@ -77,6 +83,22 @@ struct hip_device
            return rbhandle.get();
        }

+        void wait(hipEvent_t event)
+        {
+            setup();
+            auto status = hipStreamWaitEvent(get(), event, 0);
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to wait.");
+        }
+
+        void record(hipEvent_t event)
+        {
+            setup();
+            auto status = hipEventRecord(event, get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to record.");
+        }
+
        private:
        std::size_t id                      = 0;
        shared<hip_stream_ptr> s            = nullptr;
@@ -88,8 +110,14 @@ struct hip_device

    stream& get_stream() { return streams.at(current_stream); }

+    stream& get_stream(std::size_t n) { return streams.at(n); }
+
    void set_stream(std::size_t n) { current_stream = n; }

+    std::size_t nstreams() const { return streams.size(); }
+
+    std::size_t stream_id() const { return current_stream; }
+
    private:
    std::size_t device_id      = 0;
    std::size_t current_stream = 0;
@@ -98,7 +126,10 @@ struct hip_device

 struct context
 {
-    context(std::size_t n = 0) : current_device(std::make_shared<hip_device>(n)) {}
+    context(std::size_t device_id = 0, std::size_t n = 4)
+        : current_device(std::make_shared<hip_device>(device_id, n))
+    {
+    }

    hip_device& get_current_device()
    {
@@ -107,13 +138,34 @@ struct context
    }

    hip_device::stream& get_stream() { return get_current_device().get_stream(); }
+    hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
+
+    void set_stream(std::size_t n) { get_current_device().set_stream(n); }
+
+    void create_events(std::size_t num_of_events)
+    {
+        for(std::size_t i = events.size(); i < num_of_events + 1; ++i)
+            events.emplace_back(create_event());
+    }
+
+    hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }

    std::vector<argument> literals{};
    void finish() const { gpu_sync(); }

+    static hip_event_ptr create_event()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreateWithFlags(&event, hipEventDisableTiming);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to create event");
+        return hip_event_ptr{event};
+    }
+
    private:
    // TODO: Make this a vector to support multiple devices
    std::shared_ptr<hip_device> current_device;
+    std::vector<shared<hip_event_ptr>> events;
 };
 } // namespace gpu
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/schedule_model.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_SCHEDULE_MODEL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_GPU_SCHEDULE_MODEL_HPP
+
+#include <migraphx/config.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <vector>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+
+struct program;
+struct operation;
+
+namespace gpu {
+
+struct schedule_model
+{
+    std::size_t streams = 0;
+    std::size_t concurrency() const;
+    void sched(program& p, instruction_ref ins, std::size_t n) const;
+    void wait(program& p, instruction_ref ins, std::size_t wait_id) const;
+    void record(program& p, instruction_ref ins, std::size_t wait_id) const;
+    std::size_t weight(const operation& op) const;
+};
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+
+#endif
--- a/src/targets/gpu/schedule_model.cpp
+++ b/src/targets/gpu/schedule_model.cpp
+#include <migraphx/gpu/schedule_model.hpp>
+#include <migraphx/gpu/context.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/operation.hpp>
+
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+
+struct record_event
+{
+    std::size_t event = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.event, "event"));
+    }
+    std::string name() const { return "gpu::record_event"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.get_stream().record(ctx.get_event(event));
+        return {};
+    }
+
+    void finalize(context& ctx, const shape&, const std::vector<shape>&)
+    {
+        ctx.create_events(event);
+    }
+};
+
+struct wait_event
+{
+    std::size_t event = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.event, "event"));
+    }
+    std::string name() const { return "gpu::wait_event"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.get_stream().wait(ctx.get_event(event));
+        return {};
+    }
+};
+
+struct set_stream
+{
+    std::size_t stream = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.stream, "stream"));
+    }
+    std::string name() const { return "gpu::set_stream"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.set_stream(stream);
+        return {};
+    }
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) { ctx.set_stream(stream); }
+};
+
+std::size_t schedule_model::concurrency() const { return streams; }
+void schedule_model::sched(program& p, instruction_ref ins, std::size_t n) const
+{
+    auto last_stream = std::find_if(std::make_reverse_iterator(ins),
+                                    std::make_reverse_iterator(p.begin()),
+                                    [&](auto&& i) { return i.name() == "gpu::set_stream"; });
+    if(last_stream != std::make_reverse_iterator(p.begin()))
+    {
+        auto&& op = any_cast<set_stream>(last_stream->get_operator());
+        // If the same stream was set earlier then skip
+        if(op.stream == n)
+            return;
+    }
+    p.insert_instruction(ins, set_stream{n});
+}
+
+void schedule_model::wait(program& p, instruction_ref ins, std::size_t wait_id) const
+{
+    p.insert_instruction(ins, wait_event{wait_id});
+}
+void schedule_model::record(program& p, instruction_ref ins, std::size_t wait_id) const
+{
+    p.insert_instruction(std::next(ins), record_event{wait_id});
+}
+
+static std::unordered_map<std::string, std::size_t> create_weight_map()
+{
+    return {
+        {"hip::load_literal", 0},
+        {"hip::allocate", 0},
+        {"gpu::convolution", 4},
+        {"gpu::conv_bias_relu", 4},
+        {"gpu::pooling", 2},
+        {"gpu::gemm", 2},
+        {"gpu::concat", 1},
+        {"hip::add_relu", 2},
+    };
+}
+
+static const std::unordered_map<std::string, std::size_t>& weight_map()
+{
+    static std::unordered_map<std::string, std::size_t> m = create_weight_map();
+    return m;
+}
+
+std::size_t schedule_model::weight(const operation& op) const
+{
+    if(weight_map().count(op.name()) == 0)
+    {
+        return 1;
+    }
+    return weight_map().at(op.name());
+}
+
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx