merge in develop

3885c9bc · mei-ye · a5b0afa0 · 3885c9bc · 3885c9bc · 3885c9bc
Commit 3885c9bc authored Feb 12, 2019 by mei-ye
18 changed files
--- a/src/opt/pre_scheduling_impl.hpp
+++ b/src/opt/pre_scheduling_impl.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_PRE_SCHEDULING_IMPL_HPP
+#define MIGRAPHX_GUARD_RTGLIB_PRE_SCHEDULING_IMPL_HPP
+#include <migraphx/common_header.hpp>
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/operation.hpp>
+#include <migraphx/insert_instruction.hpp>
+namespace migraphx {
+struct dag_node
+{
+    dag_node()
+    {
+        weight         = 0;
+        run_on_cpu     = 0;
+        weight_sum     = 0;
+        ins_ndx        = -1;
+        first_child    = nullptr;
+        stream         = -1;
+        partition      = -1;
+        sched_cycle    = -1;
+        earliest_cycle = -1;
+    }
+    int weight;
+    int run_on_cpu;
+    int weight_sum;
+    int ins_ndx;
+    dag_node* first_child;
+    int stream;
+    int partition;
+    int sched_cycle;
+    int earliest_cycle = -1;
+    instruction_ref ins;
+    bool is_literal() const { return (ins->name() == "@literal"); }
+    bool can_use_stream() const { return (run_on_cpu == 0); }
+#ifdef MIGRAPHX_DEBUG_OPT
+    void dump();
+#endif
+};
+struct dag_partition
+{
+    dag_partition()
+    {
+        num_of_partition = 0;
+        weight_sum.clear();
+    }
+    int create_partition()
+    {
+        weight_sum.push_back(0);
+        return num_of_partition++;
+    }
+    void add_weight(dag_node* node)
+    {
+        if(node->partition >= 0)
+        {
+            assert(node->partition < num_of_partition);
+            weight_sum[node->partition] += node->weight;
+        }
+    }
+    int num_of_partition;
+    std::vector<int> weight_sum;
+};
+struct stream_info
+{
+    stream_info(int n) : num_of_streams(n)
+    {
+        max_cycle = 0;
+        next_cycles.clear();
+        for(auto stream = 0; stream < num_of_streams; ++stream)
+            next_cycles.push_back(0);
+    }
+    std::vector<int> next_cycles;
+    int num_of_streams;
+    int max_cycle;
+};
+struct pre_scheduling_impl
+{
+    pre_scheduling_impl(program* p,
+                        std::function<std::pair<int, int>(const operation&)> w,
+                        int n,
+                        insert_instruction ins,
+                        bool v)
+        : p_program(p),
+          weight_func(std::move(w)),
+          num_of_streams(n),
+          insert_instr(std::move(ins)),
+          enable_verify(v)
+    {
+        instr2_node.clear();
+        instr2_mask.clear();
+        instr2_stream.clear();
+    }
+    void schedule(std::list<dag_node*>&);
+    void compute_weights();
+    int get_stream(stream_info&, dag_node*);
+    void record(stream_info&, dag_node*);
+    void reorder();
+    void run();
+    void splice(std::list<dag_node*>&);
+    void annotate(std::list<dag_node*>&);
+    static bool compare_exit_nodes(dag_node* d1, dag_node* d2)
+    {
+        return (d1->weight_sum > d2->weight_sum);
+    }
+    struct weighted_topology_ordering
+    {
+        bool operator()(const dag_node* d1, const dag_node* d2) const
+        {
+            if(d1->weight_sum < d2->weight_sum)
+            {
+                // smaller weigth_sum is placed on top of the queue.
+                return false;
+            }
+            else if(d1->weight_sum > d2->weight_sum)
+            {
+                return true;
+            }
+            else
+            {
+                // smaller instrution index is placed on top of the queue,
+                return d1->ins_ndx > d2->ins_ndx;
+            }
+        }
+    };
+    struct post_schedule_ordering
+    {
+        bool operator()(const dag_node* d1, const dag_node* d2) const
+        {
+            if(d1->sched_cycle == d2->sched_cycle)
+            {
+                if(d1->stream == d2->stream)
+                {
+                    // smaller instruction index on top of queue.
+                    return d1->ins_ndx > d2->ins_ndx;
+                }
+                else
+                {
+                    // smaller stream on top of queue.
+                    return (d1->stream > d2->stream);
+                }
+            }
+            else
+            {
+                // smaller sched_cycle on top of queue.
+                return (d1->sched_cycle > d2->sched_cycle);
+            }
+        }
+    };
+    bool has_mask(instruction_ref ins, unsigned int m)
+    {
+        if(instr2_mask.find(ins) != instr2_mask.end())
+        {
+            unsigned int mask = instr2_mask[ins];
+            return ((mask & (1u << m)) != 0);
+        }
+        return false;
+    }
+    void add_mask(instruction_ref ins, unsigned int m)
+    {
+        unsigned int mask = (instr2_mask.find(ins) != instr2_mask.end()) ? instr2_mask[ins] : 0;
+        if((mask & (1u << m)) == 0)
+            instr2_mask[ins] = (mask + (1u << m));
+    }
+    void verify();
+#ifdef MIGRAPHX_DEBUG_OPT
+    void dump(const std::string&);
+    void dump_program();
+    void dump(std::list<dag_node*>&);
+#endif
+    static const int min_partition_threshold = 2;
+    private:
+    program* p_program;
+    std::function<std::pair<int, int>(const operation&)> weight_func;
+    int num_of_streams;
+    insert_instruction insert_instr;
+    std::vector<dag_node> nodes;
+    std::vector<dag_node*> exit_nodes;
+    std::unordered_map<instruction_ref, dag_node*> instr2_node;
+    std::unordered_map<instruction_ref, int> instr2_stream;
+    std::unordered_map<instruction_ref, unsigned int> instr2_mask;
+    dag_partition partition_info;
+    bool enable_verify;
+};
+} // namespace migraphx
+#endif
--- a/src/program.cpp
+++ b/src/program.cpp
@@ -6,6 +6,7 @@
 #include <migraphx/ranges.hpp>
 #include <migraphx/time.hpp>
 #include <migraphx/iterator_for.hpp>
+#include <migraphx/pass_config.hpp>
 #include <iostream>
 #include <sstream>
 #include <algorithm>
@@ -52,7 +53,12 @@ static void print_instruction(std::ostream& os,
        }
        os << ")";
    }
+    if(ins->get_stream() >= 0)
+        os << "(stream=" << ins->get_stream() << ")";
+    if(ins->has_mask(wait_event))
+        os << " wait ";
+    if(ins->has_mask(record_event))
+        os << " record=" << ins->get_event();
    os << " -> " << ins->get_shape();
 }
@@ -323,12 +329,18 @@ void program::compile(const target& t, tracer trace)
 void program::finalize()
 {
+    int max_event = -1;
    for(auto ins : iterator_for(*this))
    {
        ins->finalize(this->impl->ctx);
+        max_event = std::max(max_event, ins->get_event());
    }
+    if(max_event >= 0)
+        this->impl->ctx.create_events(max_event + 1);
 }
+void program::finish() { this->impl->ctx.finish(); }
 template <class F>
 argument generic_eval(const program& p,
                      context& ctx,
@@ -340,8 +352,12 @@ argument generic_eval(const program& p,
    results.reserve(p.size() * 2);
    std::vector<argument> values;
    values.reserve(16);
+    bool enable_event_as_instr = enabled(MIGRAPHX_ENABLE_EVENT_AS_INSTRUCTION{});
    for(auto ins : iterator_for(p))
    {
+        int stream = ins->get_stream();
+        // ctx.set_stream(stream);
        if(ins->name() == "@literal")
        {
            results.emplace(ins, trace(ins, [&] { return ins->get_literal().get_argument(); }));
@@ -368,9 +384,26 @@ argument generic_eval(const program& p,
                    assert(results.find(i) != results.end());
                    return results[i];
                });
+            if(!enable_event_as_instr && ins->has_mask(wait_event))
+            {
+                for(auto&& arg : ins->inputs())
+                {
+                    int arg_s = arg->get_stream();
+                    if((arg_s < 0) || (arg_s == stream))
+                        continue;
+                    int event = arg->get_event();
+                    assert(event >= 0);
+                    ctx.wait_event(event);
+                }
+            }
            results.emplace(ins, trace(ins, [&] {
                                return ins->get_operator().compute(ctx, ins->get_shape(), values);
                            }));
+            if(!enable_event_as_instr && ins->has_mask(record_event))
+                ctx.record_event(ins->get_event());
        }
        assert(results.find(ins) != results.end());
    }
@@ -534,6 +567,5 @@ std::ostream& operator<<(std::ostream& os, const program& p)
    print_program(os, p, [](auto&&...) {});
    return os;
 }
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx
--- a/src/targets/cpu/include/migraphx/cpu/context.hpp
+++ b/src/targets/cpu/include/migraphx/cpu/context.hpp
@@ -9,9 +9,12 @@ namespace cpu {
 struct context
 {
-    void finish() const {}
+    void finish() {}
+    void set_stream(int) {}
+    void create_events(int) {}
+    void record_event(int) {}
+    void wait_event(int) {}
 };
 } // namespace cpu
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -4,6 +4,7 @@
 #include <migraphx/gpu/miopen.hpp>
 #include <migraphx/gpu/rocblas.hpp>
 #include <migraphx/gpu/hip.hpp>
+#include <migraphx/gpu/machine_model.hpp>
 #include <migraphx/env.hpp>
 #include <migraphx/config.hpp>
@@ -11,13 +12,13 @@ namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
 namespace gpu {
-MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_NULL_STREAM)
 struct hip_device
 {
-    hip_device() { add_stream(); }
+    using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);
+    hip_device() { add_streams(); }
-    hip_device(std::size_t id) : device_id(id) { add_stream(); }
+    hip_device(std::size_t id) : device_id(id) { add_streams(); }
    struct stream
    {
@@ -32,7 +33,8 @@ struct hip_device
        static hip_stream_ptr create_stream()
        {
            hipStream_t result = nullptr;
-            auto status        = hipStreamCreate(&result);
+            auto status        = hipStreamCreateWithFlags(&result, hipStreamNonBlocking);
            if(status != hipSuccess)
                MIGRAPHX_THROW("Failed to allocate stream");
            return hip_stream_ptr{result};
@@ -84,16 +86,61 @@ struct hip_device
        shared<rocblas_handle_ptr> rbhandle = nullptr;
    };
-    void add_stream() { streams.emplace_back(device_id); }
+    static hip_event_ptr create_event()
+    {
+        hipEvent_t event;
+        auto status = hipEventCreateWithFlags(&event, hipEventDisableTiming);
+        if(status != hipSuccess)
+            MIGRAPHX_THROW("Failed to creat event");
+        return hip_event_ptr{event};
+    }
+    void add_streams()
+    {
+        int num_of_streams = 1;
+        assert(streams.empty());
+        if(enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+            num_of_streams = stream_info().num_of_streams();
+        for(int i = 0; i < num_of_streams; ++i)
+            streams.emplace_back(device_id);
+    }
    stream& get_stream() { return streams.at(current_stream); }
    void set_stream(std::size_t n) { current_stream = n; }
+    void create_events(int num_of_events)
+    {
+        for(int i = events.size(); i < num_of_events; ++i)
+            events.emplace_back(create_event());
+    }
+    void record_event(int event)
+    {
+        hipEventRecord(events.at(event).get(), streams.at(current_stream).get());
+    }
+    void wait_event(int event)
+    {
+        hipStreamWaitEvent(streams.at(current_stream).get(), events.at(event).get(), 0);
+    }
+    void stream_sync()
+    {
+        if(enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+        {
+            int num_of_streams = streams.size();
+            if(num_of_streams > 0)
+            {
+                for(int i = 0; i < num_of_streams; i++)
+                    hipStreamSynchronize(streams.at(i).get());
+            }
+        }
+    }
    private:
    std::size_t device_id      = 0;
    std::size_t current_stream = 0;
    std::vector<stream> streams;
+    std::vector<shared<hip_event_ptr>> events;
 };
 struct context
@@ -107,9 +154,21 @@ struct context
    }
    hip_device::stream& get_stream() { return get_current_device().get_stream(); }
+    void set_stream(int n)
+    {
+        if(n >= 0)
+            get_current_device().set_stream(n);
+    }
+    void create_events(int num_of_events) { get_current_device().create_events(num_of_events); }
+    void record_event(int event) { get_current_device().record_event(event); }
+    void wait_event(int event) { get_current_device().wait_event(event); }
    std::vector<argument> literals{};
-    void finish() const { gpu_sync(); }
+    void finish()
+    {
+        get_current_device().stream_sync();
+        gpu_sync();
+    }
    private:
    // TODO: Make this a vector to support multiple devices

--- a/src/targets/gpu/include/migraphx/gpu/event.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/event.hpp
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_EVENT_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_EVENT_HPP
+#include <migraphx/instruction.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/config.hpp>
+#include <utility>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct create_events
+{
+    int num_of_events = 0;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.num_of_events, "event"));
+    }
+    std::string name() const { return "gpu::create_events"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.create_events(num_of_events);
+        return {};
+    }
+};
+struct record_event
+{
+    int event = -1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.event, "event"));
+    }
+    std::string name() const { return "gpu::record_event"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.record_event(event);
+        return {};
+    }
+};
+struct wait_event
+{
+    int event = -1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.event, "event"));
+    }
+    std::string name() const { return "gpu::wait_event"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        ctx.wait_event(event);
+        return {};
+    }
+};
+struct set_stream
+{
+    int stream = -1;
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.stream, "stream"));
+    }
+    std::string name() const { return "gpu::set_stream"; }
+    shape compute_shape(const std::vector<shape>&) const { return {}; }
+    argument compute(context& ctx, const shape&, const std::vector<argument>&) const
+    {
+        assert(stream >= 0);
+        ctx.set_stream(stream);
+        return {};
+    }
+    void finalize(context& ctx, const shape&, const std::vector<shape>&) { ctx.set_stream(stream); }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/find_concur_gpu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/find_concur_gpu.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_FIND_CONCUR_GPU_HPP
+#define MIGRAPHX_GUARD_RTGLIB_FIND_CONCUR_GPU_HPP
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/dom_info.hpp>
+#include <migraphx/common_header.hpp>
+#include <unordered_map>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct find_concur_gpu
+{
+    void get_concur(program* p,
+                    int num_of_streams,
+                    std::unordered_map<const instruction*,
+                                       std::vector<std::vector<const instruction*>>>& concur_instrs,
+                    std::unordered_map<const instruction*, int>& instr2_points) const
+    {
+        dom_info info(p);
+        info.compute_dom(true);
+        info.propagate_splits(num_of_streams, concur_instrs, instr2_points);
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/insert_instruction_gpu.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/insert_instruction_gpu.hpp
+#ifndef MIGRAPHX_GUARD_RTGLIB_INSERT_INSTRUCTION_GPU_HPP
+#define MIGRAPHX_GUARD_RTGLIB_INSERT_INSTRUCTION_GPU_HPP
+#include <migraphx/instruction_ref.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/config.hpp>
+#include <migraphx/gpu/event.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+namespace gpu {
+struct insert_instruction_gpu
+{
+    void insert_create_events(program* p, instruction_ref ins, int num_of_events)
+    {
+        p->insert_instruction(ins, create_events{num_of_events});
+    }
+    void insert_record_event(program* p, instruction_ref ins, int event)
+    {
+        p->insert_instruction(ins, record_event{event});
+    }
+    void insert_wait_event(program* p, instruction_ref ins, int event)
+    {
+        p->insert_instruction(ins, wait_event{event});
+    }
+    void insert_stream(program* p, instruction_ref ins, int stream)
+    {
+        p->insert_instruction(ins, set_stream{stream});
+    }
+};
+} // namespace gpu
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/include/migraphx/gpu/machine_model.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/machine_model.hpp
+#ifndef MIGRAPHX_GUARD_MIGRAPHLIB_MIOPEN_MACHINE_MODEL_HPP
+#define MIGRAPHX_GUARD_MIGRAPHLIB_MIOPEN_MACHINE_MODEL_HPP
+#include <string>
+#include <unordered_map>
+#include <migraphx/pass_config.hpp>
+#include <migraphx/operation.hpp>
+namespace migraphx {
+namespace gpu {
+MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_DISABLE_NULL_STREAM)
+struct op_info
+{
+    op_info()
+    {
+        // First in pair denotes weight.  Second in pair tells
+        // that the instruction is run ONLY on CPU.
+        weight_map["convolution"]         = std::make_pair(4, 0);
+        weight_map["pooling"]             = std::make_pair(2, 0);
+        weight_map["gemm"]                = std::make_pair(2, 0);
+        weight_map["broadcast"]           = std::make_pair(1, 1);
+        weight_map["multibroadcast"]      = std::make_pair(1, 1);
+        weight_map["contiguous"]          = std::make_pair(1, 1);
+        weight_map["transpose"]           = std::make_pair(1, 1);
+        weight_map["load"]                = std::make_pair(1, 1);
+        weight_map["@param"]              = std::make_pair(1, 1);
+        weight_map["@literal"]            = std::make_pair(1, 1);
+        weight_map["hip::load_literal"]   = std::make_pair(1, 1);
+        weight_map["hip::allocate"]       = std::make_pair(0, 1);
+        weight_map["@outline"]            = std::make_pair(0, 1);
+        weight_map["gpu::convolution"]    = std::make_pair(4, 0);
+        weight_map["gpu::conv_bias_relu"] = std::make_pair(4, 0);
+        weight_map["gpu::pooling"]        = std::make_pair(2, 0);
+        weight_map["gpu::gemm"]           = std::make_pair(2, 0);
+        weight_map["gpu::concat"]         = std::make_pair(1, 0);
+        weight_map["hip::add_relu"]       = std::make_pair(2, 0);
+    }
+    std::pair<int, int> operator()(const operation& op)
+    {
+        if(weight_map.find(op.name()) != weight_map.end())
+        {
+            return weight_map[op.name()];
+        }
+        else
+        {
+            return std::make_pair(1, 0);
+        }
+    }
+    std::unordered_map<std::string, std::pair<int, int>> weight_map;
+};
+struct stream_info
+{
+    int num_of_streams()
+    {
+        if(!enabled(MIGRAPHX_DISABLE_NULL_STREAM{}))
+            return 0;
+        else
+            return 4;
+    }
+};
+} // namespace gpu
+} // namespace migraphx
+#endif
--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -18,6 +18,10 @@
 #include <migraphx/rewrite_rnn.hpp>
 #include <migraphx/eliminate_concat.hpp>
 #include <migraphx/gpu/concat_gpu_opt.hpp>
+#include <migraphx/pre_scheduling.hpp>
+#include <migraphx/gpu/machine_model.hpp>
+#include <migraphx/gpu/find_concur_gpu.hpp>
+#include <migraphx/gpu/insert_instruction_gpu.hpp>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
@@ -26,6 +30,8 @@ namespace gpu {
 std::vector<pass> target::get_passes(migraphx::context& gctx) const
 {
    auto& ctx                                                        = any_cast<context>(gctx);
+    std::function<std::pair<int, int>(const operation&)> weight_func = op_info();
+    int num_of_streams = stream_info().num_of_streams();
    // clang-format off
    return
    {
@@ -51,7 +57,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
        fuse_ops{&ctx},
        dead_code_elimination{},
        write_literals{&ctx}, 
-        memory_coloring{"hip::allocate"},
+        pre_scheduling{weight_func, num_of_streams, insert_instruction_gpu{}},
+        memory_coloring{"hip::allocate", num_of_streams, find_concur_gpu{}},
        eliminate_workspace{},
        eliminate_allocation{"hip::allocate"},
        check_context<context>{},

--- a/test/const_eval_test.cpp
+++ b/test/const_eval_test.cpp
@@ -47,6 +47,10 @@ struct non_computable_cf
 struct test_context
 {
    void finish() const {}
+    void set_stream(int) {}
+    void create_events(int) {}
+    void record_event(int) {}
+    void wait_event(int) {}
 };
 TEST_CASE(literal_test)

--- a/test/cpu_event_test.cpp
+++ b/test/cpu_event_test.cpp
+#include <migraphx/pre_scheduling.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/cpu/target.hpp>
+#include <basic_ops.hpp>
+#include <test.hpp>
+// This is a test to trigger the code in cpu's context.hpp and runtime
+// codes in program.cpp.
+//
+TEST_CASE(test1)
+{
+    migraphx::program p;
+    auto in1 = p.add_parameter("0", migraphx::shape{migraphx::shape::float_type, {32, 64, 1, 1}});
+    auto in2 = p.add_parameter("1", migraphx::shape{migraphx::shape::float_type, {64, 64, 1, 1}});
+    auto p1  = p.add_instruction(migraphx::op::convolution{}, in1, in2);
+    p1->set_stream(0);
+    auto in3 = p.add_parameter("2", migraphx::shape{migraphx::shape::float_type, {64, 64, 1, 1}});
+    auto p2  = p.add_instruction(migraphx::op::convolution{}, in1, in3);
+    p2->set_stream(1);
+    p2->set_event(0);
+    p2->add_mask(migraphx::record_event);
+    auto p3 = p.add_instruction(migraphx::op::concat{1}, p1, p2);
+    p3->set_stream(0);
+    p3->add_mask(migraphx::wait_event);
+    p.compile(migraphx::cpu::target{});
+    migraphx::program::parameter_map m;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        m[x.first] = migraphx::generate_argument(x.second);
+    }
+    p.eval(m);
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/eval_test.cpp
+++ b/test/eval_test.cpp
@@ -11,6 +11,10 @@ struct id_target
    struct context
    {
        void finish() const {}
+        void set_stream(int) {}
+        void create_events(int) {}
+        void record_event(int) {}
+        void wait_event(int) {}
    };
    migraphx::context ctx = context{};
    std::string name() const { return "id"; }

--- a/test/gpu/stream_execution.cpp
+++ b/test/gpu/stream_execution.cpp
+#include <test.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/program.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/cpu/target.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/verify_args.hpp>
+migraphx::program create_program(bool is_cpu)
+{
+    migraphx::program p;
+    auto in1 = p.add_parameter("0", migraphx::shape{migraphx::shape::float_type, {32, 64, 1, 1}});
+    auto in2 = p.add_parameter("1", migraphx::shape{migraphx::shape::float_type, {64, 64, 1, 1}});
+    auto p1  = p.add_instruction(migraphx::op::convolution{}, in1, in2);
+    auto in3 = p.add_parameter("2", migraphx::shape{migraphx::shape::float_type, {64, 64, 1, 1}});
+    auto p2  = p.add_instruction(migraphx::op::convolution{}, in1, in3);
+    if(is_cpu)
+    {
+        p2->set_event(0);
+        p2->add_mask(migraphx::record_event);
+    }
+    auto p3 = p.add_instruction(migraphx::op::concat{1}, p1, p2);
+    if(is_cpu)
+    {
+        p3->add_mask(migraphx::wait_event);
+    }
+    return p;
+}
+migraphx::argument run_gpu()
+{
+    setenv("MIGRAPHX_DISABLE_NULL_STREAM", "1", 1);
+    migraphx::program p = create_program(false);
+    p.compile(migraphx::gpu::target{});
+    migraphx::program::parameter_map m;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        m[x.first] = migraphx::gpu::to_gpu(migraphx::generate_argument(x.second));
+    }
+    auto ret_val = migraphx::gpu::from_gpu(p.eval(m));
+    p.finish();
+    return ret_val;
+}
+migraphx::argument run_cpu()
+{
+    migraphx::program p = create_program(true);
+    p.compile(migraphx::cpu::target{});
+    migraphx::program::parameter_map m;
+    for(auto&& x : p.get_parameter_shapes())
+    {
+        m[x.first] = migraphx::generate_argument(x.second);
+    }
+    return p.eval(m);
+}
+void gpu_stream_execution_test()
+{
+    auto result1 = run_gpu();
+    auto result2 = run_cpu();
+    verify_args("test", result2, result1);
+}
+int main() { gpu_stream_execution_test(); }
--- a/test/memory_coloring_test.cpp
+++ b/test/memory_coloring_test.cpp
@@ -2,15 +2,47 @@
 #include <migraphx/operators.hpp>
 #include <migraphx/generate.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/dom_info.hpp>
+#include <migraphx/common_header.hpp>
+#include <migraphx/instruction.hpp>
 #include <basic_ops.hpp>
 #include <test.hpp>
+struct set_stream
+{
+    int stream = -1;
+    std::string name() const { return "gpu::set_stream"; }
+    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        else
+            return inputs.front();
+    }
+};
+struct find_concur
+{
+    void get_concur(
+        migraphx::program* p,
+        int num_of_streams,
+        std::unordered_map<const migraphx::instruction*,
+                           std::vector<std::vector<const migraphx::instruction*>>>& concur_instrs,
+        std::unordered_map<const migraphx::instruction*, int>& instr2_points) const
+    {
+        migraphx::dom_info info(p);
+        info.compute_dom(true);
+        info.propagate_splits(num_of_streams, concur_instrs, instr2_points);
+    }
+};
 struct memory_coloring_target
 {
    std::string name() const { return "memory_coloring"; }
    std::vector<migraphx::pass> get_passes(migraphx::context&) const
    {
-        return {migraphx::memory_coloring{"allocate", true}};
+        return {migraphx::memory_coloring{"allocate", 4, find_concur{}, true}};
    }
    migraphx::context get_context() const { return {}; }
 };
@@ -608,4 +640,47 @@ TEST_CASE(literal_test)
    CHECK(lit == result);
 }
+TEST_CASE(concurrent_test)
+{
+    migraphx::program p;
+    auto in = p.add_parameter("0", migraphx::shape{migraphx::shape::float_type, {40}});
+    auto a1 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p1 = p.add_instruction(pass_op{}, a1, in);
+    p.insert_instruction(p1, set_stream{0});
+    p1->set_stream(0);
+    p1->add_mask(migraphx::record_event);
+    auto a2 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p2 = p.add_instruction(pass_op{}, a2, p1);
+    p2->set_stream(0);
+    auto a4 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p4 = p.add_instruction(pass_op{}, a4, p2);
+    p4->set_stream(0);
+    auto a3 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p3 = p.add_instruction(pass_op{}, a3, p1);
+    p3->set_stream(1);
+    p.insert_instruction(p3, set_stream{1});
+    p3->add_mask(migraphx::wait_event);
+    auto a5 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p5 = p.add_instruction(pass_op{}, a5, p3);
+    p5->set_stream(1);
+    p5->add_mask(migraphx::record_event);
+    auto a6 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p6 = p.add_instruction(pass_op{}, a6, p1);
+    p6->set_stream(2);
+    p6->add_mask(migraphx::wait_event);
+    p.insert_instruction(p6, set_stream{2});
+    auto a7 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p7 = p.add_instruction(pass_op{}, a7, p6);
+    p7->set_stream(2);
+    p7->add_mask(migraphx::record_event);
+    auto a8 = add_alloc(p, {migraphx::shape::float_type, {40}});
+    auto p8 = p.add_instruction(migraphx::op::concat{0}, a8, p4, p5, p7);
+    ;
+    p8->set_stream(0);
+    p8->add_mask(migraphx::wait_event);
+    p.insert_instruction(p8, set_stream{0});
+    p.compile(memory_coloring_target{});
+    CHECK(p.get_parameter_shape("scratch").bytes() == 960);
+}
 int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/test/stream_execution_test.cpp
+++ b/test/stream_execution_test.cpp
+#include <migraphx/pre_scheduling.hpp>
+#include <migraphx/operators.hpp>
+#include <migraphx/generate.hpp>
+#include <migraphx/instruction.hpp>
+#include <migraphx/program.hpp>
+#include <basic_ops.hpp>
+#include <test.hpp>
+struct set_stream
+{
+    int stream = -1;
+    std::string name() const { return "set_stream"; }
+    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        else
+            return inputs.front();
+    }
+};
+struct create_events
+{
+    int num_of_events = 0;
+    std::string name() const { return "gpu::create_events"; }
+    migraphx::shape compute_shape(const std::vector<migraphx::shape>& inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        else
+            return inputs.front();
+    }
+};
+struct weight_func
+{
+    weight_func()
+    {
+        weight_map["@param"]   = std::make_pair(1, 1);
+        weight_map["@literal"] = std::make_pair(1, 1);
+    };
+    std::pair<int, int> operator()(const migraphx::operation& op)
+    {
+        if(weight_map.find(op.name()) != weight_map.end())
+            return weight_map[op.name()];
+        else
+            return std::make_pair(1, 0);
+    }
+    std::unordered_map<std::string, std::pair<int, int>> weight_map;
+};
+struct insert_instruction
+{
+    void insert_stream(migraphx::program* p, migraphx::instruction_ref ins, int stream)
+    {
+        p->insert_instruction(ins, set_stream{stream});
+    }
+    void insert_create_events(migraphx::program*, migraphx::instruction_ref, int) {}
+    void insert_record_event(migraphx::program*, migraphx::instruction_ref, int) {}
+    void insert_wait_event(migraphx::program*, migraphx::instruction_ref, int) {}
+};
+struct stream_execution_target
+{
+    struct context
+    {
+        void finish() const {}
+        void set_stream(int) {}
+        void create_events(int) {}
+        void record_event(int) {}
+        void wait_event(int) {}
+    };
+    migraphx::context ctx = context{};
+    std::string name() const { return "stream_execution"; }
+    std::vector<migraphx::pass> get_passes(migraphx::context&) const
+    {
+        return {migraphx::pre_scheduling{weight_func(), 2, insert_instruction{}, true}};
+    }
+    migraphx::context get_context() const { return {ctx}; }
+};
+TEST_CASE(test1)
+{
+    migraphx::program p;
+    auto in1 =
+        p.add_parameter("0", migraphx::shape{migraphx::shape::float_type, {32, 256, 35, 35}});
+    auto l1 =
+        p.add_literal(migraphx::generate_literal({migraphx::shape::float_type, {64, 256, 1, 1}}));
+    auto p1 = p.add_instruction(migraphx::op::convolution{}, in1, l1);
+    auto l2 =
+        p.add_literal(migraphx::generate_literal({migraphx::shape::float_type, {48, 256, 1, 1}}));
+    auto p2 = p.add_instruction(migraphx::op::convolution{}, in1, l2);
+    p.add_instruction(migraphx::op::concat{1}, p1, p2);
+    p.compile(stream_execution_target{});
+    std::cout << p << std::endl;
+    CHECK(std::count_if(
+              p.begin(), p.end(), [](auto&& ins) { return ins.name() == "set_stream"; }) == 3);
+    CHECK(std::count_if(p.begin(), p.end(), [](auto&& ins) { return ins.get_stream() == 0; }) == 2);
+    CHECK(std::count_if(p.begin(), p.end(), [](auto&& ins) { return ins.get_stream() == 1; }) == 1);
+    CHECK(std::count_if(p.begin(), p.end(), [](auto&& ins) {
+              return ins.has_mask(migraphx::record_event);
+          }) == 1);
+    CHECK(std::count_if(p.begin(), p.end(), [](auto&& ins) {
+              return ins.has_mask(migraphx::wait_event);
+          }) == 1);
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }
--- a/tools/include/context.hpp
+++ b/tools/include/context.hpp
@@ -20,19 +20,26 @@ inline namespace MIGRAPHX_INLINE_NS {
 struct context
 {
    /// Wait for any tasks in the context to complete
-    void finish() const;
+    void finish();
+    void set_stream(int ndx);
+    void create_events(int num_of_events);
+    void record_event(int event);
+    void wait_event(int event);
 };
 #else
 <%
 interface('context',
-    virtual('finish', returns='void', const=True)
+    virtual('finish', returns='void'),
+    virtual('set_stream', returns='void', input = 'int'),
+    virtual('create_events', returns='void', input = 'int'),
+    virtual('record_event', returns='void', input = 'int'),
+    virtual('wait_event', returns='void', input = 'int'),
 )
 %>
 #endif
 } // namespace MIGRAPHX_INLINE_NS
 } // namespace migraphx

--- a/tools/include/find_concur.hpp
+++ b/tools/include/find_concur.hpp
+#ifndef MIGRAPHX_GUARD_FIND_CONCUR_HPP
+#define MIGRAPHX_GUARD_FIND_CONCUR_HPP
+#include <cassert>
+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <unordered_map>
+#include <vector>
+#include <migraphx/instruction.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct program;
+#ifdef DOXYGEN
+/// An interface for target-dependent analysis to find concurrent instructions
+/// executing in different streams.
+struct find_concur
+{
+    void get_concur(program* p,
+                    int num_of_streams,
+                    std::unordered_map<const instruction*,
+                                       std::vector<std::vector<const instruction*>>>& concur_instrs,
+                    std::unordered_map<const instruction*, int>& instr2_points);
+} const;
+#else
+<%
+interface('find_concur',
+          virtual('get_concur', returns='void', p = 'program*', num_of_stream = 'int', concur_instrs = 'std::unordered_map<const instruction*, std::vector<std::vector<const instruction*>>>&', input = 'std::unordered_map<const instruction*, int>&', const=True)
+)
+%>
+#endif
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif
--- a/tools/include/insert_instruction.hpp
+++ b/tools/include/insert_instruction.hpp
+#ifndef MIGRAPHX_GUARD_INSERT_INSTRUCTION_HPP
+#define MIGRAPHX_GUARD_INSERT_INSTRUCTION_HPP
+#include <cassert>
+#include <string>
+#include <functional>
+#include <memory>
+#include <type_traits>
+#include <utility>
+#include <migraphx/instruction_ref.hpp>
+namespace migraphx {
+inline namespace MIGRAPHX_INLINE_NS {
+struct program;
+#ifdef DOXYGEN
+/// An interface for target-dependent instruction insertion.
+/// for multi-stream execution.
+struct insert_instruction
+{
+    void insert_create_events(program* p, instruction_ref ins, int num_of_events);
+    void insert_record_event(program* p, instruction_ref ins, int event);
+    void insert_wait_event(program* p, instruction_ref ins, int event);
+    void insert_stream(program* p, instruction_ref ins, int stream);
+};
+#else
+<%
+interface('insert_instruction',
+          virtual('insert_create_events', returns='void', p = 'program*', ins ='instruction_ref', input = 'int'),
+          virtual('insert_record_event', returns='void', p = 'program*', ins ='instruction_ref', input = 'int'),
+          virtual('insert_wait_event', returns='void', p = 'program*', ins = 'instruction_ref', input = 'int'),
+          virtual('insert_stream', returns='void', p = 'program*', ins ='instruction_ref', input = 'int')
+)
+%>
+#endif
+} // namespace MIGRAPHX_INLINE_NS
+} // namespace migraphx
+#endif