Schedule concurrently without memory coloring

c6d92ecc · Paul · 9cd5cf4e · c6d92ecc · c6d92ecc · c6d92ecc
Commit c6d92ecc authored Mar 01, 2019 by Paul
4 changed files
--- a/src/schedule.cpp
+++ b/src/schedule.cpp
@@ -5,12 +5,68 @@
 #include <migraphx/functional.hpp>
 #include <migraphx/ranges.hpp>
 #include <unordered_map>
+#include <set>
 namespace migraphx {
 inline namespace MIGRAPHX_INLINE_NS {
+struct stream_info
+{
+    std::unordered_map<instruction_ref, std::size_t> ins2stream;
+    void set_stream(instruction_ref ins, std::size_t n)
+    {
+        ins2stream[ins] = n;
+    }
+    std::size_t get_stream(instruction_ref ins) const
+    {
+        return ins2stream.at(ins);
+    }
+    bool has_stream(instruction_ref ins) const
+    {
+        return ins2stream.count(ins) > 0;
+    }
+    bool different(const std::vector<instruction_ref>& v) const
+    {
+        if (v.size() < 2)
+            return false;
+        auto stream = get_stream(v.front());
+        return not std::all_of(v.begin(), v.end(), [&](instruction_ref x) {
+            return get_stream(x) == stream;
+        });
+    }
+    bool is_split_point(instruction_ref ins) const
+    {
+        return different(ins->outputs());
+    }
+    bool is_merge_point(instruction_ref ins) const
+    {
+        return different(ins->inputs());
+    }
+    std::vector<std::size_t> wait_for(instruction_ref ins) const
+    {
+        std::set<std::size_t> result;
+        auto s = get_stream(ins);
+        for(auto i:ins->inputs())
+        {
+            auto stream = get_stream(i);
+            if (stream != s)
+                result.insert(stream);
+        }
+        return {result.begin(), result.end()};
+    }
+};
 void schedule::apply(program& p) const
 {
+    const std::size_t min_partition_threshold = 2;
    // Compute accumulated weights
    std::unordered_map<instruction_ref, std::size_t> weights;
    auto last = std::prev(p.end());
@@ -26,6 +82,44 @@ void schedule::apply(program& p) const
        return weights[ins];
    })(last);
+    // Assign streams
+    auto streams = model.concurrency();
+    stream_info si;
+    for(std::size_t stream = 0;stream < streams;stream++)
+    {
+        fix([&](auto self, auto ins) {
+            // Only assign streams fi not already assigned
+            if (not si.has_stream(ins))
+                si.set_stream(ins, stream);
+            instruction_ref child = p.end();
+            std::size_t w = 0;
+            for(auto i:ins->inputs())
+            {
+                const auto weight = weights[i];
+                // Skip instruction that already have stream assignment or too low of weights
+                if (si.has_stream(i) or weight <= min_partition_threshold) 
+                {
+                    self(i);
+                }
+                // Accumulate the max weight
+                else if (weight > w)
+                {
+                    child = i;
+                    w = weight;
+                }
+            }
+            if (child != p.end())
+                self(child);
+        })(last);
+    }
+    // Assign remaining instructions
+    for(auto ins:iterator_for(p))
+    {
+        if (si.has_stream(ins))
+            continue;
+        si.set_stream(ins, streams-1);
+    }
    // Topo sort
    fix([&](auto self, auto ins) {
        for(auto i : ins->inputs())
@@ -33,6 +127,21 @@ void schedule::apply(program& p) const
        for(auto i : ins->inputs())
            self(i);
    })(last);
+    // Schedule instructions
+    for(auto ins:iterator_for(p))
+    {
+        if (si.is_merge_point(ins)) 
+        {
+            assert(not si.wait_for(ins).empty());
+            model.wait(p, ins, si.get_stream(ins), si.wait_for(ins));
+            continue;
+        }
+        // Skip scheduling instructions with no context
+        if (is_context_free(ins->get_operator()) or ins->get_operator().name().front() == '@')
+            continue;
+        model.schedule_instruction(p, ins, si.get_stream(ins));
+    }
 }
 } // namespace MIGRAPHX_INLINE_NS

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -43,6 +43,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
 template <class T>
 std::vector<T> read_from_gpu(const void* x, std::size_t sz)
 {
+    gpu_sync();
    std::vector<T> result(sz);
    auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
    if(status != hipSuccess)
@@ -52,6 +53,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
 hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
 {
+    gpu_sync();
    auto result = allocate_gpu(sz, host);
    auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
    if(status != hipSuccess)

--- a/src/targets/gpu/schedule_model.cpp
+++ b/src/targets/gpu/schedule_model.cpp
@@ -62,7 +62,7 @@ struct set_stream
    void finalize(context& ctx, const shape&, const std::vector<shape>&) { ctx.set_stream(stream); }
 };
-std::size_t schedule_model::concurrency() const { return n; }
+std::size_t schedule_model::concurrency() const { return streams; }
 void schedule_model::schedule_instruction(program& p, instruction_ref ins, std::size_t n) const
 {
    p.insert_instruction(ins, set_stream{n});
@@ -99,7 +99,11 @@ static const std::unordered_map<std::string, std::size_t>& weight_map()
 std::size_t schedule_model::weight(const operation& op) const
 {
    if(weight_map().count(op.name()) == 0) 
+    {
+        if (is_context_free(op) or op.name()[0] == '@')
            return 0;
+        return 1;
+    }
    return weight_map().at(op.name());
 }

--- a/src/targets/gpu/target.cpp
+++ b/src/targets/gpu/target.cpp
@@ -54,8 +54,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
        dead_code_elimination{},
        write_literals{&ctx},
        schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}},
-        memory_coloring{"hip::allocate"},
+        // memory_coloring{"hip::allocate"},
-        eliminate_workspace{},
+        // eliminate_workspace{},
        eliminate_allocation{"hip::allocate"},
        check_context<context>{},
        dead_code_elimination{}