Commit c6d92ecc authored by Paul's avatar Paul
Browse files

Schedule concurrently without memory coloring

parent 9cd5cf4e
...@@ -5,12 +5,68 @@ ...@@ -5,12 +5,68 @@
#include <migraphx/functional.hpp> #include <migraphx/functional.hpp>
#include <migraphx/ranges.hpp> #include <migraphx/ranges.hpp>
#include <unordered_map> #include <unordered_map>
#include <set>
namespace migraphx { namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS { inline namespace MIGRAPHX_INLINE_NS {
struct stream_info
{
std::unordered_map<instruction_ref, std::size_t> ins2stream;
void set_stream(instruction_ref ins, std::size_t n)
{
ins2stream[ins] = n;
}
std::size_t get_stream(instruction_ref ins) const
{
return ins2stream.at(ins);
}
bool has_stream(instruction_ref ins) const
{
return ins2stream.count(ins) > 0;
}
bool different(const std::vector<instruction_ref>& v) const
{
if (v.size() < 2)
return false;
auto stream = get_stream(v.front());
return not std::all_of(v.begin(), v.end(), [&](instruction_ref x) {
return get_stream(x) == stream;
});
}
bool is_split_point(instruction_ref ins) const
{
return different(ins->outputs());
}
bool is_merge_point(instruction_ref ins) const
{
return different(ins->inputs());
}
std::vector<std::size_t> wait_for(instruction_ref ins) const
{
std::set<std::size_t> result;
auto s = get_stream(ins);
for(auto i:ins->inputs())
{
auto stream = get_stream(i);
if (stream != s)
result.insert(stream);
}
return {result.begin(), result.end()};
}
};
void schedule::apply(program& p) const void schedule::apply(program& p) const
{ {
const std::size_t min_partition_threshold = 2;
// Compute accumulated weights // Compute accumulated weights
std::unordered_map<instruction_ref, std::size_t> weights; std::unordered_map<instruction_ref, std::size_t> weights;
auto last = std::prev(p.end()); auto last = std::prev(p.end());
...@@ -26,6 +82,44 @@ void schedule::apply(program& p) const ...@@ -26,6 +82,44 @@ void schedule::apply(program& p) const
return weights[ins]; return weights[ins];
})(last); })(last);
// Assign streams
auto streams = model.concurrency();
stream_info si;
for(std::size_t stream = 0;stream < streams;stream++)
{
fix([&](auto self, auto ins) {
// Only assign streams fi not already assigned
if (not si.has_stream(ins))
si.set_stream(ins, stream);
instruction_ref child = p.end();
std::size_t w = 0;
for(auto i:ins->inputs())
{
const auto weight = weights[i];
// Skip instruction that already have stream assignment or too low of weights
if (si.has_stream(i) or weight <= min_partition_threshold)
{
self(i);
}
// Accumulate the max weight
else if (weight > w)
{
child = i;
w = weight;
}
}
if (child != p.end())
self(child);
})(last);
}
// Assign remaining instructions
for(auto ins:iterator_for(p))
{
if (si.has_stream(ins))
continue;
si.set_stream(ins, streams-1);
}
// Topo sort // Topo sort
fix([&](auto self, auto ins) { fix([&](auto self, auto ins) {
for(auto i : ins->inputs()) for(auto i : ins->inputs())
...@@ -33,6 +127,21 @@ void schedule::apply(program& p) const ...@@ -33,6 +127,21 @@ void schedule::apply(program& p) const
for(auto i : ins->inputs()) for(auto i : ins->inputs())
self(i); self(i);
})(last); })(last);
// Schedule instructions
for(auto ins:iterator_for(p))
{
if (si.is_merge_point(ins))
{
assert(not si.wait_for(ins).empty());
model.wait(p, ins, si.get_stream(ins), si.wait_for(ins));
continue;
}
// Skip scheduling instructions with no context
if (is_context_free(ins->get_operator()) or ins->get_operator().name().front() == '@')
continue;
model.schedule_instruction(p, ins, si.get_stream(ins));
}
} }
} // namespace MIGRAPHX_INLINE_NS } // namespace MIGRAPHX_INLINE_NS
......
...@@ -43,6 +43,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false) ...@@ -43,6 +43,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
template <class T> template <class T>
std::vector<T> read_from_gpu(const void* x, std::size_t sz) std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{ {
gpu_sync();
std::vector<T> result(sz); std::vector<T> result(sz);
auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost); auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
if(status != hipSuccess) if(status != hipSuccess)
...@@ -52,6 +53,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz) ...@@ -52,6 +53,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false) hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
{ {
gpu_sync();
auto result = allocate_gpu(sz, host); auto result = allocate_gpu(sz, host);
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice); auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess) if(status != hipSuccess)
......
...@@ -62,7 +62,7 @@ struct set_stream ...@@ -62,7 +62,7 @@ struct set_stream
void finalize(context& ctx, const shape&, const std::vector<shape>&) { ctx.set_stream(stream); } void finalize(context& ctx, const shape&, const std::vector<shape>&) { ctx.set_stream(stream); }
}; };
std::size_t schedule_model::concurrency() const { return n; } std::size_t schedule_model::concurrency() const { return streams; }
void schedule_model::schedule_instruction(program& p, instruction_ref ins, std::size_t n) const void schedule_model::schedule_instruction(program& p, instruction_ref ins, std::size_t n) const
{ {
p.insert_instruction(ins, set_stream{n}); p.insert_instruction(ins, set_stream{n});
...@@ -99,7 +99,11 @@ static const std::unordered_map<std::string, std::size_t>& weight_map() ...@@ -99,7 +99,11 @@ static const std::unordered_map<std::string, std::size_t>& weight_map()
std::size_t schedule_model::weight(const operation& op) const std::size_t schedule_model::weight(const operation& op) const
{ {
if(weight_map().count(op.name()) == 0) if(weight_map().count(op.name()) == 0)
{
if (is_context_free(op) or op.name()[0] == '@')
return 0; return 0;
return 1;
}
return weight_map().at(op.name()); return weight_map().at(op.name());
} }
......
...@@ -54,8 +54,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const ...@@ -54,8 +54,8 @@ std::vector<pass> target::get_passes(migraphx::context& gctx) const
dead_code_elimination{}, dead_code_elimination{},
write_literals{&ctx}, write_literals{&ctx},
schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}}, schedule{gpu::schedule_model{ctx.get_current_device().nstreams()}},
memory_coloring{"hip::allocate"}, // memory_coloring{"hip::allocate"},
eliminate_workspace{}, // eliminate_workspace{},
eliminate_allocation{"hip::allocate"}, eliminate_allocation{"hip::allocate"},
check_context<context>{}, check_context<context>{},
dead_code_elimination{} dead_code_elimination{}
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment