Unverified Commit 2b0b77f1 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Merge pull request #175 from ROCmSoftwarePlatform/stream_execution_checkin

Stream execution checkin
parents fa38a3a6 7ede2b3b
......@@ -17,6 +17,7 @@ add_library(migraphx
instruction.cpp
program.cpp
shape.cpp
schedule.cpp
simplify_algebra.cpp
simplify_reshapes.cpp
opt/memory_coloring.cpp
......
......@@ -41,9 +41,9 @@ void dead_code_elimination::apply(program& p) const
// Skip the last instruction
if(i == last)
break;
// Skip instruction with empty shape as output unless its a builtin or undefined
if(i->get_shape().elements() == 0 and not(i->name().front() == '@') and
not(i->name() == "undefined"))
// Skip instruction with empty shape as output unless its a builtin or undefined or identity
if(i->get_shape().elements() == 0 and i->name().front() != '@' and
i->name() != "undefined" and i->name() != "identity")
continue;
assert(bidistance(p, i, last) > 0);
fix([&](auto self, auto leaf) {
......
......@@ -137,6 +137,18 @@ auto fold(F f)
return [=](auto&&... xs) { return fold_impl(f, std::forward<decltype(xs)>(xs)...); };
}
template <class F, class Proj>
auto by(F f, Proj proj)
{
return [=](auto&&... xs) { return f(proj(std::forward<decltype(xs)>(xs))...); };
}
template <class T>
auto index_of(T& x)
{
return [&](auto&& y) { return x[y]; };
}
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
......@@ -3,21 +3,64 @@
#include <cassert>
#include <type_traits>
#include <iterator>
#include <migraphx/config.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
template <class T>
struct iterator_for_select
{
template <class T>
static T deref(T x)
{
return x;
}
template <class T>
static auto begin(T* x)
{
return x->begin();
}
template <class T>
static auto end(T* x)
{
return x->end();
}
};
struct iterator_for_select_reverse
{
template <class T>
static auto deref(T x)
{
return std::prev(x.base());
}
template <class T>
static auto begin(T* x)
{
return std::make_reverse_iterator(x->end());
}
template <class T>
static auto end(T* x)
{
return std::make_reverse_iterator(x->begin());
}
};
template <class T, class Selector = iterator_for_select>
struct iterator_for_range
{
T* base;
using base_iterator = std::remove_reference_t<decltype(base->begin())>;
using base_iterator = std::remove_reference_t<decltype(Selector::begin(base))>;
struct iterator
{
base_iterator i;
base_iterator operator*() const { return i; }
auto operator*() const { return Selector::deref(i); }
base_iterator operator++() { return ++i; }
bool operator!=(const iterator& rhs) const { return i != rhs.i; }
};
......@@ -25,12 +68,12 @@ struct iterator_for_range
iterator begin()
{
assert(base != nullptr);
return {base->begin()};
return {Selector::begin(base)};
}
iterator end()
{
assert(base != nullptr);
return {base->end()};
return {Selector::end(base)};
}
};
template <class T>
......@@ -39,6 +82,12 @@ iterator_for_range<T> iterator_for(T& x)
return {&x};
}
template <class T>
iterator_for_range<T, iterator_for_select_reverse> reverse_iterator_for(T& x)
{
return {&x};
}
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......
......@@ -1174,9 +1174,19 @@ struct load
}
argument compute(const shape&, const std::vector<argument>& args) const
{
if((offset + s.bytes()) > args[0].get_shape().bytes())
MIGRAPHX_THROW("Load access is out of bounds");
return {s, args[0].data() + offset};
}
int output_alias(const std::vector<shape>&) const { return 0; }
friend std::ostream& operator<<(std::ostream& os, const load& op)
{
os << op.name() << "[";
os << "offset=" << op.offset << ",";
os << "end=" << (op.offset + op.s.bytes()) << "]";
return os;
}
};
struct outline
......
......@@ -9,6 +9,7 @@
#include <migraphx/instruction_ref.hpp>
#include <migraphx/target.hpp>
#include <migraphx/tracer.hpp>
#include <migraphx/env.hpp>
#include <migraphx/config.hpp>
#include <algorithm>
#include <iostream>
......@@ -16,6 +17,9 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_COMPILE)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_EVAL)
struct program_impl;
const operation& get_operation(instruction_ref ins);
......@@ -107,6 +111,8 @@ struct program
void dry_run(parameter_map params) const;
void annotate(std::ostream& os, std::function<void(instruction_ref)> a) const;
friend std::ostream& operator<<(std::ostream& os, const program& p);
friend bool operator==(const program& x, const program& y);
friend bool operator!=(const program& x, const program& y) { return !(x == y); }
......
#ifndef MIGRAPHX_GUARD_RTGLIB_SCHEDULE_HPP
#define MIGRAPHX_GUARD_RTGLIB_SCHEDULE_HPP
#include <string>
#include <migraphx/instruction_ref.hpp>
#include <migraphx/schedule_model.hpp>
#include <migraphx/config.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct program;
/**
* Schedule instructions for concurrent execution
*/
struct schedule
{
schedule_model model{};
std::string name() const { return "schedule"; }
void apply(program& p) const;
};
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#ifndef MIGRAPHX_GUARD_SCHEDULE_MODEL_HPP
#define MIGRAPHX_GUARD_SCHEDULE_MODEL_HPP
#include <cassert>
#include <string>
#include <functional>
#include <memory>
#include <type_traits>
#include <utility>
#include <migraphx/config.hpp>
#include <migraphx/instruction_ref.hpp>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct program;
struct operation;
#ifdef DOXYGEN
/// An interface for target-dependent model for the scheduler
struct schedule_model
{
/// Get the number of concurrent instruction allowed
std::size_t concurrency() const;
/// Schedule a concurrent instruction
void sched(program& p, instruction_ref ins, std::size_t n) const;
// Insert necessary waits before an instruction
void wait(program& p, instruction_ref ins, std::size_t wait_id) const;
// Insert necessary records after an instruction
void record(program& p, instruction_ref ins, std::size_t wait_id) const;
/// Compute weights for an operation
std::size_t weight(const operation& op) const;
};
#else
/*
* Type-erased interface for:
*
* struct schedule_model
* {
* std::size_t concurrency() const;
* void sched(program& p,instruction_ref ins,std::size_t n) const;
* void wait(program& p,instruction_ref ins,std::size_t wait_id) const;
* void record(program& p,instruction_ref ins,std::size_t wait_id) const;
* std::size_t weight(const operation& op) const;
* };
*
*/
struct schedule_model
{
// Constructors
schedule_model() = default;
template <typename PrivateDetailTypeErasedT>
schedule_model(PrivateDetailTypeErasedT value)
: private_detail_te_handle_mem_var(
std::make_shared<private_detail_te_handle_type<
typename std::remove_reference<PrivateDetailTypeErasedT>::type>>(
std::forward<PrivateDetailTypeErasedT>(value)))
{
}
// Assignment
template <typename PrivateDetailTypeErasedT>
schedule_model& operator=(PrivateDetailTypeErasedT value)
{
if(private_detail_te_handle_mem_var.unique())
*private_detail_te_handle_mem_var = std::forward<PrivateDetailTypeErasedT>(value);
else if(!private_detail_te_handle_mem_var)
private_detail_te_handle_mem_var = std::make_shared<PrivateDetailTypeErasedT>(
std::forward<PrivateDetailTypeErasedT>(value));
return *this;
}
// Cast
template <typename PrivateDetailTypeErasedT>
PrivateDetailTypeErasedT* any_cast()
{
return private_detail_te_get_handle().type() == typeid(PrivateDetailTypeErasedT)
? std::addressof(static_cast<private_detail_te_handle_type<
typename std::remove_cv<PrivateDetailTypeErasedT>::type>&>(
private_detail_te_get_handle())
.private_detail_te_value)
: nullptr;
}
template <typename PrivateDetailTypeErasedT>
const typename std::remove_cv<PrivateDetailTypeErasedT>::type* any_cast() const
{
return private_detail_te_get_handle().type() == typeid(PrivateDetailTypeErasedT)
? std::addressof(static_cast<const private_detail_te_handle_type<
typename std::remove_cv<PrivateDetailTypeErasedT>::type>&>(
private_detail_te_get_handle())
.private_detail_te_value)
: nullptr;
}
const std::type_info& type_id() const
{
if(private_detail_te_handle_empty())
return typeid(std::nullptr_t);
else
return private_detail_te_get_handle().type();
}
std::size_t concurrency() const
{
assert((*this).private_detail_te_handle_mem_var);
return (*this).private_detail_te_get_handle().concurrency();
}
void sched(program& p, instruction_ref ins, std::size_t n) const
{
assert((*this).private_detail_te_handle_mem_var);
(*this).private_detail_te_get_handle().sched(p, ins, n);
}
void wait(program& p, instruction_ref ins, std::size_t wait_id) const
{
assert((*this).private_detail_te_handle_mem_var);
(*this).private_detail_te_get_handle().wait(p, ins, wait_id);
}
void record(program& p, instruction_ref ins, std::size_t wait_id) const
{
assert((*this).private_detail_te_handle_mem_var);
(*this).private_detail_te_get_handle().record(p, ins, wait_id);
}
std::size_t weight(const operation& op) const
{
assert((*this).private_detail_te_handle_mem_var);
return (*this).private_detail_te_get_handle().weight(op);
}
friend bool is_shared(const schedule_model& private_detail_x,
const schedule_model& private_detail_y)
{
return private_detail_x.private_detail_te_handle_mem_var ==
private_detail_y.private_detail_te_handle_mem_var;
}
private:
struct private_detail_te_handle_base_type
{
virtual ~private_detail_te_handle_base_type() {}
virtual std::shared_ptr<private_detail_te_handle_base_type> clone() const = 0;
virtual const std::type_info& type() const = 0;
virtual std::size_t concurrency() const = 0;
virtual void sched(program& p, instruction_ref ins, std::size_t n) const = 0;
virtual void wait(program& p, instruction_ref ins, std::size_t wait_id) const = 0;
virtual void record(program& p, instruction_ref ins, std::size_t wait_id) const = 0;
virtual std::size_t weight(const operation& op) const = 0;
};
template <typename PrivateDetailTypeErasedT>
struct private_detail_te_handle_type : private_detail_te_handle_base_type
{
template <typename PrivateDetailTypeErasedU = PrivateDetailTypeErasedT>
private_detail_te_handle_type(
PrivateDetailTypeErasedT value,
typename std::enable_if<std::is_reference<PrivateDetailTypeErasedU>::value>::type* =
nullptr)
: private_detail_te_value(value)
{
}
template <typename PrivateDetailTypeErasedU = PrivateDetailTypeErasedT>
private_detail_te_handle_type(
PrivateDetailTypeErasedT value,
typename std::enable_if<!std::is_reference<PrivateDetailTypeErasedU>::value,
int>::type* = nullptr) noexcept
: private_detail_te_value(std::move(value))
{
}
std::shared_ptr<private_detail_te_handle_base_type> clone() const override
{
return std::make_shared<private_detail_te_handle_type>(private_detail_te_value);
}
const std::type_info& type() const override { return typeid(private_detail_te_value); }
std::size_t concurrency() const override { return private_detail_te_value.concurrency(); }
void sched(program& p, instruction_ref ins, std::size_t n) const override
{
private_detail_te_value.sched(p, ins, n);
}
void wait(program& p, instruction_ref ins, std::size_t wait_id) const override
{
private_detail_te_value.wait(p, ins, wait_id);
}
void record(program& p, instruction_ref ins, std::size_t wait_id) const override
{
private_detail_te_value.record(p, ins, wait_id);
}
std::size_t weight(const operation& op) const override
{
return private_detail_te_value.weight(op);
}
PrivateDetailTypeErasedT private_detail_te_value;
};
template <typename PrivateDetailTypeErasedT>
struct private_detail_te_handle_type<std::reference_wrapper<PrivateDetailTypeErasedT>>
: private_detail_te_handle_type<PrivateDetailTypeErasedT&>
{
private_detail_te_handle_type(std::reference_wrapper<PrivateDetailTypeErasedT> ref)
: private_detail_te_handle_type<PrivateDetailTypeErasedT&>(ref.get())
{
}
};
bool private_detail_te_handle_empty() const
{
return private_detail_te_handle_mem_var == nullptr;
}
const private_detail_te_handle_base_type& private_detail_te_get_handle() const
{
assert(private_detail_te_handle_mem_var != nullptr);
return *private_detail_te_handle_mem_var;
}
private_detail_te_handle_base_type& private_detail_te_get_handle()
{
assert(private_detail_te_handle_mem_var != nullptr);
if(!private_detail_te_handle_mem_var.unique())
private_detail_te_handle_mem_var = private_detail_te_handle_mem_var->clone();
return *private_detail_te_handle_mem_var;
}
std::shared_ptr<private_detail_te_handle_base_type> private_detail_te_handle_mem_var;
};
template <typename ValueType>
inline const ValueType* any_cast(const schedule_model* x)
{
return x->any_cast<ValueType>();
}
template <typename ValueType>
inline ValueType* any_cast(schedule_model* x)
{
return x->any_cast<ValueType>();
}
template <typename ValueType>
inline ValueType& any_cast(schedule_model& x)
{
auto* y = x.any_cast<typename std::remove_reference<ValueType>::type>();
if(y == nullptr)
throw std::bad_cast();
return *y;
}
template <typename ValueType>
inline const ValueType& any_cast(const schedule_model& x)
{
const auto* y = x.any_cast<typename std::remove_reference<ValueType>::type>();
if(y == nullptr)
throw std::bad_cast();
return *y;
}
#endif
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
......@@ -36,6 +36,8 @@ inline stream_range_container<Range> stream_range(const Range& r)
namespace detail {
inline void stream_write_value_impl(rank<2>, std::ostream& os, const std::string& x) { os << x; }
template <class Range>
auto stream_write_value_impl(rank<1>, std::ostream& os, const Range& r)
-> decltype(r.begin(), r.end(), void())
......@@ -53,7 +55,7 @@ void stream_write_value_impl(rank<0>, std::ostream& os, const T& x)
template <class T>
void stream_write_value(std::ostream& os, const T& x)
{
detail::stream_write_value_impl(rank<1>{}, os, x);
detail::stream_write_value_impl(rank<2>{}, os, x);
}
} // namespace MIGRAPHX_INLINE_NS
......
......@@ -22,10 +22,8 @@ struct target
{
/// A unique name used to identify the target
std::string name() const;
/// The transformation passes to be run
/**
* @brief The transformation pass to be run during compilation.
* @details [long description]
*
* @param ctx This is the target-dependent context that is created by `get_context`
* @return The passes to be ran
......
#ifndef MIGRAPHX_GUARD_RTGLIB_COMMON_HEADER_HPP
#define MIGRAPHX_GUARD_RTGLIB_COMMON_HEADER_HPP
#include <migraphx/program.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/pass_config.hpp>
#include <migraphx/config.hpp>
#include <set>
#include <list>
#include <vector>
#include <queue>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
//#define MIGRAPHX_DEBUG_OPT
#ifdef MIGRAPHX_DEBUG_OPT
#define MIGRAPHX_DEBUG(s) s
#else
#define MIGRAPHX_DEBUG(s)
#endif // MIGRAPHX_DEBUG_OPT
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif // MIGRAPHX_GUARD_RTGLIB_COMMON_HEADER_HPP
#ifndef MIGRAPHX_GUARD_RTGLIB_MEMORY_COLORING_IMPL_HPP
#define MIGRAPHX_GUARD_RTGLIB_MEMORY_COLORING_IMPL_HPP
#include "common_header.hpp"
#include <migraphx/program.hpp>
#include <migraphx/stringutils.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/pass_config.hpp>
#include <migraphx/config.hpp>
#include <set>
#include <list>
#include <vector>
#include <queue>
#ifdef MIGRAPHX_DEBUG_OPT
#define MIGRAPHX_DEBUG(s) s
#else
#define MIGRAPHX_DEBUG(s)
#endif // MIGRAPHX_DEBUG_OPT
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
......
......@@ -15,9 +15,6 @@
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_COMPILE)
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_TRACE_EVAL)
struct program_impl
{
// A list is used to keep references to an instruction stable
......@@ -107,11 +104,9 @@ instruction_ref program::insert_instruction(instruction_ref ins,
args.begin(), args.end(), [&](instruction_ref x) { return has_instruction(x); }) &&
"Argument is not an exisiting instruction");
assert(not starts_with(op.name(), "@"));
// TODO: Use move
shape r = compute_shape(op, args);
auto result = impl->instructions.insert(ins, {op, r, std::move(args)});
instruction::backreference(result);
// assert(result->inputs() == args);
assert(result->valid(begin()));
return result;
}
......@@ -510,6 +505,16 @@ void program::perf_report(std::ostream& os, std::size_t n, parameter_map params)
void program::debug_print() const { std::cout << *this << std::endl; }
void program::debug_print(instruction_ref ins) const
{
if(ins == this->end())
{
std::cout << "End instruction" << std::endl;
return;
}
if(not has_instruction(ins))
{
std::cout << "Instruction not part of program" << std::endl;
return;
}
std::stringstream ss;
print_program(ss, *this, [&](auto x, auto&& names) {
if(x == ins)
......@@ -532,6 +537,11 @@ void program::dry_run(std::unordered_map<std::string, argument> params) const
generic_eval(*this, ctx, std::move(params), [](auto&&...) { return argument{}; });
}
void program::annotate(std::ostream& os, std::function<void(instruction_ref)> a) const
{
print_program(os, *this, [&](auto ins, auto&&) { a(ins); });
}
bool operator==(const program& x, const program& y) { return to_string(x) == to_string(y); }
std::ostream& operator<<(std::ostream& os, const program& p)
......
#include <migraphx/schedule.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/operators.hpp>
#include <migraphx/iterator_for.hpp>
#include <migraphx/dfor.hpp>
#include <migraphx/functional.hpp>
#include <migraphx/ranges.hpp>
#include <unordered_map>
#include <unordered_set>
#include <set>
#include <deque>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
auto get_inputs()
{
return [](auto i) { return i->inputs(); };
}
auto get_outputs()
{
return [](auto i) { return i->outputs(); };
}
struct stream_info
{
std::unordered_map<instruction_ref, std::size_t> ins2stream;
std::unordered_map<instruction_ref, std::size_t> weights;
std::unordered_map<instruction_ref, std::size_t> iweights;
void accumulate_weights(instruction_ref last, const schedule_model& model)
{
fix<std::size_t>([&](auto self, auto ins) -> std::size_t {
if(not contains(weights, ins))
{
std::size_t weight = 0;
auto&& op = ins->get_operator();
if(not is_context_free(op) and op.name()[0] != '@')
weight = model.weight(op);
iweights[ins] = weight;
weights[ins] =
std::accumulate(ins->inputs().begin(),
ins->inputs().end(),
weight,
[&](std::size_t w, instruction_ref i) { return w + self(i); });
}
return weights[ins];
})(last);
}
std::vector<instruction_ref>::iterator sort_args(std::vector<instruction_ref>& args)
{
if(args.size() < 2)
{
return args.end();
}
const std::size_t min_partition_threshold = 2;
auto compare = by(std::greater<>{}, [&](auto x) {
return std::make_tuple(this->weights[x], x->inputs().size());
});
std::sort(args.begin(), args.end(), compare);
auto it = std::lower_bound(std::next(args.begin()),
args.end(),
min_partition_threshold,
[&](auto i, std::size_t w) { return this->weights[i] > w; });
assert(it == args.end() or this->weights[*it] <= min_partition_threshold);
assert(it == args.end() or std::prev(it) == args.begin() or
this->weights[*std::prev(it)] > min_partition_threshold);
return it;
}
struct partition
{
std::size_t weight = 0;
std::vector<instruction_ref> instructions{};
void add(instruction_ref ins, std::size_t w)
{
weight += w;
instructions.push_back(ins);
}
};
void assign_streams(program& p, std::size_t n)
{
partition critical;
std::unordered_map<instruction_ref, std::deque<partition>> partitions;
partitions.reserve(weights.size());
fix([&](auto self, auto ins, auto& part) {
assert(ins != p.end());
if(contains(partitions, ins))
return;
assert(p.has_instruction(ins));
// Add an entry so we know the instruction was visited
partitions[ins];
part.add(ins, this->iweights[ins]);
auto args = ins->inputs();
auto threshold_it = this->sort_args(args);
if(not args.empty())
{
assert(threshold_it != args.begin());
self(args.front(), part);
for(auto i : range(std::next(args.begin()), threshold_it))
{
partitions[ins].emplace_back();
self(i, partitions[ins].back());
}
for(auto i : range(threshold_it, args.end()))
{
self(i, part);
}
}
// Sort instructions
p.move_instruction(ins, p.end());
})(std::prev(p.end()), critical);
// Set the critical partition to stream 0
set_stream(critical, 0);
std::vector<std::size_t> streams(n - 1);
// Assign streams for the other partitions
for(auto&& ins_part : partitions)
{
std::sort(
ins_part.second.begin(), ins_part.second.end(), by(std::greater<>{}, [](auto&& x) {
return std::make_tuple(x.weight, x.instructions.size());
}));
for(auto&& part : ins_part.second)
{
auto stream = std::min_element(streams.begin(), streams.end()) - streams.begin();
set_stream(part, stream + 1);
streams[stream] += part.weight;
}
}
}
void set_stream(const partition& p, std::size_t n)
{
for(auto ins : p.instructions)
if(iweights[ins] > 0)
set_stream(ins, n);
}
void set_stream(instruction_ref ins, std::size_t n)
{
assert(iweights[ins] > 0);
ins2stream[ins] = n;
}
std::size_t get_stream(instruction_ref ins) const { return ins2stream.at(ins); }
bool has_stream(instruction_ref ins) const { return contains(ins2stream, ins); }
template <class F>
bool different(F f, std::size_t stream) const
{
bool result = false;
f([&](auto s) {
if(s != stream)
{
result = true;
return false;
}
// cppcheck-suppress uselessAssignmentArg
stream = s;
return true;
});
return result;
}
template <class F>
bool different(F f) const
{
bool result = false;
f([&](auto s) {
result = this->different(f, s);
return false;
});
return result;
}
template <class Selector>
auto get_streams_from(instruction_ref start, Selector select) const
{
return [=](auto f) {
return fix<bool>([&](auto self, auto ins) {
for(auto i : select(ins))
{
if(iweights.at(i) == 0)
{
if(not self(i))
return false;
}
else
{
if(not f(this->get_stream(i)))
return false;
}
}
return true;
})(start);
};
}
std::unordered_set<std::size_t> get_streams(instruction_ref ins) const
{
if(has_stream(ins))
return {get_stream(ins)};
std::unordered_set<std::size_t> result;
get_streams_from(ins, get_inputs())([&](auto s) {
result.insert(s);
return true;
});
return result;
}
template <class... Ts>
bool is_merge_point(instruction_ref ins, Ts... xs) const
{
return different(get_streams_from(ins, get_inputs()), xs...);
}
template <class... Ts>
bool is_split_point(instruction_ref ins, Ts... xs) const
{
return different(get_streams_from(ins, get_outputs()), xs...);
}
std::vector<instruction_ref> get_recorded_instructions(instruction_ref start)
{
std::vector<instruction_ref> result;
std::unordered_map<std::size_t, instruction_ref> m;
fix([&](auto self, auto ins) {
for(auto i : ins->inputs())
{
if(iweights.at(i) == 0)
{
self(i);
continue;
}
auto stream = this->get_stream(i);
if(not contains(m, stream))
m[stream] = i;
else
m[stream] = std::min(m[stream], i, by(std::less<>{}, [&](auto x) {
return std::distance(x, start);
}));
}
})(start);
std::transform(
m.begin(), m.end(), std::back_inserter(result), [](auto&& p) { return p.second; });
return result;
}
std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>>
find_concurrent_instructions(program& p)
{
std::unordered_map<instruction_ref, std::vector<std::vector<instruction_ref>>> result;
std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> merge_from;
result.reserve(p.size());
merge_from.reserve(p.size());
for(auto ins : reverse_iterator_for(p))
{
for(auto&& arg : ins->outputs())
{
if(is_merge_point(arg))
merge_from[ins].insert(arg);
merge_from[ins].insert(merge_from[arg].begin(), merge_from[arg].end());
}
auto streams = this->get_streams(ins);
// Collect concur instructions for each merge point.
for(auto& merge : merge_from[ins])
{
for(auto stream : streams)
{
if(result[merge].size() <= stream)
result[merge].resize(stream + 1);
auto&& r = result[merge][stream];
r.push_back(ins);
// Copy inputs if they dont have a stream(and are not a builtin and context
// free). Inputs without a stream can have a implicit dependency
std::copy_if(ins->inputs().begin(),
ins->inputs().end(),
std::back_inserter(r),
[&](auto x) {
return not this->has_stream(x) and
not is_context_free(x->get_operator()) and
x->name().front() != '@';
});
}
}
}
return result;
}
std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>>
get_conflicts(program& p)
{
std::unordered_map<instruction_ref, std::unordered_set<instruction_ref>> conflict_table;
auto concur_ins = this->find_concurrent_instructions(p);
for(auto&& merge : concur_ins)
{
dfor(merge.second.size(), merge.second.size())([&](auto i, auto j) {
if(i == j)
return;
for(auto ins1 : merge.second[i])
{
auto p1 = std::distance(ins1, merge.first);
for(auto ins2 : merge.second[j])
{
if(ins1 == ins2)
continue;
auto p2 = std::distance(ins2, merge.first);
// The smaller distance means the instruction occurs later
if(p1 > p2)
conflict_table[ins2].insert(ins1);
else
conflict_table[ins1].insert(ins2);
}
}
});
}
// Remove duplicates
for(auto&& ip : conflict_table)
{
auto ins1 = ip.first;
for(auto ins2 : ip.second)
if(contains(conflict_table[ins2], ins1))
conflict_table[ins2].erase(ins1);
}
return conflict_table;
}
};
void schedule::apply(program& p) const
{
stream_info si;
auto last = std::prev(p.end());
si.accumulate_weights(last, model);
si.assign_streams(p, model.concurrency());
if(enabled(MIGRAPHX_TRACE_COMPILE{}))
{
p.annotate(std::cout, [&](auto ins) {
std::cout << ":";
std::cout << " weight=" << si.weights.at(ins);
std::cout << " input={";
si.get_streams_from(ins, get_inputs())([&](auto s) {
std::cout << s << ",";
return true;
});
std::cout << "}";
if(si.has_stream(ins))
std::cout << " stream=" << si.get_stream(ins);
});
std::cout << std::endl;
}
// Schedule instructions
std::size_t wait_id = 0;
std::unordered_map<instruction_ref, std::size_t> ins2wait;
std::unordered_map<std::size_t, std::unordered_set<std::size_t>> waited_for;
std::unordered_map<instruction_ref, std::unordered_set<std::size_t>> ins2waited;
ins2wait.reserve(p.size());
ins2waited.reserve(p.size());
for(auto ins : iterator_for(p))
{
// Only schedule instructions that have a stream
if(not si.has_stream(ins))
continue;
assert(si.weights[ins] > 0);
// Schedule instruction on the stream
auto stream = si.get_stream(ins);
assert(stream < model.concurrency());
model.sched(p, ins, stream);
// Insert wait instructions
if(si.is_merge_point(ins, stream))
{
for(auto i : si.get_recorded_instructions(ins))
{
if(not si.has_stream(i))
continue;
auto istream = si.get_stream(i);
if(stream == istream)
continue;
// Create a new event if it hasn't been recorded
if(not contains(ins2wait, i))
{
ins2wait[i] = wait_id;
model.record(p, i, wait_id);
wait_id++;
}
auto w = ins2wait.at(i);
// If we already waited for the event on this stream then dont
// insert another wait event
if(not contains(waited_for[stream], w))
model.wait(p, ins, w);
// Store the event as waited
waited_for[stream].insert(w);
// Store all wait events that have been waited on prior to the recorded instruction
waited_for[stream].insert(ins2waited[i].begin(), ins2waited[i].end());
}
}
// Store wait events that have already been waited on
if(si.is_split_point(ins, stream))
{
ins2waited[ins] = waited_for[stream];
}
}
// Add memory conflicts
auto conflict_table = si.get_conflicts(p);
for(auto&& ip : conflict_table)
{
if(ip.second.empty())
continue;
std::vector<instruction_ref> args;
args.push_back(ip.first);
args.insert(args.end(), ip.second.begin(), ip.second.end());
p.insert_instruction(std::next(ip.first), op::identity{}, args);
}
}
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
......@@ -64,6 +64,7 @@ add_library(migraphx_gpu
pad.cpp
gather.cpp
lrn.cpp
schedule_model.cpp
)
set_target_properties(migraphx_gpu PROPERTIES EXPORT_NAME gpu)
rocm_clang_tidy_check(migraphx_gpu)
......
......@@ -21,7 +21,7 @@ argument miopen_convolution::compute(context& ctx,
float alpha = 1;
float beta = 0;
miopenConvolutionForward(ctx.get_stream().get_miopen(),
auto status = miopenConvolutionForward(ctx.get_stream().get_miopen(),
&alpha,
x_desc.get(),
args[0].implicit(),
......@@ -34,6 +34,8 @@ argument miopen_convolution::compute(context& ctx,
args[3].implicit(),
args[2].implicit(),
args[2].get_shape().bytes());
if(status != miopenStatusSuccess)
MIGRAPHX_THROW("Running convolution failed");
return args[3];
}
......@@ -89,8 +91,11 @@ void miopen_convolution::finalize(context& ctx,
{
if(handle == ctx.get_stream().get_miopen())
return;
// TODO: Check that workspace hasn't changed
compile(ctx, output_shape, std::move(inputs));
// Check that workspace hasn't changed
auto size = inputs.at(2).bytes();
auto ws = compile(ctx, output_shape, std::move(inputs));
if(ws.bytes() > size)
MIGRAPHX_THROW("Workspace has changed during finalization.");
}
} // namespace gpu
......
......@@ -43,6 +43,7 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
template <class T>
std::vector<T> read_from_gpu(const void* x, std::size_t sz)
{
gpu_sync();
std::vector<T> result(sz);
auto status = hipMemcpy(result.data(), x, sz * sizeof(T), hipMemcpyDeviceToHost);
if(status != hipSuccess)
......@@ -52,6 +53,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
{
gpu_sync();
auto result = allocate_gpu(sz, host);
auto status = hipMemcpy(result.get(), x, sz, hipMemcpyHostToDevice);
if(status != hipSuccess)
......
......@@ -13,11 +13,17 @@ namespace gpu {
MIGRAPHX_DECLARE_ENV_VAR(MIGRAPHX_ENABLE_NULL_STREAM)
using hip_event_ptr = MIGRAPHX_MANAGE_PTR(hipEvent_t, hipEventDestroy);
struct hip_device
{
hip_device() { add_stream(); }
hip_device(std::size_t id) : device_id(id) { add_stream(); }
hip_device(std::size_t id, std::size_t n) : device_id(id)
{
for(std::size_t i = 0; i < n; i++)
add_stream();
}
struct stream
{
......@@ -32,7 +38,7 @@ struct hip_device
static hip_stream_ptr create_stream()
{
hipStream_t result = nullptr;
auto status = hipStreamCreate(&result);
auto status = hipStreamCreateWithFlags(&result, hipStreamNonBlocking);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to allocate stream");
return hip_stream_ptr{result};
......@@ -77,6 +83,22 @@ struct hip_device
return rbhandle.get();
}
void wait(hipEvent_t event)
{
setup();
auto status = hipStreamWaitEvent(get(), event, 0);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to wait.");
}
void record(hipEvent_t event)
{
setup();
auto status = hipEventRecord(event, get());
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to record.");
}
private:
std::size_t id = 0;
shared<hip_stream_ptr> s = nullptr;
......@@ -88,8 +110,14 @@ struct hip_device
stream& get_stream() { return streams.at(current_stream); }
stream& get_stream(std::size_t n) { return streams.at(n); }
void set_stream(std::size_t n) { current_stream = n; }
std::size_t nstreams() const { return streams.size(); }
std::size_t stream_id() const { return current_stream; }
private:
std::size_t device_id = 0;
std::size_t current_stream = 0;
......@@ -98,7 +126,10 @@ struct hip_device
struct context
{
context(std::size_t n = 0) : current_device(std::make_shared<hip_device>(n)) {}
context(std::size_t device_id = 0, std::size_t n = 4)
: current_device(std::make_shared<hip_device>(device_id, n))
{
}
hip_device& get_current_device()
{
......@@ -107,13 +138,34 @@ struct context
}
hip_device::stream& get_stream() { return get_current_device().get_stream(); }
hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
void set_stream(std::size_t n) { get_current_device().set_stream(n); }
void create_events(std::size_t num_of_events)
{
for(std::size_t i = events.size(); i < num_of_events + 1; ++i)
events.emplace_back(create_event());
}
hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }
std::vector<argument> literals{};
void finish() const { gpu_sync(); }
static hip_event_ptr create_event()
{
hipEvent_t event;
auto status = hipEventCreateWithFlags(&event, hipEventDisableTiming);
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to create event");
return hip_event_ptr{event};
}
private:
// TODO: Make this a vector to support multiple devices
std::shared_ptr<hip_device> current_device;
std::vector<shared<hip_event_ptr>> events;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
......
#ifndef MIGRAPHX_GUARD_RTGLIB_GPU_SCHEDULE_MODEL_HPP
#define MIGRAPHX_GUARD_RTGLIB_GPU_SCHEDULE_MODEL_HPP
#include <migraphx/config.hpp>
#include <migraphx/instruction_ref.hpp>
#include <vector>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
struct program;
struct operation;
namespace gpu {
struct schedule_model
{
std::size_t streams = 0;
std::size_t concurrency() const;
void sched(program& p, instruction_ref ins, std::size_t n) const;
void wait(program& p, instruction_ref ins, std::size_t wait_id) const;
void record(program& p, instruction_ref ins, std::size_t wait_id) const;
std::size_t weight(const operation& op) const;
};
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
#endif
#include <migraphx/gpu/schedule_model.hpp>
#include <migraphx/gpu/context.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/operation.hpp>
namespace migraphx {
inline namespace MIGRAPHX_INLINE_NS {
namespace gpu {
struct record_event
{
std::size_t event = 0;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.event, "event"));
}
std::string name() const { return "gpu::record_event"; }
shape compute_shape(const std::vector<shape>&) const { return {}; }
argument compute(context& ctx, const shape&, const std::vector<argument>&) const
{
ctx.get_stream().record(ctx.get_event(event));
return {};
}
void finalize(context& ctx, const shape&, const std::vector<shape>&)
{
ctx.create_events(event);
}
};
struct wait_event
{
std::size_t event = 0;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.event, "event"));
}
std::string name() const { return "gpu::wait_event"; }
shape compute_shape(const std::vector<shape>&) const { return {}; }
argument compute(context& ctx, const shape&, const std::vector<argument>&) const
{
ctx.get_stream().wait(ctx.get_event(event));
return {};
}
};
struct set_stream
{
std::size_t stream = 0;
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.stream, "stream"));
}
std::string name() const { return "gpu::set_stream"; }
shape compute_shape(const std::vector<shape>&) const { return {}; }
argument compute(context& ctx, const shape&, const std::vector<argument>&) const
{
ctx.set_stream(stream);
return {};
}
void finalize(context& ctx, const shape&, const std::vector<shape>&) { ctx.set_stream(stream); }
};
std::size_t schedule_model::concurrency() const { return streams; }
void schedule_model::sched(program& p, instruction_ref ins, std::size_t n) const
{
auto last_stream = std::find_if(std::make_reverse_iterator(ins),
std::make_reverse_iterator(p.begin()),
[&](auto&& i) { return i.name() == "gpu::set_stream"; });
if(last_stream != std::make_reverse_iterator(p.begin()))
{
auto&& op = any_cast<set_stream>(last_stream->get_operator());
// If the same stream was set earlier then skip
if(op.stream == n)
return;
}
p.insert_instruction(ins, set_stream{n});
}
void schedule_model::wait(program& p, instruction_ref ins, std::size_t wait_id) const
{
p.insert_instruction(ins, wait_event{wait_id});
}
void schedule_model::record(program& p, instruction_ref ins, std::size_t wait_id) const
{
p.insert_instruction(std::next(ins), record_event{wait_id});
}
static std::unordered_map<std::string, std::size_t> create_weight_map()
{
return {
{"hip::load_literal", 0},
{"hip::allocate", 0},
{"gpu::convolution", 4},
{"gpu::conv_bias_relu", 4},
{"gpu::pooling", 2},
{"gpu::gemm", 2},
{"gpu::concat", 1},
{"hip::add_relu", 2},
};
}
static const std::unordered_map<std::string, std::size_t>& weight_map()
{
static std::unordered_map<std::string, std::size_t> m = create_weight_map();
return m;
}
std::size_t schedule_model::weight(const operation& op) const
{
if(weight_map().count(op.name()) == 0)
{
return 1;
}
return weight_map().at(op.name());
}
} // namespace gpu
} // namespace MIGRAPHX_INLINE_NS
} // namespace migraphx
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment