Unverified Commit 6887a000 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Use hipStreamSynchronize instead of device sync (#813)

* Use hipStreamSynchronize instead of device sync

* Formatting

* Suppress FPs

* Use sync_stream instead of device

* Formatting

* Fix python bindings

* Formatting
parent a2e33148
...@@ -191,6 +191,8 @@ rocm_enable_cppcheck( ...@@ -191,6 +191,8 @@ rocm_enable_cppcheck(
definePrefix:*test/include/test.hpp definePrefix:*test/include/test.hpp
useSmartPointer:*src/api/api.cpp useSmartPointer:*src/api/api.cpp
useSmartPointer:*make_shared_array.hpp useSmartPointer:*make_shared_array.hpp
constParameter:*src/targets/gpu/*.cpp
constParameter:*src/targets/gpu/*.hpp
FORCE FORCE
INCONCLUSIVE INCONCLUSIVE
RULE_FILE RULE_FILE
......
...@@ -398,7 +398,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m) ...@@ -398,7 +398,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false); m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);
m.def("to_gpu", &migraphx::gpu::to_gpu, py::arg("arg"), py::arg("host") = false); m.def("to_gpu", &migraphx::gpu::to_gpu, py::arg("arg"), py::arg("host") = false);
m.def("from_gpu", &migraphx::gpu::from_gpu); m.def("from_gpu", &migraphx::gpu::from_gpu);
m.def("gpu_sync", &migraphx::gpu::gpu_sync); m.def("gpu_sync", [] { migraphx::gpu::gpu_sync(); });
#endif #endif
#ifdef VERSION_INFO #ifdef VERSION_INFO
......
...@@ -15,6 +15,7 @@ namespace gpu { ...@@ -15,6 +15,7 @@ namespace gpu {
MIGRAPHX_REGISTER_OP(hip_allocate) MIGRAPHX_REGISTER_OP(hip_allocate)
MIGRAPHX_REGISTER_OP(hip_sync_device) MIGRAPHX_REGISTER_OP(hip_sync_device)
MIGRAPHX_REGISTER_OP(hip_sync_stream)
MIGRAPHX_REGISTER_OP(hip_copy_to_gpu) MIGRAPHX_REGISTER_OP(hip_copy_to_gpu)
MIGRAPHX_REGISTER_OP(hip_copy_from_gpu) MIGRAPHX_REGISTER_OP(hip_copy_from_gpu)
MIGRAPHX_REGISTER_OP(hip_copy) MIGRAPHX_REGISTER_OP(hip_copy)
...@@ -146,6 +147,8 @@ void gpu_sync() ...@@ -146,6 +147,8 @@ void gpu_sync()
MIGRAPHX_THROW("hip device synchronization failed: " + hip_error(status)); MIGRAPHX_THROW("hip device synchronization failed: " + hip_error(status));
} }
void gpu_sync(const context& ctx) { ctx.finish(); }
void hip_async_copy(context& ctx, const argument& src, const argument& dst, hipMemcpyKind kind) void hip_async_copy(context& ctx, const argument& src, const argument& dst, hipMemcpyKind kind)
{ {
std::size_t src_size = src.get_shape().bytes(); std::size_t src_size = src.get_shape().bytes();
......
...@@ -97,6 +97,16 @@ struct hip_device ...@@ -97,6 +97,16 @@ struct hip_device
return rbhandle.get(); return rbhandle.get();
} }
void wait() const
{
if(s == nullptr)
return;
setup();
auto status = hipStreamSynchronize(s.get());
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to wait.");
}
void wait(hipEvent_t event) void wait(hipEvent_t event)
{ {
setup(); setup();
...@@ -126,6 +136,10 @@ struct hip_device ...@@ -126,6 +136,10 @@ struct hip_device
stream& get_stream(std::size_t n) { return streams.at(n); } stream& get_stream(std::size_t n) { return streams.at(n); }
const stream& get_stream() const { return streams.at(current_stream); }
const stream& get_stream(std::size_t n) const { return streams.at(n); }
void set_stream(std::size_t n) { current_stream = n; } void set_stream(std::size_t n) { current_stream = n; }
std::size_t nstreams() const { return streams.size(); } std::size_t nstreams() const { return streams.size(); }
...@@ -163,9 +177,21 @@ struct context ...@@ -163,9 +177,21 @@ struct context
return *current_device; return *current_device;
} }
const hip_device& get_current_device() const
{
assert(current_device != nullptr);
return *current_device;
}
hip_device::stream& get_stream() { return get_current_device().get_stream(); } hip_device::stream& get_stream() { return get_current_device().get_stream(); }
hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); } hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
const hip_device::stream& get_stream() const { return get_current_device().get_stream(); }
const hip_device::stream& get_stream(std::size_t n) const
{
return get_current_device().get_stream(n);
}
void set_stream(std::size_t n) { get_current_device().set_stream(n); } void set_stream(std::size_t n) { get_current_device().set_stream(n); }
void create_events(std::size_t num_of_events) void create_events(std::size_t num_of_events)
...@@ -177,7 +203,7 @@ struct context ...@@ -177,7 +203,7 @@ struct context
hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); } hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }
std::vector<argument> literals{}; std::vector<argument> literals{};
void finish() const { gpu_sync(); } void finish() const { get_stream().wait(); }
static hip_event_ptr create_event() static hip_event_ptr create_event()
{ {
......
...@@ -25,6 +25,7 @@ argument from_gpu(const argument& arg); ...@@ -25,6 +25,7 @@ argument from_gpu(const argument& arg);
void set_device(std::size_t id); void set_device(std::size_t id);
void gpu_sync(); void gpu_sync();
void gpu_sync(const context& ctx);
void gpu_copy(context& ctx, const argument& src, const argument& dst); void gpu_copy(context& ctx, const argument& src, const argument& dst);
void copy_to_gpu(context& ctx, const argument& src, const argument& dst); void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
...@@ -82,6 +83,33 @@ struct hip_sync_device ...@@ -82,6 +83,33 @@ struct hip_sync_device
} }
}; };
struct hip_sync_stream
{
std::string tag{};
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.tag, "tag"));
}
std::string name() const { return "hip::sync_stream"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
if(inputs.empty())
return {};
return inputs.front();
}
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
gpu_sync(ctx);
if(args.empty())
return {};
return args.front();
}
};
struct hip_copy_to_gpu struct hip_copy_to_gpu
{ {
std::string name() const { return "hip::copy_to_gpu"; } std::string name() const { return "hip::copy_to_gpu"; }
......
...@@ -26,7 +26,6 @@ ...@@ -26,7 +26,6 @@
#include <migraphx/gpu/equal.hpp> #include <migraphx/gpu/equal.hpp>
#include <migraphx/gpu/gemm.hpp> #include <migraphx/gpu/gemm.hpp>
#include <migraphx/gpu/greater.hpp> #include <migraphx/gpu/greater.hpp>
#include <migraphx/gpu/hip.hpp>
#include <migraphx/gpu/int8_conv_pack.hpp> #include <migraphx/gpu/int8_conv_pack.hpp>
#include <migraphx/gpu/leaky_relu.hpp> #include <migraphx/gpu/leaky_relu.hpp>
#include <migraphx/gpu/less.hpp> #include <migraphx/gpu/less.hpp>
...@@ -188,7 +187,7 @@ struct miopen_apply ...@@ -188,7 +187,7 @@ struct miopen_apply
auto pos = std::next(ins); auto pos = std::next(ins);
auto a = insert_allocation(pos, ins->get_shape()); auto a = insert_allocation(pos, ins->get_shape());
auto c = mod->insert_instruction(pos, hip_copy_to_gpu{}, ins, a); auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
mod->replace_instruction(ins, c); mod->replace_instruction(ins, c);
} }
...@@ -202,14 +201,14 @@ struct miopen_apply ...@@ -202,14 +201,14 @@ struct miopen_apply
// output with copy output // output with copy output
for(const auto& in : inputs) for(const auto& in : inputs)
{ {
auto p_output = mod->insert_instruction(ret, hip_copy_from_gpu{}, in); auto p_output = mod->insert_instruction(ret, make_op("hip::copy_from_gpu"), in);
instruction::replace_argument(ret, in, p_output); instruction::replace_argument(ret, in, p_output);
} }
} }
// else branch to handle legacy program without the return instruction // else branch to handle legacy program without the return instruction
else else
{ {
mod->add_instruction(hip_copy_from_gpu{}, ret); mod->add_instruction(make_op("hip::copy_from_gpu"), ret);
} }
} }
...@@ -233,7 +232,8 @@ struct miopen_apply ...@@ -233,7 +232,8 @@ struct miopen_apply
// Instruction's output is an input of the ret instruction // Instruction's output is an input of the ret instruction
if(offload_copy) if(offload_copy)
{ {
auto result = mod->insert_instruction(ins, hip_allocate{s, std::move(tag)}); auto result = mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
return result; return result;
} }
...@@ -247,7 +247,8 @@ struct miopen_apply ...@@ -247,7 +247,8 @@ struct miopen_apply
return mod->add_parameter("output", s); return mod->add_parameter("output", s);
} }
return mod->insert_instruction(ins, hip_allocate{s, std::move(tag)}); return mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
} }
void add_convolution_op() void add_convolution_op()
...@@ -300,9 +301,10 @@ struct miopen_apply ...@@ -300,9 +301,10 @@ struct miopen_apply
auto c_alias = instruction::get_output_alias(refs.back()); auto c_alias = instruction::get_output_alias(refs.back());
if(ins == last or refs.back()->outputs().size() > 1 or c_alias->inputs().empty()) if(ins == last or refs.back()->outputs().size() > 1 or c_alias->inputs().empty())
{ {
auto output = insert_allocation(ins, ins->get_shape()); auto output = insert_allocation(ins, ins->get_shape());
auto copy_out = mod->insert_instruction(ins, hip_copy{}, refs.back(), output); auto copy_out =
refs.back() = copy_out; mod->insert_instruction(ins, make_op("hip::copy"), refs.back(), output);
refs.back() = copy_out;
refs.push_back(copy_out); refs.push_back(copy_out);
} }
else else
...@@ -413,8 +415,9 @@ struct miopen_apply ...@@ -413,8 +415,9 @@ struct miopen_apply
{ {
apply_map.emplace("if", [=](instruction_ref ins) { apply_map.emplace("if", [=](instruction_ref ins) {
std::vector<instruction_ref> inputs = ins->inputs(); std::vector<instruction_ref> inputs = ins->inputs();
auto cpu_cond = mod->insert_instruction(ins, hip_copy_from_gpu{}, inputs.front()); auto cpu_cond =
auto sync_cond = mod->insert_instruction(ins, hip_sync_device{}, cpu_cond); mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.front());
auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
inputs.front() = sync_cond; inputs.front() = sync_cond;
std::vector<module_ref> mod_args = ins->module_inputs(); std::vector<module_ref> mod_args = ins->module_inputs();
...@@ -437,7 +440,8 @@ struct miopen_apply ...@@ -437,7 +440,8 @@ struct miopen_apply
} }
else else
{ {
output = mod->insert_instruction(ins, hip_allocate{s}); output = mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(s)}}));
} }
inputs.push_back(output); inputs.push_back(output);
} }
......
#include <migraphx/gpu/sync_device.hpp> #include <migraphx/gpu/sync_device.hpp>
#include <migraphx/gpu/hip.hpp>
#include <migraphx/program.hpp> #include <migraphx/program.hpp>
#include <migraphx/instruction.hpp> #include <migraphx/instruction.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/iterator_for.hpp> #include <migraphx/iterator_for.hpp>
namespace migraphx { namespace migraphx {
...@@ -18,7 +18,7 @@ void sync_device::apply(module& p) const ...@@ -18,7 +18,7 @@ void sync_device::apply(module& p) const
return (i->name() == "hip::copy_from_gpu"); return (i->name() == "hip::copy_from_gpu");
})) }))
{ {
auto sync_in = p.insert_instruction(last, hip_sync_device{}, inputs); auto sync_in = p.insert_instruction(last, make_op("hip::sync_stream"), inputs);
if(not inputs.empty()) if(not inputs.empty())
{ {
p.replace_instruction(inputs.front(), sync_in); p.replace_instruction(inputs.front(), sync_in);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment