Unverified Commit 6887a000 authored by Paul Fultz II's avatar Paul Fultz II Committed by GitHub
Browse files

Use hipStreamSynchronize instead of device sync (#813)

* Use hipStreamSynchronize instead of device sync

* Formatting

* Suppress FPs

* Use sync_stream instead of device

* Formatting

* Fix python bindings

* Formatting
parent a2e33148
......@@ -191,6 +191,8 @@ rocm_enable_cppcheck(
definePrefix:*test/include/test.hpp
useSmartPointer:*src/api/api.cpp
useSmartPointer:*make_shared_array.hpp
constParameter:*src/targets/gpu/*.cpp
constParameter:*src/targets/gpu/*.hpp
FORCE
INCONCLUSIVE
RULE_FILE
......
......@@ -398,7 +398,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);
m.def("to_gpu", &migraphx::gpu::to_gpu, py::arg("arg"), py::arg("host") = false);
m.def("from_gpu", &migraphx::gpu::from_gpu);
m.def("gpu_sync", &migraphx::gpu::gpu_sync);
m.def("gpu_sync", [] { migraphx::gpu::gpu_sync(); });
#endif
#ifdef VERSION_INFO
......
......@@ -15,6 +15,7 @@ namespace gpu {
MIGRAPHX_REGISTER_OP(hip_allocate)
MIGRAPHX_REGISTER_OP(hip_sync_device)
MIGRAPHX_REGISTER_OP(hip_sync_stream)
MIGRAPHX_REGISTER_OP(hip_copy_to_gpu)
MIGRAPHX_REGISTER_OP(hip_copy_from_gpu)
MIGRAPHX_REGISTER_OP(hip_copy)
......@@ -146,6 +147,8 @@ void gpu_sync()
MIGRAPHX_THROW("hip device synchronization failed: " + hip_error(status));
}
void gpu_sync(const context& ctx) { ctx.finish(); }
void hip_async_copy(context& ctx, const argument& src, const argument& dst, hipMemcpyKind kind)
{
std::size_t src_size = src.get_shape().bytes();
......
......@@ -97,6 +97,16 @@ struct hip_device
return rbhandle.get();
}
void wait() const
{
if(s == nullptr)
return;
setup();
auto status = hipStreamSynchronize(s.get());
if(status != hipSuccess)
MIGRAPHX_THROW("Failed to wait.");
}
void wait(hipEvent_t event)
{
setup();
......@@ -126,6 +136,10 @@ struct hip_device
stream& get_stream(std::size_t n) { return streams.at(n); }
const stream& get_stream() const { return streams.at(current_stream); }
const stream& get_stream(std::size_t n) const { return streams.at(n); }
void set_stream(std::size_t n) { current_stream = n; }
std::size_t nstreams() const { return streams.size(); }
......@@ -163,9 +177,21 @@ struct context
return *current_device;
}
const hip_device& get_current_device() const
{
assert(current_device != nullptr);
return *current_device;
}
hip_device::stream& get_stream() { return get_current_device().get_stream(); }
hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }
const hip_device::stream& get_stream() const { return get_current_device().get_stream(); }
const hip_device::stream& get_stream(std::size_t n) const
{
return get_current_device().get_stream(n);
}
void set_stream(std::size_t n) { get_current_device().set_stream(n); }
void create_events(std::size_t num_of_events)
......@@ -177,7 +203,7 @@ struct context
hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }
std::vector<argument> literals{};
void finish() const { gpu_sync(); }
void finish() const { get_stream().wait(); }
static hip_event_ptr create_event()
{
......
......@@ -25,6 +25,7 @@ argument from_gpu(const argument& arg);
void set_device(std::size_t id);
void gpu_sync();
void gpu_sync(const context& ctx);
void gpu_copy(context& ctx, const argument& src, const argument& dst);
void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
......@@ -82,6 +83,33 @@ struct hip_sync_device
}
};
struct hip_sync_stream
{
std::string tag{};
template <class Self, class F>
static auto reflect(Self& self, F f)
{
return pack(f(self.tag, "tag"));
}
std::string name() const { return "hip::sync_stream"; }
shape compute_shape(const std::vector<shape>& inputs) const
{
if(inputs.empty())
return {};
return inputs.front();
}
argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
{
gpu_sync(ctx);
if(args.empty())
return {};
return args.front();
}
};
struct hip_copy_to_gpu
{
std::string name() const { return "hip::copy_to_gpu"; }
......
......@@ -26,7 +26,6 @@
#include <migraphx/gpu/equal.hpp>
#include <migraphx/gpu/gemm.hpp>
#include <migraphx/gpu/greater.hpp>
#include <migraphx/gpu/hip.hpp>
#include <migraphx/gpu/int8_conv_pack.hpp>
#include <migraphx/gpu/leaky_relu.hpp>
#include <migraphx/gpu/less.hpp>
......@@ -188,7 +187,7 @@ struct miopen_apply
auto pos = std::next(ins);
auto a = insert_allocation(pos, ins->get_shape());
auto c = mod->insert_instruction(pos, hip_copy_to_gpu{}, ins, a);
auto c = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
mod->replace_instruction(ins, c);
}
......@@ -202,14 +201,14 @@ struct miopen_apply
// output with copy output
for(const auto& in : inputs)
{
auto p_output = mod->insert_instruction(ret, hip_copy_from_gpu{}, in);
auto p_output = mod->insert_instruction(ret, make_op("hip::copy_from_gpu"), in);
instruction::replace_argument(ret, in, p_output);
}
}
// else branch to handle legacy program without the return instruction
else
{
mod->add_instruction(hip_copy_from_gpu{}, ret);
mod->add_instruction(make_op("hip::copy_from_gpu"), ret);
}
}
......@@ -233,7 +232,8 @@ struct miopen_apply
// Instruction's output is an input of the ret instruction
if(offload_copy)
{
auto result = mod->insert_instruction(ins, hip_allocate{s, std::move(tag)});
auto result = mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
return result;
}
......@@ -247,7 +247,8 @@ struct miopen_apply
return mod->add_parameter("output", s);
}
return mod->insert_instruction(ins, hip_allocate{s, std::move(tag)});
return mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
}
void add_convolution_op()
......@@ -300,9 +301,10 @@ struct miopen_apply
auto c_alias = instruction::get_output_alias(refs.back());
if(ins == last or refs.back()->outputs().size() > 1 or c_alias->inputs().empty())
{
auto output = insert_allocation(ins, ins->get_shape());
auto copy_out = mod->insert_instruction(ins, hip_copy{}, refs.back(), output);
refs.back() = copy_out;
auto output = insert_allocation(ins, ins->get_shape());
auto copy_out =
mod->insert_instruction(ins, make_op("hip::copy"), refs.back(), output);
refs.back() = copy_out;
refs.push_back(copy_out);
}
else
......@@ -413,8 +415,9 @@ struct miopen_apply
{
apply_map.emplace("if", [=](instruction_ref ins) {
std::vector<instruction_ref> inputs = ins->inputs();
auto cpu_cond = mod->insert_instruction(ins, hip_copy_from_gpu{}, inputs.front());
auto sync_cond = mod->insert_instruction(ins, hip_sync_device{}, cpu_cond);
auto cpu_cond =
mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.front());
auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
inputs.front() = sync_cond;
std::vector<module_ref> mod_args = ins->module_inputs();
......@@ -437,7 +440,8 @@ struct miopen_apply
}
else
{
output = mod->insert_instruction(ins, hip_allocate{s});
output = mod->insert_instruction(
ins, make_op("hip::allocate", {{"shape", to_value(s)}}));
}
inputs.push_back(output);
}
......
#include <migraphx/gpu/sync_device.hpp>
#include <migraphx/gpu/hip.hpp>
#include <migraphx/program.hpp>
#include <migraphx/instruction.hpp>
#include <migraphx/make_op.hpp>
#include <migraphx/iterator_for.hpp>
namespace migraphx {
......@@ -18,7 +18,7 @@ void sync_device::apply(module& p) const
return (i->name() == "hip::copy_from_gpu");
}))
{
auto sync_in = p.insert_instruction(last, hip_sync_device{}, inputs);
auto sync_in = p.insert_instruction(last, make_op("hip::sync_stream"), inputs);
if(not inputs.empty())
{
p.replace_instruction(inputs.front(), sync_in);
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment