Use hipStreamSynchronize instead of device sync (#813)

* Use hipStreamSynchronize instead of device sync * Formatting * Suppress FPs * Use sync_stream instead of device * Formatting * Fix python bindings * Formatting

Use hipStreamSynchronize instead of device sync (#813)
* Use hipStreamSynchronize instead of device sync * Formatting * Suppress FPs * Use sync_stream instead of device * Formatting * Fix python bindings * Formatting
6887a000 · Paul Fultz II · GitHub · a2e33148 · 6887a000 · 6887a000
Unverified Commit 6887a000 authored May 06, 2021 by Paul Fultz II Committed by GitHub May 06, 2021
7 changed files
--- a/CMakeLists.txt
+++ b/CMakeLists.txt
@@ -191,6 +191,8 @@ rocm_enable_cppcheck(
        definePrefix:*test/include/test.hpp
        useSmartPointer:*src/api/api.cpp
        useSmartPointer:*make_shared_array.hpp
+        constParameter:*src/targets/gpu/*.cpp
+        constParameter:*src/targets/gpu/*.hpp
    FORCE
    INCONCLUSIVE
    RULE_FILE

--- a/src/py/migraphx_py.cpp
+++ b/src/py/migraphx_py.cpp
@@ -398,7 +398,7 @@ MIGRAPHX_PYBIND11_MODULE(migraphx, m)
    m.def("allocate_gpu", &migraphx::gpu::allocate_gpu, py::arg("s"), py::arg("host") = false);
    m.def("to_gpu", &migraphx::gpu::to_gpu, py::arg("arg"), py::arg("host") = false);
    m.def("from_gpu", &migraphx::gpu::from_gpu);
-    m.def("gpu_sync", &migraphx::gpu::gpu_sync);
+    m.def("gpu_sync", [] { migraphx::gpu::gpu_sync(); });
 #endif

 #ifdef VERSION_INFO

--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -15,6 +15,7 @@ namespace gpu {

 MIGRAPHX_REGISTER_OP(hip_allocate)
 MIGRAPHX_REGISTER_OP(hip_sync_device)
+MIGRAPHX_REGISTER_OP(hip_sync_stream)
 MIGRAPHX_REGISTER_OP(hip_copy_to_gpu)
 MIGRAPHX_REGISTER_OP(hip_copy_from_gpu)
 MIGRAPHX_REGISTER_OP(hip_copy)
@@ -146,6 +147,8 @@ void gpu_sync()
        MIGRAPHX_THROW("hip device synchronization failed: " + hip_error(status));
 }

+void gpu_sync(const context& ctx) { ctx.finish(); }
+
 void hip_async_copy(context& ctx, const argument& src, const argument& dst, hipMemcpyKind kind)
 {
    std::size_t src_size = src.get_shape().bytes();

--- a/src/targets/gpu/include/migraphx/gpu/context.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/context.hpp
@@ -97,6 +97,16 @@ struct hip_device
            return rbhandle.get();
        }

+        void wait() const
+        {
+            if(s == nullptr)
+                return;
+            setup();
+            auto status = hipStreamSynchronize(s.get());
+            if(status != hipSuccess)
+                MIGRAPHX_THROW("Failed to wait.");
+        }
+
        void wait(hipEvent_t event)
        {
            setup();
@@ -126,6 +136,10 @@ struct hip_device

    stream& get_stream(std::size_t n) { return streams.at(n); }

+    const stream& get_stream() const { return streams.at(current_stream); }
+
+    const stream& get_stream(std::size_t n) const { return streams.at(n); }
+
    void set_stream(std::size_t n) { current_stream = n; }

    std::size_t nstreams() const { return streams.size(); }
@@ -163,9 +177,21 @@ struct context
        return *current_device;
    }

+    const hip_device& get_current_device() const
+    {
+        assert(current_device != nullptr);
+        return *current_device;
+    }
+
    hip_device::stream& get_stream() { return get_current_device().get_stream(); }
    hip_device::stream& get_stream(std::size_t n) { return get_current_device().get_stream(n); }

+    const hip_device::stream& get_stream() const { return get_current_device().get_stream(); }
+    const hip_device::stream& get_stream(std::size_t n) const
+    {
+        return get_current_device().get_stream(n);
+    }
+
    void set_stream(std::size_t n) { get_current_device().set_stream(n); }

    void create_events(std::size_t num_of_events)
@@ -177,7 +203,7 @@ struct context
    hipEvent_t get_event(std::size_t i) const { return events.at(i).get(); }

    std::vector<argument> literals{};
-    void finish() const { gpu_sync(); }
+    void finish() const { get_stream().wait(); }

    static hip_event_ptr create_event()
    {

--- a/src/targets/gpu/include/migraphx/gpu/hip.hpp
+++ b/src/targets/gpu/include/migraphx/gpu/hip.hpp
@@ -25,6 +25,7 @@ argument from_gpu(const argument& arg);
 void set_device(std::size_t id);

 void gpu_sync();
+void gpu_sync(const context& ctx);

 void gpu_copy(context& ctx, const argument& src, const argument& dst);
 void copy_to_gpu(context& ctx, const argument& src, const argument& dst);
@@ -82,6 +83,33 @@ struct hip_sync_device
    }
 };

+struct hip_sync_stream
+{
+    std::string tag{};
+
+    template <class Self, class F>
+    static auto reflect(Self& self, F f)
+    {
+        return pack(f(self.tag, "tag"));
+    }
+
+    std::string name() const { return "hip::sync_stream"; }
+    shape compute_shape(const std::vector<shape>& inputs) const
+    {
+        if(inputs.empty())
+            return {};
+        return inputs.front();
+    }
+
+    argument compute(context& ctx, const shape&, const std::vector<argument>& args) const
+    {
+        gpu_sync(ctx);
+        if(args.empty())
+            return {};
+        return args.front();
+    }
+};
+
 struct hip_copy_to_gpu
 {
    std::string name() const { return "hip::copy_to_gpu"; }

--- a/src/targets/gpu/lowering.cpp
+++ b/src/targets/gpu/lowering.cpp
@@ -26,7 +26,6 @@
 #include <migraphx/gpu/equal.hpp>
 #include <migraphx/gpu/gemm.hpp>
 #include <migraphx/gpu/greater.hpp>
-#include <migraphx/gpu/hip.hpp>
 #include <migraphx/gpu/int8_conv_pack.hpp>
 #include <migraphx/gpu/leaky_relu.hpp>
 #include <migraphx/gpu/less.hpp>
@@ -188,7 +187,7 @@ struct miopen_apply

            auto pos = std::next(ins);
            auto a   = insert_allocation(pos, ins->get_shape());
-            auto c   = mod->insert_instruction(pos, hip_copy_to_gpu{}, ins, a);
+            auto c   = mod->insert_instruction(pos, make_op("hip::copy_to_gpu"), ins, a);
            mod->replace_instruction(ins, c);
        }

@@ -202,14 +201,14 @@ struct miopen_apply
            // output with copy output
            for(const auto& in : inputs)
            {
-                auto p_output = mod->insert_instruction(ret, hip_copy_from_gpu{}, in);
+                auto p_output = mod->insert_instruction(ret, make_op("hip::copy_from_gpu"), in);
                instruction::replace_argument(ret, in, p_output);
            }
        }
        // else branch to handle legacy program without the return instruction
        else
        {
-            mod->add_instruction(hip_copy_from_gpu{}, ret);
+            mod->add_instruction(make_op("hip::copy_from_gpu"), ret);
        }
    }

@@ -233,7 +232,8 @@ struct miopen_apply
        // Instruction's output is an input of the ret instruction
        if(offload_copy)
        {
-            auto result = mod->insert_instruction(ins, hip_allocate{s, std::move(tag)});
+            auto result = mod->insert_instruction(
+                ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
            return result;
        }

@@ -247,7 +247,8 @@ struct miopen_apply
            return mod->add_parameter("output", s);
        }

-        return mod->insert_instruction(ins, hip_allocate{s, std::move(tag)});
+        return mod->insert_instruction(
+            ins, make_op("hip::allocate", {{"shape", to_value(s)}, {"tag", std::move(tag)}}));
    }

    void add_convolution_op()
@@ -300,9 +301,10 @@ struct miopen_apply
                auto c_alias = instruction::get_output_alias(refs.back());
                if(ins == last or refs.back()->outputs().size() > 1 or c_alias->inputs().empty())
                {
-                    auto output   = insert_allocation(ins, ins->get_shape());
-                    auto copy_out = mod->insert_instruction(ins, hip_copy{}, refs.back(), output);
-                    refs.back()   = copy_out;
+                    auto output = insert_allocation(ins, ins->get_shape());
+                    auto copy_out =
+                        mod->insert_instruction(ins, make_op("hip::copy"), refs.back(), output);
+                    refs.back() = copy_out;
                    refs.push_back(copy_out);
                }
                else
@@ -413,8 +415,9 @@ struct miopen_apply
    {
        apply_map.emplace("if", [=](instruction_ref ins) {
            std::vector<instruction_ref> inputs = ins->inputs();
-            auto cpu_cond  = mod->insert_instruction(ins, hip_copy_from_gpu{}, inputs.front());
-            auto sync_cond = mod->insert_instruction(ins, hip_sync_device{}, cpu_cond);
+            auto cpu_cond =
+                mod->insert_instruction(ins, make_op("hip::copy_from_gpu"), inputs.front());
+            auto sync_cond = mod->insert_instruction(ins, make_op("hip::sync_stream"), cpu_cond);
            inputs.front() = sync_cond;

            std::vector<module_ref> mod_args = ins->module_inputs();
@@ -437,7 +440,8 @@ struct miopen_apply
                }
                else
                {
-                    output = mod->insert_instruction(ins, hip_allocate{s});
+                    output = mod->insert_instruction(
+                        ins, make_op("hip::allocate", {{"shape", to_value(s)}}));
                }
                inputs.push_back(output);
            }

--- a/src/targets/gpu/sync_device.cpp
+++ b/src/targets/gpu/sync_device.cpp
 #include <migraphx/gpu/sync_device.hpp>
-#include <migraphx/gpu/hip.hpp>
 #include <migraphx/program.hpp>
 #include <migraphx/instruction.hpp>
+#include <migraphx/make_op.hpp>
 #include <migraphx/iterator_for.hpp>

 namespace migraphx {
@@ -18,7 +18,7 @@ void sync_device::apply(module& p) const
               return (i->name() == "hip::copy_from_gpu");
           }))
        {
-            auto sync_in = p.insert_instruction(last, hip_sync_device{}, inputs);
+            auto sync_in = p.insert_instruction(last, make_op("hip::sync_stream"), inputs);
            if(not inputs.empty())
            {
                p.replace_instruction(inputs.front(), sync_in);