Avoid registering host buffer ptr multiple times during hip copies (#1245)

Currently, while copying a host buffer to the device, it first registers/maps the host buffer pointer to address space of the device. If the host buffer has been allocated by the hipHostMalloc then, it is implicitly registered to the device's address space, and no need to register again. This PR adds a check for the same.

Avoid registering host buffer ptr multiple times during hip copies (#1245)
Currently, while copying a host buffer to the device, it first registers/maps the host buffer pointer to address space of the device. If the host buffer has been allocated by the hipHostMalloc then, it is implicitly registered to the device's address space, and no need to register again. This PR adds a check for the same.
7596f3f1 · Umang Yadav · GitHub · afdc3051 · 7596f3f1 · 7596f3f1
Unverified Commit 7596f3f1 authored Jul 29, 2022 by Umang Yadav Committed by GitHub Jul 29, 2022
Show whitespace changes
Inline Side-by-side

Showing with 153 additions and 17 deletions

src/targets/gpu/hip.cpp src/targets/gpu/hip.cpp +54 -17

test/gpu/manage_host_buffer.cpp test/gpu/manage_host_buffer.cpp +99 -0

No files found.
--- a/src/targets/gpu/hip.cpp
+++ b/src/targets/gpu/hip.cpp
@@ -23,13 +23,13 @@
 */
 #include <migraphx/gpu/hip.hpp>
 #include <migraphx/manage_ptr.hpp>
 #include <migraphx/register_op.hpp>
 #include <migraphx/gpu/context.hpp>
 #include <migraphx/gpu/device/contiguous.hpp>
 #include <miopen/miopen.h>
+#include <memory>
+#include <mutex>
 #include <vector>
 namespace migraphx {
@@ -77,12 +77,38 @@ void* get_device_ptr(void* hptr)
    return result;
 }
-hip_ptr allocate_gpu(std::size_t sz, bool host = false)
+struct host_ptr_cache
+{
+    std::unordered_map<void*, std::weak_ptr<void>> cache;
+    std::mutex m;
+    std::shared_ptr<void> get(void* ptr)
+    {
+        std::lock_guard<std::mutex> lock(m);
+        auto it = cache.find(ptr);
+        if(it != cache.end())
+            return it->second.lock();
+        return nullptr;
+    }
+    void put(const std::shared_ptr<void>& p)
+    {
+        std::lock_guard<std::mutex> lock(m);
+        cache[p.get()] = p;
+    }
+};
+static host_ptr_cache& get_host_ptr_cache()
+{
+    static host_ptr_cache cache;
+    return cache;
+}
+std::shared_ptr<void> allocate_gpu(std::size_t sz, bool host = false)
 {
    if(sz > get_available_gpu_memory())
        MIGRAPHX_THROW("Memory not available to allocate buffer: " + std::to_string(sz));
-    void* result = nullptr;
+    void* alloc_ptr = nullptr;
-    auto status  = host ? hipHostMalloc(&result, sz) : hipMalloc(&result, sz);
+    auto status     = host ? hipHostMalloc(&alloc_ptr, sz) : hipMalloc(&alloc_ptr, sz);
    if(status != hipSuccess)
    {
        if(host)
@@ -90,16 +116,28 @@ hip_ptr allocate_gpu(std::size_t sz, bool host = false)
        else
            return allocate_gpu(sz, true);
    }
-    assert(result != nullptr);
+    assert(alloc_ptr != nullptr);
-    return hip_ptr{result};
+    std::shared_ptr<void> result = share(hip_ptr{alloc_ptr});
+    if(host)
+    {
+        get_host_ptr_cache().put(result);
+    }
+    return result;
 }
-hip_host_ptr register_on_gpu(void* ptr, std::size_t sz)
+std::shared_ptr<void> register_on_gpu(void* ptr, std::size_t sz)
 {
+    std::shared_ptr<void> result = get_host_ptr_cache().get(ptr);
+    if(result)
+    {
+        return result;
+    }
    auto status = hipHostRegister(ptr, sz, hipHostRegisterMapped);
    if(status != hipSuccess)
        MIGRAPHX_THROW("Gpu register failed: " + hip_error(status));
-    return hip_host_ptr{ptr};
+    result = share(hip_host_ptr{ptr});
+    get_host_ptr_cache().put(result);
+    return result;
 }
 template <class T>
@@ -115,7 +153,7 @@ std::vector<T> read_from_gpu(const void* x, std::size_t sz)
    return result;
 }
-hip_ptr write_to_gpu(const void* x, std::size_t sz, bool host = false)
+std::shared_ptr<void> write_to_gpu(const void* x, std::size_t sz, bool host = false)
 {
    gpu_sync();
    auto result = allocate_gpu(sz, host);
@@ -137,22 +175,21 @@ hip_ptr write_to_gpu(const T& x)
 argument allocate_gpu(const shape& s, bool host)
 {
-    auto p = share(allocate_gpu(s.bytes() + 1, host));
+    auto p = allocate_gpu(s.bytes() + 1, host);
    return {s, [p]() mutable { return reinterpret_cast<char*>(p.get()); }};
 }
 argument register_on_gpu(const argument& arg)
 {
    auto arg_shared = arg.share();
-    auto p          = share(register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes()));
+    auto p          = register_on_gpu(arg_shared.data(), arg_shared.get_shape().bytes());
-    return {arg_shared.get_shape(), [p, a = std::move(arg_shared)]() mutable {
+    return {arg_shared.get_shape(),
-                return get_device_ptr(p.get());
+            [p, a = std::move(arg_shared)]() mutable { return get_device_ptr(p.get()); }};
-            }}; // namespace gpu
+}
-} // namespace MIGRAPHX_INLINE_NS
 argument to_gpu(const argument& arg, bool host)
 {
-    auto p = share(write_to_gpu(arg.data(), arg.get_shape().bytes(), host));
+    auto p = write_to_gpu(arg.data(), arg.get_shape().bytes(), host);
    return {arg.get_shape(), p};
 }

--- a/test/gpu/manage_host_buffer.cpp
+++ b/test/gpu/manage_host_buffer.cpp
+/*
+ * The MIT License (MIT)
+ *
+ * Copyright (c) 2015-2022 Advanced Micro Devices, Inc. All rights reserved.
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to deal
+ * in the Software without restriction, including without limitation the rights
+ * to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
+ * copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in
+ * all copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
+ * THE SOFTWARE.
+ */
+#include <iostream>
+#include <vector>
+#include <hip/hip_runtime_api.h>
+#include <migraphx/gpu/target.hpp>
+#include <migraphx/verify.hpp>
+#include <test.hpp>
+#include <basic_ops.hpp>
+#include <migraphx/gpu/hip.hpp>
+#include <migraphx/make_op.hpp>
+#define MIGRAPHX_HIP_ASSERT(x) (EXPECT(x == hipSuccess))
+TEST_CASE(host_same_buffer_copy)
+{
+    migraphx::program p;
+    auto* mm = p.get_main_module();
+    migraphx::shape ss{migraphx::shape::float_type, {4, 2}};
+    auto a           = mm->add_parameter("a", ss);
+    auto b           = mm->add_parameter("b", ss);
+    auto aa          = mm->add_instruction(migraphx::make_op("add"), a, a);
+    auto gpu_out     = mm->add_instruction(migraphx::make_op("hip::copy_from_gpu"), aa);
+    auto stream_sync = mm->add_instruction(migraphx::make_op("hip::sync_stream"), gpu_out);
+    auto pass        = mm->add_instruction(unary_pass_op{}, stream_sync);
+    auto alloc       = mm->add_instruction(
+        migraphx::make_op("hip::allocate", {{"shape", migraphx::to_value(ss)}}));
+    auto gpu_in = mm->add_instruction(migraphx::make_op("hip::copy_to_gpu"), pass, alloc);
+    auto aab    = mm->add_instruction(migraphx::make_op("add"), gpu_in, b);
+    mm->add_return({aab});
+    migraphx::parameter_map pp;
+    std::vector<float> a_vec(ss.elements(), -1);
+    std::vector<float> b_vec(ss.elements(), 2);
+    std::vector<float> c_vec(ss.elements(), 0);
+    pp["a"] = migraphx::argument(ss, a_vec.data());
+    pp["b"] = migraphx::argument(ss, b_vec.data());
+    std::vector<float> gpu_result;
+    migraphx::target gpu_t = migraphx::gpu::target{};
+    migraphx::compile_options options;
+    options.offload_copy = true;
+    p.compile(gpu_t, options);
+    auto result = p.eval(pp).back();
+    std::vector<float> results_vector(ss.elements(), -1);
+    result.visit([&](auto output) { results_vector.assign(output.begin(), output.end()); });
+    EXPECT(migraphx::verify_range(c_vec, results_vector));
+}
+TEST_CASE(arguments_lifetime)
+{
+    auto use_on_gpu = [](const migraphx::argument& arg, int c) {
+        auto* arg_ptr = arg.data();
+        MIGRAPHX_HIP_ASSERT(hipSetDevice(0));
+        MIGRAPHX_HIP_ASSERT(hipMemset(arg_ptr, c, arg.get_shape().bytes()));
+        MIGRAPHX_HIP_ASSERT(hipDeviceSynchronize());
+        return;
+    };
+    auto f = [use_on_gpu](const migraphx::argument& input) {
+        auto a = migraphx::gpu::register_on_gpu(input);
+        auto s = a.get_shape();
+        {
+            auto b = migraphx::gpu::register_on_gpu(input);
+            use_on_gpu(b, 0);
+            std::vector<float> expected_b(s.elements(), 0);
+            auto gold = migraphx::argument(s, expected_b.data());
+        }
+        use_on_gpu(a, 1);
+        return true;
+    };
+    migraphx::shape ss{migraphx::shape::float_type, {4, 2}};
+    std::vector<float> x_data(ss.elements(), -1);
+    migraphx::argument x{ss, x_data.data()};
+    EXPECT(f(x));
+}
+int main(int argc, const char* argv[]) { test::run(argc, argv); }