Use pinned host memory for std::vector memory allocations.

6787ca76 · Ville Pietilä · 2db781e9 · f8d7d77c · 6787ca76 · 6787ca76
Commit 6787ca76 authored Dec 03, 2024 by Ville Pietilä
6 changed files
--- a/gtest-src @ f8d7d77c
+++ b/gtest-src @ f8d7d77c
+Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_dl.hpp
@@ -8,6 +8,7 @@
 #include <sstream>

 #include "ck/utility/common_header.hpp"
+#include "ck/utility/host_memory_allocator.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -408,13 +409,14 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
    };

    // Argument
+
    struct Argument : public BaseArgument
    {
-        Argument(std::vector<const void*>& p_As,
-                 std::vector<const void*>& p_Bs,
-                 std::vector<std::array<const void*, NumDTensor>>& p_Ds,
-                 std::vector<void*>& p_Es,
-                 std::vector<GemmDesc>& gemm_descs,
+        Argument(std::vector<const void*, Allocator>& p_As,
+                 std::vector<const void*, Allocator>& p_Bs,
+                 std::vector<std::array<const void*, NumDTensor>, Allocator>& p_Ds,
+                 std::vector<void*, Allocator>& p_Es,
+                 std::vector<GemmDesc, Allocator>& gemm_descs,
                 AElementwiseOperation a_element_op,
                 BElementwiseOperation b_element_op,
                 CDEElementwiseOperation cde_element_op)
@@ -533,9 +535,9 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation cde_element_op_;

-        std::vector<GemmKernelArg> gemm_desc_kernel_arg_;
-        std::vector<Tuple<index_t, index_t>> a_mtx_mraw_kraw_;
-        std::vector<Tuple<index_t, index_t>> b_mtx_nraw_kraw_;
+        std::vector<GemmKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmKernelArg>> gemm_desc_kernel_arg_;
+        std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> a_mtx_mraw_kraw_;
+        std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> b_mtx_nraw_kraw_;

        index_t grid_size_;
    };

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_multiple_d_splitk_xdl_cshuffle_two_stage.hpp
@@ -11,6 +11,7 @@
 #include "ck/host_utility/device_prop.hpp"
 #include "ck/host_utility/kernel_launch.hpp"
 #include "ck/host_utility/hip_check_error.hpp"
+#include "ck/utility/host_memory_allocator.hpp"
 #include "ck/utility/common_header.hpp"
 #include <ck/utility/loop_scheduler.hpp>
 #include "ck/utility/tuple.hpp"
@@ -537,7 +538,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage

        std::vector<std::array<const void*, NumDTensor>>& p_Ds_;
        std::vector<std::array<index_t, NumDTensor>> stride_Ds_;
-        std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        std::vector<GemmTransKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmTransKernelArg>> gemm_kernel_args_;
        std::vector<index_t> group_grid_size_;

        std::vector<CGridDesc_M_N> elementwise_c_grid_descs_m_n_;

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl.hpp
@@ -8,6 +8,7 @@
 #include <sstream>

 #include "ck/utility/common_header.hpp"
+#include "ck/utility/host_memory_allocator.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -496,9 +497,9 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
        BElementwiseOperation b_element_op_;
        CDEElementwiseOperation c_element_op_;

-        std::vector<GemmBiasTransKernelArg> gemm_desc_kernel_arg_;
-        std::vector<Tuple<index_t, index_t>> a_mtx_mraw_kraw_;
-        std::vector<Tuple<index_t, index_t>> b_mtx_nraw_kraw_;
+        std::vector<GemmBiasTransKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmBiasTransKernelArg>> gemm_desc_kernel_arg_;
+        std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> a_mtx_mraw_kraw_;
+        std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> b_mtx_nraw_kraw_;

        index_t grid_size_;
    };

--- a/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
+++ b/include/ck/tensor_operation/gpu/device/impl/device_grouped_gemm_xdl_splitk_cshuffle.hpp
@@ -12,6 +12,7 @@
 #include "ck/host_utility/hip_check_error.hpp"
 #include "ck/utility/common_header.hpp"
 #include "ck/utility/tuple.hpp"
+#include "ck/utility/host_memory_allocator.hpp"
 #include "ck/tensor_description/tensor_descriptor.hpp"
 #include "ck/tensor_description/tensor_descriptor_helper.hpp"
 #include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
@@ -365,7 +366,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
        index_t group_count_;
        index_t skipped_group_count_;

-        std::vector<GemmTransKernelArg> gemm_kernel_args_;
+        std::vector<GemmTransKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmTransKernelArg>> gemm_kernel_args_;
        index_t grid_size_;
    };


--- a/include/ck/utility/host_memory_allocator.hpp
+++ b/include/ck/utility/host_memory_allocator.hpp
+// SPDX-License-Identifier: MIT
+// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
+
+#pragma once
+
+#include <hip/hip_runtime.h>
+#include "ck/host_utility/hip_check_error.hpp"
+
+namespace ck {
+namespace memory {
+
+    template <typename T>
+    struct PinnedHostMemoryAllocator
+    { 
+    public:
+        using value_type = T;
+        using pointer = T*;
+        using const_pointer = const T*;
+        using void_pointer = void*;
+        using const_void_pointer = const void*;
+        using size_type = std::size_t;
+        using difference_type = std::ptrdiff_t;
+
+        template <typename U>
+        struct rebind {
+            using other = PinnedHostMemoryAllocator<U>;
+        };
+
+        PinnedHostMemoryAllocator() = default;
+
+        template <typename U>
+        PinnedHostMemoryAllocator(const PinnedHostMemoryAllocator<U>& other) : std::allocator<T>(other) 
+        {}
+
+        T* allocate(std::size_t n) {
+            T* p;
+            hip_check_error(hipHostMalloc(&p, n * sizeof(T)));
+            return p;
+        }
+
+        void deallocate(T* p, std::size_t) {
+            hip_check_error(hipHostFree(p));
+        }
+
+        template<typename U, typename... Args>
+        void construct(U* p, Args&&... args) {
+            new(p) U(std::forward<Args>(args)...);
+        }
+
+        template<typename U>
+        void destroy(U* p) noexcept {
+            p->~U();
+        }
+    };
+
+    template <typename T, typename U>
+    bool operator==(const PinnedHostMemoryAllocator<T>&, const PinnedHostMemoryAllocator<U>&) { return true; }
+
+    template <typename T, typename U>
+    bool operator!=(const PinnedHostMemoryAllocator<T>&, const PinnedHostMemoryAllocator<U>&) { return false; }
+
+}
+}