Commit 6787ca76 authored by Ville Pietilä's avatar Ville Pietilä
Browse files

Use pinned host memory for std::vector memory allocations.

parent 2db781e9
gtest-src @ f8d7d77c
Subproject commit f8d7d77c06936315286eb55f8de22cd23c188571
......@@ -8,6 +8,7 @@
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/utility/host_memory_allocator.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......@@ -408,13 +409,14 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
};
// Argument
struct Argument : public BaseArgument
{
Argument(std::vector<const void*>& p_As,
std::vector<const void*>& p_Bs,
std::vector<std::array<const void*, NumDTensor>>& p_Ds,
std::vector<void*>& p_Es,
std::vector<GemmDesc>& gemm_descs,
Argument(std::vector<const void*, Allocator>& p_As,
std::vector<const void*, Allocator>& p_Bs,
std::vector<std::array<const void*, NumDTensor>, Allocator>& p_Ds,
std::vector<void*, Allocator>& p_Es,
std::vector<GemmDesc, Allocator>& gemm_descs,
AElementwiseOperation a_element_op,
BElementwiseOperation b_element_op,
CDEElementwiseOperation cde_element_op)
......@@ -533,9 +535,9 @@ struct DeviceGroupedGemmMultipleD_Dl : public DeviceGroupedGemm<ALayout,
BElementwiseOperation b_element_op_;
CDEElementwiseOperation cde_element_op_;
std::vector<GemmKernelArg> gemm_desc_kernel_arg_;
std::vector<Tuple<index_t, index_t>> a_mtx_mraw_kraw_;
std::vector<Tuple<index_t, index_t>> b_mtx_nraw_kraw_;
std::vector<GemmKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmKernelArg>> gemm_desc_kernel_arg_;
std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> a_mtx_mraw_kraw_;
std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> b_mtx_nraw_kraw_;
index_t grid_size_;
};
......
......@@ -11,6 +11,7 @@
#include "ck/host_utility/device_prop.hpp"
#include "ck/host_utility/kernel_launch.hpp"
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/utility/host_memory_allocator.hpp"
#include "ck/utility/common_header.hpp"
#include <ck/utility/loop_scheduler.hpp>
#include "ck/utility/tuple.hpp"
......@@ -537,7 +538,7 @@ struct DeviceGroupedGemmMultipleDSplitKXdlCShuffleTwoStage
std::vector<std::array<const void*, NumDTensor>>& p_Ds_;
std::vector<std::array<index_t, NumDTensor>> stride_Ds_;
std::vector<GemmTransKernelArg> gemm_kernel_args_;
std::vector<GemmTransKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmTransKernelArg>> gemm_kernel_args_;
std::vector<index_t> group_grid_size_;
std::vector<CGridDesc_M_N> elementwise_c_grid_descs_m_n_;
......
......@@ -8,6 +8,7 @@
#include <sstream>
#include "ck/utility/common_header.hpp"
#include "ck/utility/host_memory_allocator.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......@@ -496,9 +497,9 @@ struct DeviceGroupedGemm_Xdl : public DeviceGroupedGemm<ALayout,
BElementwiseOperation b_element_op_;
CDEElementwiseOperation c_element_op_;
std::vector<GemmBiasTransKernelArg> gemm_desc_kernel_arg_;
std::vector<Tuple<index_t, index_t>> a_mtx_mraw_kraw_;
std::vector<Tuple<index_t, index_t>> b_mtx_nraw_kraw_;
std::vector<GemmBiasTransKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmBiasTransKernelArg>> gemm_desc_kernel_arg_;
std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> a_mtx_mraw_kraw_;
std::vector<Tuple<index_t, index_t>, ck::memory::PinnedHostMemoryAllocator<Tuple<index_t, index_t>>> b_mtx_nraw_kraw_;
index_t grid_size_;
};
......
......@@ -12,6 +12,7 @@
#include "ck/host_utility/hip_check_error.hpp"
#include "ck/utility/common_header.hpp"
#include "ck/utility/tuple.hpp"
#include "ck/utility/host_memory_allocator.hpp"
#include "ck/tensor_description/tensor_descriptor.hpp"
#include "ck/tensor_description/tensor_descriptor_helper.hpp"
#include "ck/tensor_operation/gpu/device/tensor_layout.hpp"
......@@ -365,7 +366,7 @@ struct DeviceGroupedGemmXdlSplitKCShuffle : public DeviceGroupedGemmSplitK<ALayo
index_t group_count_;
index_t skipped_group_count_;
std::vector<GemmTransKernelArg> gemm_kernel_args_;
std::vector<GemmTransKernelArg, ck::memory::PinnedHostMemoryAllocator<GemmTransKernelArg>> gemm_kernel_args_;
index_t grid_size_;
};
......
// SPDX-License-Identifier: MIT
// Copyright (c) 2018-2024, Advanced Micro Devices, Inc. All rights reserved.
#pragma once
#include <hip/hip_runtime.h>
#include "ck/host_utility/hip_check_error.hpp"
namespace ck {
namespace memory {
template <typename T>
struct PinnedHostMemoryAllocator
{
public:
using value_type = T;
using pointer = T*;
using const_pointer = const T*;
using void_pointer = void*;
using const_void_pointer = const void*;
using size_type = std::size_t;
using difference_type = std::ptrdiff_t;
template <typename U>
struct rebind {
using other = PinnedHostMemoryAllocator<U>;
};
PinnedHostMemoryAllocator() = default;
template <typename U>
PinnedHostMemoryAllocator(const PinnedHostMemoryAllocator<U>& other) : std::allocator<T>(other)
{}
T* allocate(std::size_t n) {
T* p;
hip_check_error(hipHostMalloc(&p, n * sizeof(T)));
return p;
}
void deallocate(T* p, std::size_t) {
hip_check_error(hipHostFree(p));
}
template<typename U, typename... Args>
void construct(U* p, Args&&... args) {
new(p) U(std::forward<Args>(args)...);
}
template<typename U>
void destroy(U* p) noexcept {
p->~U();
}
};
template <typename T, typename U>
bool operator==(const PinnedHostMemoryAllocator<T>&, const PinnedHostMemoryAllocator<U>&) { return true; }
template <typename T, typename U>
bool operator!=(const PinnedHostMemoryAllocator<T>&, const PinnedHostMemoryAllocator<U>&) { return false; }
}
}
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment