"git@developer.sourcefind.cn:yangql/googletest.git" did not exist on "2d1835b086e69570e4c3e0ad6197da509bd0a957"
Unverified Commit 885b5f51 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory (#593)

**Description**
Introduce option for fine-grained GPU memory for AMD GPUs, which should
include GPU P2P performance.
parent 9d6b3aae
......@@ -243,6 +243,7 @@ or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils
### `gpu-copy-bw`
Measure the memory copy bandwidth performed by GPU SM/DMA engine, including device-to-host, host-to-device and device-to-device.
For measurements of peer-to-peer communication performance between AMD GPUs, GPU memory buffers are allocated in `hipDeviceMallocUncached` (previous `hipDeviceMallocFinegrained`) mode to maximize performance.
#### Metrics
......
......@@ -27,6 +27,13 @@ else()
# link hip device lib
add_executable(gpu_copy gpu_copy.cpp)
include(CheckSymbolExists)
check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
if(${HIP_UNCACHED_MEMORY})
target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
endif()
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
target_link_libraries(gpu_copy numa hip::device)
else()
......
......@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) {
return 0;
}
#if defined(__HIP_PLATFORM_AMD__)
bool UseFineGrained(const SubBenchArgs &args) {
return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id;
}
cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size, bool use_fine_grained) {
if (use_fine_grained) {
#if defined(HIP_UNCACHED_MEMORY)
return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocUncached);
#else
return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocFinegrained);
#endif
} else {
return cudaMalloc(ptr, size);
}
}
#else
cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size) { return cudaMalloc(ptr, size); }
#endif
// Prepare data buffers and streams to be used.
int PrepareBufAndStream(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess;
......@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) {
return -1;
}
*(host_buf_ptrs[j]) = nullptr;
cuda_err = cudaMalloc(gpu_buf_ptrs[j], args->size);
#if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size, UseFineGrained(sub));
#else
cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size);
#endif
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err);
return -1;
......@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
}
// Prepare source buffers
cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size);
#if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size, true);
#else
cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size);
#endif
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
return -1;
......@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
}
// Prepare destination buffers
cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size);
#if defined(__HIP_PLATFORM_AMD__)
cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size, true);
#else
cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size);
#endif
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
return -1;
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment