Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory (#593)

**Description** Introduce option for fine-grained GPU memory for AMD GPUs, which should include GPU P2P performance.

Benchmarks: Microbenchmark - Improve AMD GPU P2P performance with fine-grained GPU memory (#593)
**Description** Introduce option for fine-grained GPU memory for AMD GPUs, which should include GPU P2P performance.
885b5f51 · Ziyue Yang · GitHub · 9d6b3aae · 885b5f51 · 885b5f51
Unverified Commit 885b5f51 authored Dec 20, 2023 by Ziyue Yang Committed by GitHub Dec 20, 2023
3 changed files
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -243,6 +243,7 @@ or [AMD](https://github.com/ROCm-Developer-Tools/HIP/tree/master/samples/1_Utils
 ### `gpu-copy-bw`

 Measure the memory copy bandwidth performed by GPU SM/DMA engine, including device-to-host, host-to-device and device-to-device.
+For measurements of peer-to-peer communication performance between AMD GPUs, GPU memory buffers are allocated in `hipDeviceMallocUncached` (previous `hipDeviceMallocFinegrained`) mode to maximize performance.

 #### Metrics


--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/CMakeLists.txt
@@ -27,6 +27,13 @@ else()

        # link hip device lib
        add_executable(gpu_copy gpu_copy.cpp)
+
+        include(CheckSymbolExists)
+        check_symbol_exists("hipDeviceMallocUncached" "hip/hip_runtime_api.h" HIP_UNCACHED_MEMORY)
+        if(${HIP_UNCACHED_MEMORY})
+            target_compile_definitions(gpu_copy PRIVATE HIP_UNCACHED_MEMORY)
+        endif()
+
        set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O2")
        target_link_libraries(gpu_copy numa hip::device)
    else()

--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -313,6 +313,25 @@ int SetGpu(int gpu_id) {
    return 0;
 }

+#if defined(__HIP_PLATFORM_AMD__)
+bool UseFineGrained(const SubBenchArgs &args) {
+    return args.is_src_dev_gpu && args.is_dst_dev_gpu && args.src_gpu_id != args.dst_gpu_id;
+}
+cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size, bool use_fine_grained) {
+    if (use_fine_grained) {
+#if defined(HIP_UNCACHED_MEMORY)
+        return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocUncached);
+#else
+        return hipExtMallocWithFlags((void **)ptr, size, hipDeviceMallocFinegrained);
+#endif
+    } else {
+        return cudaMalloc(ptr, size);
+    }
+}
+#else
+cudaError_t GpuMallocDataBuf(uint8_t **ptr, uint64_t size) { return cudaMalloc(ptr, size); }
+#endif
+
 // Prepare data buffers and streams to be used.
 int PrepareBufAndStream(BenchArgs *args) {
    cudaError_t cuda_err = cudaSuccess;
@@ -346,7 +365,11 @@ int PrepareBufAndStream(BenchArgs *args) {
                    return -1;
                }
                *(host_buf_ptrs[j]) = nullptr;
-                cuda_err = cudaMalloc(gpu_buf_ptrs[j], args->size);
+#if defined(__HIP_PLATFORM_AMD__)
+                cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size, UseFineGrained(sub));
+#else
+                cuda_err = GpuMallocDataBuf(gpu_buf_ptrs[j], args->size);
+#endif
                if (cuda_err != cudaSuccess) {
                    fprintf(stderr, "PrepareBufAndStream::cudaMalloc error: %d\n", cuda_err);
                    return -1;
@@ -876,7 +899,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
        }

        // Prepare source buffers
-        cuda_err = cudaMalloc(&(src_buffers_gpu[rank]), opts.size);
+#if defined(__HIP_PLATFORM_AMD__)
+        cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size, true);
+#else
+        cuda_err = GpuMallocDataBuf(&(src_buffers_gpu[rank]), opts.size);
+#endif
        if (cuda_err != cudaSuccess) {
            fprintf(stderr, "RunAllToAllBench::cudaMalloc for src_buffers_gpu[%d] error: %d\n", cuda_err, rank);
            return -1;
@@ -893,7 +920,11 @@ int RunAllToAllBench(const Opts &opts, int gpu_count, int src_rank, int dst_rank
        }

        // Prepare destination buffers
-        cuda_err = cudaMalloc(&(dst_buffers_gpu[rank]), opts.size);
+#if defined(__HIP_PLATFORM_AMD__)
+        cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size, true);
+#else
+        cuda_err = GpuMallocDataBuf(&(dst_buffers_gpu[rank]), opts.size);
+#endif
        if (cuda_err != cudaSuccess) {
            fprintf(stderr, "RunAllToAllBench::cudaMalloc for dst_buffers_gpu[%d] error: %d\n", cuda_err, rank);
            return -1;