Benchmarks: Add Feature - Add CPU-initiated copy and dtod support to gpu-sm-copy benchmark (#230)

**Description** This commit does the following: 1) Adds CPU-initiated copy benchmark; 2) Adds dtod benchmark; 3) Support scanning NUMA nodes and GPUs inside the benchmark program; 4) Change the name of gpu-sm-copy to gpu-copy.

Benchmarks: Add Feature - Add CPU-initiated copy and dtod support to gpu-sm-copy benchmark (#230)
**Description** This commit does the following: 1) Adds CPU-initiated copy benchmark; 2) Adds dtod benchmark; 3) Support scanning NUMA nodes and GPUs inside the benchmark program; 4) Change the name of gpu-sm-copy to gpu-copy.
008e0fe1 · Ziyue Yang · GitHub · 6a068e25 · 008e0fe1 · 008e0fe1
Unverified Commit 008e0fe1 authored Oct 30, 2021 by Ziyue Yang Committed by GitHub Oct 30, 2021
15 changed files
--- a/dockerfile/cuda11.1.1.dockerfile
+++ b/dockerfile/cuda11.1.1.dockerfile
@@ -27,6 +27,7 @@ RUN apt-get update && \
    jq \
    libaio-dev \
    libcap2 \
+    libnuma-dev \
    libpci-dev \
    libtinfo5 \
    libtool \

--- a/dockerfile/rocm4.0-pytorch1.7.0.dockerfile
+++ b/dockerfile/rocm4.0-pytorch1.7.0.dockerfile
@@ -27,6 +27,7 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | APT_KEY_DONT_WARN_ON_D
    jq \
    libaio-dev \
    libcap2 \
+    libnuma-dev \
    libpci-dev \
    libtinfo5 \
    libtool \

--- a/dockerfile/rocm4.2-pytorch1.7.0.dockerfile
+++ b/dockerfile/rocm4.2-pytorch1.7.0.dockerfile
@@ -27,6 +27,7 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | APT_KEY_DONT_WARN_ON_D
    jq \
    libaio-dev \
    libcap2 \
+    libnuma-dev \
    libpci-dev \
    libtinfo5 \
    libtool \

--- a/examples/benchmarks/gpu_sm_copy_bw_performance.py
+++ b/examples/benchmarks/gpu_sm_copy_bw_performance.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""Micro benchmark example for GPU SM copy bandwidth performance.
+"""Micro benchmark example for GPU copy bandwidth performance.

 Commands to run:
-  python3 examples/benchmarks/gpu_sm_copy_bw_performance.py
+  python3 examples/benchmarks/gpu_copy_bw_performance.py
 """

 from superbench.benchmarks import BenchmarkRegistry, Platform
@@ -12,11 +12,11 @@

 if __name__ == '__main__':
    context = BenchmarkRegistry.create_benchmark_context(
-        'gpu-sm-copy-bw', platform=Platform.CUDA, parameters='--mem_type dtoh htod'
+        'gpu-copy-bw', platform=Platform.CUDA, parameters='--mem_type htod dtoh dtod --copy_type sm dma'
    )
    # For ROCm environment, please specify the benchmark name and the platform as the following.
    # context = BenchmarkRegistry.create_benchmark_context(
-    #     'gpu-sm-copy-bw', platform=Platform.ROCM, parameters='--mem_type dtoh htod'
+    #     'gpu-copy-bw', platform=Platform.ROCM, parameters='--mem_type htod dtoh dtod --copy_type sm dma'
    # )

    benchmark = BenchmarkRegistry.launch_benchmark(context)

--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -18,7 +18,7 @@
 from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
 from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
 from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
-from superbench.benchmarks.micro_benchmarks.gpu_sm_copy_bw_performance import GpuSmCopyBwBenchmark
+from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
 from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark

@@ -26,5 +26,5 @@
    'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
    'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark',
-    'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark', 'TCPConnectivityBenchmark', 'GPCNetBenchmark'
+    'RocmGemmFlopsBenchmark', 'GpuCopyBwBenchmark', 'TCPConnectivityBenchmark', 'GPCNetBenchmark'
 ]
--- a/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_bw_performance.py
 # Copyright (c) Microsoft Corporation.
 # Licensed under the MIT license.

-"""Module of the GPU SM Copy Bandwidth Performance benchmark."""
+"""Module of the GPU Copy Bandwidth Performance benchmark."""

 import os

@@ -10,8 +10,8 @@
 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


-class GpuSmCopyBwBenchmark(MicroBenchmarkWithInvoke):
-    """The GPU SM copy bandwidth performance benchmark class."""
+class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
+    """The GPU copy bandwidth performance benchmark class."""
    def __init__(self, name, parameters=''):
        """Constructor.

@@ -21,8 +21,9 @@ def __init__(self, name, parameters=''):
        """
        super().__init__(name, parameters)

-        self._bin_name = 'gpu_sm_copy'
-        self._mem_types = ['htod', 'dtoh']
+        self._bin_name = 'gpu_copy'
+        self._mem_types = ['htod', 'dtoh', 'dtod']
+        self._copy_types = ['sm', 'dma']

    def add_parser_arguments(self):
        """Add the specified arguments."""
@@ -36,6 +37,14 @@ def add_parser_arguments(self):
            help='Memory types for benchmark. E.g. {}.'.format(' '.join(self._mem_types)),
        )

+        self._parser.add_argument(
+            '--copy_type',
+            type=str,
+            nargs='+',
+            default=self._copy_types,
+            help='Copy types for benchmark. E.g. {}.'.format(' '.join(self._copy_types)),
+        )
+
        self._parser.add_argument(
            '--size',
            type=int,
@@ -63,10 +72,13 @@ def _preprocess(self):

        self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)

+        args = '--size %d --num_loops %d' % (self._args.size, self._args.num_loops)
        for mem_type in self._args.mem_type:
-            command = '%s 0 %s %d %d' % \
-                (self.__bin_path, mem_type, self._args.size, self._args.num_loops)
-            self._commands.append(command)
+            args += ' --%s' % mem_type
+        for copy_type in self._args.copy_type:
+            args += ' --%s_copy' % copy_type
+
+        self._commands = ['%s %s' % (self.__bin_path, args)]

        return True

@@ -85,9 +97,10 @@ def _process_raw_result(self, cmd_idx, raw_output):
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)

        try:
-            output_prefix = 'Bandwidth (GB/s): '
-            assert (raw_output.startswith(output_prefix))
-            self._result.add_result(self._args.mem_type[cmd_idx], float(raw_output[len(output_prefix):]))
+            output_lines = [x.strip() for x in raw_output.strip().splitlines()]
+            for output_line in output_lines:
+                tag, bw_str = output_line.split()
+                self._result.add_result(tag, float(bw_str))
        except BaseException as e:
            self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
            logger.error(
@@ -100,4 +113,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
        return True


-BenchmarkRegistry.register_benchmark('gpu-sm-copy-bw', GpuSmCopyBwBenchmark)
+BenchmarkRegistry.register_benchmark('gpu-copy-bw', GpuCopyBwBenchmark)
--- a/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_performance/CMakeLists.txt
+++ b/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_performance/CMakeLists.txt
@@ -3,7 +3,7 @@

 cmake_minimum_required(VERSION 3.18)

-project(gpu_sm_copy LANGUAGES CXX)
+project(gpu_copy LANGUAGES CXX)

 find_package(CUDAToolkit QUIET)

@@ -12,9 +12,8 @@ if(CUDAToolkit_FOUND)
    message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})

    include(../cuda_common.cmake)
-    add_executable(gpu_sm_copy gpu_sm_copy.cu)
-    set_property(TARGET gpu_sm_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
-    install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
+    add_executable(gpu_copy gpu_copy.cu)
+    set_property(TARGET gpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
 else()
    # ROCm environment
    include(../rocm_common.cmake)
@@ -23,16 +22,17 @@ else()
        message(STATUS "Found ROCm: " ${HIP_VERSION})

        # Convert cuda code to hip code inplace
-        execute_process(COMMAND hipify-perl -inplace -print-stats gpu_sm_copy.cu
+        execute_process(COMMAND hipify-perl -inplace -print-stats gpu_copy.cu
                        WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)

        # Add HIP targets
-        set_source_files_properties(gpu_sm_copy.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
+        set_source_files_properties(gpu_copy.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
        # Link with HIP
-        hip_add_executable(gpu_sm_copy gpu_sm_copy.cu)
-        # Install tergets
-        install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
+        hip_add_executable(gpu_copy gpu_copy.cu)
    else()
        message(FATAL_ERROR "No CUDA or ROCm environment found.")
    endif()
 endif()
+
+install(TARGETS gpu_copy RUNTIME DESTINATION bin)
+target_link_libraries(gpu_copy numa)
--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
--- a/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_performance/gpu_sm_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_sm_copy_performance/gpu_sm_copy.cu
-// Copyright (c) Microsoft Corporation.
-// Licensed under the MIT License.
-
-// GPU SM copy benchmark tests dtoh/htod data transfer bandwidth initiated by GPU SM.
-
-#include <chrono>
-#include <cstdio>
-#include <cstring>
-#include <string>
-
-#include <cuda.h>
-#include <cuda_runtime.h>
-
-// Argurment index used in argument parsing.
-enum class ArgIdx { kGpuId = 1, kCopyDirection, kSize, kNumLoops, kNumArgs };
-
-// Stored arguments for this program.
-struct Args {
-    // ID of GPU used in this benchmark.
-    int gpu_id = 0;
-
-    // Data transfer direction, can be "dtoh" or "htod".
-    std::string copy_direction;
-
-    // Data buffer size used.
-    uint64_t size = 0;
-
-    // Number of loops in data transfer benchmark.
-    uint64_t num_loops = 0;
-};
-
-struct Buffers {
-    // Original data buffer.
-    uint8_t *data_buf = nullptr;
-
-    // Buffer to validate the correctness of data transfer.
-    uint8_t *check_buf = nullptr;
-
-    // Data buffer in host memory.
-    uint8_t *host_buf = nullptr;
-
-    // Device pointer of the data buffer in host memory.
-    uint8_t *host_buf_dev_ptr = nullptr;
-
-    // Data buffer in device memory
-    uint8_t *dev_buf = nullptr;
-};
-
-// Pring usage of this program.
-void PrintUsage() {
-    printf("Usage: gpu_sm_copy "
-           "<gpu-id> "
-           "<copy-direction: dtoh|htod> "
-           "<size> "
-           "<num_loops>\n");
-}
-
-// Prepare data buffers to be used.
-int PrepareBuf(const Args &args, Buffers *buffers) {
-    cudaError_t cuda_err = cudaSuccess;
-
-    // Generate data to copy
-    buffers->data_buf = static_cast<uint8_t *>(malloc(args.size));
-    for (int i = 0; i < args.size; i++) {
-        buffers->data_buf[i] = static_cast<uint8_t>(i % 256);
-    }
-
-    // Reset check buffer
-    buffers->check_buf = static_cast<uint8_t *>(malloc(args.size));
-    memset(buffers->check_buf, 0, args.size);
-
-    // Allocate host buffer
-    buffers->host_buf = static_cast<uint8_t *>(malloc(args.size));
-    cuda_err = cudaHostRegister(buffers->host_buf, args.size, cudaHostRegisterMapped);
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "PrepareBuf::cudaHostRegister error: %d\n", cuda_err);
-        return -1;
-    }
-    cuda_err = cudaHostGetDevicePointer((void **)(&(buffers->host_buf_dev_ptr)), buffers->host_buf, 0);
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "PrepareBuf::cudaHostGetDevicePointer error: %d\n", cuda_err);
-        return -1;
-    }
-
-    // Allocate device buffer
-    cuda_err = cudaMalloc(&(buffers->dev_buf), args.size);
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "PrepareBuf::cudaMalloc error: %d\n", cuda_err);
-        return -1;
-    }
-
-    // Initialize source buffer
-    if (args.copy_direction == "dtoh") {
-        cuda_err = cudaMemcpy(buffers->dev_buf, buffers->data_buf, args.size, cudaMemcpyDefault);
-    } else if (args.copy_direction == "htod") {
-        cuda_err = cudaMemcpy(buffers->host_buf, buffers->data_buf, args.size, cudaMemcpyDefault);
-    } else {
-        fprintf(stderr, "Unrecognized copy direction: %s\n", args.copy_direction.c_str());
-        return -1;
-    }
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "PrepareBuf::cudaMemcpy error: %d\n", cuda_err);
-        return -1;
-    }
-
-    return 0;
-}
-
-// Validate the result of data transfer.
-int CheckBuf(const Args &args, Buffers *buffers) {
-    cudaError_t cuda_err = cudaSuccess;
-
-    // Copy result
-    if (args.copy_direction == "dtoh") {
-        cuda_err = cudaMemcpy(buffers->check_buf, buffers->host_buf, args.size, cudaMemcpyDefault);
-    } else if (args.copy_direction == "htod") {
-        cuda_err = cudaMemcpy(buffers->check_buf, buffers->dev_buf, args.size, cudaMemcpyDefault);
-    }
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "CheckBuf::cudaMemcpy error: %d\n", cuda_err);
-        return -1;
-    }
-
-    // Validate result
-    int memcmp_result = memcmp(buffers->data_buf, buffers->check_buf, args.size);
-    if (memcmp_result) {
-        fprintf(stderr, "Memory check failed\n");
-        return -1;
-    }
-
-    return 0;
-}
-
-// Destroy data buffers
-int DestroyBuf(Buffers *buffers) {
-    int ret = 0;
-    cudaError_t cuda_err = cudaSuccess;
-
-    // Destroy original data buffer and check buffer
-    if (buffers->data_buf != nullptr)
-        free(buffers->data_buf);
-    if (buffers->check_buf != nullptr)
-        free(buffers->check_buf);
-
-    // Destroy device buffer
-    if (buffers->dev_buf != nullptr) {
-        cuda_err = cudaFree(buffers->dev_buf);
-        if (cuda_err != cudaSuccess) {
-            fprintf(stderr, "DestroyBuf::cudaFree error: %d\n", cuda_err);
-            ret = -1;
-        }
-    }
-
-    // Destroy host buffer
-    if (buffers->host_buf != nullptr) {
-        cuda_err = cudaHostUnregister(buffers->host_buf);
-        if (cuda_err != cudaSuccess) {
-            fprintf(stderr, "DestroyBuf::cudaHostUnregister error: %d\n", cuda_err);
-            ret = -1;
-        }
-        free(buffers->host_buf);
-        buffers->host_buf_dev_ptr = nullptr;
-    }
-
-    return ret;
-}
-
-// Unroll depth in SM copy kernel
-#define NUM_LOOP_UNROLL 2
-
-// Thread block size
-#define NUM_THREADS_IN_BLOCK 128
-
-// Fetch a ulong2 from source memory and write to register
-// This kernel references the implementation in
-// 1) NCCL:
-// https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L483
-// 2) RCCL:
-// https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L268
-inline __device__ void FetchULong2(ulong2 &v, const ulong2 *p) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    v.x = p->x;
-    v.y = p->y;
-#else
-    asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
-#endif
-}
-
-// Store a ulong2 from register and write to target memory
-// This kernel references the implementation in
-// 1) NCCL:
-// https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L486
-// 2) RCCL:
-// https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L276
-inline __device__ void StoreULong2(ulong2 *p, ulong2 &v) {
-#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
-    p->x = v.x;
-    p->y = v.y;
-#else
-    asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(p), "l"(v.x), "l"(v.y) : "memory");
-#endif
-}
-
-// Fetch data from source memory into register first, and then write them to target memory
-// Stride set to thread block size to best utilize cache
-__global__ void SMCopyKernel(ulong2 *tgt, const ulong2 *src) {
-    uint64_t index = blockIdx.x * blockDim.x * NUM_LOOP_UNROLL + threadIdx.x;
-    ulong2 val[NUM_LOOP_UNROLL];
-#pragma unroll
-    for (uint64_t i = 0; i < NUM_LOOP_UNROLL; i++)
-        FetchULong2(val[i], src + index + i * blockDim.x);
-#pragma unroll
-    for (uint64_t i = 0; i < NUM_LOOP_UNROLL; i++)
-        StoreULong2(tgt + index + i * blockDim.x, val[i]);
-}
-
-// Run SM copy kernel benchmark
-int BenchSMCopyKernel(const Args &args, Buffers *buffers) {
-    cudaError_t cuda_err = cudaSuccess;
-    cudaStream_t stream;
-    uint8_t *src_buf = nullptr;
-    uint8_t *tgt_buf = nullptr;
-
-    // Determine source buffer and target buff
-    if (args.copy_direction == "dtoh") {
-        src_buf = buffers->dev_buf;
-        tgt_buf = buffers->host_buf_dev_ptr;
-    } else {
-        src_buf = buffers->host_buf_dev_ptr;
-        tgt_buf = buffers->dev_buf;
-    }
-
-    // Validate data size
-    uint64_t num_elements_in_thread_block = NUM_LOOP_UNROLL * NUM_THREADS_IN_BLOCK;
-    uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(ulong2);
-    if (args.size % num_bytes_in_thread_block) {
-        fprintf(stderr, "Data size should be multiple of %lu\n", num_bytes_in_thread_block);
-        return -1;
-    }
-
-    // Create stream to launch kernels
-    cuda_err = cudaStreamCreate(&stream);
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "BenchSMCopyKernel::cudaStreamCreate error: %d\n", cuda_err);
-        return -1;
-    }
-
-    // Launch kernels and collect running time
-    uint64_t num_thread_blocks = args.size / num_bytes_in_thread_block;
-    auto start = std::chrono::steady_clock::now();
-    for (int i = 0; i < args.num_loops; i++) {
-        SMCopyKernel<<<num_thread_blocks, NUM_THREADS_IN_BLOCK, 0, stream>>>(reinterpret_cast<ulong2 *>(tgt_buf),
-                                                                             reinterpret_cast<ulong2 *>(src_buf));
-    }
-    cuda_err = cudaStreamSynchronize(stream);
-    auto end = std::chrono::steady_clock::now();
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "BenchSMCopyKernel::cudaStreamSynchronize error: %d\n", cuda_err);
-        return -1;
-    }
-
-    // Destroy stream
-    cuda_err = cudaStreamDestroy(stream);
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "BenchSMCopyKernel::cudaStreamDestroy error: %d\n", cuda_err);
-        return -1;
-    }
-
-    // Calculate and display bandwidth if no problem
-    double time_in_sec = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
-    printf("Bandwidth (GB/s): %g\n", args.size * args.num_loops / time_in_sec / 1e9);
-
-    return 0;
-}
-
-int main(int argc, char **argv) {
-    int ret = 0;
-    int destroy_buf_ret = 0;
-    cudaError_t cuda_err = cudaSuccess;
-    Args args;
-    Buffers buffers;
-
-    if (argc != static_cast<int>(ArgIdx::kNumArgs)) {
-        PrintUsage();
-        return -1;
-    }
-
-    args.gpu_id = std::stoi(argv[static_cast<int>(ArgIdx::kGpuId)]);
-    args.copy_direction = argv[static_cast<int>(ArgIdx::kCopyDirection)];
-    args.size = std::stoul(argv[static_cast<int>(ArgIdx::kSize)]);
-    args.num_loops = std::stoul(argv[static_cast<int>(ArgIdx::kNumLoops)]);
-
-    // Set device context
-    cuda_err = cudaSetDevice(args.gpu_id);
-    if (cuda_err != cudaSuccess) {
-        fprintf(stderr, "cudaSetDevice error: %d\n", cuda_err);
-        goto destroy_buf;
-    }
-
-    // Prepare data buffers
-    ret = PrepareBuf(args, &buffers);
-    if (ret != 0) {
-        goto destroy_buf;
-    }
-
-    // Run benchmark
-    ret = BenchSMCopyKernel(args, &buffers);
-    if (ret != 0) {
-        goto destroy_buf;
-    }
-
-    // Validate data
-    ret = CheckBuf(args, &buffers);
-
-destroy_buf:
-    // Destroy buffers
-    destroy_buf_ret = DestroyBuf(&buffers);
-    if (ret == 0) {
-        ret = destroy_buf_ret;
-    }
-
-    return ret;
-}
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -70,17 +70,19 @@ superbench:
          parallel: no
      parameters:
        block_devices: []
-    gpu-sm-copy-bw:
+    gpu-copy-bw:
      enable: true
      modes:
        - name: local
-          proc_num: 32
-          prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
          parallel: no
      parameters:
        mem_type:
-          - dtoh
          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
    ort-models:
      enable: false
      modes:

--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -71,17 +71,19 @@ superbench:
          parallel: no
      parameters:
        block_devices: []
-    gpu-sm-copy-bw:
+    gpu-copy-bw:
      enable: true
      modes:
        - name: local
-          proc_num: 32
-          prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
          parallel: no
      parameters:
        mem_type:
-          - dtoh
          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
    ort-models:
      enable: false
      modes:

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -69,17 +69,19 @@ superbench:
          parallel: no
      parameters:
        block_devices: []
-    gpu-sm-copy-bw:
+    gpu-copy-bw:
      enable: true
      modes:
        - name: local
-          proc_num: 32
-          prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
          parallel: no
      parameters:
        mem_type:
-          - dtoh
          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
    cudnn-function:
      <<: *default_local_mode
    cublas-function:

--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -63,17 +63,19 @@ superbench:
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
          parallel: yes
-    gpu-sm-copy-bw:
+    gpu-copy-bw:
      enable: true
      modes:
        - name: local
-          proc_num: 32
-          prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
          parallel: no
      parameters:
        mem_type:
-          - dtoh
          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
    kernel-launch:
      <<: *default_local_mode
    gemm-flops:

--- a/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for disk-performance benchmark."""
+
+from pathlib import Path
+import numbers
+import os
+import unittest
+
+from tests.helper import decorator
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+
+
+class GpuCopyBwBenchmarkTest(unittest.TestCase):
+    """Test class for gpu-copy-bw benchmark."""
+    def setUp(self):
+        """Method called to prepare the test fixture."""
+        # Create fake binary file just for testing.
+        os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
+        binary_path = Path(os.getenv('SB_MICRO_PATH'), 'bin')
+        binary_path.mkdir(parents=True, exist_ok=True)
+        self.__binary_file = binary_path / 'gpu_copy'
+        self.__binary_file.touch(mode=0o755, exist_ok=True)
+
+    def tearDown(self):
+        """Method called after the test method has been called and the result recorded."""
+        self.__binary_file.unlink()
+
+    def _test_gpu_copy_bw_performance_command_generation(self, platform):
+        """Test gpu-copy benchmark command generation."""
+        benchmark_name = 'gpu-copy-bw'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
+        assert (benchmark_class)
+
+        size = 1048576
+        num_loops = 10000
+        mem_types = ['htod', 'dtoh', 'dtod']
+        copy_types = ['sm', 'dma']
+
+        parameters = '--mem_type %s --copy_type %s --size %d --num_loops %d' % \
+            (' '.join(mem_types), ' '.join(copy_types), size, num_loops)
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Check parameters specified in BenchmarkContext.
+        assert (benchmark._args.mem_type == mem_types)
+        assert (benchmark._args.copy_type == copy_types)
+        assert (benchmark._args.size == size)
+        assert (benchmark._args.num_loops == num_loops)
+
+        # Check command
+        assert (1 == len(benchmark._commands))
+        assert (benchmark._commands[0].startswith(benchmark._GpuCopyBwBenchmark__bin_path))
+        for mem_type in mem_types:
+            assert ('--%s' % mem_type in benchmark._commands[0])
+        for copy_type in copy_types:
+            assert ('--%s_copy' % copy_type in benchmark._commands[0])
+        assert ('--size %d' % size in benchmark._commands[0])
+        assert ('--num_loops %d' % num_loops in benchmark._commands[0])
+
+    @decorator.cuda_test
+    def test_gpu_copy_bw_performance_command_generation_cuda(self):
+        """Test gpu-copy benchmark command generation, CUDA case."""
+        self._test_gpu_copy_bw_performance_command_generation(Platform.CUDA)
+
+    @decorator.rocm_test
+    def test_gpu_copy_bw_performance_command_generation_rocm(self):
+        """Test gpu-copy benchmark command generation, ROCm case."""
+        self._test_gpu_copy_bw_performance_command_generation(Platform.ROCM)
+
+    def _test_gpu_copy_bw_performance_result_parsing(self, platform):
+        """Test gpu-copy benchmark result parsing."""
+        benchmark_name = 'gpu-copy-bw'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
+        assert (benchmark_class)
+        benchmark = benchmark_class(benchmark_name, parameters='')
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == 'gpu-copy-bw')
+        assert (benchmark.type == BenchmarkType.MICRO)
+
+        # Positive case - valid raw output.
+        test_raw_output = """
+cpu_to_gpu0_by_gpu0_using_sm_under_numa0 26.1755
+cpu_to_gpu0_by_gpu0_using_dma_under_numa0 26.1894
+gpu0_to_cpu_by_gpu0_using_sm_under_numa0 5.72584
+gpu0_to_cpu_by_gpu0_using_dma_under_numa0 26.2623
+gpu0_to_gpu0_by_gpu0_using_sm_under_numa0 659.275
+gpu0_to_gpu0_by_gpu0_using_dma_under_numa0 636.401
+cpu_to_gpu0_by_gpu0_using_sm_under_numa1 26.1589
+cpu_to_gpu0_by_gpu0_using_dma_under_numa1 26.18
+gpu0_to_cpu_by_gpu0_using_sm_under_numa1 5.07597
+gpu0_to_cpu_by_gpu0_using_dma_under_numa1 25.2851
+gpu0_to_gpu0_by_gpu0_using_sm_under_numa1 656.825
+gpu0_to_gpu0_by_gpu0_using_dma_under_numa1 634.203
+"""
+        assert (benchmark._process_raw_result(0, test_raw_output))
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+
+        assert (1 == len(benchmark.raw_data))
+        print(test_raw_output.splitlines())
+        test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()}
+        assert (len(test_raw_output_dict) == len(benchmark.result))
+        for output_key in benchmark.result:
+            assert (len(benchmark.result[output_key]) == 1)
+            assert (isinstance(benchmark.result[output_key][0], numbers.Number))
+            assert (output_key in test_raw_output_dict)
+            assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
+
+        # Negative case - invalid raw output.
+        assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
+        assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
+
+    @decorator.cuda_test
+    def test_gpu_copy_bw_performance_result_parsing_cuda(self):
+        """Test gpu-copy benchmark result parsing, CUDA case."""
+        self._test_gpu_copy_bw_performance_result_parsing(Platform.CUDA)
+
+    @decorator.rocm_test
+    def test_gpu_copy_bw_performance_result_parsing_rocm(self):
+        """Test gpu-copy benchmark result parsing, ROCm case."""
+        self._test_gpu_copy_bw_performance_result_parsing(Platform.ROCM)
--- a/tests/benchmarks/micro_benchmarks/test_gpu_sm_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_sm_copy_bw_performance.py
-# Copyright (c) Microsoft Corporation.
-# Licensed under the MIT License.
-
-"""Tests for disk-performance benchmark."""
-
-import numbers
-
-from tests.helper import decorator
-from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
-
-
-def _test_gpu_sm_copy_bw_performance_impl(platform):
-    """Test gpu-sm-copy-bw benchmark."""
-    benchmark_name = 'gpu-sm-copy-bw'
-    (benchmark_class,
-     predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
-    assert (benchmark_class)
-
-    size = 1048576
-    num_loops = 10000
-    mem_types = ['dtoh', 'htod']
-
-    parameters = '--mem_type %s --size %d --num_loops %d' % \
-        (' '.join(mem_types), size, num_loops)
-    benchmark = benchmark_class(benchmark_name, parameters=parameters)
-
-    # Check basic information
-    assert (benchmark)
-    ret = benchmark._preprocess()
-    assert (ret is True)
-    assert (benchmark.return_code == ReturnCode.SUCCESS)
-    assert (benchmark.name == benchmark_name)
-    assert (benchmark.type == BenchmarkType.MICRO)
-
-    # Check parameters specified in BenchmarkContext.
-    assert (benchmark._args.mem_type == mem_types)
-    assert (benchmark._args.size == size)
-    assert (benchmark._args.num_loops == num_loops)
-
-    # Check and revise command list
-    assert (len(mem_types) == len(benchmark._commands))
-    for idx, mem_type in enumerate(mem_types):
-        assert (
-            benchmark._commands[idx] == '%s 0 %s %d %d' %
-            (benchmark._GpuSmCopyBwBenchmark__bin_path, mem_type, size, num_loops)
-        )
-
-    # Run benchmark
-    assert (benchmark._benchmark())
-
-    # Check results and metrics.
-    assert (benchmark.run_count == 1)
-    assert (benchmark.return_code == ReturnCode.SUCCESS)
-    for idx, mem_type in enumerate(mem_types):
-        raw_output_key = 'raw_output_%d' % idx
-        assert (raw_output_key in benchmark.raw_data)
-        assert (len(benchmark.raw_data[raw_output_key]) == 1)
-        assert (isinstance(benchmark.raw_data[raw_output_key][0], str))
-
-        output_key = mem_type
-        assert (output_key in benchmark.result)
-        assert (len(benchmark.result[output_key]) == 1)
-        assert (isinstance(benchmark.result[output_key][0], numbers.Number))
-
-
-@decorator.cuda_test
-def test_gpu_sm_copy_bw_performance_cuda():
-    """Test gpu-sm-copy-bw benchmark, CUDA case."""
-    _test_gpu_sm_copy_bw_performance_impl(Platform.CUDA)
-
-
-@decorator.rocm_test
-def test_gpu_sm_copy_bw_performance_rocm():
-    """Test gpu-sm-copy-bw benchmark, ROCm case."""
-    _test_gpu_sm_copy_bw_performance_impl(Platform.ROCM)