Unverified Commit 008e0fe1 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Add Feature - Add CPU-initiated copy and dtod support to gpu-sm-copy benchmark (#230)

**Description**
This commit does the following:
1) Adds CPU-initiated copy benchmark;
2) Adds dtod benchmark;
3) Support scanning NUMA nodes and GPUs inside the benchmark program;
4) Change the name of gpu-sm-copy to gpu-copy.
parent 6a068e25
......@@ -27,6 +27,7 @@ RUN apt-get update && \
jq \
libaio-dev \
libcap2 \
libnuma-dev \
libpci-dev \
libtinfo5 \
libtool \
......
......@@ -27,6 +27,7 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | APT_KEY_DONT_WARN_ON_D
jq \
libaio-dev \
libcap2 \
libnuma-dev \
libpci-dev \
libtinfo5 \
libtool \
......
......@@ -27,6 +27,7 @@ RUN wget -qO - http://repo.radeon.com/rocm/rocm.gpg.key | APT_KEY_DONT_WARN_ON_D
jq \
libaio-dev \
libcap2 \
libnuma-dev \
libpci-dev \
libtinfo5 \
libtool \
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for GPU SM copy bandwidth performance.
"""Micro benchmark example for GPU copy bandwidth performance.
Commands to run:
python3 examples/benchmarks/gpu_sm_copy_bw_performance.py
python3 examples/benchmarks/gpu_copy_bw_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
......@@ -12,11 +12,11 @@
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'gpu-sm-copy-bw', platform=Platform.CUDA, parameters='--mem_type dtoh htod'
'gpu-copy-bw', platform=Platform.CUDA, parameters='--mem_type htod dtoh dtod --copy_type sm dma'
)
# For ROCm environment, please specify the benchmark name and the platform as the following.
# context = BenchmarkRegistry.create_benchmark_context(
# 'gpu-sm-copy-bw', platform=Platform.ROCM, parameters='--mem_type dtoh htod'
# 'gpu-copy-bw', platform=Platform.ROCM, parameters='--mem_type htod dtoh dtod --copy_type sm dma'
# )
benchmark = BenchmarkRegistry.launch_benchmark(context)
......
......@@ -18,7 +18,7 @@
from superbench.benchmarks.micro_benchmarks.cuda_nccl_bw_performance import CudaNcclBwBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_memory_bw_performance import RocmMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_gemm_flops_performance import RocmGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_sm_copy_bw_performance import GpuSmCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.tcp_connectivity import TCPConnectivityBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
......@@ -26,5 +26,5 @@
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsBenchmark', 'CudaGemmFlopsBenchmark', 'MemBwBenchmark',
'CudaMemBwBenchmark', 'DiskBenchmark', 'IBLoopbackBenchmark', 'CudaNcclBwBenchmark', 'RocmMemBwBenchmark',
'RocmGemmFlopsBenchmark', 'GpuSmCopyBwBenchmark', 'TCPConnectivityBenchmark', 'GPCNetBenchmark'
'RocmGemmFlopsBenchmark', 'GpuCopyBwBenchmark', 'TCPConnectivityBenchmark', 'GPCNetBenchmark'
]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the GPU SM Copy Bandwidth Performance benchmark."""
"""Module of the GPU Copy Bandwidth Performance benchmark."""
import os
......@@ -10,8 +10,8 @@
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GpuSmCopyBwBenchmark(MicroBenchmarkWithInvoke):
"""The GPU SM copy bandwidth performance benchmark class."""
class GpuCopyBwBenchmark(MicroBenchmarkWithInvoke):
"""The GPU copy bandwidth performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
......@@ -21,8 +21,9 @@ def __init__(self, name, parameters=''):
"""
super().__init__(name, parameters)
self._bin_name = 'gpu_sm_copy'
self._mem_types = ['htod', 'dtoh']
self._bin_name = 'gpu_copy'
self._mem_types = ['htod', 'dtoh', 'dtod']
self._copy_types = ['sm', 'dma']
def add_parser_arguments(self):
"""Add the specified arguments."""
......@@ -36,6 +37,14 @@ def add_parser_arguments(self):
help='Memory types for benchmark. E.g. {}.'.format(' '.join(self._mem_types)),
)
self._parser.add_argument(
'--copy_type',
type=str,
nargs='+',
default=self._copy_types,
help='Copy types for benchmark. E.g. {}.'.format(' '.join(self._copy_types)),
)
self._parser.add_argument(
'--size',
type=int,
......@@ -63,10 +72,13 @@ def _preprocess(self):
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
args = '--size %d --num_loops %d' % (self._args.size, self._args.num_loops)
for mem_type in self._args.mem_type:
command = '%s 0 %s %d %d' % \
(self.__bin_path, mem_type, self._args.size, self._args.num_loops)
self._commands.append(command)
args += ' --%s' % mem_type
for copy_type in self._args.copy_type:
args += ' --%s_copy' % copy_type
self._commands = ['%s %s' % (self.__bin_path, args)]
return True
......@@ -85,9 +97,10 @@ def _process_raw_result(self, cmd_idx, raw_output):
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output)
try:
output_prefix = 'Bandwidth (GB/s): '
assert (raw_output.startswith(output_prefix))
self._result.add_result(self._args.mem_type[cmd_idx], float(raw_output[len(output_prefix):]))
output_lines = [x.strip() for x in raw_output.strip().splitlines()]
for output_line in output_lines:
tag, bw_str = output_line.split()
self._result.add_result(tag, float(bw_str))
except BaseException as e:
self._result.set_return_code(ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
logger.error(
......@@ -100,4 +113,4 @@ def _process_raw_result(self, cmd_idx, raw_output):
return True
BenchmarkRegistry.register_benchmark('gpu-sm-copy-bw', GpuSmCopyBwBenchmark)
BenchmarkRegistry.register_benchmark('gpu-copy-bw', GpuCopyBwBenchmark)
......@@ -3,7 +3,7 @@
cmake_minimum_required(VERSION 3.18)
project(gpu_sm_copy LANGUAGES CXX)
project(gpu_copy LANGUAGES CXX)
find_package(CUDAToolkit QUIET)
......@@ -12,9 +12,8 @@ if(CUDAToolkit_FOUND)
message(STATUS "Found CUDA: " ${CUDAToolkit_VERSION})
include(../cuda_common.cmake)
add_executable(gpu_sm_copy gpu_sm_copy.cu)
set_property(TARGET gpu_sm_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
add_executable(gpu_copy gpu_copy.cu)
set_property(TARGET gpu_copy PROPERTY CUDA_ARCHITECTURES ${NVCC_ARCHS_SUPPORTED})
else()
# ROCm environment
include(../rocm_common.cmake)
......@@ -23,16 +22,17 @@ else()
message(STATUS "Found ROCm: " ${HIP_VERSION})
# Convert cuda code to hip code inplace
execute_process(COMMAND hipify-perl -inplace -print-stats gpu_sm_copy.cu
execute_process(COMMAND hipify-perl -inplace -print-stats gpu_copy.cu
WORKING_DIRECTORY ${CMAKE_CURRENT_SOURCE_DIR}/)
# Add HIP targets
set_source_files_properties(gpu_sm_copy.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
set_source_files_properties(gpu_copy.cu PROPERTIES HIP_SOURCE_PROPERTY_FORMAT 1)
# Link with HIP
hip_add_executable(gpu_sm_copy gpu_sm_copy.cu)
# Install tergets
install(TARGETS gpu_sm_copy RUNTIME DESTINATION bin)
hip_add_executable(gpu_copy gpu_copy.cu)
else()
message(FATAL_ERROR "No CUDA or ROCm environment found.")
endif()
endif()
install(TARGETS gpu_copy RUNTIME DESTINATION bin)
target_link_libraries(gpu_copy numa)
// Copyright (c) Microsoft Corporation.
// Licensed under the MIT License.
// GPU SM copy benchmark tests dtoh/htod data transfer bandwidth initiated by GPU SM.
#include <chrono>
#include <cstdio>
#include <cstring>
#include <string>
#include <cuda.h>
#include <cuda_runtime.h>
// Argurment index used in argument parsing.
enum class ArgIdx { kGpuId = 1, kCopyDirection, kSize, kNumLoops, kNumArgs };
// Stored arguments for this program.
struct Args {
// ID of GPU used in this benchmark.
int gpu_id = 0;
// Data transfer direction, can be "dtoh" or "htod".
std::string copy_direction;
// Data buffer size used.
uint64_t size = 0;
// Number of loops in data transfer benchmark.
uint64_t num_loops = 0;
};
struct Buffers {
// Original data buffer.
uint8_t *data_buf = nullptr;
// Buffer to validate the correctness of data transfer.
uint8_t *check_buf = nullptr;
// Data buffer in host memory.
uint8_t *host_buf = nullptr;
// Device pointer of the data buffer in host memory.
uint8_t *host_buf_dev_ptr = nullptr;
// Data buffer in device memory
uint8_t *dev_buf = nullptr;
};
// Pring usage of this program.
void PrintUsage() {
printf("Usage: gpu_sm_copy "
"<gpu-id> "
"<copy-direction: dtoh|htod> "
"<size> "
"<num_loops>\n");
}
// Prepare data buffers to be used.
int PrepareBuf(const Args &args, Buffers *buffers) {
cudaError_t cuda_err = cudaSuccess;
// Generate data to copy
buffers->data_buf = static_cast<uint8_t *>(malloc(args.size));
for (int i = 0; i < args.size; i++) {
buffers->data_buf[i] = static_cast<uint8_t>(i % 256);
}
// Reset check buffer
buffers->check_buf = static_cast<uint8_t *>(malloc(args.size));
memset(buffers->check_buf, 0, args.size);
// Allocate host buffer
buffers->host_buf = static_cast<uint8_t *>(malloc(args.size));
cuda_err = cudaHostRegister(buffers->host_buf, args.size, cudaHostRegisterMapped);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBuf::cudaHostRegister error: %d\n", cuda_err);
return -1;
}
cuda_err = cudaHostGetDevicePointer((void **)(&(buffers->host_buf_dev_ptr)), buffers->host_buf, 0);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBuf::cudaHostGetDevicePointer error: %d\n", cuda_err);
return -1;
}
// Allocate device buffer
cuda_err = cudaMalloc(&(buffers->dev_buf), args.size);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBuf::cudaMalloc error: %d\n", cuda_err);
return -1;
}
// Initialize source buffer
if (args.copy_direction == "dtoh") {
cuda_err = cudaMemcpy(buffers->dev_buf, buffers->data_buf, args.size, cudaMemcpyDefault);
} else if (args.copy_direction == "htod") {
cuda_err = cudaMemcpy(buffers->host_buf, buffers->data_buf, args.size, cudaMemcpyDefault);
} else {
fprintf(stderr, "Unrecognized copy direction: %s\n", args.copy_direction.c_str());
return -1;
}
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareBuf::cudaMemcpy error: %d\n", cuda_err);
return -1;
}
return 0;
}
// Validate the result of data transfer.
int CheckBuf(const Args &args, Buffers *buffers) {
cudaError_t cuda_err = cudaSuccess;
// Copy result
if (args.copy_direction == "dtoh") {
cuda_err = cudaMemcpy(buffers->check_buf, buffers->host_buf, args.size, cudaMemcpyDefault);
} else if (args.copy_direction == "htod") {
cuda_err = cudaMemcpy(buffers->check_buf, buffers->dev_buf, args.size, cudaMemcpyDefault);
}
if (cuda_err != cudaSuccess) {
fprintf(stderr, "CheckBuf::cudaMemcpy error: %d\n", cuda_err);
return -1;
}
// Validate result
int memcmp_result = memcmp(buffers->data_buf, buffers->check_buf, args.size);
if (memcmp_result) {
fprintf(stderr, "Memory check failed\n");
return -1;
}
return 0;
}
// Destroy data buffers
int DestroyBuf(Buffers *buffers) {
int ret = 0;
cudaError_t cuda_err = cudaSuccess;
// Destroy original data buffer and check buffer
if (buffers->data_buf != nullptr)
free(buffers->data_buf);
if (buffers->check_buf != nullptr)
free(buffers->check_buf);
// Destroy device buffer
if (buffers->dev_buf != nullptr) {
cuda_err = cudaFree(buffers->dev_buf);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestroyBuf::cudaFree error: %d\n", cuda_err);
ret = -1;
}
}
// Destroy host buffer
if (buffers->host_buf != nullptr) {
cuda_err = cudaHostUnregister(buffers->host_buf);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestroyBuf::cudaHostUnregister error: %d\n", cuda_err);
ret = -1;
}
free(buffers->host_buf);
buffers->host_buf_dev_ptr = nullptr;
}
return ret;
}
// Unroll depth in SM copy kernel
#define NUM_LOOP_UNROLL 2
// Thread block size
#define NUM_THREADS_IN_BLOCK 128
// Fetch a ulong2 from source memory and write to register
// This kernel references the implementation in
// 1) NCCL:
// https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L483
// 2) RCCL:
// https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L268
inline __device__ void FetchULong2(ulong2 &v, const ulong2 *p) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
v.x = p->x;
v.y = p->y;
#else
asm volatile("ld.volatile.global.v2.u64 {%0,%1}, [%2];" : "=l"(v.x), "=l"(v.y) : "l"(p) : "memory");
#endif
}
// Store a ulong2 from register and write to target memory
// This kernel references the implementation in
// 1) NCCL:
// https://github.com/NVIDIA/nccl/blob/7e515921295adaab72adf56ea71a0fafb0ecb5f3/src/collectives/device/common_kernel.h#L486
// 2) RCCL:
// https://github.com/ROCmSoftwarePlatform/rccl/blob/5c8380ff5b5925cae4bce00b1879a5f930226e8d/src/collectives/device/common_kernel.h#L276
inline __device__ void StoreULong2(ulong2 *p, ulong2 &v) {
#if defined(__HIP_PLATFORM_HCC__) || defined(__HCC__) || defined(__HIPCC__)
p->x = v.x;
p->y = v.y;
#else
asm volatile("st.volatile.global.v2.u64 [%0], {%1,%2};" ::"l"(p), "l"(v.x), "l"(v.y) : "memory");
#endif
}
// Fetch data from source memory into register first, and then write them to target memory
// Stride set to thread block size to best utilize cache
__global__ void SMCopyKernel(ulong2 *tgt, const ulong2 *src) {
uint64_t index = blockIdx.x * blockDim.x * NUM_LOOP_UNROLL + threadIdx.x;
ulong2 val[NUM_LOOP_UNROLL];
#pragma unroll
for (uint64_t i = 0; i < NUM_LOOP_UNROLL; i++)
FetchULong2(val[i], src + index + i * blockDim.x);
#pragma unroll
for (uint64_t i = 0; i < NUM_LOOP_UNROLL; i++)
StoreULong2(tgt + index + i * blockDim.x, val[i]);
}
// Run SM copy kernel benchmark
int BenchSMCopyKernel(const Args &args, Buffers *buffers) {
cudaError_t cuda_err = cudaSuccess;
cudaStream_t stream;
uint8_t *src_buf = nullptr;
uint8_t *tgt_buf = nullptr;
// Determine source buffer and target buff
if (args.copy_direction == "dtoh") {
src_buf = buffers->dev_buf;
tgt_buf = buffers->host_buf_dev_ptr;
} else {
src_buf = buffers->host_buf_dev_ptr;
tgt_buf = buffers->dev_buf;
}
// Validate data size
uint64_t num_elements_in_thread_block = NUM_LOOP_UNROLL * NUM_THREADS_IN_BLOCK;
uint64_t num_bytes_in_thread_block = num_elements_in_thread_block * sizeof(ulong2);
if (args.size % num_bytes_in_thread_block) {
fprintf(stderr, "Data size should be multiple of %lu\n", num_bytes_in_thread_block);
return -1;
}
// Create stream to launch kernels
cuda_err = cudaStreamCreate(&stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "BenchSMCopyKernel::cudaStreamCreate error: %d\n", cuda_err);
return -1;
}
// Launch kernels and collect running time
uint64_t num_thread_blocks = args.size / num_bytes_in_thread_block;
auto start = std::chrono::steady_clock::now();
for (int i = 0; i < args.num_loops; i++) {
SMCopyKernel<<<num_thread_blocks, NUM_THREADS_IN_BLOCK, 0, stream>>>(reinterpret_cast<ulong2 *>(tgt_buf),
reinterpret_cast<ulong2 *>(src_buf));
}
cuda_err = cudaStreamSynchronize(stream);
auto end = std::chrono::steady_clock::now();
if (cuda_err != cudaSuccess) {
fprintf(stderr, "BenchSMCopyKernel::cudaStreamSynchronize error: %d\n", cuda_err);
return -1;
}
// Destroy stream
cuda_err = cudaStreamDestroy(stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "BenchSMCopyKernel::cudaStreamDestroy error: %d\n", cuda_err);
return -1;
}
// Calculate and display bandwidth if no problem
double time_in_sec = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count();
printf("Bandwidth (GB/s): %g\n", args.size * args.num_loops / time_in_sec / 1e9);
return 0;
}
int main(int argc, char **argv) {
int ret = 0;
int destroy_buf_ret = 0;
cudaError_t cuda_err = cudaSuccess;
Args args;
Buffers buffers;
if (argc != static_cast<int>(ArgIdx::kNumArgs)) {
PrintUsage();
return -1;
}
args.gpu_id = std::stoi(argv[static_cast<int>(ArgIdx::kGpuId)]);
args.copy_direction = argv[static_cast<int>(ArgIdx::kCopyDirection)];
args.size = std::stoul(argv[static_cast<int>(ArgIdx::kSize)]);
args.num_loops = std::stoul(argv[static_cast<int>(ArgIdx::kNumLoops)]);
// Set device context
cuda_err = cudaSetDevice(args.gpu_id);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "cudaSetDevice error: %d\n", cuda_err);
goto destroy_buf;
}
// Prepare data buffers
ret = PrepareBuf(args, &buffers);
if (ret != 0) {
goto destroy_buf;
}
// Run benchmark
ret = BenchSMCopyKernel(args, &buffers);
if (ret != 0) {
goto destroy_buf;
}
// Validate data
ret = CheckBuf(args, &buffers);
destroy_buf:
// Destroy buffers
destroy_buf_ret = DestroyBuf(&buffers);
if (ret == 0) {
ret = destroy_buf_ret;
}
return ret;
}
......@@ -70,17 +70,19 @@ superbench:
parallel: no
parameters:
block_devices: []
gpu-sm-copy-bw:
gpu-copy-bw:
enable: true
modes:
- name: local
proc_num: 32
prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
mem_type:
- dtoh
- htod
- dtoh
- dtod
copy_type:
- sm
- dma
ort-models:
enable: false
modes:
......
......@@ -71,17 +71,19 @@ superbench:
parallel: no
parameters:
block_devices: []
gpu-sm-copy-bw:
gpu-copy-bw:
enable: true
modes:
- name: local
proc_num: 32
prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
mem_type:
- dtoh
- htod
- dtoh
- dtod
copy_type:
- sm
- dma
ort-models:
enable: false
modes:
......
......@@ -69,17 +69,19 @@ superbench:
parallel: no
parameters:
block_devices: []
gpu-sm-copy-bw:
gpu-copy-bw:
enable: true
modes:
- name: local
proc_num: 32
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
mem_type:
- dtoh
- htod
- dtoh
- dtod
copy_type:
- sm
- dma
cudnn-function:
<<: *default_local_mode
cublas-function:
......
......@@ -63,17 +63,19 @@ superbench:
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
parallel: yes
gpu-sm-copy-bw:
gpu-copy-bw:
enable: true
modes:
- name: local
proc_num: 32
prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
parallel: no
parameters:
mem_type:
- dtoh
- htod
- dtoh
- dtod
copy_type:
- sm
- dma
kernel-launch:
<<: *default_local_mode
gemm-flops:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for disk-performance benchmark."""
from pathlib import Path
import numbers
import os
import unittest
from tests.helper import decorator
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
class GpuCopyBwBenchmarkTest(unittest.TestCase):
"""Test class for gpu-copy-bw benchmark."""
def setUp(self):
"""Method called to prepare the test fixture."""
# Create fake binary file just for testing.
os.environ['SB_MICRO_PATH'] = '/tmp/superbench/'
binary_path = Path(os.getenv('SB_MICRO_PATH'), 'bin')
binary_path.mkdir(parents=True, exist_ok=True)
self.__binary_file = binary_path / 'gpu_copy'
self.__binary_file.touch(mode=0o755, exist_ok=True)
def tearDown(self):
"""Method called after the test method has been called and the result recorded."""
self.__binary_file.unlink()
def _test_gpu_copy_bw_performance_command_generation(self, platform):
"""Test gpu-copy benchmark command generation."""
benchmark_name = 'gpu-copy-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
assert (benchmark_class)
size = 1048576
num_loops = 10000
mem_types = ['htod', 'dtoh', 'dtod']
copy_types = ['sm', 'dma']
parameters = '--mem_type %s --copy_type %s --size %d --num_loops %d' % \
(' '.join(mem_types), ' '.join(copy_types), size, num_loops)
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.mem_type == mem_types)
assert (benchmark._args.copy_type == copy_types)
assert (benchmark._args.size == size)
assert (benchmark._args.num_loops == num_loops)
# Check command
assert (1 == len(benchmark._commands))
assert (benchmark._commands[0].startswith(benchmark._GpuCopyBwBenchmark__bin_path))
for mem_type in mem_types:
assert ('--%s' % mem_type in benchmark._commands[0])
for copy_type in copy_types:
assert ('--%s_copy' % copy_type in benchmark._commands[0])
assert ('--size %d' % size in benchmark._commands[0])
assert ('--num_loops %d' % num_loops in benchmark._commands[0])
@decorator.cuda_test
def test_gpu_copy_bw_performance_command_generation_cuda(self):
"""Test gpu-copy benchmark command generation, CUDA case."""
self._test_gpu_copy_bw_performance_command_generation(Platform.CUDA)
@decorator.rocm_test
def test_gpu_copy_bw_performance_command_generation_rocm(self):
"""Test gpu-copy benchmark command generation, ROCm case."""
self._test_gpu_copy_bw_performance_command_generation(Platform.ROCM)
def _test_gpu_copy_bw_performance_result_parsing(self, platform):
"""Test gpu-copy benchmark result parsing."""
benchmark_name = 'gpu-copy-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name, parameters='')
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == 'gpu-copy-bw')
assert (benchmark.type == BenchmarkType.MICRO)
# Positive case - valid raw output.
test_raw_output = """
cpu_to_gpu0_by_gpu0_using_sm_under_numa0 26.1755
cpu_to_gpu0_by_gpu0_using_dma_under_numa0 26.1894
gpu0_to_cpu_by_gpu0_using_sm_under_numa0 5.72584
gpu0_to_cpu_by_gpu0_using_dma_under_numa0 26.2623
gpu0_to_gpu0_by_gpu0_using_sm_under_numa0 659.275
gpu0_to_gpu0_by_gpu0_using_dma_under_numa0 636.401
cpu_to_gpu0_by_gpu0_using_sm_under_numa1 26.1589
cpu_to_gpu0_by_gpu0_using_dma_under_numa1 26.18
gpu0_to_cpu_by_gpu0_using_sm_under_numa1 5.07597
gpu0_to_cpu_by_gpu0_using_dma_under_numa1 25.2851
gpu0_to_gpu0_by_gpu0_using_sm_under_numa1 656.825
gpu0_to_gpu0_by_gpu0_using_dma_under_numa1 634.203
"""
assert (benchmark._process_raw_result(0, test_raw_output))
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (1 == len(benchmark.raw_data))
print(test_raw_output.splitlines())
test_raw_output_dict = {x.split()[0]: float(x.split()[1]) for x in test_raw_output.strip().splitlines()}
assert (len(test_raw_output_dict) == len(benchmark.result))
for output_key in benchmark.result:
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
assert (output_key in test_raw_output_dict)
assert (test_raw_output_dict[output_key] == benchmark.result[output_key][0])
# Negative case - invalid raw output.
assert (benchmark._process_raw_result(1, 'Invalid raw output') is False)
assert (benchmark.return_code == ReturnCode.MICROBENCHMARK_RESULT_PARSING_FAILURE)
@decorator.cuda_test
def test_gpu_copy_bw_performance_result_parsing_cuda(self):
"""Test gpu-copy benchmark result parsing, CUDA case."""
self._test_gpu_copy_bw_performance_result_parsing(Platform.CUDA)
@decorator.rocm_test
def test_gpu_copy_bw_performance_result_parsing_rocm(self):
"""Test gpu-copy benchmark result parsing, ROCm case."""
self._test_gpu_copy_bw_performance_result_parsing(Platform.ROCM)
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for disk-performance benchmark."""
import numbers
from tests.helper import decorator
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
def _test_gpu_sm_copy_bw_performance_impl(platform):
"""Test gpu-sm-copy-bw benchmark."""
benchmark_name = 'gpu-sm-copy-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, platform)
assert (benchmark_class)
size = 1048576
num_loops = 10000
mem_types = ['dtoh', 'htod']
parameters = '--mem_type %s --size %d --num_loops %d' % \
(' '.join(mem_types), size, num_loops)
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.mem_type == mem_types)
assert (benchmark._args.size == size)
assert (benchmark._args.num_loops == num_loops)
# Check and revise command list
assert (len(mem_types) == len(benchmark._commands))
for idx, mem_type in enumerate(mem_types):
assert (
benchmark._commands[idx] == '%s 0 %s %d %d' %
(benchmark._GpuSmCopyBwBenchmark__bin_path, mem_type, size, num_loops)
)
# Run benchmark
assert (benchmark._benchmark())
# Check results and metrics.
assert (benchmark.run_count == 1)
assert (benchmark.return_code == ReturnCode.SUCCESS)
for idx, mem_type in enumerate(mem_types):
raw_output_key = 'raw_output_%d' % idx
assert (raw_output_key in benchmark.raw_data)
assert (len(benchmark.raw_data[raw_output_key]) == 1)
assert (isinstance(benchmark.raw_data[raw_output_key][0], str))
output_key = mem_type
assert (output_key in benchmark.result)
assert (len(benchmark.result[output_key]) == 1)
assert (isinstance(benchmark.result[output_key][0], numbers.Number))
@decorator.cuda_test
def test_gpu_sm_copy_bw_performance_cuda():
"""Test gpu-sm-copy-bw benchmark, CUDA case."""
_test_gpu_sm_copy_bw_performance_impl(Platform.CUDA)
@decorator.rocm_test
def test_gpu_sm_copy_bw_performance_rocm():
"""Test gpu-sm-copy-bw benchmark, ROCm case."""
_test_gpu_sm_copy_bw_performance_impl(Platform.ROCM)
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment