Unverified Commit 85389055 authored by Ziyue Yang's avatar Ziyue Yang Committed by GitHub
Browse files

Benchmarks: Revise Code - Reduce result variance in gpu_copy benchmark (#298)

**Description**
This commit does the following to optimize result variance in gpu_copy benchmark:
1) Add warmup phase for gpu_copy benchmark to avoid timing instability caused by first-time CUDA kernel launch overhead;
2) Use CUDA events for timing instead of CPU timestamps;
3) Make data checking an option that is not preferred to be enabled in performance test;
4) Enlarge message size in performance benchmark.
parent 28195be6
...@@ -48,11 +48,19 @@ def add_parser_arguments(self): ...@@ -48,11 +48,19 @@ def add_parser_arguments(self):
self._parser.add_argument( self._parser.add_argument(
'--size', '--size',
type=int, type=int,
default=64 * 1024**2, default=256 * 1024**2,
required=False, required=False,
help='Size of data buffer in bytes.', help='Size of data buffer in bytes.',
) )
self._parser.add_argument(
'--num_warm_up',
type=int,
default=20,
required=False,
help='Number of warm up rounds',
)
self._parser.add_argument( self._parser.add_argument(
'--num_loops', '--num_loops',
type=int, type=int,
...@@ -78,7 +86,9 @@ def _preprocess(self): ...@@ -78,7 +86,9 @@ def _preprocess(self):
self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name) self.__bin_path = os.path.join(self._args.bin_dir, self._bin_name)
args = '--size %d --num_loops %d' % (self._args.size, self._args.num_loops) args = '--size %d --num_warm_up %d --num_loops %d' % (
self._args.size, self._args.num_warm_up, self._args.num_loops
)
for mem_type in self._args.mem_type: for mem_type in self._args.mem_type:
args += ' --%s' % mem_type args += ' --%s' % mem_type
for copy_type in self._args.copy_type: for copy_type in self._args.copy_type:
......
...@@ -3,7 +3,6 @@ ...@@ -3,7 +3,6 @@
// GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA. // GPU copy benchmark tests dtoh/htod/dtod data transfer bandwidth by GPU SM/DMA.
#include <chrono>
#include <cstdio> #include <cstdio>
#include <cstring> #include <cstring>
#include <string> #include <string>
...@@ -52,6 +51,12 @@ struct SubBenchArgs { ...@@ -52,6 +51,12 @@ struct SubBenchArgs {
// CUDA stream to be used. // CUDA stream to be used.
cudaStream_t stream; cudaStream_t stream;
// CUDA event to record start time.
cudaEvent_t start_event;
// CUDA event to record end time.
cudaEvent_t end_event;
}; };
// Arguments for each benchmark run. // Arguments for each benchmark run.
...@@ -69,6 +74,9 @@ struct BenchArgs { ...@@ -69,6 +74,9 @@ struct BenchArgs {
// Data buffer size used. // Data buffer size used.
uint64_t size = 0; uint64_t size = 0;
// Number of warm up rounds to run.
uint64_t num_warm_up = 0;
// Number of loops to run. // Number of loops to run.
uint64_t num_loops = 0; uint64_t num_loops = 0;
...@@ -82,10 +90,13 @@ struct BenchArgs { ...@@ -82,10 +90,13 @@ struct BenchArgs {
// Options accepted by this program. // Options accepted by this program.
struct Opts { struct Opts {
// Data buffer size for copy benchmark. // Data buffer size for copy benchmark.
uint64_t size; uint64_t size = 0;
// Data buffer size for copy benchmark. // Number of warm up rounds to run.
uint64_t num_loops; uint64_t num_warm_up = 0;
// Number of loops to run.
uint64_t num_loops = 0;
// Whether GPU SM copy needs to be evaluated. // Whether GPU SM copy needs to be evaluated.
bool sm_copy_enabled = false; bool sm_copy_enabled = false;
...@@ -110,6 +121,7 @@ struct Opts { ...@@ -110,6 +121,7 @@ struct Opts {
void PrintUsage() { void PrintUsage() {
printf("Usage: gpu_copy " printf("Usage: gpu_copy "
"--size <size> " "--size <size> "
"--num_warm_up <num_warm_up> "
"--num_loops <num_loops> " "--num_loops <num_loops> "
"[--sm_copy] " "[--sm_copy] "
"[--dma_copy] " "[--dma_copy] "
...@@ -123,7 +135,8 @@ void PrintUsage() { ...@@ -123,7 +135,8 @@ void PrintUsage() {
int ParseOpts(int argc, char **argv, Opts *opts) { int ParseOpts(int argc, char **argv, Opts *opts) {
enum class OptIdx { enum class OptIdx {
kSize, kSize,
kNumIters, kNumWarmUp,
kNumLoops,
kEnableSmCopy, kEnableSmCopy,
kEnableDmaCopy, kEnableDmaCopy,
kEnableHToD, kEnableHToD,
...@@ -133,7 +146,8 @@ int ParseOpts(int argc, char **argv, Opts *opts) { ...@@ -133,7 +146,8 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
}; };
const struct option options[] = { const struct option options[] = {
{"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)}, {"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumIters)}, {"num_warm_up", required_argument, nullptr, static_cast<int>(OptIdx::kNumWarmUp)},
{"num_loops", required_argument, nullptr, static_cast<int>(OptIdx::kNumLoops)},
{"sm_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableSmCopy)}, {"sm_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableSmCopy)},
{"dma_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDmaCopy)}, {"dma_copy", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDmaCopy)},
{"htod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableHToD)}, {"htod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableHToD)},
...@@ -143,12 +157,13 @@ int ParseOpts(int argc, char **argv, Opts *opts) { ...@@ -143,12 +157,13 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
int getopt_ret = 0; int getopt_ret = 0;
int opt_idx = 0; int opt_idx = 0;
bool size_specified = false; bool size_specified = false;
bool num_warm_up_specified = false;
bool num_loops_specified = false; bool num_loops_specified = false;
bool parse_err = false; bool parse_err = false;
while (true) { while (true) {
getopt_ret = getopt_long(argc, argv, "", options, &opt_idx); getopt_ret = getopt_long(argc, argv, "", options, &opt_idx);
if (getopt_ret == -1) { if (getopt_ret == -1) {
if (!size_specified || !num_loops_specified) { if (!size_specified || !num_warm_up_specified || !num_loops_specified) {
parse_err = true; parse_err = true;
} }
break; break;
...@@ -165,7 +180,15 @@ int ParseOpts(int argc, char **argv, Opts *opts) { ...@@ -165,7 +180,15 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
size_specified = true; size_specified = true;
} }
break; break;
case static_cast<int>(OptIdx::kNumIters): case static_cast<int>(OptIdx::kNumWarmUp):
if (1 != sscanf(optarg, "%lu", &(opts->num_warm_up))) {
fprintf(stderr, "Invalid num_warm_up: %s\n", optarg);
parse_err = true;
} else {
num_warm_up_specified = true;
}
break;
case static_cast<int>(OptIdx::kNumLoops):
if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) { if (1 != sscanf(optarg, "%lu", &(opts->num_loops))) {
fprintf(stderr, "Invalid num_loops: %s\n", optarg); fprintf(stderr, "Invalid num_loops: %s\n", optarg);
parse_err = true; parse_err = true;
...@@ -306,6 +329,28 @@ int PrepareBufAndStream(BenchArgs *args) { ...@@ -306,6 +329,28 @@ int PrepareBufAndStream(BenchArgs *args) {
return 0; return 0;
} }
// Prepare events to be used.
int PrepareEvent(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess;
for (int i = 0; i < args->num_subs; i++) {
SubBenchArgs &sub = args->subs[i];
if (SetGpu(sub.worker_gpu_id)) {
return -1;
}
cuda_err = cudaEventCreate(&(sub.start_event));
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareEvent::cudaEventCreate error: %d\n", cuda_err);
return -1;
}
cuda_err = cudaEventCreate(&(sub.end_event));
if (cuda_err != cudaSuccess) {
fprintf(stderr, "PrepareEvent::cudaEventCreate error: %d\n", cuda_err);
return -1;
}
}
return 0;
}
// Validate the result of data transfer. // Validate the result of data transfer.
int CheckBuf(BenchArgs *args) { int CheckBuf(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess; cudaError_t cuda_err = cudaSuccess;
...@@ -399,7 +444,7 @@ int DestroyBufAndStream(BenchArgs *args) { ...@@ -399,7 +444,7 @@ int DestroyBufAndStream(BenchArgs *args) {
} }
cuda_err = cudaStreamDestroy(sub.stream); cuda_err = cudaStreamDestroy(sub.stream);
if (cuda_err != cudaSuccess) { if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestoryBufAndStream::cudaStreamDestroy error: %d\n", cuda_err); fprintf(stderr, "DestroyBufAndStream::cudaStreamDestroy error: %d\n", cuda_err);
return -1; return -1;
} }
} }
...@@ -407,6 +452,28 @@ int DestroyBufAndStream(BenchArgs *args) { ...@@ -407,6 +452,28 @@ int DestroyBufAndStream(BenchArgs *args) {
return ret; return ret;
} }
// Destroy events
int DestroyEvent(BenchArgs *args) {
cudaError_t cuda_err = cudaSuccess;
for (int i = 0; i < args->num_subs; i++) {
SubBenchArgs &sub = args->subs[i];
if (SetGpu(sub.worker_gpu_id)) {
return -1;
}
cuda_err = cudaEventDestroy(sub.start_event);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestroyEvent::cudaEventDestroy error: %d\n", cuda_err);
return -1;
}
cuda_err = cudaEventDestroy(sub.end_event);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "DestroyEvent::cudaEventDestroy error: %d\n", cuda_err);
return -1;
}
}
return 0;
}
// Unroll depth in SM copy kernel // Unroll depth in SM copy kernel
#define NUM_LOOP_UNROLL 2 #define NUM_LOOP_UNROLL 2
...@@ -502,20 +569,37 @@ int RunCopy(BenchArgs *args) { ...@@ -502,20 +569,37 @@ int RunCopy(BenchArgs *args) {
} }
// Launch jobs and collect running time // Launch jobs and collect running time
auto start = std::chrono::steady_clock::now(); for (int i = 0; i < args->num_loops + args->num_warm_up; i++) {
for (int i = 0; i < args->num_loops; i++) {
for (int j = 0; j < args->num_subs; j++) { for (int j = 0; j < args->num_subs; j++) {
SubBenchArgs &sub = args->subs[j]; SubBenchArgs &sub = args->subs[j];
if (SetGpu(sub.worker_gpu_id)) { if (SetGpu(sub.worker_gpu_id)) {
return -1; return -1;
} }
if (i == args->num_warm_up) {
cuda_err = cudaEventRecord(sub.start_event, sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaEventRecord error: %d\n", cuda_err);
return -1;
}
}
if (args->is_sm_copy) { if (args->is_sm_copy) {
SMCopyKernel<<<num_thread_blocks, NUM_THREADS_IN_BLOCK, 0, sub.stream>>>( SMCopyKernel<<<num_thread_blocks, NUM_THREADS_IN_BLOCK, 0, sub.stream>>>(
reinterpret_cast<ulong2 *>(sub.dst_dev_gpu_buf_ptr), reinterpret_cast<ulong2 *>(sub.dst_dev_gpu_buf_ptr),
reinterpret_cast<ulong2 *>(sub.src_dev_gpu_buf_ptr)); reinterpret_cast<ulong2 *>(sub.src_dev_gpu_buf_ptr));
} else { } else {
cudaMemcpyAsync(sub.dst_dev_gpu_buf_ptr, sub.src_dev_gpu_buf_ptr, args->size, cudaMemcpyDefault, cuda_err = cudaMemcpyAsync(sub.dst_dev_gpu_buf_ptr, sub.src_dev_gpu_buf_ptr, args->size,
sub.stream); cudaMemcpyDefault, sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaMemcpyAsync error: %d\n", cuda_err);
return -1;
}
}
if (i + 1 == args->num_loops + args->num_warm_up) {
cuda_err = cudaEventRecord(sub.end_event, sub.stream);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaEventRecord error: %d\n", cuda_err);
return -1;
}
} }
} }
} }
...@@ -527,13 +611,22 @@ int RunCopy(BenchArgs *args) { ...@@ -527,13 +611,22 @@ int RunCopy(BenchArgs *args) {
return -1; return -1;
} }
} }
auto end = std::chrono::steady_clock::now();
// Calculate and display bandwidth if no problem // Calculate and display bandwidth if no problem
double time_in_sec = std::chrono::duration_cast<std::chrono::duration<double>>(end - start).count(); float max_time_in_ms = 0;
for (int i = 0; i < args->num_subs; i++) {
SubBenchArgs &sub = args->subs[i];
float time_in_ms = 0;
cuda_err = cudaEventElapsedTime(&time_in_ms, sub.start_event, sub.end_event);
if (cuda_err != cudaSuccess) {
fprintf(stderr, "RunCopy::cudaEventElapsedTime error: %d\n", cuda_err);
return -1;
}
max_time_in_ms = time_in_ms > max_time_in_ms ? time_in_ms : max_time_in_ms;
}
PrintResultTag(*args); PrintResultTag(*args);
printf(" %g\n", args->size * args->num_loops * args->num_subs / time_in_sec / 1e9); printf(" %g\n", args->size * args->num_loops * args->num_subs / max_time_in_ms / 1e6);
return 0; return 0;
} }
...@@ -565,17 +658,28 @@ int EnablePeerAccess(int src_gpu_id, int dst_gpu_id, int *can_access) { ...@@ -565,17 +658,28 @@ int EnablePeerAccess(int src_gpu_id, int dst_gpu_id, int *can_access) {
int RunBench(BenchArgs *args) { int RunBench(BenchArgs *args) {
int ret = 0; int ret = 0;
int destroy_buf_ret = 0; int destroy_ret = 0;
ret = PrepareBufAndStream(args); ret = PrepareBufAndStream(args);
if (ret != 0) {
goto destroy_buf;
}
ret = PrepareEvent(args);
if (ret != 0) {
goto destroy_event;
}
ret = RunCopy(args);
if (ret == 0) { if (ret == 0) {
ret = RunCopy(args); ret = CheckBuf(args);
if (ret == 0) { }
ret = CheckBuf(args); destroy_event:
} destroy_ret = DestroyEvent(args);
if (ret == 0) {
ret = destroy_ret;
} }
destroy_buf_ret = DestroyBufAndStream(args); destroy_buf:
destroy_ret = DestroyBufAndStream(args);
if (ret == 0) { if (ret == 0) {
ret = destroy_buf_ret; ret = destroy_ret;
} }
return ret; return ret;
} }
...@@ -643,6 +747,7 @@ int main(int argc, char **argv) { ...@@ -643,6 +747,7 @@ int main(int argc, char **argv) {
if (ret != 0) { if (ret != 0) {
return ret; return ret;
} }
args.num_warm_up = opts.num_warm_up;
args.num_loops = opts.num_loops; args.num_loops = opts.num_loops;
args.size = opts.size; args.size = opts.size;
......
...@@ -28,12 +28,14 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform): ...@@ -28,12 +28,14 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform):
assert (benchmark_class) assert (benchmark_class)
size = 1048576 size = 1048576
num_warm_up = 20
num_loops = 10000 num_loops = 10000
mem_types = ['htod', 'dtoh', 'dtod'] mem_types = ['htod', 'dtoh', 'dtod']
copy_types = ['sm', 'dma'] copy_types = ['sm', 'dma']
parameters = '--mem_type %s --copy_type %s --size %d --num_loops %d --bidirectional' % \ parameters = '--mem_type %s --copy_type %s --size %d ' \
(' '.join(mem_types), ' '.join(copy_types), size, num_loops) '--num_warm_up %d --num_loops %d --bidirectional' % \
(' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops)
benchmark = benchmark_class(benchmark_name, parameters=parameters) benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information # Check basic information
...@@ -48,6 +50,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform): ...@@ -48,6 +50,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform):
assert (benchmark._args.mem_type == mem_types) assert (benchmark._args.mem_type == mem_types)
assert (benchmark._args.copy_type == copy_types) assert (benchmark._args.copy_type == copy_types)
assert (benchmark._args.size == size) assert (benchmark._args.size == size)
assert (benchmark._args.num_warm_up == num_warm_up)
assert (benchmark._args.num_loops == num_loops) assert (benchmark._args.num_loops == num_loops)
assert (benchmark._args.bidirectional) assert (benchmark._args.bidirectional)
...@@ -59,6 +62,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform): ...@@ -59,6 +62,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform):
for copy_type in copy_types: for copy_type in copy_types:
assert ('--%s_copy' % copy_type in benchmark._commands[0]) assert ('--%s_copy' % copy_type in benchmark._commands[0])
assert ('--size %d' % size in benchmark._commands[0]) assert ('--size %d' % size in benchmark._commands[0])
assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
assert ('--num_loops %d' % num_loops in benchmark._commands[0]) assert ('--num_loops %d' % num_loops in benchmark._commands[0])
assert ('--bidirectional' in benchmark._commands[0]) assert ('--bidirectional' in benchmark._commands[0])
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment