Benchmarks: Revise Code - Make data checking in gpu_copy optional (#301)

This commit makes data checking in gpu_copy optional, because it will take too long time if message size is large.

Benchmarks: Revise Code - Make data checking in gpu_copy optional (#301)
This commit makes data checking in gpu_copy optional, because it will take too long time if message size is large.
682b2c12 · Ziyue Yang · GitHub · 85389055 · 682b2c12 · 682b2c12
Unverified Commit 682b2c12 authored Feb 08, 2022 by Ziyue Yang Committed by GitHub Feb 08, 2022
9 changed files
--- a/examples/benchmarks/gpu_copy_bw_performance.py
+++ b/examples/benchmarks/gpu_copy_bw_performance.py
@@ -20,6 +20,7 @@
    # )
    # For bidirectional test, please specify parameters as the following.
    # parameters='--mem_type htod dtod --copy_type sm dma --bidirectional'
+    # To enable data checking, please add '--check_data'.

    benchmark = BenchmarkRegistry.launch_benchmark(context)
    if benchmark:

--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_bw_performance.py
@@ -75,6 +75,12 @@ def add_parser_arguments(self):
            help='Enable bidirectional test',
        )

+        self._parser.add_argument(
+            '--check_data',
+            action='store_true',
+            help='Enable data checking',
+        )
+
    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.

@@ -97,6 +103,9 @@ def _preprocess(self):
        if self._args.bidirectional:
            args += ' --bidirectional'

+        if self._args.check_data:
+            args += ' --check_data'
+
        self._commands = ['%s %s' % (self.__bin_path, args)]

        return True

--- a/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
+++ b/superbench/benchmarks/micro_benchmarks/gpu_copy_performance/gpu_copy.cu
@@ -83,6 +83,9 @@ struct BenchArgs {
    // Uses SM copy, otherwise DMA copy.
    bool is_sm_copy = false;

+    // Whether check data after copy.
+    bool check_data = false;
+
    // Sub-benchmarks in parallel.
    SubBenchArgs subs[kMaxNumSubs];
 };
@@ -115,6 +118,9 @@ struct Opts {

    // Whether bidirectional transfer is enabled.
    bool bidirectional_enabled = false;
+
+    // Whether check data after copy.
+    bool check_data = false;
 };

 // Print usage of this program.
@@ -128,7 +134,8 @@ void PrintUsage() {
           "[--htod] "
           "[--dtoh] "
           "[--dtod] "
-           "[--bidirectional]\n");
+           "[--bidirectional] "
+           "[--check_data]\n");
 }

 // Parse options of this program.
@@ -142,7 +149,8 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
        kEnableHToD,
        kEnableDToH,
        kEnableDToD,
-        kEnableBidirectional
+        kEnableBidirectional,
+        kEnableCheckData
    };
    const struct option options[] = {
        {"size", required_argument, nullptr, static_cast<int>(OptIdx::kSize)},
@@ -153,7 +161,8 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
        {"htod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableHToD)},
        {"dtoh", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDToH)},
        {"dtod", no_argument, nullptr, static_cast<int>(OptIdx::kEnableDToD)},
-        {"bidirectional", no_argument, nullptr, static_cast<int>(OptIdx::kEnableBidirectional)}};
+        {"bidirectional", no_argument, nullptr, static_cast<int>(OptIdx::kEnableBidirectional)},
+        {"check_data", no_argument, nullptr, static_cast<int>(OptIdx::kEnableCheckData)}};
    int getopt_ret = 0;
    int opt_idx = 0;
    bool size_specified = false;
@@ -214,6 +223,9 @@ int ParseOpts(int argc, char **argv, Opts *opts) {
        case static_cast<int>(OptIdx::kEnableBidirectional):
            opts->bidirectional_enabled = true;
            break;
+        case static_cast<int>(OptIdx::kEnableCheckData):
+            opts->check_data = true;
+            break;
        default:
            parse_err = true;
        }
@@ -258,12 +270,14 @@ int PrepareBufAndStream(BenchArgs *args) {

        // Generate data to copy
        sub.data_buf = static_cast<uint8_t *>(numa_alloc_onnode(args->size, args->numa_id));
-        for (int j = 0; j < args->size; j++) {
-            sub.data_buf[j] = static_cast<uint8_t>(j % uint8_mod);
-        }

-        // Allocate check buffer
-        sub.check_buf = static_cast<uint8_t *>(numa_alloc_onnode(args->size, args->numa_id));
+        if (args->check_data) {
+            for (int j = 0; j < args->size; j++) {
+                sub.data_buf[j] = static_cast<uint8_t>(j % uint8_mod);
+            }
+            // Allocate check buffer
+            sub.check_buf = static_cast<uint8_t *>(numa_alloc_onnode(args->size, args->numa_id));
+        }

        // Allocate buffers for src/dst devices
        constexpr int num_devices = 2;
@@ -668,7 +682,7 @@ int RunBench(BenchArgs *args) {
        goto destroy_event;
    }
    ret = RunCopy(args);
-    if (ret == 0) {
+    if (ret == 0 && args->check_data) {
        ret = CheckBuf(args);
    }
 destroy_event:
@@ -750,6 +764,7 @@ int main(int argc, char **argv) {
    args.num_warm_up = opts.num_warm_up;
    args.num_loops = opts.num_loops;
    args.size = opts.size;
+    args.check_data = opts.check_data;

    // Get number of NUMA nodes
    if (numa_available()) {

--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -86,7 +86,24 @@ superbench:
          parallel: no
      parameters:
        block_devices: []
-    gpu-copy-bw:
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
      enable: true
      modes:
        - name: local

--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -87,7 +87,24 @@ superbench:
          parallel: no
      parameters:
        block_devices: []
-    gpu-copy-bw:
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
      enable: true
      modes:
        - name: local

--- a/superbench/config/azure_ndmv4.yaml
+++ b/superbench/config/azure_ndmv4.yaml
@@ -102,7 +102,24 @@ superbench:
        rand_read_runtime: 60
        rand_write_runtime: 60
        rand_readwrite_runtime: 60
-    gpu-copy-bw:
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
      enable: true
      modes:
        - name: local

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -98,7 +98,24 @@ superbench:
        rand_read_runtime: 60
        rand_write_runtime: 60
        rand_readwrite_runtime: 60
-    gpu-copy-bw:
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
      enable: true
      modes:
        - name: local

--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -79,7 +79,24 @@ superbench:
          proc_num: 8
          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2))
          parallel: no
-    gpu-copy-bw:
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type:
+          - htod
+          - dtoh
+          - dtod
+        copy_type:
+          - sm
+          - dma
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
      enable: true
      modes:
        - name: local

--- a/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_copy_bw_performance.py
@@ -34,7 +34,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform):
        copy_types = ['sm', 'dma']

        parameters = '--mem_type %s --copy_type %s --size %d ' \
-            '--num_warm_up %d --num_loops %d --bidirectional' % \
+            '--num_warm_up %d --num_loops %d --bidirectional --check_data' % \
            (' '.join(mem_types), ' '.join(copy_types), size, num_warm_up, num_loops)
        benchmark = benchmark_class(benchmark_name, parameters=parameters)

@@ -53,6 +53,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform):
        assert (benchmark._args.num_warm_up == num_warm_up)
        assert (benchmark._args.num_loops == num_loops)
        assert (benchmark._args.bidirectional)
+        assert (benchmark._args.check_data)

        # Check command
        assert (1 == len(benchmark._commands))
@@ -65,6 +66,7 @@ def _test_gpu_copy_bw_performance_command_generation(self, platform):
        assert ('--num_warm_up %d' % num_warm_up in benchmark._commands[0])
        assert ('--num_loops %d' % num_loops in benchmark._commands[0])
        assert ('--bidirectional' in benchmark._commands[0])
+        assert ('--check_data' in benchmark._commands[0])

    @decorator.cuda_test
    def test_gpu_copy_bw_performance_command_generation_cuda(self):