Benchmarks: Add Benchmark - Add memory bandwidth benchmark for cuda. (#114)

Add microbenchmark, example, test, config for cuda memory performance and Add cuda-samples(tag with cuda version) as git submodule and update related makefile

Benchmarks: Add Benchmark - Add memory bandwidth benchmark for cuda. (#114)
Add microbenchmark, example, test, config for cuda memory performance and Add cuda-samples(tag with cuda version) as git submodule and update related makefile
f9550bd6 · Yuting Jiang · GitHub · 71c1617b · f9550bd6 · f9550bd6
Unverified Commit f9550bd6 authored Jul 13, 2021 by Yuting Jiang Committed by GitHub Jul 13, 2021
7 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -2,3 +2,6 @@
 	path = third_party/cutlass
 	url = https://github.com/NVIDIA/cutlass.git
 	branch = v2.4.0
+[submodule "third_party/cuda-samples"]
+	path = third_party/cuda-samples
+	url = https://github.com/NVIDIA/cuda-samples.git
--- a/examples/benchmarks/cuda_memory_bw_performance.py
+++ b/examples/benchmarks/cuda_memory_bw_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Micro benchmark example for device memory bandwidth performance.
+Commands to run:
+  python3 examples/benchmarks/cuda_memory_bw_performance.py
+"""
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context('mem-bw', platform=Platform.CUDA)
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -10,8 +10,9 @@
 from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
 from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
 from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
+from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
 __all__ = [
    'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
-    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda'
+    'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'CudaMemBwBenchmark'
 ]
--- a/superbench/benchmarks/micro_benchmarks/cuda_memory_bw_performance.py
+++ b/superbench/benchmarks/micro_benchmarks/cuda_memory_bw_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Module of the Cuda memory performance benchmarks."""
+import os
+import re
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+class CudaMemBwBenchmark(MicroBenchmarkWithInvoke):
+    """The Cuda memory performance benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'bandwidthTest'
+        self.__mem_types = ['htod', 'dtoh', 'dtod']
+        self.__memory = ['pageable', 'pinned']
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--mem_type',
+            type=str,
+            nargs='+',
+            default=self.__mem_types,
+            help='Memory types to benchmark. E.g. {}.'.format(' '.join(self.__mem_types)),
+        )
+        self._parser.add_argument(
+            '--shmoo_mode',
+            action='store_true',
+            default=False,
+            help='Enable shmoo mode for bandwidthtest.',
+        )
+        self._parser.add_argument(
+            '--memory',
+            type=str,
+            default=None,
+            help='Memory argument for bandwidthtest. E.g. {}.'.format(' '.join(self.__memory)),
+        )
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+        # Format the arguments
+        if not isinstance(self._args.mem_type, list):
+            self._args.mem_type = [self._args.mem_type]
+        self._args.mem_type = [p.lower() for p in self._args.mem_type]
+        # Check the arguments and generate the commands
+        for mem_type in self._args.mem_type:
+            if mem_type not in self.__mem_types:
+                self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+                logger.error(
+                    'Unsupported mem_type of bandwidth test - benchmark: {}, mem_type: {}, expected: {}.'.format(
+                        self._name, mem_type, ' '.join(self.__mem_types)
+                    )
+                )
+                return False
+            else:
+                command = os.path.join(self._args.bin_dir, self._bin_name)
+                command += ' --' + mem_type
+                if self._args.shmoo_mode:
+                    command += ' mode=shmoo'
+                if self._args.memory:
+                    if self._args.memory in self.__memory:
+                        command += ' memory=' + self._args.memory
+                    else:
+                        self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
+                        logger.error(
+                            'Unsupported memory argument of bandwidth test - benchmark: {}, memory: {}, expected: {}.'.
+                            format(self._name, self._args.memory, ' '.join(self.__memory))
+                        )
+                        return False
+                command += ' --csv'
+                self._commands.append(command)
+        return True
+    def _process_raw_result(self, cmd_idx, raw_output):
+        """Function to parse raw results and save the summarized results.
+          self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        self._result.add_raw_data('raw_output_' + self._args.mem_type[cmd_idx], raw_output)
+        mem_bw = -1
+        metric = ''
+        valid = True
+        content = raw_output.splitlines()
+        try:
+            for index, line in enumerate(content):
+                if 'H2D' in line:
+                    metric = 'H2D_Mem_BW'
+                elif 'D2H' in line:
+                    metric = 'D2H_Mem_BW'
+                elif 'D2D' in line:
+                    metric = 'D2D_Mem_BW'
+                else:
+                    continue
+                line = line.split(',')[1]
+                value = re.search(r'(\d+.\d+)', line)
+                if value:
+                    mem_bw = max(mem_bw, float(value.group(0)))
+        except BaseException:
+            valid = False
+        finally:
+            if valid is False or mem_bw == -1:
+                logger.error(
+                    'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
+                        self._curr_run_index, self._name, raw_output
+                    )
+                )
+                return False
+        self._result.add_result(metric, mem_bw)
+        return True
+BenchmarkRegistry.register_benchmark('mem-bw', CudaMemBwBenchmark, platform=Platform.CUDA)
--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -28,6 +28,13 @@ superbench:
      model_action:
        - train
  benchmarks:
+    mem-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
+          parallel: yes
    kernel-launch:
      <<: *default_local_mode
    gemm-flops:

--- a/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_cuda_memory_bw_performance.py
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -4,11 +4,15 @@
 SB_MICRO_PATH ?= "/usr/local"
-.PHONY: all cutlass
+.PHONY: all cutlass bandwidthTest
 # Build all targets.
-all: cutlass
+all: cutlass bandwidthTest
+# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
+sb_micro_path:
+	mkdir -p $(SB_MICRO_PATH)/bin
+	mkdir -p $(SB_MICRO_PATH)/lib
 # Build cutlass.
 cutlass:
 ifneq (,$(wildcard cutlass/CMakeLists.txt))
@@ -16,3 +20,13 @@ ifneq (,$(wildcard cutlass/CMakeLists.txt))
 		-DCUTLASS_NVCC_ARCHS='70;80' -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
 	cmake --build ./cutlass/build -j 8 --target install
 endif
+# Build cuda-samples/Samples/bandwidthTest.
+# cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on.
+# The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
+# The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
+bandwidthTest: sb_micro_path
+ifneq (,$(wildcard cuda-samples/Samples/bandwidthTest/Makefile))
+	cd cuda-samples && git checkout v$(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
+	cd ./cuda-samples/Samples/bandwidthTest && make clean && make TARGET_ARCH=x86_64 SMS="70 75 80 86"
+	cp -v ./cuda-samples/Samples/bandwidthTest/bandwidthTest $(SB_MICRO_PATH)/bin/
+endif