Unverified Commit f9550bd6 authored by Yuting Jiang's avatar Yuting Jiang Committed by GitHub
Browse files

Benchmarks: Add Benchmark - Add memory bandwidth benchmark for cuda. (#114)

Add microbenchmark, example, test, config for cuda memory performance and Add cuda-samples(tag with cuda version) as git submodule and update related makefile
parent 71c1617b
...@@ -2,3 +2,6 @@ ...@@ -2,3 +2,6 @@
path = third_party/cutlass path = third_party/cutlass
url = https://github.com/NVIDIA/cutlass.git url = https://github.com/NVIDIA/cutlass.git
branch = v2.4.0 branch = v2.4.0
[submodule "third_party/cuda-samples"]
path = third_party/cuda-samples
url = https://github.com/NVIDIA/cuda-samples.git
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for device memory bandwidth performance.
Commands to run:
python3 examples/benchmarks/cuda_memory_bw_performance.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context('mem-bw', platform=Platform.CUDA)
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
...@@ -10,8 +10,9 @@ ...@@ -10,8 +10,9 @@
from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark from superbench.benchmarks.micro_benchmarks.cublas_function import CublasBenchmark
from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark from superbench.benchmarks.micro_benchmarks.cudnn_function import CudnnBenchmark
from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda from superbench.benchmarks.micro_benchmarks.gemm_flops_performance import GemmFlopsCuda
from superbench.benchmarks.micro_benchmarks.cuda_memory_bw_performance import CudaMemBwBenchmark
__all__ = [ __all__ = [
'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch', 'MicroBenchmark', 'MicroBenchmarkWithInvoke', 'ShardingMatmul', 'ComputationCommunicationOverlap', 'KernelLaunch',
'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda' 'CublasBenchmark', 'CudnnBenchmark', 'GemmFlopsCuda', 'CudaMemBwBenchmark'
] ]
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the Cuda memory performance benchmarks."""
import os
import re
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class CudaMemBwBenchmark(MicroBenchmarkWithInvoke):
"""The Cuda memory performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'bandwidthTest'
self.__mem_types = ['htod', 'dtoh', 'dtod']
self.__memory = ['pageable', 'pinned']
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--mem_type',
type=str,
nargs='+',
default=self.__mem_types,
help='Memory types to benchmark. E.g. {}.'.format(' '.join(self.__mem_types)),
)
self._parser.add_argument(
'--shmoo_mode',
action='store_true',
default=False,
help='Enable shmoo mode for bandwidthtest.',
)
self._parser.add_argument(
'--memory',
type=str,
default=None,
help='Memory argument for bandwidthtest. E.g. {}.'.format(' '.join(self.__memory)),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
# Format the arguments
if not isinstance(self._args.mem_type, list):
self._args.mem_type = [self._args.mem_type]
self._args.mem_type = [p.lower() for p in self._args.mem_type]
# Check the arguments and generate the commands
for mem_type in self._args.mem_type:
if mem_type not in self.__mem_types:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
'Unsupported mem_type of bandwidth test - benchmark: {}, mem_type: {}, expected: {}.'.format(
self._name, mem_type, ' '.join(self.__mem_types)
)
)
return False
else:
command = os.path.join(self._args.bin_dir, self._bin_name)
command += ' --' + mem_type
if self._args.shmoo_mode:
command += ' mode=shmoo'
if self._args.memory:
if self._args.memory in self.__memory:
command += ' memory=' + self._args.memory
else:
self._result.set_return_code(ReturnCode.INVALID_ARGUMENT)
logger.error(
'Unsupported memory argument of bandwidth test - benchmark: {}, memory: {}, expected: {}.'.
format(self._name, self._args.memory, ' '.join(self.__memory))
)
return False
command += ' --csv'
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data('raw_output_' + self._args.mem_type[cmd_idx], raw_output)
mem_bw = -1
metric = ''
valid = True
content = raw_output.splitlines()
try:
for index, line in enumerate(content):
if 'H2D' in line:
metric = 'H2D_Mem_BW'
elif 'D2H' in line:
metric = 'D2H_Mem_BW'
elif 'D2D' in line:
metric = 'D2D_Mem_BW'
else:
continue
line = line.split(',')[1]
value = re.search(r'(\d+.\d+)', line)
if value:
mem_bw = max(mem_bw, float(value.group(0)))
except BaseException:
valid = False
finally:
if valid is False or mem_bw == -1:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
self._curr_run_index, self._name, raw_output
)
)
return False
self._result.add_result(metric, mem_bw)
return True
BenchmarkRegistry.register_benchmark('mem-bw', CudaMemBwBenchmark, platform=Platform.CUDA)
...@@ -28,6 +28,13 @@ superbench: ...@@ -28,6 +28,13 @@ superbench:
model_action: model_action:
- train - train
benchmarks: benchmarks:
mem-bw:
enable: true
modes:
- name: local
proc_num: 8
prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
parallel: yes
kernel-launch: kernel-launch:
<<: *default_local_mode <<: *default_local_mode
gemm-flops: gemm-flops:
......
...@@ -4,11 +4,15 @@ ...@@ -4,11 +4,15 @@
SB_MICRO_PATH ?= "/usr/local" SB_MICRO_PATH ?= "/usr/local"
.PHONY: all cutlass .PHONY: all cutlass bandwidthTest
# Build all targets. # Build all targets.
all: cutlass all: cutlass bandwidthTest
# Create $(SB_MICRO_PATH)/bin and $(SB_MICRO_PATH)/lib, no error if existing, make parent directories as needed.
sb_micro_path:
mkdir -p $(SB_MICRO_PATH)/bin
mkdir -p $(SB_MICRO_PATH)/lib
# Build cutlass. # Build cutlass.
cutlass: cutlass:
ifneq (,$(wildcard cutlass/CMakeLists.txt)) ifneq (,$(wildcard cutlass/CMakeLists.txt))
...@@ -16,3 +20,13 @@ ifneq (,$(wildcard cutlass/CMakeLists.txt)) ...@@ -16,3 +20,13 @@ ifneq (,$(wildcard cutlass/CMakeLists.txt))
-DCUTLASS_NVCC_ARCHS='70;80' -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build -DCUTLASS_NVCC_ARCHS='70;80' -DCUTLASS_ENABLE_EXAMPLES=OFF -DCUTLASS_ENABLE_TESTS=OFF -S ./cutlass -B ./cutlass/build
cmake --build ./cutlass/build -j 8 --target install cmake --build ./cutlass/build -j 8 --target install
endif endif
# Build cuda-samples/Samples/bandwidthTest.
# cuda-samples is released together with CUDA, they have the exact same version. Like v10.0, v11.1 and so on.
# The version we use is the released tag of cuda-samples which is consistent with the cuda version in the environment or docker.
# The Makefile of bandwidthTest does not have 'install' target, so need to copy bin to $(SB_MICRO_PATH)/bin/ and create $(SB_MICRO_PATH)/bin/ if not existing.
bandwidthTest: sb_micro_path
ifneq (,$(wildcard cuda-samples/Samples/bandwidthTest/Makefile))
cd cuda-samples && git checkout v$(shell nvcc --version | grep 'release' | awk '{print $$6}' | cut -c2- | cut -d '.' -f1-2)
cd ./cuda-samples/Samples/bandwidthTest && make clean && make TARGET_ARCH=x86_64 SMS="70 75 80 86"
cp -v ./cuda-samples/Samples/bandwidthTest/bandwidthTest $(SB_MICRO_PATH)/bin/
endif
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment