Unverified Commit ff51a3ce authored by rafsalas19's avatar rafsalas19 Committed by GitHub
Browse files

Benchmarks: Add Feature - Add GPU-Burn as microbenchmark (#324)

**Description**
Modifications adding GPU-Burn to SuperBench.
- added third party submodule
- modified Makefile to make gpu-burn binary
- added/modified microbenchmarks to add gpu-burn python scripts
- modified default and azure_ndv4 configs to add gpu-burn
parent 84359fd8
...@@ -18,3 +18,6 @@ ...@@ -18,3 +18,6 @@
[submodule "third_party/GPCNET"] [submodule "third_party/GPCNET"]
path = third_party/GPCNET path = third_party/GPCNET
url = https://github.com/netbench/GPCNET.git url = https://github.com/netbench/GPCNET.git
[submodule "third_party/gpu-burn"]
path = third_party/gpu-burn
url = https://github.com/wilicc/gpu-burn.git
...@@ -142,6 +142,20 @@ The supported percentiles are 50, 90, 95, 99, and 99.9. ...@@ -142,6 +142,20 @@ The supported percentiles are 50, 90, 95, 99, and 99.9.
| ort-inference/{precision}_{model}_time | time (ms) | The mean latency to execute one batch of inference. | | ort-inference/{precision}_{model}_time | time (ms) | The mean latency to execute one batch of inference. |
| ort-inference/{precision}_{model}_time_{percentile} | time (ms) | The {percentile}th percentile latency to execute one batch of inference. | | ort-inference/{precision}_{model}_time_{percentile} | time (ms) | The {percentile}th percentile latency to execute one batch of inference. |
### `gpu-burn`
#### Introduction
Multi-GPU CUDA stress test for GPU compute and memory utilization, performed by [gpu-burn](https://github.com/wilicc/gpu-burn).
Supports the use of double unit types and the use of tensor cores.
#### Metrics
| Name | Unit | Description |
|--------------------------|------------|-------------------------------------------------------------------------------------|
| gpu-burn/time | time (s) | The runtime for gpu-burn test. |
| gpu-burn/gpu_[0-9]_pass | yes/no | The result of the gpu-burn test for each GPU (1: yes, 0: no). |
| gpu-burn/abort | yes/no | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no). |
## Communication Benchmarks ## Communication Benchmarks
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Micro benchmark example for GPU-Burn.
Commands to run:
python3 examples/benchmarks/gpu_burn_test.py
"""
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.common.utils import logger
if __name__ == '__main__':
context = BenchmarkRegistry.create_benchmark_context(
'gpu-burn', platform=Platform.CUDA, parameters='--doubles --tensor_core --time 10'
)
benchmark = BenchmarkRegistry.launch_benchmark(context)
if benchmark:
logger.info(
'benchmark: {}, return code: {}, result: {}'.format(
benchmark.name, benchmark.return_code, benchmark.result
)
)
...@@ -17,6 +17,7 @@ ...@@ -17,6 +17,7 @@
from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark
from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
...@@ -39,6 +40,7 @@ ...@@ -39,6 +40,7 @@
'GPCNetBenchmark', 'GPCNetBenchmark',
'GemmFlopsBenchmark', 'GemmFlopsBenchmark',
'GpuCopyBwBenchmark', 'GpuCopyBwBenchmark',
'GpuBurnBenchmark',
'IBBenchmark', 'IBBenchmark',
'IBLoopbackBenchmark', 'IBLoopbackBenchmark',
'KernelLaunch', 'KernelLaunch',
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the GPU-Burn Test."""
import os
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GpuBurnBenchmark(MicroBenchmarkWithInvoke):
"""The GPU Burn Test benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'gpu_burn'
def add_parser_arguments(self):
"""Add the specified arguments."""
super().add_parser_arguments()
self._parser.add_argument(
'--doubles',
action='store_true',
default=False,
help='Use doubles for the data type used in GPU-Burn',
)
self._parser.add_argument(
'--tensor_core',
action='store_true',
default=False,
help='Use tensor cores in GPU-Burn',
)
self._parser.add_argument(
'--time',
type=int,
default=10,
help='Length of time to run GPU-Burn for(in seconds)',
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
if not self._set_binary_path():
return False
command = os.path.join(self._args.bin_dir, self._bin_name)
if self._args.doubles:
command += ' -d'
if self._args.tensor_core:
command += ' -tc'
command += ' {} '.format(self._args.time)
# copy compare.ptx which needs to be in the working directory
compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./'
# remove compare.ptx from working directory
compare_rm = 'rm ' + 'compare.ptx'
self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm)
return True
def _process_raw_result(self, cmd_idx, raw_output): # noqa: C901
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
content = raw_output.splitlines()
gpu_res = []
abort = False
failure_msg = 'unknown failure'
index = -1
try:
for idx, line in enumerate(content):
if 'No clients are alive!' in line or "Couldn't init a GPU" \
in line or 'Failure during compute' in line or 'Low mem for result' in line:
abort = True
failure_msg = line
break
if 'done' in line:
index = idx
break
if not abort:
if 'done' not in content[index]:
abort = True
failure_msg = 'The result format invalid'
raise failure_msg
content = content[index + 2:len(content):]
for line in content:
if 'Tested' in line:
continue
if 'GPU' in line:
gpu_res.append(line.strip('\n').strip('\t'))
self._result.add_result('time', self._args.time)
for res in gpu_res:
if 'OK' in res:
self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 1)
else:
self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 0)
self._result.add_raw_data('GPU-Burn_result', res)
else:
self._result.add_raw_data('GPU Burn Failure: ', failure_msg)
self._result.add_result('abort', 1)
return False
except BaseException as e:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
self._curr_run_index, self._name, raw_output, str(e)
)
)
self._result.add_result('abort', 1)
return False
self._result.add_result('abort', 0)
return True
BenchmarkRegistry.register_benchmark('gpu-burn', GpuBurnBenchmark, platform=Platform.CUDA)
...@@ -39,6 +39,16 @@ superbench: ...@@ -39,6 +39,16 @@ superbench:
<<: *default_local_mode <<: *default_local_mode
gemm-flops: gemm-flops:
<<: *default_local_mode <<: *default_local_mode
gpu-burn:
enable: false
modes:
- name: local
proc_num: 1
parallel: no
parameters:
time: 900
doubles: true
tensor_core: true
nccl-bw:default: nccl-bw:default:
enable: true enable: true
modes: modes:
......
...@@ -33,6 +33,16 @@ superbench: ...@@ -33,6 +33,16 @@ superbench:
model_action: model_action:
- train - train
benchmarks: benchmarks:
gpu-burn:
enable: true
modes:
- name: local
proc_num: 1
parallel: no
parameters:
time: 300
doubles: true
tensor_core: true
nccl-bw:default: nccl-bw:default:
enable: true enable: true
modes: modes:
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for gpu-burn benchmark."""
import unittest
from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
class GpuBurnBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Test class for gpu-burn benchmark."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/gpu_burn'])
@decorator.load_data('tests/data/gpu_burn.log')
def test_gpu_burn(self, results):
"""Test gpu-burn benchmark command generation."""
benchmark_name = 'gpu-burn'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
assert (benchmark_class)
time = 10
parameters = '--doubles --tensor_core --time ' + str(time)
benchmark = benchmark_class(benchmark_name, parameters=parameters)
# Check basic information
assert (benchmark)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
assert (benchmark.name == benchmark_name)
assert (benchmark.type == BenchmarkType.MICRO)
# Check parameters specified in BenchmarkContext.
assert (benchmark._args.time == time)
assert (benchmark._args.doubles)
assert (benchmark._args.tensor_core)
# Check command
compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./'
compare_rm = 'rm ' + 'compare.ptx'
assert (1 == len(benchmark._commands))
assert (benchmark._commands[0].startswith(compare_copy))
assert ('-d' in benchmark._commands[0])
assert ('-tc' in benchmark._commands[0])
assert (str(time) in benchmark._commands[0])
assert (compare_rm in benchmark._commands[0])
# Check results
assert (benchmark._process_raw_result(0, results))
assert (benchmark.result['return_code'][0] == 0)
assert (benchmark.result['time'][0] == time)
for device in range(8):
assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
assert (benchmark.result['abort'][0] == 0)
This diff is collapsed.
...@@ -10,11 +10,11 @@ RCCL_HOME ?= /opt/rocm/rccl ...@@ -10,11 +10,11 @@ RCCL_HOME ?= /opt/rocm/rccl
ROCM_VERSION ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3) ROCM_VERSION ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
HPCX_HOME ?= /opt/hpcx HPCX_HOME ?= /opt/hpcx
.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet .PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn
# Build all targets. # Build all targets.
all: cuda rocm all: cuda rocm
cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
common: fio common: fio
...@@ -98,3 +98,11 @@ gpcnet: sb_micro_path ...@@ -98,3 +98,11 @@ gpcnet: sb_micro_path
bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload" bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
cp -v ./GPCNET/network_test $(SB_MICRO_PATH)/bin/ cp -v ./GPCNET/network_test $(SB_MICRO_PATH)/bin/
cp -v ./GPCNET/network_load_test $(SB_MICRO_PATH)/bin/ cp -v ./GPCNET/network_load_test $(SB_MICRO_PATH)/bin/
#Build GPU burn from main branch (only branch that exists)
cuda_gpuburn: sb_micro_path
ifneq (,$(wildcard gpu-burn/Makefile))
cd ./gpu-burn && make
cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
endif
Subproject commit cab8221b1147e83dd1fea3e42c3fe255254236ff
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment