Benchmarks: Add Feature - Add GPU-Burn as microbenchmark (#324)

**Description** Modifications adding GPU-Burn to SuperBench. - added third party submodule - modified Makefile to make gpu-burn binary - added/modified microbenchmarks to add gpu-burn python scripts - modified default and azure_ndv4 configs to add gpu-burn

Benchmarks: Add Feature - Add GPU-Burn as microbenchmark (#324)
**Description** Modifications adding GPU-Burn to SuperBench. - added third party submodule - modified Makefile to make gpu-burn binary - added/modified microbenchmarks to add gpu-burn python scripts - modified default and azure_ndv4 configs to add gpu-burn
ff51a3ce · rafsalas19 · GitHub · 84359fd8 · ff51a3ce · ff51a3ce
Unverified Commit ff51a3ce authored Mar 16, 2022 by rafsalas19 Committed by GitHub Mar 16, 2022
11 changed files
--- a/.gitmodules
+++ b/.gitmodules
@@ -18,3 +18,6 @@
 [submodule "third_party/GPCNET"]
 	path = third_party/GPCNET
 	url = https://github.com/netbench/GPCNET.git
+[submodule "third_party/gpu-burn"]
+	path = third_party/gpu-burn
+	url = https://github.com/wilicc/gpu-burn.git
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -142,6 +142,20 @@ The supported percentiles are 50, 90, 95, 99, and 99.9.
 | ort-inference/{precision}_{model}_time                  | time (ms) | The mean latency to execute one batch of inference.                         |
 | ort-inference/{precision}_{model}_time_{percentile}     | time (ms) | The {percentile}th percentile latency to execute one batch of inference.    |
+### `gpu-burn`
+#### Introduction
+Multi-GPU CUDA stress test for GPU compute and memory utilization, performed by [gpu-burn](https://github.com/wilicc/gpu-burn).
+Supports the use of double unit types and the use of tensor cores.
+#### Metrics
+| Name                     | Unit       | Description                                                                         |
+|--------------------------|------------|-------------------------------------------------------------------------------------|
+| gpu-burn/time            | time (s)   | The runtime for gpu-burn test.                                                      |
+| gpu-burn/gpu_[0-9]_pass  | yes/no  	  | The result of the gpu-burn test for each GPU (1: yes, 0: no).                       |
+| gpu-burn/abort           | yes/no  	  | Whether or not GPU-burn test aborted before returning GPU results (1: yes, 0: no).  |
 ## Communication Benchmarks

--- a/examples/benchmarks/gpu_burn_test.py
+++ b/examples/benchmarks/gpu_burn_test.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Micro benchmark example for GPU-Burn.
+Commands to run:
+  python3 examples/benchmarks/gpu_burn_test.py
+"""
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.common.utils import logger
+if __name__ == '__main__':
+    context = BenchmarkRegistry.create_benchmark_context(
+        'gpu-burn', platform=Platform.CUDA, parameters='--doubles --tensor_core --time 10'
+    )
+    benchmark = BenchmarkRegistry.launch_benchmark(context)
+    if benchmark:
+        logger.info(
+            'benchmark: {}, return code: {}, result: {}'.format(
+                benchmark.name, benchmark.return_code, benchmark.result
+            )
+        )
--- a/superbench/benchmarks/micro_benchmarks/__init__.py
+++ b/superbench/benchmarks/micro_benchmarks/__init__.py
@@ -17,6 +17,7 @@
 from superbench.benchmarks.micro_benchmarks.cpu_memory_bw_latency_performance import CpuMemBwLatencyBenchmark
 from superbench.benchmarks.micro_benchmarks.gpcnet_performance import GPCNetBenchmark
 from superbench.benchmarks.micro_benchmarks.gpu_copy_bw_performance import GpuCopyBwBenchmark
+from superbench.benchmarks.micro_benchmarks.gpu_burn_test import GpuBurnBenchmark
 from superbench.benchmarks.micro_benchmarks.ib_loopback_performance import IBLoopbackBenchmark
 from superbench.benchmarks.micro_benchmarks.ib_validation_performance import IBBenchmark
 from superbench.benchmarks.micro_benchmarks.kernel_launch_overhead import KernelLaunch
@@ -39,6 +40,7 @@
    'GPCNetBenchmark',
    'GemmFlopsBenchmark',
    'GpuCopyBwBenchmark',
+    'GpuBurnBenchmark',
    'IBBenchmark',
    'IBLoopbackBenchmark',
    'KernelLaunch',

--- a/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_burn_test.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT license.
+"""Module of the GPU-Burn Test."""
+import os
+from superbench.common.utils import logger
+from superbench.benchmarks import BenchmarkRegistry, Platform
+from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
+class GpuBurnBenchmark(MicroBenchmarkWithInvoke):
+    """The GPU Burn Test benchmark class."""
+    def __init__(self, name, parameters=''):
+        """Constructor.
+        Args:
+            name (str): benchmark name.
+            parameters (str): benchmark parameters.
+        """
+        super().__init__(name, parameters)
+        self._bin_name = 'gpu_burn'
+    def add_parser_arguments(self):
+        """Add the specified arguments."""
+        super().add_parser_arguments()
+        self._parser.add_argument(
+            '--doubles',
+            action='store_true',
+            default=False,
+            help='Use doubles for the data type used in GPU-Burn',
+        )
+        self._parser.add_argument(
+            '--tensor_core',
+            action='store_true',
+            default=False,
+            help='Use tensor cores in GPU-Burn',
+        )
+        self._parser.add_argument(
+            '--time',
+            type=int,
+            default=10,
+            help='Length of time to run GPU-Burn for(in seconds)',
+        )
+    def _preprocess(self):
+        """Preprocess/preparation operations before the benchmarking.
+        Return:
+            True if _preprocess() succeed.
+        """
+        if not super()._preprocess():
+            return False
+        if not self._set_binary_path():
+            return False
+        command = os.path.join(self._args.bin_dir, self._bin_name)
+        if self._args.doubles:
+            command += ' -d'
+        if self._args.tensor_core:
+            command += ' -tc'
+        command += ' {} '.format(self._args.time)
+        # copy compare.ptx which needs to be in the working directory
+        compare_copy = 'cp ' + self._args.bin_dir + '/compare.ptx ./'
+        # remove compare.ptx from working directory
+        compare_rm = 'rm ' + 'compare.ptx'
+        self._commands.append(compare_copy + ' && ' + command + ' && ' + compare_rm)
+        return True
+    def _process_raw_result(self, cmd_idx, raw_output):    # noqa: C901
+        """Function to parse raw results and save the summarized results.
+           self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
+        Args:
+            cmd_idx (int): the index of command corresponding with the raw_output.
+            raw_output (str): raw output string of the micro-benchmark.
+        Return:
+            True if the raw output string is valid and result can be extracted.
+        """
+        content = raw_output.splitlines()
+        gpu_res = []
+        abort = False
+        failure_msg = 'unknown failure'
+        index = -1
+        try:
+            for idx, line in enumerate(content):
+                if 'No clients are alive!' in line or "Couldn't init a GPU" \
+                        in line or 'Failure during compute' in line or 'Low mem for result' in line:
+                    abort = True
+                    failure_msg = line
+                    break
+                if 'done' in line:
+                    index = idx
+                    break
+            if not abort:
+                if 'done' not in content[index]:
+                    abort = True
+                    failure_msg = 'The result format invalid'
+                    raise failure_msg
+                content = content[index + 2:len(content):]
+                for line in content:
+                    if 'Tested' in line:
+                        continue
+                    if 'GPU' in line:
+                        gpu_res.append(line.strip('\n').strip('\t'))
+                self._result.add_result('time', self._args.time)
+                for res in gpu_res:
+                    if 'OK' in res:
+                        self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 1)
+                    else:
+                        self._result.add_result(res.split(':')[0].replace(' ', '_').lower() + '_pass', 0)
+                    self._result.add_raw_data('GPU-Burn_result', res)
+            else:
+                self._result.add_raw_data('GPU Burn Failure: ', failure_msg)
+                self._result.add_result('abort', 1)
+                return False
+        except BaseException as e:
+            logger.error(
+                'The result format is invalid - round: {}, benchmark: {}, raw output: {}, message: {}.'.format(
+                    self._curr_run_index, self._name, raw_output, str(e)
+                )
+            )
+            self._result.add_result('abort', 1)
+            return False
+        self._result.add_result('abort', 0)
+        return True
+BenchmarkRegistry.register_benchmark('gpu-burn', GpuBurnBenchmark, platform=Platform.CUDA)
--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -39,6 +39,16 @@ superbench:
      <<: *default_local_mode
    gemm-flops:
      <<: *default_local_mode
+    gpu-burn:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        time: 900
+        doubles: true
+        tensor_core: true
    nccl-bw:default:
      enable: true
      modes:

--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -33,6 +33,16 @@ superbench:
      model_action:
        - train
  benchmarks:
+    gpu-burn:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        time: 300
+        doubles: true
+        tensor_core: true
    nccl-bw:default:
      enable: true
      modes:

--- a/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+++ b/tests/benchmarks/micro_benchmarks/test_gpu_burn_test.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+"""Tests for gpu-burn benchmark."""
+import unittest
+from tests.helper import decorator
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
+class GpuBurnBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
+    """Test class for gpu-burn benchmark."""
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/gpu_burn'])
+    @decorator.load_data('tests/data/gpu_burn.log')
+    def test_gpu_burn(self, results):
+        """Test gpu-burn benchmark command generation."""
+        benchmark_name = 'gpu-burn'
+        (benchmark_class,
+         predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.CUDA)
+        assert (benchmark_class)
+        time = 10
+        parameters = '--doubles --tensor_core --time ' + str(time)
+        benchmark = benchmark_class(benchmark_name, parameters=parameters)
+        # Check basic information
+        assert (benchmark)
+        ret = benchmark._preprocess()
+        assert (ret is True)
+        assert (benchmark.return_code == ReturnCode.SUCCESS)
+        assert (benchmark.name == benchmark_name)
+        assert (benchmark.type == BenchmarkType.MICRO)
+        # Check parameters specified in BenchmarkContext.
+        assert (benchmark._args.time == time)
+        assert (benchmark._args.doubles)
+        assert (benchmark._args.tensor_core)
+        # Check command
+        compare_copy = 'cp ' + benchmark._args.bin_dir + '/compare.ptx ./'
+        compare_rm = 'rm ' + 'compare.ptx'
+        assert (1 == len(benchmark._commands))
+        assert (benchmark._commands[0].startswith(compare_copy))
+        assert ('-d' in benchmark._commands[0])
+        assert ('-tc' in benchmark._commands[0])
+        assert (str(time) in benchmark._commands[0])
+        assert (compare_rm in benchmark._commands[0])
+        # Check results
+        assert (benchmark._process_raw_result(0, results))
+        assert (benchmark.result['return_code'][0] == 0)
+        assert (benchmark.result['time'][0] == time)
+        for device in range(8):
+            assert (benchmark.result['gpu_' + str(device) + '_pass'][0] == 1)
+        assert (benchmark.result['abort'][0] == 0)
--- a/tests/data/gpu_burn.log
+++ b/tests/data/gpu_burn.log
--- a/third_party/Makefile
+++ b/third_party/Makefile
@@ -10,11 +10,11 @@ RCCL_HOME ?= /opt/rocm/rccl
 ROCM_VERSION ?= rocm-$(shell dpkg -l | grep 'rocm-dev ' | awk '{print $$3}' | cut -d '.' -f1-3)
 HPCX_HOME ?= /opt/hpcx
-.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet
+.PHONY: all cuda rocm common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest rocm_perftest fio rocm_rccl_tests rocm_rocblas rocm_bandwidthTest gpcnet cuda_gpuburn
 # Build all targets.
 all: cuda rocm
-cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet
+cuda: common cuda_cutlass cuda_bandwidthTest cuda_nccl_tests cuda_perftest gpcnet cuda_gpuburn
 rocm: common rocm_perftest rocm_rccl_tests rocm_rocblas rocm_bandwidthTest
 common: fio
@@ -98,3 +98,11 @@ gpcnet: sb_micro_path
 	bash -c "source ${HPCX_HOME}/hpcx-init.sh && hpcx_load && make CC=mpicc -C GPCNET all && hpcx_unload"
 	cp -v ./GPCNET/network_test $(SB_MICRO_PATH)/bin/
 	cp -v ./GPCNET/network_load_test $(SB_MICRO_PATH)/bin/
+#Build GPU burn from main branch (only branch that exists)
+cuda_gpuburn: sb_micro_path
+ifneq (,$(wildcard gpu-burn/Makefile))
+	cd ./gpu-burn && make
+	cp -v ./gpu-burn/gpu_burn $(SB_MICRO_PATH)/bin/
+	cp -v ./gpu-burn/compare.ptx $(SB_MICRO_PATH)/bin/
+endif
--- a/gpu-burn @ cab8221b
+++ b/gpu-burn @ cab8221b
+Subproject commit cab8221b1147e83dd1fea3e42c3fe255254236ff