Unverified Commit 800b962a authored by one's avatar one Committed by GitHub
Browse files

Update mem-bw to use BandwidthTest (#5)

* Update mem-bw to use BandwidthTest

* Update config and format code
parent 9ca5e7a9
......@@ -14,6 +14,7 @@
from superbench.benchmarks.micro_benchmarks.cublaslt_function import CublasLtBenchmark
from superbench.benchmarks.micro_benchmarks.rocm_hipblaslt_function import RocmHipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_hipblaslt_function import DtkHipBlasLtBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_memory_bw_performance import DtkMemBwBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_gemm_flops_performance import DtkGemmFlopsBenchmark
from superbench.benchmarks.micro_benchmarks.dtk_hpcg_performance import DtkHpcgBenchmark
from superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance import CudaGemmFlopsBenchmark
......@@ -61,6 +62,7 @@
'DtkGemmFlopsBenchmark',
'RocmHipBlasLtBenchmark',
'DtkHipBlasLtBenchmark',
'DtkMemBwBenchmark',
'GPCNetBenchmark',
'GemmFlopsBenchmark',
'GpuBurnBenchmark',
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the DTK memory performance benchmarks."""
import os
import re
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, Platform
from superbench.benchmarks.micro_benchmarks import MemBwBenchmark
class DtkMemBwBenchmark(MemBwBenchmark):
"""The DTK memory performance benchmark class."""
def __init__(self, name, parameters=''):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super().__init__(name, parameters)
self._bin_name = 'BandwidthTest'
self._type_map = {'htod': 0, 'dtoh': 1, 'dtod': 2}
self._mode_map = {'pinned': 0, 'unpinned': 1}
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if not super()._preprocess():
return False
# SuperBench runs one process per visible GPU. Select index 0 inside that visibility mask.
for mem_type in self._args.mem_type:
command = os.path.join(self._args.bin_dir, self._bin_name)
command += ' --type {} --index 0'.format(self._type_map[mem_type])
if mem_type != 'dtod':
command += ' --mode {}'.format(self._mode_map[self._args.memory])
self._commands.append(command)
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self._result.add_raw_data('raw_output_' + self._args.mem_type[cmd_idx], raw_output, self._args.log_raw_data)
mem_bw = -1
valid = True
number = r'[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?'
row_pattern = re.compile(
r'^\s*\d+(?:\.\d+)?\s*(?:B|KB|MB|GB)\s+'
r'({number})\s+({number})\s+({number})\s+({number})\s+({number})\s+({number})\s*$'.format(number=number),
re.IGNORECASE,
)
try:
metric = self._metrics[self._mem_types.index(self._args.mem_type[cmd_idx])]
for line in raw_output.splitlines():
match = row_pattern.match(line)
if match:
mem_bw = max(mem_bw, float(match.group(2)))
except BaseException:
valid = False
finally:
if valid is False or mem_bw == -1:
logger.error(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'.format(
self._curr_run_index, self._name, raw_output
)
)
return False
self._result.add_result(metric, mem_bw)
return True
BenchmarkRegistry.register_benchmark('mem-bw', DtkMemBwBenchmark, platform=Platform.DTK)
......@@ -91,4 +91,3 @@ def _process_raw_result(self, cmd_idx, raw_output):
BenchmarkRegistry.register_benchmark('mem-bw', RocmMemBwBenchmark, platform=Platform.ROCM)
BenchmarkRegistry.register_benchmark('mem-bw', RocmMemBwBenchmark, platform=Platform.DTK)
......@@ -282,7 +282,7 @@ superbench:
modes:
- name: local
proc_num: 8
prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
prefix: HIP_VISIBLE_DEVICES={proc_rank}
parallel: no
ib-loopback:
enable: false
......
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for DTK mem-bw benchmark."""
import numbers
import unittest
from tests.helper import decorator
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, ReturnCode, Platform
class DtkMemBwTest(BenchmarkTestCase, unittest.TestCase):
"""Test class for DTK mem-bw benchmark."""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/BandwidthTest'])
@decorator.load_data('tests/data/dtk_memory_h2d_bw.log')
@decorator.load_data('tests/data/dtk_memory_d2h_bw.log')
@decorator.load_data('tests/data/dtk_memory_d2d_bw.log')
def test_dtk_memory_bw_performance(self, raw_output_h2d, raw_output_d2h, raw_output_d2d):
"""Test DTK mem-bw benchmark."""
benchmark_name = 'mem-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.DTK)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name)
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
# Check basic information.
assert (benchmark)
assert (benchmark.name == 'mem-bw')
assert (benchmark.type == BenchmarkType.MICRO)
# Check command list.
expected_command = [
'BandwidthTest --type 0 --index 0 --mode 0',
'BandwidthTest --type 1 --index 0 --mode 0',
'BandwidthTest --type 2 --index 0',
]
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
# Check results and metrics.
raw_output = [raw_output_h2d, raw_output_d2h, raw_output_d2d]
for i, metric in enumerate(['h2d_bw', 'd2h_bw', 'd2d_bw']):
assert (benchmark._process_raw_result(i, raw_output[i]))
assert (metric in benchmark.result)
assert (len(benchmark.result[metric]) == 1)
assert (isinstance(benchmark.result[metric][0], numbers.Number))
assert (benchmark.result['h2d_bw'][0] == 30.274)
assert (benchmark.result['d2h_bw'][0] == 32.058)
assert (benchmark.result['d2d_bw'][0] == 1431.655)
def test_dtk_memory_bw_performance_unpinned_command(self):
"""Test DTK mem-bw unpinned command generation."""
benchmark_name = 'mem-bw'
(benchmark_class,
predefine_params) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(benchmark_name, Platform.DTK)
assert (benchmark_class)
benchmark = benchmark_class(benchmark_name, parameters='--memory unpinned')
ret = benchmark._preprocess()
assert (ret is True)
assert (benchmark.return_code == ReturnCode.SUCCESS)
expected_command = [
'BandwidthTest --type 0 --index 0 --mode 1',
'BandwidthTest --type 1 --index 0 --mode 1',
'BandwidthTest --type 2 --index 0',
]
for i in range(len(expected_command)):
command = benchmark._bin_name + benchmark._commands[i].split(benchmark._bin_name)[1]
assert (command == expected_command[i])
Using event timing to calculate bandwidth
Use the following device for testing
Device: deviceId 0, PciBusID 159 Name=BW Mem=64.0GB #CUs=80 Freq=1500Mhz
===================== HIP Bandwidth Test Type: D2D =====================
Data Size Avg Time(us) Avg BW(GB/s) Min Time(us) Peak BW(GB/s) Max Time(us) Min BW(GB/s)
16 B 6.100 0.003 6.000 0.003 6.200 0.003
1 KB 6.400 0.160 6.100 0.168 6.900 0.148
1 MB 8.100 129.454 8.000 131.072 8.400 124.830
512 MB 380.000 1412.913 370.000 1451.025 390.000 1376.602
1 GB 750.000 1431.655 740.000 1451.002 760.000 1412.817
Using event timing to calculate bandwidth
Use the following device for testing
Device: deviceId 0, PciBusID 159 Name=BW Mem=64.0GB #CUs=80 Freq=1500Mhz
===================== HIP Bandwidth Test Type: D2H with host pinned =====================
Data Size Avg Time(us) Avg BW(GB/s) Min Time(us) Peak BW(GB/s) Max Time(us) Min BW(GB/s)
16 B 10.120 0.002 10.080 0.002 10.240 0.002
1 KB 10.500 0.098 10.240 0.100 10.880 0.094
1 MB 42.000 24.967 41.500 25.267 42.900 24.444
512 MB 16800.000 31.958 16790.000 31.977 16820.000 31.920
1 GB 33500.000 32.058 33490.000 32.068 33520.000 32.039
Using event timing to calculate bandwidth
Use the following device for testing
Device: deviceId 0, PciBusID 159 Name=BW Mem=64.0GB #CUs=80 Freq=1500Mhz
===================== HIP Bandwidth Test Type: H2D with host pinned =====================
Data Size Avg Time(us) Avg BW(GB/s) Min Time(us) Peak BW(GB/s) Max Time(us) Min BW(GB/s)
16 B 11.876 0.001 11.200 0.001 13.120 0.001
1 KB 11.591 0.088 11.040 0.093 12.960 0.079
1 MB 46.044 22.773 45.440 23.076 47.040 22.291
512 MB 17745.799 30.253 17742.546 30.259 17749.426 30.247
1 GB 35467.245 30.274 35446.537 30.292 35485.413 30.259
4 GB 142138.429 30.217 142122.604 30.220 142188.065 30.206
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment