Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
800b962a
Unverified
Commit
800b962a
authored
Apr 20, 2026
by
one
Committed by
GitHub
Apr 20, 2026
Browse files
Update mem-bw to use BandwidthTest (#5)
* Update mem-bw to use BandwidthTest * Update config and format code
parent
9ca5e7a9
Changes
8
Hide whitespace changes
Inline
Side-by-side
Showing
8 changed files
with
215 additions
and
2 deletions
+215
-2
superbench/benchmarks/micro_benchmarks/__init__.py
superbench/benchmarks/micro_benchmarks/__init__.py
+2
-0
superbench/benchmarks/micro_benchmarks/dtk_memory_bw_performance.py
.../benchmarks/micro_benchmarks/dtk_memory_bw_performance.py
+92
-0
superbench/benchmarks/micro_benchmarks/rocm_memory_bw_performance.py
...benchmarks/micro_benchmarks/rocm_memory_bw_performance.py
+0
-1
superbench/config/hygon_bw1000.yaml
superbench/config/hygon_bw1000.yaml
+1
-1
tests/benchmarks/micro_benchmarks/test_dtk_memory_bw_performance.py
...hmarks/micro_benchmarks/test_dtk_memory_bw_performance.py
+86
-0
tests/data/dtk_memory_d2d_bw.log
tests/data/dtk_memory_d2d_bw.log
+11
-0
tests/data/dtk_memory_d2h_bw.log
tests/data/dtk_memory_d2h_bw.log
+11
-0
tests/data/dtk_memory_h2d_bw.log
tests/data/dtk_memory_h2d_bw.log
+12
-0
No files found.
superbench/benchmarks/micro_benchmarks/__init__.py
View file @
800b962a
...
...
@@ -14,6 +14,7 @@
from
superbench.benchmarks.micro_benchmarks.cublaslt_function
import
CublasLtBenchmark
from
superbench.benchmarks.micro_benchmarks.rocm_hipblaslt_function
import
RocmHipBlasLtBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_hipblaslt_function
import
DtkHipBlasLtBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_memory_bw_performance
import
DtkMemBwBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_gemm_flops_performance
import
DtkGemmFlopsBenchmark
from
superbench.benchmarks.micro_benchmarks.dtk_hpcg_performance
import
DtkHpcgBenchmark
from
superbench.benchmarks.micro_benchmarks.cuda_gemm_flops_performance
import
CudaGemmFlopsBenchmark
...
...
@@ -61,6 +62,7 @@
'DtkGemmFlopsBenchmark'
,
'RocmHipBlasLtBenchmark'
,
'DtkHipBlasLtBenchmark'
,
'DtkMemBwBenchmark'
,
'GPCNetBenchmark'
,
'GemmFlopsBenchmark'
,
'GpuBurnBenchmark'
,
...
...
superbench/benchmarks/micro_benchmarks/dtk_memory_bw_performance.py
0 → 100644
View file @
800b962a
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT license.
"""Module of the DTK memory performance benchmarks."""
import
os
import
re
from
superbench.common.utils
import
logger
from
superbench.benchmarks
import
BenchmarkRegistry
,
Platform
from
superbench.benchmarks.micro_benchmarks
import
MemBwBenchmark
class
DtkMemBwBenchmark
(
MemBwBenchmark
):
"""The DTK memory performance benchmark class."""
def
__init__
(
self
,
name
,
parameters
=
''
):
"""Constructor.
Args:
name (str): benchmark name.
parameters (str): benchmark parameters.
"""
super
().
__init__
(
name
,
parameters
)
self
.
_bin_name
=
'BandwidthTest'
self
.
_type_map
=
{
'htod'
:
0
,
'dtoh'
:
1
,
'dtod'
:
2
}
self
.
_mode_map
=
{
'pinned'
:
0
,
'unpinned'
:
1
}
def
_preprocess
(
self
):
"""Preprocess/preparation operations before the benchmarking.
Return:
True if _preprocess() succeed.
"""
if
not
super
().
_preprocess
():
return
False
# SuperBench runs one process per visible GPU. Select index 0 inside that visibility mask.
for
mem_type
in
self
.
_args
.
mem_type
:
command
=
os
.
path
.
join
(
self
.
_args
.
bin_dir
,
self
.
_bin_name
)
command
+=
' --type {} --index 0'
.
format
(
self
.
_type_map
[
mem_type
])
if
mem_type
!=
'dtod'
:
command
+=
' --mode {}'
.
format
(
self
.
_mode_map
[
self
.
_args
.
memory
])
self
.
_commands
.
append
(
command
)
return
True
def
_process_raw_result
(
self
,
cmd_idx
,
raw_output
):
"""Function to parse raw results and save the summarized results.
self._result.add_raw_data() and self._result.add_result() need to be called to save the results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
True if the raw output string is valid and result can be extracted.
"""
self
.
_result
.
add_raw_data
(
'raw_output_'
+
self
.
_args
.
mem_type
[
cmd_idx
],
raw_output
,
self
.
_args
.
log_raw_data
)
mem_bw
=
-
1
valid
=
True
number
=
r
'[-+]?\d+(?:\.\d+)?(?:[eE][-+]?\d+)?'
row_pattern
=
re
.
compile
(
r
'^\s*\d+(?:\.\d+)?\s*(?:B|KB|MB|GB)\s+'
r
'({number})\s+({number})\s+({number})\s+({number})\s+({number})\s+({number})\s*$'
.
format
(
number
=
number
),
re
.
IGNORECASE
,
)
try
:
metric
=
self
.
_metrics
[
self
.
_mem_types
.
index
(
self
.
_args
.
mem_type
[
cmd_idx
])]
for
line
in
raw_output
.
splitlines
():
match
=
row_pattern
.
match
(
line
)
if
match
:
mem_bw
=
max
(
mem_bw
,
float
(
match
.
group
(
2
)))
except
BaseException
:
valid
=
False
finally
:
if
valid
is
False
or
mem_bw
==
-
1
:
logger
.
error
(
'The result format is invalid - round: {}, benchmark: {}, raw output: {}.'
.
format
(
self
.
_curr_run_index
,
self
.
_name
,
raw_output
)
)
return
False
self
.
_result
.
add_result
(
metric
,
mem_bw
)
return
True
BenchmarkRegistry
.
register_benchmark
(
'mem-bw'
,
DtkMemBwBenchmark
,
platform
=
Platform
.
DTK
)
superbench/benchmarks/micro_benchmarks/rocm_memory_bw_performance.py
View file @
800b962a
...
...
@@ -91,4 +91,3 @@ def _process_raw_result(self, cmd_idx, raw_output):
BenchmarkRegistry
.
register_benchmark
(
'mem-bw'
,
RocmMemBwBenchmark
,
platform
=
Platform
.
ROCM
)
BenchmarkRegistry
.
register_benchmark
(
'mem-bw'
,
RocmMemBwBenchmark
,
platform
=
Platform
.
DTK
)
superbench/config/hygon_bw1000.yaml
View file @
800b962a
...
...
@@ -282,7 +282,7 @@ superbench:
modes
:
-
name
:
local
proc_num
:
8
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
numactl -N $(({proc_rank}/4))
prefix
:
HIP_VISIBLE_DEVICES={proc_rank}
parallel
:
no
ib-loopback
:
enable
:
false
...
...
tests/benchmarks/micro_benchmarks/test_dtk_memory_bw_performance.py
0 → 100644
View file @
800b962a
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for DTK mem-bw benchmark."""
import
numbers
import
unittest
from
tests.helper
import
decorator
from
tests.helper.testcase
import
BenchmarkTestCase
from
superbench.benchmarks
import
BenchmarkRegistry
,
BenchmarkType
,
ReturnCode
,
Platform
class
DtkMemBwTest
(
BenchmarkTestCase
,
unittest
.
TestCase
):
"""Test class for DTK mem-bw benchmark."""
@
classmethod
def
setUpClass
(
cls
):
"""Hook method for setting up class fixture before running tests in the class."""
super
().
setUpClass
()
cls
.
createMockEnvs
(
cls
)
cls
.
createMockFiles
(
cls
,
[
'bin/BandwidthTest'
])
@
decorator
.
load_data
(
'tests/data/dtk_memory_h2d_bw.log'
)
@
decorator
.
load_data
(
'tests/data/dtk_memory_d2h_bw.log'
)
@
decorator
.
load_data
(
'tests/data/dtk_memory_d2d_bw.log'
)
def
test_dtk_memory_bw_performance
(
self
,
raw_output_h2d
,
raw_output_d2h
,
raw_output_d2d
):
"""Test DTK mem-bw benchmark."""
benchmark_name
=
'mem-bw'
(
benchmark_class
,
predefine_params
)
=
BenchmarkRegistry
.
_BenchmarkRegistry__select_benchmark
(
benchmark_name
,
Platform
.
DTK
)
assert
(
benchmark_class
)
benchmark
=
benchmark_class
(
benchmark_name
)
ret
=
benchmark
.
_preprocess
()
assert
(
ret
is
True
)
assert
(
benchmark
.
return_code
==
ReturnCode
.
SUCCESS
)
# Check basic information.
assert
(
benchmark
)
assert
(
benchmark
.
name
==
'mem-bw'
)
assert
(
benchmark
.
type
==
BenchmarkType
.
MICRO
)
# Check command list.
expected_command
=
[
'BandwidthTest --type 0 --index 0 --mode 0'
,
'BandwidthTest --type 1 --index 0 --mode 0'
,
'BandwidthTest --type 2 --index 0'
,
]
for
i
in
range
(
len
(
expected_command
)):
command
=
benchmark
.
_bin_name
+
benchmark
.
_commands
[
i
].
split
(
benchmark
.
_bin_name
)[
1
]
assert
(
command
==
expected_command
[
i
])
# Check results and metrics.
raw_output
=
[
raw_output_h2d
,
raw_output_d2h
,
raw_output_d2d
]
for
i
,
metric
in
enumerate
([
'h2d_bw'
,
'd2h_bw'
,
'd2d_bw'
]):
assert
(
benchmark
.
_process_raw_result
(
i
,
raw_output
[
i
]))
assert
(
metric
in
benchmark
.
result
)
assert
(
len
(
benchmark
.
result
[
metric
])
==
1
)
assert
(
isinstance
(
benchmark
.
result
[
metric
][
0
],
numbers
.
Number
))
assert
(
benchmark
.
result
[
'h2d_bw'
][
0
]
==
30.274
)
assert
(
benchmark
.
result
[
'd2h_bw'
][
0
]
==
32.058
)
assert
(
benchmark
.
result
[
'd2d_bw'
][
0
]
==
1431.655
)
def
test_dtk_memory_bw_performance_unpinned_command
(
self
):
"""Test DTK mem-bw unpinned command generation."""
benchmark_name
=
'mem-bw'
(
benchmark_class
,
predefine_params
)
=
BenchmarkRegistry
.
_BenchmarkRegistry__select_benchmark
(
benchmark_name
,
Platform
.
DTK
)
assert
(
benchmark_class
)
benchmark
=
benchmark_class
(
benchmark_name
,
parameters
=
'--memory unpinned'
)
ret
=
benchmark
.
_preprocess
()
assert
(
ret
is
True
)
assert
(
benchmark
.
return_code
==
ReturnCode
.
SUCCESS
)
expected_command
=
[
'BandwidthTest --type 0 --index 0 --mode 1'
,
'BandwidthTest --type 1 --index 0 --mode 1'
,
'BandwidthTest --type 2 --index 0'
,
]
for
i
in
range
(
len
(
expected_command
)):
command
=
benchmark
.
_bin_name
+
benchmark
.
_commands
[
i
].
split
(
benchmark
.
_bin_name
)[
1
]
assert
(
command
==
expected_command
[
i
])
tests/data/dtk_memory_d2d_bw.log
0 → 100644
View file @
800b962a
Using event timing to calculate bandwidth
Use the following device for testing
Device: deviceId 0, PciBusID 159 Name=BW Mem=64.0GB #CUs=80 Freq=1500Mhz
===================== HIP Bandwidth Test Type: D2D =====================
Data Size Avg Time(us) Avg BW(GB/s) Min Time(us) Peak BW(GB/s) Max Time(us) Min BW(GB/s)
16 B 6.100 0.003 6.000 0.003 6.200 0.003
1 KB 6.400 0.160 6.100 0.168 6.900 0.148
1 MB 8.100 129.454 8.000 131.072 8.400 124.830
512 MB 380.000 1412.913 370.000 1451.025 390.000 1376.602
1 GB 750.000 1431.655 740.000 1451.002 760.000 1412.817
tests/data/dtk_memory_d2h_bw.log
0 → 100644
View file @
800b962a
Using event timing to calculate bandwidth
Use the following device for testing
Device: deviceId 0, PciBusID 159 Name=BW Mem=64.0GB #CUs=80 Freq=1500Mhz
===================== HIP Bandwidth Test Type: D2H with host pinned =====================
Data Size Avg Time(us) Avg BW(GB/s) Min Time(us) Peak BW(GB/s) Max Time(us) Min BW(GB/s)
16 B 10.120 0.002 10.080 0.002 10.240 0.002
1 KB 10.500 0.098 10.240 0.100 10.880 0.094
1 MB 42.000 24.967 41.500 25.267 42.900 24.444
512 MB 16800.000 31.958 16790.000 31.977 16820.000 31.920
1 GB 33500.000 32.058 33490.000 32.068 33520.000 32.039
tests/data/dtk_memory_h2d_bw.log
0 → 100644
View file @
800b962a
Using event timing to calculate bandwidth
Use the following device for testing
Device: deviceId 0, PciBusID 159 Name=BW Mem=64.0GB #CUs=80 Freq=1500Mhz
===================== HIP Bandwidth Test Type: H2D with host pinned =====================
Data Size Avg Time(us) Avg BW(GB/s) Min Time(us) Peak BW(GB/s) Max Time(us) Min BW(GB/s)
16 B 11.876 0.001 11.200 0.001 13.120 0.001
1 KB 11.591 0.088 11.040 0.093 12.960 0.079
1 MB 46.044 22.773 45.440 23.076 47.040 22.291
512 MB 17745.799 30.253 17742.546 30.259 17749.426 30.247
1 GB 35467.245 30.274 35446.537 30.292 35485.413 30.259
4 GB 142138.429 30.217 142122.604 30.220 142188.065 30.206
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment