Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
2056d7fa
Commit
2056d7fa
authored
Apr 01, 2026
by
one
Browse files
Add gpu-hpcg metrics
parent
4f69c7de
Changes
3
Show whitespace changes
Inline
Side-by-side
Showing
3 changed files
with
300 additions
and
2 deletions
+300
-2
docs/user-tutorial/benchmarks/micro-benchmarks.md
docs/user-tutorial/benchmarks/micro-benchmarks.md
+49
-0
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
.../benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+109
-2
tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
.../benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+142
-0
No files found.
docs/user-tutorial/benchmarks/micro-benchmarks.md
View file @
2056d7fa
...
...
@@ -187,6 +187,55 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |
### `gpu-hpcg`
#### Introduction
Measure GPU HPCG performance using
`run_rochpcg`
, which wraps
`rocHPCG`
execution and emits rocHPCG-native summary output.
When collecting multiple MPI scales or problem sizes, use separate benchmark section names such as
`gpu-hpcg:ranks8`
and
`gpu-hpcg:ranks4`
instead of placing multiple
`modes`
under one
`gpu-hpcg`
section.
#### Metrics
| Name | Unit | Description |
|---------------------------------------|--------------------|------------------------------------------------------|
| gpu-hpcg/final_gflops | FLOPS (GFLOPS) | Final rocHPCG score. |
| gpu-hpcg/final_gflops_per_process | FLOPS (GFLOPS) | Final rocHPCG score per process. |
| gpu-hpcg/ddot_gflops | FLOPS (GFLOPS) | DDOT throughput. |
| gpu-hpcg/ddot_bandwidth | bandwidth (GB/s) | DDOT bandwidth. |
| gpu-hpcg/ddot_gflops_per_process | FLOPS (GFLOPS) | DDOT throughput per process. |
| gpu-hpcg/ddot_bandwidth_per_process | bandwidth (GB/s) | DDOT bandwidth per process. |
| gpu-hpcg/waxpby_gflops | FLOPS (GFLOPS) | WAXPBY throughput. |
| gpu-hpcg/waxpby_bandwidth | bandwidth (GB/s) | WAXPBY bandwidth. |
| gpu-hpcg/waxpby_gflops_per_process | FLOPS (GFLOPS) | WAXPBY throughput per process. |
| gpu-hpcg/waxpby_bandwidth_per_process | bandwidth (GB/s) | WAXPBY bandwidth per process. |
| gpu-hpcg/spmv_gflops | FLOPS (GFLOPS) | SpMV throughput. |
| gpu-hpcg/spmv_bandwidth | bandwidth (GB/s) | SpMV bandwidth. |
| gpu-hpcg/spmv_gflops_per_process | FLOPS (GFLOPS) | SpMV throughput per process. |
| gpu-hpcg/spmv_bandwidth_per_process | bandwidth (GB/s) | SpMV bandwidth per process. |
| gpu-hpcg/mg_gflops | FLOPS (GFLOPS) | MG throughput. |
| gpu-hpcg/mg_bandwidth | bandwidth (GB/s) | MG bandwidth. |
| gpu-hpcg/mg_gflops_per_process | FLOPS (GFLOPS) | MG throughput per process. |
| gpu-hpcg/mg_bandwidth_per_process | bandwidth (GB/s) | MG bandwidth per process. |
| gpu-hpcg/total_gflops | FLOPS (GFLOPS) | Aggregate rocHPCG throughput. |
| gpu-hpcg/total_bandwidth | bandwidth (GB/s) | Aggregate rocHPCG bandwidth. |
| gpu-hpcg/total_gflops_per_process | FLOPS (GFLOPS) | Aggregate rocHPCG throughput per process. |
| gpu-hpcg/total_bandwidth_per_process | bandwidth (GB/s) | Aggregate rocHPCG bandwidth per process. |
| gpu-hpcg/setup_time | time (s) | Setup phase duration. |
| gpu-hpcg/optimization_time | time (s) | Optimization phase duration. |
| gpu-hpcg/total_time | time (s) | Total runtime. |
| gpu-hpcg/is_valid | | Run validity inferred from rocHPCG invalid markers. |
| gpu-hpcg/local_domain_x | | Local domain size in x dimension. |
| gpu-hpcg/local_domain_y | | Local domain size in y dimension. |
| gpu-hpcg/local_domain_z | | Local domain size in z dimension. |
| gpu-hpcg/global_domain_x | | Global domain size in x dimension. |
| gpu-hpcg/global_domain_y | | Global domain size in y dimension. |
| gpu-hpcg/global_domain_z | | Global domain size in z dimension. |
| gpu-hpcg/process_domain_x | | Process topology in x dimension. |
| gpu-hpcg/process_domain_y | | Process topology in y dimension. |
| gpu-hpcg/process_domain_z | | Process topology in z dimension. |
### `cpu-stream`
#### Introduction
...
...
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
View file @
2056d7fa
...
...
@@ -4,12 +4,32 @@
"""Module of the GPU HPCG benchmark base class."""
import
os
import
re
from
superbench.common.utils
import
logger
from
superbench.benchmarks.micro_benchmarks
import
MicroBenchmarkWithInvoke
class
GpuHpcgBenchmark
(
MicroBenchmarkWithInvoke
):
"""The GPU HPCG benchmark base class."""
_operation_metric_map
=
{
'DDOT'
:
'ddot'
,
'WAXPBY'
:
'waxpby'
,
'SpMV'
:
'spmv'
,
'MG'
:
'mg'
,
'Total'
:
'total'
,
'Final'
:
'final'
,
}
_operation_pattern
=
re
.
compile
(
r
'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
r
'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
r
'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
)
_time_pattern
=
re
.
compile
(
r
'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$'
)
_domain_pattern
=
re
.
compile
(
r
'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$'
)
_invalid_markers
=
[
'*** WARNING *** INVALID RUN'
,
'*** WARNING *** THIS IS NOT A VALID RUN ***'
]
def
__init__
(
self
,
name
,
parameters
=
''
):
"""Constructor.
...
...
@@ -131,14 +151,101 @@ def _preprocess(self):
return
True
def
_process_raw_result
(
self
,
cmd_idx
,
raw_output
):
"""
Save raw output for later parser refinement
.
"""
Parse rocHPCG stdout and save summarized results
.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
bool:
Always True for now
.
bool:
True if rocHPCG summary metrics are extracted successfully
.
"""
self
.
_result
.
add_raw_data
(
'raw_output_'
+
str
(
cmd_idx
),
raw_output
,
self
.
_args
.
log_raw_data
)
parsed_results
=
{}
required_metrics
=
{
'final_gflops'
,
'final_gflops_per_process'
,
'ddot_gflops'
,
'ddot_bandwidth'
,
'ddot_gflops_per_process'
,
'ddot_bandwidth_per_process'
,
'waxpby_gflops'
,
'waxpby_bandwidth'
,
'waxpby_gflops_per_process'
,
'waxpby_bandwidth_per_process'
,
'spmv_gflops'
,
'spmv_bandwidth'
,
'spmv_gflops_per_process'
,
'spmv_bandwidth_per_process'
,
'mg_gflops'
,
'mg_bandwidth'
,
'mg_gflops_per_process'
,
'mg_bandwidth_per_process'
,
'total_gflops'
,
'total_bandwidth'
,
'total_gflops_per_process'
,
'total_bandwidth_per_process'
,
'setup_time'
,
'optimization_time'
,
'total_time'
,
'local_domain_x'
,
'local_domain_y'
,
'local_domain_z'
,
'global_domain_x'
,
'global_domain_y'
,
'global_domain_z'
,
'process_domain_x'
,
'process_domain_y'
,
'process_domain_z'
,
}
for
raw_line
in
raw_output
.
splitlines
():
line
=
raw_line
.
strip
()
if
not
line
:
continue
operation_match
=
self
.
_operation_pattern
.
match
(
line
)
if
operation_match
:
prefix
=
self
.
_operation_metric_map
[
operation_match
.
group
(
1
)]
total_gflops
=
float
(
operation_match
.
group
(
2
))
total_bandwidth
=
float
(
operation_match
.
group
(
3
))
per_process_gflops
=
float
(
operation_match
.
group
(
4
))
per_process_bandwidth
=
float
(
operation_match
.
group
(
5
))
parsed_results
[
f
'
{
prefix
}
_gflops'
]
=
total_gflops
parsed_results
[
f
'
{
prefix
}
_gflops_per_process'
]
=
per_process_gflops
if
prefix
!=
'final'
:
parsed_results
[
f
'
{
prefix
}
_bandwidth'
]
=
total_bandwidth
parsed_results
[
f
'
{
prefix
}
_bandwidth_per_process'
]
=
per_process_bandwidth
continue
time_match
=
self
.
_time_pattern
.
match
(
line
)
if
time_match
:
metric_prefix
=
time_match
.
group
(
1
).
lower
().
replace
(
' '
,
'_'
)
parsed_results
[
metric_prefix
]
=
float
(
time_match
.
group
(
2
))
continue
domain_match
=
self
.
_domain_pattern
.
match
(
line
)
if
domain_match
:
domain_prefix
=
domain_match
.
group
(
1
).
lower
()
parsed_results
[
f
'
{
domain_prefix
}
_domain_x'
]
=
int
(
domain_match
.
group
(
2
))
parsed_results
[
f
'
{
domain_prefix
}
_domain_y'
]
=
int
(
domain_match
.
group
(
3
))
parsed_results
[
f
'
{
domain_prefix
}
_domain_z'
]
=
int
(
domain_match
.
group
(
4
))
parsed_results
[
'is_valid'
]
=
0
if
any
(
marker
in
raw_output
for
marker
in
self
.
_invalid_markers
)
else
1
missing_metrics
=
sorted
(
metric
for
metric
in
required_metrics
if
metric
not
in
parsed_results
)
if
missing_metrics
:
logger
.
error
(
'The result format is invalid - round: %s, benchmark: %s, missing metrics: %s.'
,
self
.
_curr_run_index
,
self
.
_name
,
', '
.
join
(
missing_metrics
),
)
return
False
for
metric
,
value
in
parsed_results
.
items
():
self
.
_result
.
add_result
(
metric
,
value
)
return
True
tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
0 → 100644
View file @
2056d7fa
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for DTK gpu-hpcg benchmark."""
import
unittest
from
types
import
SimpleNamespace
from
tests.helper.testcase
import
BenchmarkTestCase
from
superbench.benchmarks
import
BenchmarkRegistry
,
BenchmarkType
,
Platform
,
ReturnCode
from
superbench.benchmarks.result
import
BenchmarkResult
class
DtkHpcgBenchmarkTest
(
BenchmarkTestCase
,
unittest
.
TestCase
):
"""Tests for DTK gpu-hpcg benchmark."""
example_raw_output
=
"""
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
Setup Phase took 0.12 sec
Starting Reference CG Phase ...
Optimization Phase took 0.25 sec
Validation Testing Phase ...
Optimized CG Setup ...
HIP Initial Residual = 2.668768e+04
Total device memory usage: 19550 MByte (29152 MByte)
Starting Benchmarking Phase ...
Performing (at least) 2 CG sets in 1.0 seconds ...
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
Local domain: 560 x 280 x 280
Global domain: 2240 x 1120 x 560
Process domain: 4 x 4 x 2
Total Time: 7.55 sec
Setup Time: 0.12 sec
Optimization Time: 0.25 sec
*** WARNING *** INVALID RUN
DDOT = 5849.4 GFlop/s (46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
WAXPBY = 3052.0 GFlop/s (36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
SpMV = 5473.9 GFlop/s (34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
MG = 7716.9 GFlop/s (59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
Total = 6971.0 GFlop/s (52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
Final = 6904.9 GFlop/s (52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
*** WARNING *** THIS IS NOT A VALID RUN ***
"""
@
classmethod
def
setUpClass
(
cls
):
"""Hook method for setting up class fixture before running tests in the class."""
super
().
setUpClass
()
cls
.
benchmark_name
=
'gpu-hpcg'
cls
.
createMockEnvs
(
cls
)
cls
.
createMockFiles
(
cls
,
[
'bin/run_rochpcg'
])
def
get_benchmark
(
self
):
"""Get benchmark."""
(
benchmark_cls
,
_
)
=
BenchmarkRegistry
.
_BenchmarkRegistry__select_benchmark
(
self
.
benchmark_name
,
Platform
.
DTK
)
benchmark
=
benchmark_cls
(
self
.
benchmark_name
,
parameters
=
''
)
benchmark
.
_args
=
SimpleNamespace
(
log_raw_data
=
False
)
benchmark
.
_curr_run_index
=
0
benchmark
.
_result
=
BenchmarkResult
(
self
.
benchmark_name
,
BenchmarkType
.
MICRO
,
ReturnCode
.
SUCCESS
,
run_count
=
1
)
return
benchmark
def
test_dtk_hpcg_cls
(
self
):
"""Test DTK gpu-hpcg benchmark class."""
for
platform
in
Platform
:
(
benchmark_cls
,
_
)
=
BenchmarkRegistry
.
_BenchmarkRegistry__select_benchmark
(
self
.
benchmark_name
,
platform
)
if
platform
is
Platform
.
DTK
:
self
.
assertIsNotNone
(
benchmark_cls
)
else
:
self
.
assertIsNone
(
benchmark_cls
)
def
test_dtk_hpcg_result_parsing_with_wrapper_noise
(
self
):
"""Test DTK gpu-hpcg result parsing with wrapper noise."""
benchmark
=
self
.
get_benchmark
()
self
.
assertTrue
(
benchmark
.
_process_raw_result
(
0
,
self
.
example_raw_output
))
self
.
assertEqual
(
ReturnCode
.
SUCCESS
,
benchmark
.
return_code
)
self
.
assertEqual
(
6904.9
,
benchmark
.
result
[
'final_gflops'
][
0
])
self
.
assertEqual
(
215.8
,
benchmark
.
result
[
'final_gflops_per_process'
][
0
])
self
.
assertEqual
(
5849.4
,
benchmark
.
result
[
'ddot_gflops'
][
0
])
self
.
assertEqual
(
46794.9
,
benchmark
.
result
[
'ddot_bandwidth'
][
0
])
self
.
assertEqual
(
182.8
,
benchmark
.
result
[
'ddot_gflops_per_process'
][
0
])
self
.
assertEqual
(
1462.3
,
benchmark
.
result
[
'ddot_bandwidth_per_process'
][
0
])
self
.
assertEqual
(
3052.0
,
benchmark
.
result
[
'waxpby_gflops'
][
0
])
self
.
assertEqual
(
36623.8
,
benchmark
.
result
[
'waxpby_bandwidth'
][
0
])
self
.
assertEqual
(
5473.9
,
benchmark
.
result
[
'spmv_gflops'
][
0
])
self
.
assertEqual
(
34468.8
,
benchmark
.
result
[
'spmv_bandwidth'
][
0
])
self
.
assertEqual
(
7716.9
,
benchmark
.
result
[
'mg_gflops'
][
0
])
self
.
assertEqual
(
59557.1
,
benchmark
.
result
[
'mg_bandwidth'
][
0
])
self
.
assertEqual
(
6971.0
,
benchmark
.
result
[
'total_gflops'
][
0
])
self
.
assertEqual
(
52859.9
,
benchmark
.
result
[
'total_bandwidth'
][
0
])
self
.
assertEqual
(
217.8
,
benchmark
.
result
[
'total_gflops_per_process'
][
0
])
self
.
assertEqual
(
1651.9
,
benchmark
.
result
[
'total_bandwidth_per_process'
][
0
])
self
.
assertEqual
(
0.12
,
benchmark
.
result
[
'setup_time'
][
0
])
self
.
assertEqual
(
0.25
,
benchmark
.
result
[
'optimization_time'
][
0
])
self
.
assertEqual
(
7.55
,
benchmark
.
result
[
'total_time'
][
0
])
self
.
assertEqual
(
0
,
benchmark
.
result
[
'is_valid'
][
0
])
self
.
assertEqual
(
560
,
benchmark
.
result
[
'local_domain_x'
][
0
])
self
.
assertEqual
(
280
,
benchmark
.
result
[
'local_domain_y'
][
0
])
self
.
assertEqual
(
280
,
benchmark
.
result
[
'local_domain_z'
][
0
])
self
.
assertEqual
(
2240
,
benchmark
.
result
[
'global_domain_x'
][
0
])
self
.
assertEqual
(
1120
,
benchmark
.
result
[
'global_domain_y'
][
0
])
self
.
assertEqual
(
560
,
benchmark
.
result
[
'global_domain_z'
][
0
])
self
.
assertEqual
(
4
,
benchmark
.
result
[
'process_domain_x'
][
0
])
self
.
assertEqual
(
4
,
benchmark
.
result
[
'process_domain_y'
][
0
])
self
.
assertEqual
(
2
,
benchmark
.
result
[
'process_domain_z'
][
0
])
self
.
assertIn
(
'raw_output_0'
,
benchmark
.
raw_data
)
def
test_dtk_hpcg_result_parsing_valid_by_absence_of_invalid_markers
(
self
):
"""Test DTK gpu-hpcg valid detection by absence of invalid markers."""
benchmark
=
self
.
get_benchmark
()
valid_output
=
self
.
example_raw_output
.
replace
(
'*** WARNING *** INVALID RUN'
,
''
)
valid_output
=
valid_output
.
replace
(
'*** WARNING *** THIS IS NOT A VALID RUN ***'
,
''
)
self
.
assertTrue
(
benchmark
.
_process_raw_result
(
0
,
valid_output
))
self
.
assertEqual
(
1
,
benchmark
.
result
[
'is_valid'
][
0
])
def
test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing
(
self
):
"""Test DTK gpu-hpcg parsing failure when required summary is missing."""
benchmark
=
self
.
get_benchmark
()
invalid_output
=
self
.
example_raw_output
.
replace
(
'Process domain: 4 x 4 x 2
\n\n
'
,
''
,
)
self
.
assertFalse
(
benchmark
.
_process_raw_result
(
0
,
invalid_output
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment