Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
742f203d
Commit
742f203d
authored
Apr 01, 2026
by
one
Browse files
Fix rocHPCG metric extraction
parent
b623c7e9
Changes
2
Show whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
113 additions
and
65 deletions
+113
-65
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
.../benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+76
-38
tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
.../benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+37
-27
No files found.
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
View file @
742f203d
...
...
@@ -13,21 +13,15 @@
class
GpuHpcgBenchmark
(
MicroBenchmarkWithInvoke
):
"""The GPU HPCG benchmark base class."""
_operation_metric_map
=
{
'DDOT'
:
'ddot'
,
'WAXPBY'
:
'waxpby'
,
'SpMV'
:
'spmv'
,
'MG'
:
'mg'
,
'Total'
:
'total'
,
'Final'
:
'final'
,
}
_operation_pattern
=
re
.
compile
(
r
'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
r
'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
r
'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
)
_time_pattern
=
re
.
compile
(
r
'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$'
)
_domain_pattern
=
re
.
compile
(
r
'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$'
)
_mpi_output_prefix_pattern
=
re
.
compile
(
r
'^\[\d+,\d+\]<(?:stdout|stderr)>:\s*'
)
_operation_metric_map
=
{
'DDOT'
:
'ddot'
,
'WAXPBY'
:
'waxpby'
,
'SpMV'
:
'spmv'
,
'MG'
:
'mg'
,
'Total'
:
'total'
,
'Final'
:
'final'
}
_time_metric_map
=
{
'Total Time'
:
'total_time'
,
'Setup Time'
:
'setup_time'
,
'Optimization Time'
:
'optimization_time'
}
_domain_metric_map
=
{
'Local domain'
:
'local_domain'
,
'Global domain'
:
'global_domain'
,
'Process domain'
:
'process_domain'
}
_float_pattern
=
re
.
compile
(
r
'([0-9]+(?:\.[0-9]+)?)\s+(GFlop/s|GB/s)'
)
_dimension_pattern
=
re
.
compile
(
r
'([0-9]+)\s*x\s*([0-9]+)\s*x\s*([0-9]+)'
)
_time_value_pattern
=
re
.
compile
(
r
'([0-9]+(?:\.[0-9]+)?)\s+sec'
)
_invalid_markers
=
[
'*** WARNING *** INVALID RUN'
,
'*** WARNING *** THIS IS NOT A VALID RUN ***'
]
def
__init__
(
self
,
name
,
parameters
=
''
):
...
...
@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output):
"""
self
.
_result
.
add_raw_data
(
'raw_output_'
+
str
(
cmd_idx
),
raw_output
,
self
.
_args
.
log_raw_data
)
# Under MPI only rank 0 emits the complete rocHPCG summary.
rank
=
int
(
os
.
getenv
(
'OMPI_COMM_WORLD_RANK'
,
'0'
))
if
rank
>
0
:
return
True
parsed_results
=
{}
required_metrics
=
{
'final_gflops'
,
...
...
@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
for
raw_line
in
raw_output
.
splitlines
():
line
=
raw_line
.
strip
()
line
=
self
.
_mpi_output_prefix_pattern
.
sub
(
''
,
line
)
if
not
line
:
continue
operation_match
=
self
.
_operation_pattern
.
match
(
line
)
if
operation_match
:
prefix
=
self
.
_operation_metric_map
[
operation_match
.
group
(
1
)]
total_gflops
=
float
(
operation_match
.
group
(
2
))
total_bandwidth
=
float
(
operation_match
.
group
(
3
))
per_process_gflops
=
float
(
operation_match
.
group
(
4
))
per_process_bandwidth
=
float
(
operation_match
.
group
(
5
))
parsed_results
[
f
'
{
prefix
}
_gflops'
]
=
total_gflops
parsed_results
[
f
'
{
prefix
}
_gflops_per_process'
]
=
per_process_gflops
if
prefix
!=
'final'
:
parsed_results
[
f
'
{
prefix
}
_bandwidth'
]
=
total_bandwidth
parsed_results
[
f
'
{
prefix
}
_bandwidth_per_process'
]
=
per_process_bandwidth
if
self
.
_parse_operation_line
(
line
,
parsed_results
):
continue
time_match
=
self
.
_time_pattern
.
match
(
line
)
if
time_match
:
metric_prefix
=
time_match
.
group
(
1
).
lower
().
replace
(
' '
,
'_'
)
parsed_results
[
metric_prefix
]
=
float
(
time_match
.
group
(
2
))
if
self
.
_parse_time_line
(
line
,
parsed_results
):
continue
domain_match
=
self
.
_domain_pattern
.
match
(
line
)
if
domain_match
:
domain_prefix
=
domain_match
.
group
(
1
).
lower
()
parsed_results
[
f
'
{
domain_prefix
}
_domain_x'
]
=
int
(
domain_match
.
group
(
2
))
parsed_results
[
f
'
{
domain_prefix
}
_domain_y'
]
=
int
(
domain_match
.
group
(
3
))
parsed_results
[
f
'
{
domain_prefix
}
_domain_z'
]
=
int
(
domain_match
.
group
(
4
))
self
.
_parse_domain_line
(
line
,
parsed_results
)
parsed_results
[
'is_valid'
]
=
0
if
any
(
marker
in
raw_output
for
marker
in
self
.
_invalid_markers
)
else
1
...
...
@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output):
self
.
_result
.
add_result
(
metric
,
value
)
return
True
def
_parse_operation_line
(
self
,
line
,
parsed_results
):
"""Parse one rocHPCG operation summary line."""
operation_key
=
None
for
candidate
in
self
.
_operation_metric_map
:
if
line
.
startswith
(
candidate
)
and
'='
in
line
:
operation_key
=
candidate
break
if
operation_key
is
None
:
return
False
matches
=
self
.
_float_pattern
.
findall
(
line
)
if
len
(
matches
)
<
4
:
return
False
prefix
=
self
.
_operation_metric_map
[
operation_key
]
gflops_values
=
[
float
(
value
)
for
value
,
unit
in
matches
if
unit
==
'GFlop/s'
]
bandwidth_values
=
[
float
(
value
)
for
value
,
unit
in
matches
if
unit
==
'GB/s'
]
if
len
(
gflops_values
)
<
2
or
len
(
bandwidth_values
)
<
2
:
return
False
parsed_results
[
f
'
{
prefix
}
_gflops'
]
=
gflops_values
[
0
]
parsed_results
[
f
'
{
prefix
}
_gflops_per_process'
]
=
gflops_values
[
1
]
if
prefix
!=
'final'
:
parsed_results
[
f
'
{
prefix
}
_bandwidth'
]
=
bandwidth_values
[
0
]
parsed_results
[
f
'
{
prefix
}
_bandwidth_per_process'
]
=
bandwidth_values
[
1
]
return
True
def
_parse_time_line
(
self
,
line
,
parsed_results
):
"""Parse one rocHPCG time summary line."""
for
label
,
metric
in
self
.
_time_metric_map
.
items
():
if
not
line
.
startswith
(
label
+
':'
):
continue
match
=
self
.
_time_value_pattern
.
search
(
line
)
if
match
:
parsed_results
[
metric
]
=
float
(
match
.
group
(
1
))
return
True
return
False
def
_parse_domain_line
(
self
,
line
,
parsed_results
):
"""Parse one rocHPCG domain summary line."""
for
label
,
metric_prefix
in
self
.
_domain_metric_map
.
items
():
if
not
line
.
startswith
(
label
+
':'
):
continue
match
=
self
.
_dimension_pattern
.
search
(
line
)
if
not
match
:
return
False
parsed_results
[
f
'
{
metric_prefix
}
_x'
]
=
int
(
match
.
group
(
1
))
parsed_results
[
f
'
{
metric_prefix
}
_y'
]
=
int
(
match
.
group
(
2
))
parsed_results
[
f
'
{
metric_prefix
}
_z'
]
=
int
(
match
.
group
(
3
))
return
True
return
False
tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
View file @
742f203d
...
...
@@ -3,8 +3,10 @@
"""Tests for DTK gpu-hpcg benchmark."""
import
os
import
unittest
from
types
import
SimpleNamespace
from
unittest.mock
import
patch
from
tests.helper.testcase
import
BenchmarkTestCase
from
superbench.benchmarks
import
BenchmarkRegistry
,
BenchmarkType
,
Platform
,
ReturnCode
...
...
@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for DTK gpu-hpcg benchmark."""
example_raw_output
=
"""
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
[1,0]<stdout>:
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
Setup Phase took 0.12 sec
[1,0]<stdout>:
Setup Phase took 0.12 sec
Starting Reference CG Phase ...
[1,0]<stdout>:
Starting Reference CG Phase ...
Optimization Phase took 0.25 sec
[1,0]<stdout>:
Optimization Phase took 0.25 sec
Validation Testing Phase ...
[1,0]<stdout>:
Validation Testing Phase ...
Optimized CG Setup ...
[1,0]<stdout>:
Optimized CG Setup ...
HIP Initial Residual = 2.668768e+04
[1,0]<stdout>:
HIP Initial Residual = 2.668768e+04
Total device memory usage: 19550 MByte (29152 MByte)
[1,0]<stdout>:
Total device memory usage: 19550 MByte (29152 MByte)
Starting Benchmarking Phase ...
[1,0]<stdout>:
Starting Benchmarking Phase ...
Performing (at least) 2 CG sets in 1.0 seconds ...
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
[1,0]<stdout>:
Performing (at least) 2 CG sets in 1.0 seconds ...
[1,0]<stdout>:
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
[1,0]<stdout>:
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
Local domain: 560 x 280 x 280
Global domain: 2240 x 1120 x 560
Process domain: 4 x 4 x 2
[1,0]<stdout>:
Local domain: 560 x 280 x 280
[1,0]<stdout>:
Global domain: 2240 x 1120 x 560
[1,0]<stdout>:
Process domain: 4 x 4 x 2
Total Time: 7.55 sec
Setup Time: 0.12 sec
Optimization Time: 0.25 sec
[1,0]<stdout>:
Total Time: 7.55 sec
[1,0]<stdout>:
Setup Time: 0.12 sec
[1,0]<stdout>:
Optimization Time: 0.25 sec
*** WARNING *** INVALID RUN
[1,0]<stdout>:
*** WARNING *** INVALID RUN
DDOT = 5849.4 GFlop/s (46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
WAXPBY = 3052.0 GFlop/s (36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
SpMV = 5473.9 GFlop/s (34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
MG = 7716.9 GFlop/s (59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
Total = 6971.0 GFlop/s (52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
Final = 6904.9 GFlop/s (52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
[1,0]<stdout>:
DDOT = 5849.4 GFlop/s (
46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
[1,0]<stdout>:
WAXPBY = 3052.0 GFlop/s (
36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
[1,0]<stdout>:
SpMV = 5473.9 GFlop/s (
34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
[1,0]<stdout>:
MG = 7716.9 GFlop/s (
59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
[1,0]<stdout>:
Total = 6971.0 GFlop/s (
52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
[1,0]<stdout>:
Final = 6904.9 GFlop/s (
52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
*** WARNING *** THIS IS NOT A VALID RUN ***
[1,0]<stdout>:
*** WARNING *** THIS IS NOT A VALID RUN ***
"""
@
classmethod
...
...
@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
"""Test DTK gpu-hpcg parsing failure when required summary is missing."""
benchmark
=
self
.
get_benchmark
()
invalid_output
=
self
.
example_raw_output
.
replace
(
'Process domain: 4 x 4 x 2
\n\n
'
,
'
[1,0]<stdout>:
Process domain: 4 x 4 x 2
\n\n
'
,
''
,
)
self
.
assertFalse
(
benchmark
.
_process_raw_result
(
0
,
invalid_output
))
def
test_dtk_hpcg_result_parsing_ignores_non_root_mpi_rank
(
self
):
"""Test DTK gpu-hpcg parser skips non-root MPI ranks without summary output."""
benchmark
=
self
.
get_benchmark
()
rank_only_output
=
'[1,2]<stdout>: [2]: Node Binding: Process 2 GPU: 2, NUMA: 0'
with
patch
.
dict
(
os
.
environ
,
{
'OMPI_COMM_WORLD_RANK'
:
'2'
}):
self
.
assertTrue
(
benchmark
.
_process_raw_result
(
0
,
rank_only_output
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment