Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
tsoc
superbenchmark
Commits
742f203d
Commit
742f203d
authored
Apr 01, 2026
by
one
Browse files
Fix rocHPCG metric extraction
parent
b623c7e9
Changes
2
Hide whitespace changes
Inline
Side-by-side
Showing
2 changed files
with
113 additions
and
65 deletions
+113
-65
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
.../benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+76
-38
tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
.../benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+37
-27
No files found.
superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
View file @
742f203d
...
@@ -13,21 +13,15 @@
...
@@ -13,21 +13,15 @@
class
GpuHpcgBenchmark
(
MicroBenchmarkWithInvoke
):
class
GpuHpcgBenchmark
(
MicroBenchmarkWithInvoke
):
"""The GPU HPCG benchmark base class."""
"""The GPU HPCG benchmark base class."""
_operation_metric_map
=
{
_mpi_output_prefix_pattern
=
re
.
compile
(
r
'^\[\d+,\d+\]<(?:stdout|stderr)>:\s*'
)
'DDOT'
:
'ddot'
,
_operation_metric_map
=
{
'DDOT'
:
'ddot'
,
'WAXPBY'
:
'waxpby'
,
'SpMV'
:
'spmv'
,
'MG'
:
'mg'
,
'Total'
:
'total'
,
'WAXPBY'
:
'waxpby'
,
'Final'
:
'final'
}
'SpMV'
:
'spmv'
,
_time_metric_map
=
{
'Total Time'
:
'total_time'
,
'Setup Time'
:
'setup_time'
,
'Optimization Time'
:
'optimization_time'
}
'MG'
:
'mg'
,
_domain_metric_map
=
{
'Local domain'
:
'local_domain'
,
'Global domain'
:
'global_domain'
,
'Total'
:
'total'
,
'Process domain'
:
'process_domain'
}
'Final'
:
'final'
,
_float_pattern
=
re
.
compile
(
r
'([0-9]+(?:\.[0-9]+)?)\s+(GFlop/s|GB/s)'
)
}
_dimension_pattern
=
re
.
compile
(
r
'([0-9]+)\s*x\s*([0-9]+)\s*x\s*([0-9]+)'
)
_operation_pattern
=
re
.
compile
(
_time_value_pattern
=
re
.
compile
(
r
'([0-9]+(?:\.[0-9]+)?)\s+sec'
)
r
'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
r
'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
r
'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
)
_time_pattern
=
re
.
compile
(
r
'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$'
)
_domain_pattern
=
re
.
compile
(
r
'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$'
)
_invalid_markers
=
[
'*** WARNING *** INVALID RUN'
,
'*** WARNING *** THIS IS NOT A VALID RUN ***'
]
_invalid_markers
=
[
'*** WARNING *** INVALID RUN'
,
'*** WARNING *** THIS IS NOT A VALID RUN ***'
]
def
__init__
(
self
,
name
,
parameters
=
''
):
def
__init__
(
self
,
name
,
parameters
=
''
):
...
@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output):
...
@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output):
"""
"""
self
.
_result
.
add_raw_data
(
'raw_output_'
+
str
(
cmd_idx
),
raw_output
,
self
.
_args
.
log_raw_data
)
self
.
_result
.
add_raw_data
(
'raw_output_'
+
str
(
cmd_idx
),
raw_output
,
self
.
_args
.
log_raw_data
)
# Under MPI only rank 0 emits the complete rocHPCG summary.
rank
=
int
(
os
.
getenv
(
'OMPI_COMM_WORLD_RANK'
,
'0'
))
if
rank
>
0
:
return
True
parsed_results
=
{}
parsed_results
=
{}
required_metrics
=
{
required_metrics
=
{
'final_gflops'
,
'final_gflops'
,
...
@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
...
@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
for
raw_line
in
raw_output
.
splitlines
():
for
raw_line
in
raw_output
.
splitlines
():
line
=
raw_line
.
strip
()
line
=
raw_line
.
strip
()
line
=
self
.
_mpi_output_prefix_pattern
.
sub
(
''
,
line
)
if
not
line
:
if
not
line
:
continue
continue
operation_match
=
self
.
_operation_pattern
.
match
(
line
)
if
self
.
_parse_operation_line
(
line
,
parsed_results
):
if
operation_match
:
prefix
=
self
.
_operation_metric_map
[
operation_match
.
group
(
1
)]
total_gflops
=
float
(
operation_match
.
group
(
2
))
total_bandwidth
=
float
(
operation_match
.
group
(
3
))
per_process_gflops
=
float
(
operation_match
.
group
(
4
))
per_process_bandwidth
=
float
(
operation_match
.
group
(
5
))
parsed_results
[
f
'
{
prefix
}
_gflops'
]
=
total_gflops
parsed_results
[
f
'
{
prefix
}
_gflops_per_process'
]
=
per_process_gflops
if
prefix
!=
'final'
:
parsed_results
[
f
'
{
prefix
}
_bandwidth'
]
=
total_bandwidth
parsed_results
[
f
'
{
prefix
}
_bandwidth_per_process'
]
=
per_process_bandwidth
continue
continue
time_match
=
self
.
_time_pattern
.
match
(
line
)
if
self
.
_parse_time_line
(
line
,
parsed_results
):
if
time_match
:
metric_prefix
=
time_match
.
group
(
1
).
lower
().
replace
(
' '
,
'_'
)
parsed_results
[
metric_prefix
]
=
float
(
time_match
.
group
(
2
))
continue
continue
domain_match
=
self
.
_domain_pattern
.
match
(
line
)
self
.
_parse_domain_line
(
line
,
parsed_results
)
if
domain_match
:
domain_prefix
=
domain_match
.
group
(
1
).
lower
()
parsed_results
[
f
'
{
domain_prefix
}
_domain_x'
]
=
int
(
domain_match
.
group
(
2
))
parsed_results
[
f
'
{
domain_prefix
}
_domain_y'
]
=
int
(
domain_match
.
group
(
3
))
parsed_results
[
f
'
{
domain_prefix
}
_domain_z'
]
=
int
(
domain_match
.
group
(
4
))
parsed_results
[
'is_valid'
]
=
0
if
any
(
marker
in
raw_output
for
marker
in
self
.
_invalid_markers
)
else
1
parsed_results
[
'is_valid'
]
=
0
if
any
(
marker
in
raw_output
for
marker
in
self
.
_invalid_markers
)
else
1
...
@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output):
...
@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output):
self
.
_result
.
add_result
(
metric
,
value
)
self
.
_result
.
add_result
(
metric
,
value
)
return
True
return
True
def
_parse_operation_line
(
self
,
line
,
parsed_results
):
"""Parse one rocHPCG operation summary line."""
operation_key
=
None
for
candidate
in
self
.
_operation_metric_map
:
if
line
.
startswith
(
candidate
)
and
'='
in
line
:
operation_key
=
candidate
break
if
operation_key
is
None
:
return
False
matches
=
self
.
_float_pattern
.
findall
(
line
)
if
len
(
matches
)
<
4
:
return
False
prefix
=
self
.
_operation_metric_map
[
operation_key
]
gflops_values
=
[
float
(
value
)
for
value
,
unit
in
matches
if
unit
==
'GFlop/s'
]
bandwidth_values
=
[
float
(
value
)
for
value
,
unit
in
matches
if
unit
==
'GB/s'
]
if
len
(
gflops_values
)
<
2
or
len
(
bandwidth_values
)
<
2
:
return
False
parsed_results
[
f
'
{
prefix
}
_gflops'
]
=
gflops_values
[
0
]
parsed_results
[
f
'
{
prefix
}
_gflops_per_process'
]
=
gflops_values
[
1
]
if
prefix
!=
'final'
:
parsed_results
[
f
'
{
prefix
}
_bandwidth'
]
=
bandwidth_values
[
0
]
parsed_results
[
f
'
{
prefix
}
_bandwidth_per_process'
]
=
bandwidth_values
[
1
]
return
True
def
_parse_time_line
(
self
,
line
,
parsed_results
):
"""Parse one rocHPCG time summary line."""
for
label
,
metric
in
self
.
_time_metric_map
.
items
():
if
not
line
.
startswith
(
label
+
':'
):
continue
match
=
self
.
_time_value_pattern
.
search
(
line
)
if
match
:
parsed_results
[
metric
]
=
float
(
match
.
group
(
1
))
return
True
return
False
def
_parse_domain_line
(
self
,
line
,
parsed_results
):
"""Parse one rocHPCG domain summary line."""
for
label
,
metric_prefix
in
self
.
_domain_metric_map
.
items
():
if
not
line
.
startswith
(
label
+
':'
):
continue
match
=
self
.
_dimension_pattern
.
search
(
line
)
if
not
match
:
return
False
parsed_results
[
f
'
{
metric_prefix
}
_x'
]
=
int
(
match
.
group
(
1
))
parsed_results
[
f
'
{
metric_prefix
}
_y'
]
=
int
(
match
.
group
(
2
))
parsed_results
[
f
'
{
metric_prefix
}
_z'
]
=
int
(
match
.
group
(
3
))
return
True
return
False
tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
View file @
742f203d
...
@@ -3,8 +3,10 @@
...
@@ -3,8 +3,10 @@
"""Tests for DTK gpu-hpcg benchmark."""
"""Tests for DTK gpu-hpcg benchmark."""
import
os
import
unittest
import
unittest
from
types
import
SimpleNamespace
from
types
import
SimpleNamespace
from
unittest.mock
import
patch
from
tests.helper.testcase
import
BenchmarkTestCase
from
tests.helper.testcase
import
BenchmarkTestCase
from
superbench.benchmarks
import
BenchmarkRegistry
,
BenchmarkType
,
Platform
,
ReturnCode
from
superbench.benchmarks
import
BenchmarkRegistry
,
BenchmarkType
,
Platform
,
ReturnCode
...
@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
...
@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for DTK gpu-hpcg benchmark."""
"""Tests for DTK gpu-hpcg benchmark."""
example_raw_output
=
"""
example_raw_output
=
"""
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
[1,0]<stdout>:
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
Setup Phase took 0.12 sec
[1,0]<stdout>:
Setup Phase took 0.12 sec
Starting Reference CG Phase ...
[1,0]<stdout>:
Starting Reference CG Phase ...
Optimization Phase took 0.25 sec
[1,0]<stdout>:
Optimization Phase took 0.25 sec
Validation Testing Phase ...
[1,0]<stdout>:
Validation Testing Phase ...
Optimized CG Setup ...
[1,0]<stdout>:
Optimized CG Setup ...
HIP Initial Residual = 2.668768e+04
[1,0]<stdout>:
HIP Initial Residual = 2.668768e+04
Total device memory usage: 19550 MByte (29152 MByte)
[1,0]<stdout>:
Total device memory usage: 19550 MByte (29152 MByte)
Starting Benchmarking Phase ...
[1,0]<stdout>:
Starting Benchmarking Phase ...
Performing (at least) 2 CG sets in 1.0 seconds ...
[1,0]<stdout>:
Performing (at least) 2 CG sets in 1.0 seconds ...
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
[1,0]<stdout>:
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
[1,0]<stdout>:
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
Local domain: 560 x 280 x 280
[1,0]<stdout>:
Local domain: 560 x 280 x 280
Global domain: 2240 x 1120 x 560
[1,0]<stdout>:
Global domain: 2240 x 1120 x 560
Process domain: 4 x 4 x 2
[1,0]<stdout>:
Process domain: 4 x 4 x 2
Total Time: 7.55 sec
[1,0]<stdout>:
Total Time: 7.55 sec
Setup Time: 0.12 sec
[1,0]<stdout>:
Setup Time: 0.12 sec
Optimization Time: 0.25 sec
[1,0]<stdout>:
Optimization Time: 0.25 sec
*** WARNING *** INVALID RUN
[1,0]<stdout>:
*** WARNING *** INVALID RUN
DDOT = 5849.4 GFlop/s (46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
[1,0]<stdout>:
DDOT = 5849.4 GFlop/s (
46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
WAXPBY = 3052.0 GFlop/s (36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
[1,0]<stdout>:
WAXPBY = 3052.0 GFlop/s (
36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
SpMV = 5473.9 GFlop/s (34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
[1,0]<stdout>:
SpMV = 5473.9 GFlop/s (
34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
MG = 7716.9 GFlop/s (59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
[1,0]<stdout>:
MG = 7716.9 GFlop/s (
59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
Total = 6971.0 GFlop/s (52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
[1,0]<stdout>:
Total = 6971.0 GFlop/s (
52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
Final = 6904.9 GFlop/s (52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
[1,0]<stdout>:
Final = 6904.9 GFlop/s (
52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
*** WARNING *** THIS IS NOT A VALID RUN ***
[1,0]<stdout>:
*** WARNING *** THIS IS NOT A VALID RUN ***
"""
"""
@
classmethod
@
classmethod
...
@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
...
@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
"""Test DTK gpu-hpcg parsing failure when required summary is missing."""
"""Test DTK gpu-hpcg parsing failure when required summary is missing."""
benchmark
=
self
.
get_benchmark
()
benchmark
=
self
.
get_benchmark
()
invalid_output
=
self
.
example_raw_output
.
replace
(
invalid_output
=
self
.
example_raw_output
.
replace
(
'Process domain: 4 x 4 x 2
\n\n
'
,
'
[1,0]<stdout>:
Process domain: 4 x 4 x 2
\n\n
'
,
''
,
''
,
)
)
self
.
assertFalse
(
benchmark
.
_process_raw_result
(
0
,
invalid_output
))
self
.
assertFalse
(
benchmark
.
_process_raw_result
(
0
,
invalid_output
))
def
test_dtk_hpcg_result_parsing_ignores_non_root_mpi_rank
(
self
):
"""Test DTK gpu-hpcg parser skips non-root MPI ranks without summary output."""
benchmark
=
self
.
get_benchmark
()
rank_only_output
=
'[1,2]<stdout>: [2]: Node Binding: Process 2 GPU: 2, NUMA: 0'
with
patch
.
dict
(
os
.
environ
,
{
'OMPI_COMM_WORLD_RANK'
:
'2'
}):
self
.
assertTrue
(
benchmark
.
_process_raw_result
(
0
,
rank_only_output
))
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment