Commit 742f203d authored by one's avatar one
Browse files

Fix rocHPCG metric extraction

parent b623c7e9
...@@ -13,21 +13,15 @@ ...@@ -13,21 +13,15 @@
class GpuHpcgBenchmark(MicroBenchmarkWithInvoke): class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
"""The GPU HPCG benchmark base class.""" """The GPU HPCG benchmark base class."""
_operation_metric_map = { _mpi_output_prefix_pattern = re.compile(r'^\[\d+,\d+\]<(?:stdout|stderr)>:\s*')
'DDOT': 'ddot', _operation_metric_map = {'DDOT': 'ddot', 'WAXPBY': 'waxpby', 'SpMV': 'spmv', 'MG': 'mg', 'Total': 'total',
'WAXPBY': 'waxpby', 'Final': 'final'}
'SpMV': 'spmv', _time_metric_map = {'Total Time': 'total_time', 'Setup Time': 'setup_time', 'Optimization Time': 'optimization_time'}
'MG': 'mg', _domain_metric_map = {'Local domain': 'local_domain', 'Global domain': 'global_domain',
'Total': 'total', 'Process domain': 'process_domain'}
'Final': 'final', _float_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+(GFlop/s|GB/s)')
} _dimension_pattern = re.compile(r'([0-9]+)\s*x\s*([0-9]+)\s*x\s*([0-9]+)')
_operation_pattern = re.compile( _time_value_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+sec')
r'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
)
_time_pattern = re.compile(r'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$')
_domain_pattern = re.compile(r'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$')
_invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***'] _invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***']
def __init__(self, name, parameters=''): def __init__(self, name, parameters=''):
...@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output):
""" """
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data) self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
# Under MPI only rank 0 emits the complete rocHPCG summary.
rank = int(os.getenv('OMPI_COMM_WORLD_RANK', '0'))
if rank > 0:
return True
parsed_results = {} parsed_results = {}
required_metrics = { required_metrics = {
'final_gflops', 'final_gflops',
...@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
for raw_line in raw_output.splitlines(): for raw_line in raw_output.splitlines():
line = raw_line.strip() line = raw_line.strip()
line = self._mpi_output_prefix_pattern.sub('', line)
if not line: if not line:
continue continue
operation_match = self._operation_pattern.match(line) if self._parse_operation_line(line, parsed_results):
if operation_match:
prefix = self._operation_metric_map[operation_match.group(1)]
total_gflops = float(operation_match.group(2))
total_bandwidth = float(operation_match.group(3))
per_process_gflops = float(operation_match.group(4))
per_process_bandwidth = float(operation_match.group(5))
parsed_results[f'{prefix}_gflops'] = total_gflops
parsed_results[f'{prefix}_gflops_per_process'] = per_process_gflops
if prefix != 'final':
parsed_results[f'{prefix}_bandwidth'] = total_bandwidth
parsed_results[f'{prefix}_bandwidth_per_process'] = per_process_bandwidth
continue continue
time_match = self._time_pattern.match(line) if self._parse_time_line(line, parsed_results):
if time_match:
metric_prefix = time_match.group(1).lower().replace(' ', '_')
parsed_results[metric_prefix] = float(time_match.group(2))
continue continue
domain_match = self._domain_pattern.match(line) self._parse_domain_line(line, parsed_results)
if domain_match:
domain_prefix = domain_match.group(1).lower()
parsed_results[f'{domain_prefix}_domain_x'] = int(domain_match.group(2))
parsed_results[f'{domain_prefix}_domain_y'] = int(domain_match.group(3))
parsed_results[f'{domain_prefix}_domain_z'] = int(domain_match.group(4))
parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1 parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1
...@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output): ...@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output):
self._result.add_result(metric, value) self._result.add_result(metric, value)
return True return True
def _parse_operation_line(self, line, parsed_results):
"""Parse one rocHPCG operation summary line."""
operation_key = None
for candidate in self._operation_metric_map:
if line.startswith(candidate) and '=' in line:
operation_key = candidate
break
if operation_key is None:
return False
matches = self._float_pattern.findall(line)
if len(matches) < 4:
return False
prefix = self._operation_metric_map[operation_key]
gflops_values = [float(value) for value, unit in matches if unit == 'GFlop/s']
bandwidth_values = [float(value) for value, unit in matches if unit == 'GB/s']
if len(gflops_values) < 2 or len(bandwidth_values) < 2:
return False
parsed_results[f'{prefix}_gflops'] = gflops_values[0]
parsed_results[f'{prefix}_gflops_per_process'] = gflops_values[1]
if prefix != 'final':
parsed_results[f'{prefix}_bandwidth'] = bandwidth_values[0]
parsed_results[f'{prefix}_bandwidth_per_process'] = bandwidth_values[1]
return True
def _parse_time_line(self, line, parsed_results):
"""Parse one rocHPCG time summary line."""
for label, metric in self._time_metric_map.items():
if not line.startswith(label + ':'):
continue
match = self._time_value_pattern.search(line)
if match:
parsed_results[metric] = float(match.group(1))
return True
return False
def _parse_domain_line(self, line, parsed_results):
"""Parse one rocHPCG domain summary line."""
for label, metric_prefix in self._domain_metric_map.items():
if not line.startswith(label + ':'):
continue
match = self._dimension_pattern.search(line)
if not match:
return False
parsed_results[f'{metric_prefix}_x'] = int(match.group(1))
parsed_results[f'{metric_prefix}_y'] = int(match.group(2))
parsed_results[f'{metric_prefix}_z'] = int(match.group(3))
return True
return False
...@@ -3,8 +3,10 @@ ...@@ -3,8 +3,10 @@
"""Tests for DTK gpu-hpcg benchmark.""" """Tests for DTK gpu-hpcg benchmark."""
import os
import unittest import unittest
from types import SimpleNamespace from types import SimpleNamespace
from unittest.mock import patch
from tests.helper.testcase import BenchmarkTestCase from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode
...@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase): ...@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for DTK gpu-hpcg benchmark.""" """Tests for DTK gpu-hpcg benchmark."""
example_raw_output = """ example_raw_output = """
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1) [1,0]<stdout>: rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
Setup Phase took 0.12 sec [1,0]<stdout>: Setup Phase took 0.12 sec
Starting Reference CG Phase ... [1,0]<stdout>: Starting Reference CG Phase ...
Optimization Phase took 0.25 sec [1,0]<stdout>: Optimization Phase took 0.25 sec
Validation Testing Phase ... [1,0]<stdout>: Validation Testing Phase ...
Optimized CG Setup ... [1,0]<stdout>: Optimized CG Setup ...
HIP Initial Residual = 2.668768e+04 [1,0]<stdout>: HIP Initial Residual = 2.668768e+04
Total device memory usage: 19550 MByte (29152 MByte) [1,0]<stdout>: Total device memory usage: 19550 MByte (29152 MByte)
Starting Benchmarking Phase ... [1,0]<stdout>: Starting Benchmarking Phase ...
Performing (at least) 2 CG sets in 1.0 seconds ... [1,0]<stdout>: Performing (at least) 2 CG sets in 1.0 seconds ...
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left [1,0]<stdout>: CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left [1,0]<stdout>: CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
Local domain: 560 x 280 x 280 [1,0]<stdout>: Local domain: 560 x 280 x 280
Global domain: 2240 x 1120 x 560 [1,0]<stdout>: Global domain: 2240 x 1120 x 560
Process domain: 4 x 4 x 2 [1,0]<stdout>: Process domain: 4 x 4 x 2
Total Time: 7.55 sec [1,0]<stdout>: Total Time: 7.55 sec
Setup Time: 0.12 sec [1,0]<stdout>: Setup Time: 0.12 sec
Optimization Time: 0.25 sec [1,0]<stdout>: Optimization Time: 0.25 sec
*** WARNING *** INVALID RUN [1,0]<stdout>: *** WARNING *** INVALID RUN
DDOT = 5849.4 GFlop/s (46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process) [1,0]<stdout>: DDOT = 5849.4 GFlop/s ( 46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
WAXPBY = 3052.0 GFlop/s (36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process) [1,0]<stdout>: WAXPBY = 3052.0 GFlop/s ( 36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
SpMV = 5473.9 GFlop/s (34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process) [1,0]<stdout>: SpMV = 5473.9 GFlop/s ( 34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
MG = 7716.9 GFlop/s (59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process) [1,0]<stdout>: MG = 7716.9 GFlop/s ( 59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
Total = 6971.0 GFlop/s (52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process) [1,0]<stdout>: Total = 6971.0 GFlop/s ( 52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
Final = 6904.9 GFlop/s (52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process) [1,0]<stdout>: Final = 6904.9 GFlop/s ( 52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
*** WARNING *** THIS IS NOT A VALID RUN *** [1,0]<stdout>: *** WARNING *** THIS IS NOT A VALID RUN ***
""" """
@classmethod @classmethod
...@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self): ...@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
"""Test DTK gpu-hpcg parsing failure when required summary is missing.""" """Test DTK gpu-hpcg parsing failure when required summary is missing."""
benchmark = self.get_benchmark() benchmark = self.get_benchmark()
invalid_output = self.example_raw_output.replace( invalid_output = self.example_raw_output.replace(
'Process domain: 4 x 4 x 2\n\n', '[1,0]<stdout>: Process domain: 4 x 4 x 2\n\n',
'', '',
) )
self.assertFalse(benchmark._process_raw_result(0, invalid_output)) self.assertFalse(benchmark._process_raw_result(0, invalid_output))
def test_dtk_hpcg_result_parsing_ignores_non_root_mpi_rank(self):
"""Test DTK gpu-hpcg parser skips non-root MPI ranks without summary output."""
benchmark = self.get_benchmark()
rank_only_output = '[1,2]<stdout>: [2]: Node Binding: Process 2 GPU: 2, NUMA: 0'
with patch.dict(os.environ, {'OMPI_COMM_WORLD_RANK': '2'}):
self.assertTrue(benchmark._process_raw_result(0, rank_only_output))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment