Commit 742f203d authored by one's avatar one
Browse files

Fix rocHPCG metric extraction

parent b623c7e9
......@@ -13,21 +13,15 @@
class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
"""The GPU HPCG benchmark base class."""
_operation_metric_map = {
'DDOT': 'ddot',
'WAXPBY': 'waxpby',
'SpMV': 'spmv',
'MG': 'mg',
'Total': 'total',
'Final': 'final',
}
_operation_pattern = re.compile(
r'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
)
_time_pattern = re.compile(r'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$')
_domain_pattern = re.compile(r'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$')
_mpi_output_prefix_pattern = re.compile(r'^\[\d+,\d+\]<(?:stdout|stderr)>:\s*')
_operation_metric_map = {'DDOT': 'ddot', 'WAXPBY': 'waxpby', 'SpMV': 'spmv', 'MG': 'mg', 'Total': 'total',
'Final': 'final'}
_time_metric_map = {'Total Time': 'total_time', 'Setup Time': 'setup_time', 'Optimization Time': 'optimization_time'}
_domain_metric_map = {'Local domain': 'local_domain', 'Global domain': 'global_domain',
'Process domain': 'process_domain'}
_float_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+(GFlop/s|GB/s)')
_dimension_pattern = re.compile(r'([0-9]+)\s*x\s*([0-9]+)\s*x\s*([0-9]+)')
_time_value_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+sec')
_invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***']
def __init__(self, name, parameters=''):
......@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output):
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
# Under MPI only rank 0 emits the complete rocHPCG summary.
rank = int(os.getenv('OMPI_COMM_WORLD_RANK', '0'))
if rank > 0:
return True
parsed_results = {}
required_metrics = {
'final_gflops',
......@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
for raw_line in raw_output.splitlines():
line = raw_line.strip()
line = self._mpi_output_prefix_pattern.sub('', line)
if not line:
continue
operation_match = self._operation_pattern.match(line)
if operation_match:
prefix = self._operation_metric_map[operation_match.group(1)]
total_gflops = float(operation_match.group(2))
total_bandwidth = float(operation_match.group(3))
per_process_gflops = float(operation_match.group(4))
per_process_bandwidth = float(operation_match.group(5))
parsed_results[f'{prefix}_gflops'] = total_gflops
parsed_results[f'{prefix}_gflops_per_process'] = per_process_gflops
if prefix != 'final':
parsed_results[f'{prefix}_bandwidth'] = total_bandwidth
parsed_results[f'{prefix}_bandwidth_per_process'] = per_process_bandwidth
if self._parse_operation_line(line, parsed_results):
continue
time_match = self._time_pattern.match(line)
if time_match:
metric_prefix = time_match.group(1).lower().replace(' ', '_')
parsed_results[metric_prefix] = float(time_match.group(2))
if self._parse_time_line(line, parsed_results):
continue
domain_match = self._domain_pattern.match(line)
if domain_match:
domain_prefix = domain_match.group(1).lower()
parsed_results[f'{domain_prefix}_domain_x'] = int(domain_match.group(2))
parsed_results[f'{domain_prefix}_domain_y'] = int(domain_match.group(3))
parsed_results[f'{domain_prefix}_domain_z'] = int(domain_match.group(4))
self._parse_domain_line(line, parsed_results)
parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1
......@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output):
self._result.add_result(metric, value)
return True
def _parse_operation_line(self, line, parsed_results):
"""Parse one rocHPCG operation summary line."""
operation_key = None
for candidate in self._operation_metric_map:
if line.startswith(candidate) and '=' in line:
operation_key = candidate
break
if operation_key is None:
return False
matches = self._float_pattern.findall(line)
if len(matches) < 4:
return False
prefix = self._operation_metric_map[operation_key]
gflops_values = [float(value) for value, unit in matches if unit == 'GFlop/s']
bandwidth_values = [float(value) for value, unit in matches if unit == 'GB/s']
if len(gflops_values) < 2 or len(bandwidth_values) < 2:
return False
parsed_results[f'{prefix}_gflops'] = gflops_values[0]
parsed_results[f'{prefix}_gflops_per_process'] = gflops_values[1]
if prefix != 'final':
parsed_results[f'{prefix}_bandwidth'] = bandwidth_values[0]
parsed_results[f'{prefix}_bandwidth_per_process'] = bandwidth_values[1]
return True
def _parse_time_line(self, line, parsed_results):
"""Parse one rocHPCG time summary line."""
for label, metric in self._time_metric_map.items():
if not line.startswith(label + ':'):
continue
match = self._time_value_pattern.search(line)
if match:
parsed_results[metric] = float(match.group(1))
return True
return False
def _parse_domain_line(self, line, parsed_results):
"""Parse one rocHPCG domain summary line."""
for label, metric_prefix in self._domain_metric_map.items():
if not line.startswith(label + ':'):
continue
match = self._dimension_pattern.search(line)
if not match:
return False
parsed_results[f'{metric_prefix}_x'] = int(match.group(1))
parsed_results[f'{metric_prefix}_y'] = int(match.group(2))
parsed_results[f'{metric_prefix}_z'] = int(match.group(3))
return True
return False
......@@ -3,8 +3,10 @@
"""Tests for DTK gpu-hpcg benchmark."""
import os
import unittest
from types import SimpleNamespace
from unittest.mock import patch
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode
......@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for DTK gpu-hpcg benchmark."""
example_raw_output = """
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
[1,0]<stdout>: rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
Setup Phase took 0.12 sec
[1,0]<stdout>: Setup Phase took 0.12 sec
Starting Reference CG Phase ...
[1,0]<stdout>: Starting Reference CG Phase ...
Optimization Phase took 0.25 sec
[1,0]<stdout>: Optimization Phase took 0.25 sec
Validation Testing Phase ...
[1,0]<stdout>: Validation Testing Phase ...
Optimized CG Setup ...
[1,0]<stdout>: Optimized CG Setup ...
HIP Initial Residual = 2.668768e+04
[1,0]<stdout>: HIP Initial Residual = 2.668768e+04
Total device memory usage: 19550 MByte (29152 MByte)
[1,0]<stdout>: Total device memory usage: 19550 MByte (29152 MByte)
Starting Benchmarking Phase ...
[1,0]<stdout>: Starting Benchmarking Phase ...
Performing (at least) 2 CG sets in 1.0 seconds ...
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
[1,0]<stdout>: Performing (at least) 2 CG sets in 1.0 seconds ...
[1,0]<stdout>: CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
[1,0]<stdout>: CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
Local domain: 560 x 280 x 280
Global domain: 2240 x 1120 x 560
Process domain: 4 x 4 x 2
[1,0]<stdout>: Local domain: 560 x 280 x 280
[1,0]<stdout>: Global domain: 2240 x 1120 x 560
[1,0]<stdout>: Process domain: 4 x 4 x 2
Total Time: 7.55 sec
Setup Time: 0.12 sec
Optimization Time: 0.25 sec
[1,0]<stdout>: Total Time: 7.55 sec
[1,0]<stdout>: Setup Time: 0.12 sec
[1,0]<stdout>: Optimization Time: 0.25 sec
*** WARNING *** INVALID RUN
[1,0]<stdout>: *** WARNING *** INVALID RUN
DDOT = 5849.4 GFlop/s (46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
WAXPBY = 3052.0 GFlop/s (36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
SpMV = 5473.9 GFlop/s (34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
MG = 7716.9 GFlop/s (59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
Total = 6971.0 GFlop/s (52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
Final = 6904.9 GFlop/s (52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
[1,0]<stdout>: DDOT = 5849.4 GFlop/s ( 46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
[1,0]<stdout>: WAXPBY = 3052.0 GFlop/s ( 36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
[1,0]<stdout>: SpMV = 5473.9 GFlop/s ( 34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
[1,0]<stdout>: MG = 7716.9 GFlop/s ( 59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
[1,0]<stdout>: Total = 6971.0 GFlop/s ( 52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
[1,0]<stdout>: Final = 6904.9 GFlop/s ( 52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
*** WARNING *** THIS IS NOT A VALID RUN ***
[1,0]<stdout>: *** WARNING *** THIS IS NOT A VALID RUN ***
"""
@classmethod
......@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
"""Test DTK gpu-hpcg parsing failure when required summary is missing."""
benchmark = self.get_benchmark()
invalid_output = self.example_raw_output.replace(
'Process domain: 4 x 4 x 2\n\n',
'[1,0]<stdout>: Process domain: 4 x 4 x 2\n\n',
'',
)
self.assertFalse(benchmark._process_raw_result(0, invalid_output))
def test_dtk_hpcg_result_parsing_ignores_non_root_mpi_rank(self):
"""Test DTK gpu-hpcg parser skips non-root MPI ranks without summary output."""
benchmark = self.get_benchmark()
rank_only_output = '[1,2]<stdout>: [2]: Node Binding: Process 2 GPU: 2, NUMA: 0'
with patch.dict(os.environ, {'OMPI_COMM_WORLD_RANK': '2'}):
self.assertTrue(benchmark._process_raw_result(0, rank_only_output))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment