Fix rocHPCG metric extraction

742f203d · one · b623c7e9 · 742f203d · 742f203d
Commit 742f203d authored Apr 01, 2026 by one
2 changed files
--- a/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
@@ -13,21 +13,15 @@
 class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
    """The GPU HPCG benchmark base class."""
-    _operation_metric_map = {
+    _mpi_output_prefix_pattern = re.compile(r'^\[\d+,\d+\]<(?:stdout|stderr)>:\s*')
-        'DDOT': 'ddot',
+    _operation_metric_map = {'DDOT': 'ddot', 'WAXPBY': 'waxpby', 'SpMV': 'spmv', 'MG': 'mg', 'Total': 'total',
-        'WAXPBY': 'waxpby',
+                             'Final': 'final'}
-        'SpMV': 'spmv',
+    _time_metric_map = {'Total Time': 'total_time', 'Setup Time': 'setup_time', 'Optimization Time': 'optimization_time'}
-        'MG': 'mg',
+    _domain_metric_map = {'Local domain': 'local_domain', 'Global domain': 'global_domain',
-        'Total': 'total',
+                          'Process domain': 'process_domain'}
-        'Final': 'final',
+    _float_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+(GFlop/s|GB/s)')
-    }
+    _dimension_pattern = re.compile(r'([0-9]+)\s*x\s*([0-9]+)\s*x\s*([0-9]+)')
-    _operation_pattern = re.compile(
+    _time_value_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+sec')
-        r'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
-        r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
-        r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
-    )
-    _time_pattern = re.compile(r'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$')
-    _domain_pattern = re.compile(r'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$')
    _invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***']
    def __init__(self, name, parameters=''):
@@ -162,6 +156,11 @@ def _process_raw_result(self, cmd_idx, raw_output):
        """
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+        # Under MPI only rank 0 emits the complete rocHPCG summary.
+        rank = int(os.getenv('OMPI_COMM_WORLD_RANK', '0'))
+        if rank > 0:
+            return True
        parsed_results = {}
        required_metrics = {
            'final_gflops',
@@ -202,36 +201,17 @@ def _process_raw_result(self, cmd_idx, raw_output):
        for raw_line in raw_output.splitlines():
            line = raw_line.strip()
+            line = self._mpi_output_prefix_pattern.sub('', line)
            if not line:
                continue
-            operation_match = self._operation_pattern.match(line)
+            if self._parse_operation_line(line, parsed_results):
-            if operation_match:
-                prefix = self._operation_metric_map[operation_match.group(1)]
-                total_gflops = float(operation_match.group(2))
-                total_bandwidth = float(operation_match.group(3))
-                per_process_gflops = float(operation_match.group(4))
-                per_process_bandwidth = float(operation_match.group(5))
-                parsed_results[f'{prefix}_gflops'] = total_gflops
-                parsed_results[f'{prefix}_gflops_per_process'] = per_process_gflops
-                if prefix != 'final':
-                    parsed_results[f'{prefix}_bandwidth'] = total_bandwidth
-                    parsed_results[f'{prefix}_bandwidth_per_process'] = per_process_bandwidth
                continue
-            time_match = self._time_pattern.match(line)
+            if self._parse_time_line(line, parsed_results):
-            if time_match:
-                metric_prefix = time_match.group(1).lower().replace(' ', '_')
-                parsed_results[metric_prefix] = float(time_match.group(2))
                continue
-            domain_match = self._domain_pattern.match(line)
+            self._parse_domain_line(line, parsed_results)
-            if domain_match:
-                domain_prefix = domain_match.group(1).lower()
-                parsed_results[f'{domain_prefix}_domain_x'] = int(domain_match.group(2))
-                parsed_results[f'{domain_prefix}_domain_y'] = int(domain_match.group(3))
-                parsed_results[f'{domain_prefix}_domain_z'] = int(domain_match.group(4))
        parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1
@@ -249,3 +229,61 @@ def _process_raw_result(self, cmd_idx, raw_output):
            self._result.add_result(metric, value)
        return True
+    def _parse_operation_line(self, line, parsed_results):
+        """Parse one rocHPCG operation summary line."""
+        operation_key = None
+        for candidate in self._operation_metric_map:
+            if line.startswith(candidate) and '=' in line:
+                operation_key = candidate
+                break
+        if operation_key is None:
+            return False
+        matches = self._float_pattern.findall(line)
+        if len(matches) < 4:
+            return False
+        prefix = self._operation_metric_map[operation_key]
+        gflops_values = [float(value) for value, unit in matches if unit == 'GFlop/s']
+        bandwidth_values = [float(value) for value, unit in matches if unit == 'GB/s']
+        if len(gflops_values) < 2 or len(bandwidth_values) < 2:
+            return False
+        parsed_results[f'{prefix}_gflops'] = gflops_values[0]
+        parsed_results[f'{prefix}_gflops_per_process'] = gflops_values[1]
+        if prefix != 'final':
+            parsed_results[f'{prefix}_bandwidth'] = bandwidth_values[0]
+            parsed_results[f'{prefix}_bandwidth_per_process'] = bandwidth_values[1]
+        return True
+    def _parse_time_line(self, line, parsed_results):
+        """Parse one rocHPCG time summary line."""
+        for label, metric in self._time_metric_map.items():
+            if not line.startswith(label + ':'):
+                continue
+            match = self._time_value_pattern.search(line)
+            if match:
+                parsed_results[metric] = float(match.group(1))
+                return True
+        return False
+    def _parse_domain_line(self, line, parsed_results):
+        """Parse one rocHPCG domain summary line."""
+        for label, metric_prefix in self._domain_metric_map.items():
+            if not line.startswith(label + ':'):
+                continue
+            match = self._dimension_pattern.search(line)
+            if not match:
+                return False
+            parsed_results[f'{metric_prefix}_x'] = int(match.group(1))
+            parsed_results[f'{metric_prefix}_y'] = int(match.group(2))
+            parsed_results[f'{metric_prefix}_z'] = int(match.group(3))
+            return True
+        return False
--- a/tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
@@ -3,8 +3,10 @@
 """Tests for DTK gpu-hpcg benchmark."""
+import os
 import unittest
 from types import SimpleNamespace
+from unittest.mock import patch
 from tests.helper.testcase import BenchmarkTestCase
 from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode
@@ -15,47 +17,47 @@ class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
    """Tests for DTK gpu-hpcg benchmark."""
    example_raw_output = """
-rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
+[1,0]<stdout>: rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
-Setup Phase took 0.12 sec
+[1,0]<stdout>: Setup Phase took 0.12 sec
-Starting Reference CG Phase ...
+[1,0]<stdout>: Starting Reference CG Phase ...
-Optimization Phase took 0.25 sec
+[1,0]<stdout>: Optimization Phase took 0.25 sec
-Validation Testing Phase ...
+[1,0]<stdout>: Validation Testing Phase ...
-Optimized CG Setup ...
+[1,0]<stdout>: Optimized CG Setup ...
-HIP Initial Residual = 2.668768e+04
+[1,0]<stdout>: HIP Initial Residual = 2.668768e+04
-Total device memory usage: 19550 MByte (29152 MByte)
+[1,0]<stdout>: Total device memory usage: 19550 MByte (29152 MByte)
-Starting Benchmarking Phase ...
+[1,0]<stdout>: Starting Benchmarking Phase ...
-Performing (at least) 2 CG sets in 1.0 seconds ...
+[1,0]<stdout>: Performing (at least) 2 CG sets in 1.0 seconds ...
-CG set 1 / 2    6881.2186 GFlop/s     (215.0381 GFlop/s per process)    50%    0.0 sec left
+[1,0]<stdout>: CG set 1 / 2    6881.2186 GFlop/s     (215.0381 GFlop/s per process)    50%    0.0 sec left
-CG set 2 / 2    6904.9453 GFlop/s     (215.7795 GFlop/s per process)    100%    0.0 sec left
+[1,0]<stdout>: CG set 2 / 2    6904.9453 GFlop/s     (215.7795 GFlop/s per process)    100%    0.0 sec left
-Local domain: 560 x 280 x 280
+[1,0]<stdout>: Local domain: 560 x 280 x 280
-Global domain: 2240 x 1120 x 560
+[1,0]<stdout>: Global domain: 2240 x 1120 x 560
-Process domain: 4 x 4 x 2
+[1,0]<stdout>: Process domain: 4 x 4 x 2
-Total Time: 7.55 sec
+[1,0]<stdout>: Total Time: 7.55 sec
-Setup Time: 0.12 sec
+[1,0]<stdout>: Setup Time: 0.12 sec
-Optimization Time: 0.25 sec
+[1,0]<stdout>: Optimization Time: 0.25 sec
-*** WARNING *** INVALID RUN
+[1,0]<stdout>: *** WARNING *** INVALID RUN
-DDOT   =  5849.4 GFlop/s (46794.9 GB/s)     182.8 GFlop/s per process ( 1462.3 GB/s per process)
+[1,0]<stdout>: DDOT   =  5849.4 GFlop/s ( 46794.9 GB/s)     182.8 GFlop/s per process ( 1462.3 GB/s per process)
-WAXPBY =  3052.0 GFlop/s (36623.8 GB/s)      95.4 GFlop/s per process ( 1144.5 GB/s per process)
+[1,0]<stdout>: WAXPBY =  3052.0 GFlop/s ( 36623.8 GB/s)      95.4 GFlop/s per process ( 1144.5 GB/s per process)
-SpMV   =  5473.9 GFlop/s (34468.8 GB/s)     171.1 GFlop/s per process ( 1077.1 GB/s per process)
+[1,0]<stdout>: SpMV   =  5473.9 GFlop/s ( 34468.8 GB/s)     171.1 GFlop/s per process ( 1077.1 GB/s per process)
-MG     =  7716.9 GFlop/s (59557.1 GB/s)     241.2 GFlop/s per process ( 1861.2 GB/s per process)
+[1,0]<stdout>: MG     =  7716.9 GFlop/s ( 59557.1 GB/s)     241.2 GFlop/s per process ( 1861.2 GB/s per process)
-Total  =  6971.0 GFlop/s (52859.9 GB/s)     217.8 GFlop/s per process ( 1651.9 GB/s per process)
+[1,0]<stdout>: Total  =  6971.0 GFlop/s ( 52859.9 GB/s)     217.8 GFlop/s per process ( 1651.9 GB/s per process)
-Final  =  6904.9 GFlop/s (52359.0 GB/s)     215.8 GFlop/s per process ( 1636.2 GB/s per process)
+[1,0]<stdout>: Final  =  6904.9 GFlop/s ( 52359.0 GB/s)     215.8 GFlop/s per process ( 1636.2 GB/s per process)
-*** WARNING *** THIS IS NOT A VALID RUN ***
+[1,0]<stdout>: *** WARNING *** THIS IS NOT A VALID RUN ***
 """
    @classmethod
@@ -135,8 +137,16 @@ def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
        """Test DTK gpu-hpcg parsing failure when required summary is missing."""
        benchmark = self.get_benchmark()
        invalid_output = self.example_raw_output.replace(
-            'Process domain: 4 x 4 x 2\n\n',
+            '[1,0]<stdout>: Process domain: 4 x 4 x 2\n\n',
            '',
        )
        self.assertFalse(benchmark._process_raw_result(0, invalid_output))
+    def test_dtk_hpcg_result_parsing_ignores_non_root_mpi_rank(self):
+        """Test DTK gpu-hpcg parser skips non-root MPI ranks without summary output."""
+        benchmark = self.get_benchmark()
+        rank_only_output = '[1,2]<stdout>: [2]: Node Binding: Process 2 GPU: 2, NUMA: 0'
+        with patch.dict(os.environ, {'OMPI_COMM_WORLD_RANK': '2'}):
+            self.assertTrue(benchmark._process_raw_result(0, rank_only_output))