Add gpu-hpcg metrics

2056d7fa · one · 4f69c7de · 2056d7fa · 2056d7fa · 2056d7fa
Commit 2056d7fa authored Apr 01, 2026 by one
3 changed files
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -187,6 +187,55 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
 | cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth.                                                        |
 | cpu-hpl/time       | time (s)           | Time elapsed during HPL run.                                              |

+### `gpu-hpcg`
+
+#### Introduction
+
+Measure GPU HPCG performance using `run_rochpcg`, which wraps `rocHPCG` execution and emits rocHPCG-native summary output.
+
+When collecting multiple MPI scales or problem sizes, use separate benchmark section names such as
+`gpu-hpcg:ranks8` and `gpu-hpcg:ranks4` instead of placing multiple `modes` under one `gpu-hpcg` section.
+
+#### Metrics
+
+| Name                                  | Unit               | Description                                          |
+|---------------------------------------|--------------------|------------------------------------------------------|
+| gpu-hpcg/final_gflops                 | FLOPS (GFLOPS)     | Final rocHPCG score.                                 |
+| gpu-hpcg/final_gflops_per_process     | FLOPS (GFLOPS)     | Final rocHPCG score per process.                     |
+| gpu-hpcg/ddot_gflops                  | FLOPS (GFLOPS)     | DDOT throughput.                                     |
+| gpu-hpcg/ddot_bandwidth               | bandwidth (GB/s)   | DDOT bandwidth.                                      |
+| gpu-hpcg/ddot_gflops_per_process      | FLOPS (GFLOPS)     | DDOT throughput per process.                         |
+| gpu-hpcg/ddot_bandwidth_per_process   | bandwidth (GB/s)   | DDOT bandwidth per process.                          |
+| gpu-hpcg/waxpby_gflops                | FLOPS (GFLOPS)     | WAXPBY throughput.                                   |
+| gpu-hpcg/waxpby_bandwidth             | bandwidth (GB/s)   | WAXPBY bandwidth.                                    |
+| gpu-hpcg/waxpby_gflops_per_process    | FLOPS (GFLOPS)     | WAXPBY throughput per process.                       |
+| gpu-hpcg/waxpby_bandwidth_per_process | bandwidth (GB/s)   | WAXPBY bandwidth per process.                        |
+| gpu-hpcg/spmv_gflops                  | FLOPS (GFLOPS)     | SpMV throughput.                                     |
+| gpu-hpcg/spmv_bandwidth               | bandwidth (GB/s)   | SpMV bandwidth.                                      |
+| gpu-hpcg/spmv_gflops_per_process      | FLOPS (GFLOPS)     | SpMV throughput per process.                         |
+| gpu-hpcg/spmv_bandwidth_per_process   | bandwidth (GB/s)   | SpMV bandwidth per process.                          |
+| gpu-hpcg/mg_gflops                    | FLOPS (GFLOPS)     | MG throughput.                                       |
+| gpu-hpcg/mg_bandwidth                 | bandwidth (GB/s)   | MG bandwidth.                                        |
+| gpu-hpcg/mg_gflops_per_process        | FLOPS (GFLOPS)     | MG throughput per process.                           |
+| gpu-hpcg/mg_bandwidth_per_process     | bandwidth (GB/s)   | MG bandwidth per process.                            |
+| gpu-hpcg/total_gflops                 | FLOPS (GFLOPS)     | Aggregate rocHPCG throughput.                        |
+| gpu-hpcg/total_bandwidth              | bandwidth (GB/s)   | Aggregate rocHPCG bandwidth.                         |
+| gpu-hpcg/total_gflops_per_process     | FLOPS (GFLOPS)     | Aggregate rocHPCG throughput per process.            |
+| gpu-hpcg/total_bandwidth_per_process  | bandwidth (GB/s)   | Aggregate rocHPCG bandwidth per process.             |
+| gpu-hpcg/setup_time                   | time (s)           | Setup phase duration.                                |
+| gpu-hpcg/optimization_time            | time (s)           | Optimization phase duration.                         |
+| gpu-hpcg/total_time                   | time (s)           | Total runtime.                                       |
+| gpu-hpcg/is_valid                     |                    | Run validity inferred from rocHPCG invalid markers.  |
+| gpu-hpcg/local_domain_x               |                    | Local domain size in x dimension.                    |
+| gpu-hpcg/local_domain_y               |                    | Local domain size in y dimension.                    |
+| gpu-hpcg/local_domain_z               |                    | Local domain size in z dimension.                    |
+| gpu-hpcg/global_domain_x              |                    | Global domain size in x dimension.                   |
+| gpu-hpcg/global_domain_y              |                    | Global domain size in y dimension.                   |
+| gpu-hpcg/global_domain_z              |                    | Global domain size in z dimension.                   |
+| gpu-hpcg/process_domain_x             |                    | Process topology in x dimension.                     |
+| gpu-hpcg/process_domain_y             |                    | Process topology in y dimension.                     |
+| gpu-hpcg/process_domain_z             |                    | Process topology in z dimension.                     |
+
 ### `cpu-stream`

 #### Introduction

--- a/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
@@ -4,12 +4,32 @@
 """Module of the GPU HPCG benchmark base class."""

 import os
+import re
+
+from superbench.common.utils import logger

 from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke


 class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
    """The GPU HPCG benchmark base class."""
+    _operation_metric_map = {
+        'DDOT': 'ddot',
+        'WAXPBY': 'waxpby',
+        'SpMV': 'spmv',
+        'MG': 'mg',
+        'Total': 'total',
+        'Final': 'final',
+    }
+    _operation_pattern = re.compile(
+        r'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
+        r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
+        r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
+    )
+    _time_pattern = re.compile(r'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$')
+    _domain_pattern = re.compile(r'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$')
+    _invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***']
+
    def __init__(self, name, parameters=''):
        """Constructor.

@@ -131,14 +151,101 @@ class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
        return True

    def _process_raw_result(self, cmd_idx, raw_output):
-        """Save raw output for later parser refinement.
+        """Parse rocHPCG stdout and save summarized results.

        Args:
            cmd_idx (int): the index of command corresponding with the raw_output.
            raw_output (str): raw output string of the micro-benchmark.

        Return:
-            bool: Always True for now.
+            bool: True if rocHPCG summary metrics are extracted successfully.
        """
        self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
+
+        parsed_results = {}
+        required_metrics = {
+            'final_gflops',
+            'final_gflops_per_process',
+            'ddot_gflops',
+            'ddot_bandwidth',
+            'ddot_gflops_per_process',
+            'ddot_bandwidth_per_process',
+            'waxpby_gflops',
+            'waxpby_bandwidth',
+            'waxpby_gflops_per_process',
+            'waxpby_bandwidth_per_process',
+            'spmv_gflops',
+            'spmv_bandwidth',
+            'spmv_gflops_per_process',
+            'spmv_bandwidth_per_process',
+            'mg_gflops',
+            'mg_bandwidth',
+            'mg_gflops_per_process',
+            'mg_bandwidth_per_process',
+            'total_gflops',
+            'total_bandwidth',
+            'total_gflops_per_process',
+            'total_bandwidth_per_process',
+            'setup_time',
+            'optimization_time',
+            'total_time',
+            'local_domain_x',
+            'local_domain_y',
+            'local_domain_z',
+            'global_domain_x',
+            'global_domain_y',
+            'global_domain_z',
+            'process_domain_x',
+            'process_domain_y',
+            'process_domain_z',
+        }
+
+        for raw_line in raw_output.splitlines():
+            line = raw_line.strip()
+            if not line:
+                continue
+
+            operation_match = self._operation_pattern.match(line)
+            if operation_match:
+                prefix = self._operation_metric_map[operation_match.group(1)]
+                total_gflops = float(operation_match.group(2))
+                total_bandwidth = float(operation_match.group(3))
+                per_process_gflops = float(operation_match.group(4))
+                per_process_bandwidth = float(operation_match.group(5))
+
+                parsed_results[f'{prefix}_gflops'] = total_gflops
+                parsed_results[f'{prefix}_gflops_per_process'] = per_process_gflops
+                if prefix != 'final':
+                    parsed_results[f'{prefix}_bandwidth'] = total_bandwidth
+                    parsed_results[f'{prefix}_bandwidth_per_process'] = per_process_bandwidth
+                continue
+
+            time_match = self._time_pattern.match(line)
+            if time_match:
+                metric_prefix = time_match.group(1).lower().replace(' ', '_')
+                parsed_results[metric_prefix] = float(time_match.group(2))
+                continue
+
+            domain_match = self._domain_pattern.match(line)
+            if domain_match:
+                domain_prefix = domain_match.group(1).lower()
+                parsed_results[f'{domain_prefix}_domain_x'] = int(domain_match.group(2))
+                parsed_results[f'{domain_prefix}_domain_y'] = int(domain_match.group(3))
+                parsed_results[f'{domain_prefix}_domain_z'] = int(domain_match.group(4))
+
+        parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1
+
+        missing_metrics = sorted(metric for metric in required_metrics if metric not in parsed_results)
+        if missing_metrics:
+            logger.error(
+                'The result format is invalid - round: %s, benchmark: %s, missing metrics: %s.',
+                self._curr_run_index,
+                self._name,
+                ', '.join(missing_metrics),
+            )
+            return False
+
+        for metric, value in parsed_results.items():
+            self._result.add_result(metric, value)
+
        return True
--- a/tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+# Copyright (c) Microsoft Corporation.
+# Licensed under the MIT License.
+
+"""Tests for DTK gpu-hpcg benchmark."""
+
+import unittest
+from types import SimpleNamespace
+
+from tests.helper.testcase import BenchmarkTestCase
+from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode
+from superbench.benchmarks.result import BenchmarkResult
+
+
+class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
+    """Tests for DTK gpu-hpcg benchmark."""
+
+    example_raw_output = """
+rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
+
+Setup Phase took 0.12 sec
+
+Starting Reference CG Phase ...
+
+
+Optimization Phase took 0.25 sec
+
+Validation Testing Phase ...
+
+Optimized CG Setup ...
+
+HIP Initial Residual = 2.668768e+04
+
+Total device memory usage: 19550 MByte (29152 MByte)
+
+Starting Benchmarking Phase ...
+
+Performing (at least) 2 CG sets in 1.0 seconds ...
+CG set 1 / 2    6881.2186 GFlop/s     (215.0381 GFlop/s per process)    50%    0.0 sec left
+CG set 2 / 2    6904.9453 GFlop/s     (215.7795 GFlop/s per process)    100%    0.0 sec left
+
+Local domain: 560 x 280 x 280
+Global domain: 2240 x 1120 x 560
+Process domain: 4 x 4 x 2
+
+Total Time: 7.55 sec
+Setup Time: 0.12 sec
+Optimization Time: 0.25 sec
+
+*** WARNING *** INVALID RUN
+
+DDOT   =  5849.4 GFlop/s (46794.9 GB/s)     182.8 GFlop/s per process ( 1462.3 GB/s per process)
+WAXPBY =  3052.0 GFlop/s (36623.8 GB/s)      95.4 GFlop/s per process ( 1144.5 GB/s per process)
+SpMV   =  5473.9 GFlop/s (34468.8 GB/s)     171.1 GFlop/s per process ( 1077.1 GB/s per process)
+MG     =  7716.9 GFlop/s (59557.1 GB/s)     241.2 GFlop/s per process ( 1861.2 GB/s per process)
+Total  =  6971.0 GFlop/s (52859.9 GB/s)     217.8 GFlop/s per process ( 1651.9 GB/s per process)
+Final  =  6904.9 GFlop/s (52359.0 GB/s)     215.8 GFlop/s per process ( 1636.2 GB/s per process)
+
+*** WARNING *** THIS IS NOT A VALID RUN ***
+"""
+
+    @classmethod
+    def setUpClass(cls):
+        """Hook method for setting up class fixture before running tests in the class."""
+        super().setUpClass()
+        cls.benchmark_name = 'gpu-hpcg'
+        cls.createMockEnvs(cls)
+        cls.createMockFiles(cls, ['bin/run_rochpcg'])
+
+    def get_benchmark(self):
+        """Get benchmark."""
+        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.DTK)
+        benchmark = benchmark_cls(self.benchmark_name, parameters='')
+        benchmark._args = SimpleNamespace(log_raw_data=False)
+        benchmark._curr_run_index = 0
+        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
+        return benchmark
+
+    def test_dtk_hpcg_cls(self):
+        """Test DTK gpu-hpcg benchmark class."""
+        for platform in Platform:
+            (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, platform)
+            if platform is Platform.DTK:
+                self.assertIsNotNone(benchmark_cls)
+            else:
+                self.assertIsNone(benchmark_cls)
+
+    def test_dtk_hpcg_result_parsing_with_wrapper_noise(self):
+        """Test DTK gpu-hpcg result parsing with wrapper noise."""
+        benchmark = self.get_benchmark()
+
+        self.assertTrue(benchmark._process_raw_result(0, self.example_raw_output))
+        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
+
+        self.assertEqual(6904.9, benchmark.result['final_gflops'][0])
+        self.assertEqual(215.8, benchmark.result['final_gflops_per_process'][0])
+        self.assertEqual(5849.4, benchmark.result['ddot_gflops'][0])
+        self.assertEqual(46794.9, benchmark.result['ddot_bandwidth'][0])
+        self.assertEqual(182.8, benchmark.result['ddot_gflops_per_process'][0])
+        self.assertEqual(1462.3, benchmark.result['ddot_bandwidth_per_process'][0])
+        self.assertEqual(3052.0, benchmark.result['waxpby_gflops'][0])
+        self.assertEqual(36623.8, benchmark.result['waxpby_bandwidth'][0])
+        self.assertEqual(5473.9, benchmark.result['spmv_gflops'][0])
+        self.assertEqual(34468.8, benchmark.result['spmv_bandwidth'][0])
+        self.assertEqual(7716.9, benchmark.result['mg_gflops'][0])
+        self.assertEqual(59557.1, benchmark.result['mg_bandwidth'][0])
+        self.assertEqual(6971.0, benchmark.result['total_gflops'][0])
+        self.assertEqual(52859.9, benchmark.result['total_bandwidth'][0])
+        self.assertEqual(217.8, benchmark.result['total_gflops_per_process'][0])
+        self.assertEqual(1651.9, benchmark.result['total_bandwidth_per_process'][0])
+        self.assertEqual(0.12, benchmark.result['setup_time'][0])
+        self.assertEqual(0.25, benchmark.result['optimization_time'][0])
+        self.assertEqual(7.55, benchmark.result['total_time'][0])
+        self.assertEqual(0, benchmark.result['is_valid'][0])
+        self.assertEqual(560, benchmark.result['local_domain_x'][0])
+        self.assertEqual(280, benchmark.result['local_domain_y'][0])
+        self.assertEqual(280, benchmark.result['local_domain_z'][0])
+        self.assertEqual(2240, benchmark.result['global_domain_x'][0])
+        self.assertEqual(1120, benchmark.result['global_domain_y'][0])
+        self.assertEqual(560, benchmark.result['global_domain_z'][0])
+        self.assertEqual(4, benchmark.result['process_domain_x'][0])
+        self.assertEqual(4, benchmark.result['process_domain_y'][0])
+        self.assertEqual(2, benchmark.result['process_domain_z'][0])
+        self.assertIn('raw_output_0', benchmark.raw_data)
+
+    def test_dtk_hpcg_result_parsing_valid_by_absence_of_invalid_markers(self):
+        """Test DTK gpu-hpcg valid detection by absence of invalid markers."""
+        benchmark = self.get_benchmark()
+        valid_output = self.example_raw_output.replace('*** WARNING *** INVALID RUN', '')
+        valid_output = valid_output.replace('*** WARNING *** THIS IS NOT A VALID RUN ***', '')
+
+        self.assertTrue(benchmark._process_raw_result(0, valid_output))
+        self.assertEqual(1, benchmark.result['is_valid'][0])
+
+    def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
+        """Test DTK gpu-hpcg parsing failure when required summary is missing."""
+        benchmark = self.get_benchmark()
+        invalid_output = self.example_raw_output.replace(
+            'Process domain: 4 x 4 x 2\n\n',
+            '',
+        )
+
+        self.assertFalse(benchmark._process_raw_result(0, invalid_output))