Commit 2056d7fa authored by one's avatar one
Browse files

Add gpu-hpcg metrics

parent 4f69c7de
......@@ -187,6 +187,55 @@ Performed by [High-Performance Linpack Benchmark for Distributed-Memory Computer
| cpu-hpl/throughput | bandwidth (GFlops) | Compute bandwidth. |
| cpu-hpl/time | time (s) | Time elapsed during HPL run. |
### `gpu-hpcg`
#### Introduction
Measure GPU HPCG performance using `run_rochpcg`, which wraps `rocHPCG` execution and emits rocHPCG-native summary output.
When collecting multiple MPI scales or problem sizes, use separate benchmark section names such as
`gpu-hpcg:ranks8` and `gpu-hpcg:ranks4` instead of placing multiple `modes` under one `gpu-hpcg` section.
#### Metrics
| Name | Unit | Description |
|---------------------------------------|--------------------|------------------------------------------------------|
| gpu-hpcg/final_gflops | FLOPS (GFLOPS) | Final rocHPCG score. |
| gpu-hpcg/final_gflops_per_process | FLOPS (GFLOPS) | Final rocHPCG score per process. |
| gpu-hpcg/ddot_gflops | FLOPS (GFLOPS) | DDOT throughput. |
| gpu-hpcg/ddot_bandwidth | bandwidth (GB/s) | DDOT bandwidth. |
| gpu-hpcg/ddot_gflops_per_process | FLOPS (GFLOPS) | DDOT throughput per process. |
| gpu-hpcg/ddot_bandwidth_per_process | bandwidth (GB/s) | DDOT bandwidth per process. |
| gpu-hpcg/waxpby_gflops | FLOPS (GFLOPS) | WAXPBY throughput. |
| gpu-hpcg/waxpby_bandwidth | bandwidth (GB/s) | WAXPBY bandwidth. |
| gpu-hpcg/waxpby_gflops_per_process | FLOPS (GFLOPS) | WAXPBY throughput per process. |
| gpu-hpcg/waxpby_bandwidth_per_process | bandwidth (GB/s) | WAXPBY bandwidth per process. |
| gpu-hpcg/spmv_gflops | FLOPS (GFLOPS) | SpMV throughput. |
| gpu-hpcg/spmv_bandwidth | bandwidth (GB/s) | SpMV bandwidth. |
| gpu-hpcg/spmv_gflops_per_process | FLOPS (GFLOPS) | SpMV throughput per process. |
| gpu-hpcg/spmv_bandwidth_per_process | bandwidth (GB/s) | SpMV bandwidth per process. |
| gpu-hpcg/mg_gflops | FLOPS (GFLOPS) | MG throughput. |
| gpu-hpcg/mg_bandwidth | bandwidth (GB/s) | MG bandwidth. |
| gpu-hpcg/mg_gflops_per_process | FLOPS (GFLOPS) | MG throughput per process. |
| gpu-hpcg/mg_bandwidth_per_process | bandwidth (GB/s) | MG bandwidth per process. |
| gpu-hpcg/total_gflops | FLOPS (GFLOPS) | Aggregate rocHPCG throughput. |
| gpu-hpcg/total_bandwidth | bandwidth (GB/s) | Aggregate rocHPCG bandwidth. |
| gpu-hpcg/total_gflops_per_process | FLOPS (GFLOPS) | Aggregate rocHPCG throughput per process. |
| gpu-hpcg/total_bandwidth_per_process | bandwidth (GB/s) | Aggregate rocHPCG bandwidth per process. |
| gpu-hpcg/setup_time | time (s) | Setup phase duration. |
| gpu-hpcg/optimization_time | time (s) | Optimization phase duration. |
| gpu-hpcg/total_time | time (s) | Total runtime. |
| gpu-hpcg/is_valid | | Run validity inferred from rocHPCG invalid markers. |
| gpu-hpcg/local_domain_x | | Local domain size in x dimension. |
| gpu-hpcg/local_domain_y | | Local domain size in y dimension. |
| gpu-hpcg/local_domain_z | | Local domain size in z dimension. |
| gpu-hpcg/global_domain_x | | Global domain size in x dimension. |
| gpu-hpcg/global_domain_y | | Global domain size in y dimension. |
| gpu-hpcg/global_domain_z | | Global domain size in z dimension. |
| gpu-hpcg/process_domain_x | | Process topology in x dimension. |
| gpu-hpcg/process_domain_y | | Process topology in y dimension. |
| gpu-hpcg/process_domain_z | | Process topology in z dimension. |
### `cpu-stream`
#### Introduction
......
......@@ -4,12 +4,32 @@
"""Module of the GPU HPCG benchmark base class."""
import os
import re
from superbench.common.utils import logger
from superbench.benchmarks.micro_benchmarks import MicroBenchmarkWithInvoke
class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
"""The GPU HPCG benchmark base class."""
_operation_metric_map = {
'DDOT': 'ddot',
'WAXPBY': 'waxpby',
'SpMV': 'spmv',
'MG': 'mg',
'Total': 'total',
'Final': 'final',
}
_operation_pattern = re.compile(
r'^(DDOT|WAXPBY|SpMV|MG|Total|Final)\s*=\s*'
r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s\s+\(([0-9]+(?:\.[0-9]+)?)\s+GB/s\)\s+'
r'([0-9]+(?:\.[0-9]+)?)\s+GFlop/s per process\s+\(\s*([0-9]+(?:\.[0-9]+)?)\s+GB/s per process\)$'
)
_time_pattern = re.compile(r'^(Total Time|Setup Time|Optimization Time):\s*([0-9]+(?:\.[0-9]+)?)\s+sec$')
_domain_pattern = re.compile(r'^(Local|Global|Process) domain:\s*([0-9]+)\s+x\s+([0-9]+)\s+x\s+([0-9]+)$')
_invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***']
def __init__(self, name, parameters=''):
"""Constructor.
......@@ -131,14 +151,101 @@ def _preprocess(self):
return True
def _process_raw_result(self, cmd_idx, raw_output):
"""Save raw output for later parser refinement.
"""Parse rocHPCG stdout and save summarized results.
Args:
cmd_idx (int): the index of command corresponding with the raw_output.
raw_output (str): raw output string of the micro-benchmark.
Return:
bool: Always True for now.
bool: True if rocHPCG summary metrics are extracted successfully.
"""
self._result.add_raw_data('raw_output_' + str(cmd_idx), raw_output, self._args.log_raw_data)
parsed_results = {}
required_metrics = {
'final_gflops',
'final_gflops_per_process',
'ddot_gflops',
'ddot_bandwidth',
'ddot_gflops_per_process',
'ddot_bandwidth_per_process',
'waxpby_gflops',
'waxpby_bandwidth',
'waxpby_gflops_per_process',
'waxpby_bandwidth_per_process',
'spmv_gflops',
'spmv_bandwidth',
'spmv_gflops_per_process',
'spmv_bandwidth_per_process',
'mg_gflops',
'mg_bandwidth',
'mg_gflops_per_process',
'mg_bandwidth_per_process',
'total_gflops',
'total_bandwidth',
'total_gflops_per_process',
'total_bandwidth_per_process',
'setup_time',
'optimization_time',
'total_time',
'local_domain_x',
'local_domain_y',
'local_domain_z',
'global_domain_x',
'global_domain_y',
'global_domain_z',
'process_domain_x',
'process_domain_y',
'process_domain_z',
}
for raw_line in raw_output.splitlines():
line = raw_line.strip()
if not line:
continue
operation_match = self._operation_pattern.match(line)
if operation_match:
prefix = self._operation_metric_map[operation_match.group(1)]
total_gflops = float(operation_match.group(2))
total_bandwidth = float(operation_match.group(3))
per_process_gflops = float(operation_match.group(4))
per_process_bandwidth = float(operation_match.group(5))
parsed_results[f'{prefix}_gflops'] = total_gflops
parsed_results[f'{prefix}_gflops_per_process'] = per_process_gflops
if prefix != 'final':
parsed_results[f'{prefix}_bandwidth'] = total_bandwidth
parsed_results[f'{prefix}_bandwidth_per_process'] = per_process_bandwidth
continue
time_match = self._time_pattern.match(line)
if time_match:
metric_prefix = time_match.group(1).lower().replace(' ', '_')
parsed_results[metric_prefix] = float(time_match.group(2))
continue
domain_match = self._domain_pattern.match(line)
if domain_match:
domain_prefix = domain_match.group(1).lower()
parsed_results[f'{domain_prefix}_domain_x'] = int(domain_match.group(2))
parsed_results[f'{domain_prefix}_domain_y'] = int(domain_match.group(3))
parsed_results[f'{domain_prefix}_domain_z'] = int(domain_match.group(4))
parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1
missing_metrics = sorted(metric for metric in required_metrics if metric not in parsed_results)
if missing_metrics:
logger.error(
'The result format is invalid - round: %s, benchmark: %s, missing metrics: %s.',
self._curr_run_index,
self._name,
', '.join(missing_metrics),
)
return False
for metric, value in parsed_results.items():
self._result.add_result(metric, value)
return True
# Copyright (c) Microsoft Corporation.
# Licensed under the MIT License.
"""Tests for DTK gpu-hpcg benchmark."""
import unittest
from types import SimpleNamespace
from tests.helper.testcase import BenchmarkTestCase
from superbench.benchmarks import BenchmarkRegistry, BenchmarkType, Platform, ReturnCode
from superbench.benchmarks.result import BenchmarkResult
class DtkHpcgBenchmarkTest(BenchmarkTestCase, unittest.TestCase):
"""Tests for DTK gpu-hpcg benchmark."""
example_raw_output = """
rocHPCG version: 0.8.8-62f1830-dirty (based on hpcg-3.1)
Setup Phase took 0.12 sec
Starting Reference CG Phase ...
Optimization Phase took 0.25 sec
Validation Testing Phase ...
Optimized CG Setup ...
HIP Initial Residual = 2.668768e+04
Total device memory usage: 19550 MByte (29152 MByte)
Starting Benchmarking Phase ...
Performing (at least) 2 CG sets in 1.0 seconds ...
CG set 1 / 2 6881.2186 GFlop/s (215.0381 GFlop/s per process) 50% 0.0 sec left
CG set 2 / 2 6904.9453 GFlop/s (215.7795 GFlop/s per process) 100% 0.0 sec left
Local domain: 560 x 280 x 280
Global domain: 2240 x 1120 x 560
Process domain: 4 x 4 x 2
Total Time: 7.55 sec
Setup Time: 0.12 sec
Optimization Time: 0.25 sec
*** WARNING *** INVALID RUN
DDOT = 5849.4 GFlop/s (46794.9 GB/s) 182.8 GFlop/s per process ( 1462.3 GB/s per process)
WAXPBY = 3052.0 GFlop/s (36623.8 GB/s) 95.4 GFlop/s per process ( 1144.5 GB/s per process)
SpMV = 5473.9 GFlop/s (34468.8 GB/s) 171.1 GFlop/s per process ( 1077.1 GB/s per process)
MG = 7716.9 GFlop/s (59557.1 GB/s) 241.2 GFlop/s per process ( 1861.2 GB/s per process)
Total = 6971.0 GFlop/s (52859.9 GB/s) 217.8 GFlop/s per process ( 1651.9 GB/s per process)
Final = 6904.9 GFlop/s (52359.0 GB/s) 215.8 GFlop/s per process ( 1636.2 GB/s per process)
*** WARNING *** THIS IS NOT A VALID RUN ***
"""
@classmethod
def setUpClass(cls):
"""Hook method for setting up class fixture before running tests in the class."""
super().setUpClass()
cls.benchmark_name = 'gpu-hpcg'
cls.createMockEnvs(cls)
cls.createMockFiles(cls, ['bin/run_rochpcg'])
def get_benchmark(self):
"""Get benchmark."""
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.DTK)
benchmark = benchmark_cls(self.benchmark_name, parameters='')
benchmark._args = SimpleNamespace(log_raw_data=False)
benchmark._curr_run_index = 0
benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
return benchmark
def test_dtk_hpcg_cls(self):
"""Test DTK gpu-hpcg benchmark class."""
for platform in Platform:
(benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, platform)
if platform is Platform.DTK:
self.assertIsNotNone(benchmark_cls)
else:
self.assertIsNone(benchmark_cls)
def test_dtk_hpcg_result_parsing_with_wrapper_noise(self):
"""Test DTK gpu-hpcg result parsing with wrapper noise."""
benchmark = self.get_benchmark()
self.assertTrue(benchmark._process_raw_result(0, self.example_raw_output))
self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)
self.assertEqual(6904.9, benchmark.result['final_gflops'][0])
self.assertEqual(215.8, benchmark.result['final_gflops_per_process'][0])
self.assertEqual(5849.4, benchmark.result['ddot_gflops'][0])
self.assertEqual(46794.9, benchmark.result['ddot_bandwidth'][0])
self.assertEqual(182.8, benchmark.result['ddot_gflops_per_process'][0])
self.assertEqual(1462.3, benchmark.result['ddot_bandwidth_per_process'][0])
self.assertEqual(3052.0, benchmark.result['waxpby_gflops'][0])
self.assertEqual(36623.8, benchmark.result['waxpby_bandwidth'][0])
self.assertEqual(5473.9, benchmark.result['spmv_gflops'][0])
self.assertEqual(34468.8, benchmark.result['spmv_bandwidth'][0])
self.assertEqual(7716.9, benchmark.result['mg_gflops'][0])
self.assertEqual(59557.1, benchmark.result['mg_bandwidth'][0])
self.assertEqual(6971.0, benchmark.result['total_gflops'][0])
self.assertEqual(52859.9, benchmark.result['total_bandwidth'][0])
self.assertEqual(217.8, benchmark.result['total_gflops_per_process'][0])
self.assertEqual(1651.9, benchmark.result['total_bandwidth_per_process'][0])
self.assertEqual(0.12, benchmark.result['setup_time'][0])
self.assertEqual(0.25, benchmark.result['optimization_time'][0])
self.assertEqual(7.55, benchmark.result['total_time'][0])
self.assertEqual(0, benchmark.result['is_valid'][0])
self.assertEqual(560, benchmark.result['local_domain_x'][0])
self.assertEqual(280, benchmark.result['local_domain_y'][0])
self.assertEqual(280, benchmark.result['local_domain_z'][0])
self.assertEqual(2240, benchmark.result['global_domain_x'][0])
self.assertEqual(1120, benchmark.result['global_domain_y'][0])
self.assertEqual(560, benchmark.result['global_domain_z'][0])
self.assertEqual(4, benchmark.result['process_domain_x'][0])
self.assertEqual(4, benchmark.result['process_domain_y'][0])
self.assertEqual(2, benchmark.result['process_domain_z'][0])
self.assertIn('raw_output_0', benchmark.raw_data)
def test_dtk_hpcg_result_parsing_valid_by_absence_of_invalid_markers(self):
"""Test DTK gpu-hpcg valid detection by absence of invalid markers."""
benchmark = self.get_benchmark()
valid_output = self.example_raw_output.replace('*** WARNING *** INVALID RUN', '')
valid_output = valid_output.replace('*** WARNING *** THIS IS NOT A VALID RUN ***', '')
self.assertTrue(benchmark._process_raw_result(0, valid_output))
self.assertEqual(1, benchmark.result['is_valid'][0])
def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
"""Test DTK gpu-hpcg parsing failure when required summary is missing."""
benchmark = self.get_benchmark()
invalid_output = self.example_raw_output.replace(
'Process domain: 4 x 4 x 2\n\n',
'',
)
self.assertFalse(benchmark._process_raw_result(0, invalid_output))
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment