Benchmarks: Update gpu-hpcg metrics to encode process and problem shape (#8)

* Update gpu-hpcg metrics to encode process and problem shape * Fix tests

Benchmarks: Update gpu-hpcg metrics to encode process and problem shape (#8)
* Update gpu-hpcg metrics to encode process and problem shape * Fix tests
0a1a15ea · one · GitHub · d7a56e0b · 0a1a15ea · 0a1a15ea
Unverified Commit 0a1a15ea authored Apr 21, 2026 by one Committed by GitHub Apr 21, 2026
5 changed files
--- a/docs/user-tutorial/benchmarks/micro-benchmarks.md
+++ b/docs/user-tutorial/benchmarks/micro-benchmarks.md
@@ -198,21 +198,20 @@ Performed by [rocHPCG](https://github.com/ROCm/rocHPCG).

 #### Metrics

-rocHPCG reports operation-level metrics for `final`, `ddot`, `waxpby`, `spmv`, `mg`, and `total`.
-
-| Name                                                | Unit             | Description                                             |
-|-----------------------------------------------------|------------------|---------------------------------------------------------|
-| gpu-hpcg/${operation}\_gflops                       | FLOPS (GFLOPS)   | Throughput for the specified rocHPCG operation.         |
-| gpu-hpcg/${operation}\_bandwidth                    | bandwidth (GB/s) | Bandwidth for the specified rocHPCG operation.          |
-| gpu-hpcg/${operation}\_gflops_per_process           | FLOPS (GFLOPS)   | Per-process throughput for the specified operation.     |
-| gpu-hpcg/${operation}\_bandwidth_per_process        | bandwidth (GB/s) | Per-process bandwidth for the specified operation.      |
-| gpu-hpcg/setup_time                                 | time (s)         | Setup phase duration.                                   |
-| gpu-hpcg/optimization_time                          | time (s)         | Optimization phase duration.                            |
-| gpu-hpcg/total_time                                 | time (s)         | Total runtime.                                          |
-| gpu-hpcg/is_valid                                   |                  | Run validity inferred from rocHPCG invalid markers.     |
-| gpu-hpcg/local_domain_[x\|y\|z]                     |                  | Local domain size for each dimension.                   |
-| gpu-hpcg/global_domain_[x\|y\|z]                    |                  | Global domain size for each dimension.                  |
-| gpu-hpcg/process_domain_[x\|y\|z]                   |                  | Process topology for each dimension.                    |
+rocHPCG reports performance and time metrics.
+Performance metrics are reported for `final`, `ddot`, `waxpby`, `spmv`, `mg`, and `total`.
+The metric key includes the configured process domain and local problem size:
+`p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}`.
+
+| Name                                                                                             | Unit             | Description                                             |
+|--------------------------------------------------------------------------------------------------|------------------|---------------------------------------------------------|
+| `gpu-hpcg/${operation}_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}_gflops`                         | FLOPS (GFLOPS)   | Throughput for the specified rocHPCG operation.         |
+| `gpu-hpcg/${operation}_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}_bandwidth`                      | bandwidth (GB/s) | Bandwidth for the specified rocHPCG operation.          |
+| `gpu-hpcg/${operation}_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}_gflops_per_process`             | FLOPS (GFLOPS)   | Per-process throughput for the specified operation.     |
+| `gpu-hpcg/${operation}_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}_bandwidth_per_process`          | bandwidth (GB/s) | Per-process bandwidth for the specified operation.      |
+| `gpu-hpcg/setup_time_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}`                                  | time (s)         | Setup phase duration.                                   |
+| `gpu-hpcg/optimization_time_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}`                           | time (s)         | Optimization phase duration.                            |
+| `gpu-hpcg/total_time_p${npx}x${npy}x${npz}_n${nx}x${ny}x${nz}`                                  | time (s)         | Total runtime.                                          |

 ### `cpu-stream`


--- a/superbench/analyzer/metric_sorter.py
+++ b/superbench/analyzer/metric_sorter.py
@@ -12,45 +12,35 @@

 _RCCL_PATTERN = re.compile(r'^(?P<bench>rccl-bw(?::[^/]+)?)/(?P<op>[^_]+)_(?P<size>\d+)_(?P<suffix>.+?)(?::\d+)?$')
 _HPCG_PATTERN = re.compile(r'^(?P<bench>gpu-hpcg(?::[^/]+)?)/(?P<metric>.+?)(?::\d+)?$')
+_HPCG_WORKLOAD_PATTERN = re.compile(
+    r'^(?P<subject>final|ddot|waxpby|spmv|mg|total)_'
+    r'p(?P<npx>\d+)x(?P<npy>\d+)x(?P<npz>\d+)_'
+    r'n(?P<nx>\d+)x(?P<ny>\d+)x(?P<nz>\d+)_'
+    r'(?P<type>gflops|bandwidth|gflops_per_process|bandwidth_per_process)$'
+)
+_HPCG_TIME_PATTERN = re.compile(
+    r'^(?P<subject>setup_time|optimization_time|total_time)_'
+    r'p(?P<npx>\d+)x(?P<npy>\d+)x(?P<npz>\d+)_'
+    r'n(?P<nx>\d+)x(?P<ny>\d+)x(?P<nz>\d+)$'
+)

-_HPCG_METRIC_ORDER = {
-    'local_domain_x': 0,
-    'local_domain_y': 1,
-    'local_domain_z': 2,
-    'global_domain_x': 3,
-    'global_domain_y': 4,
-    'global_domain_z': 5,
-    'process_domain_x': 6,
-    'process_domain_y': 7,
-    'process_domain_z': 8,
-    'total_time': 9,
-    'setup_time': 10,
-    'optimization_time': 11,
-    'ddot_gflops': 12,
-    'ddot_bandwidth': 13,
-    'ddot_gflops_per_process': 14,
-    'ddot_bandwidth_per_process': 15,
-    'waxpby_gflops': 16,
-    'waxpby_bandwidth': 17,
-    'waxpby_gflops_per_process': 18,
-    'waxpby_bandwidth_per_process': 19,
-    'spmv_gflops': 20,
-    'spmv_bandwidth': 21,
-    'spmv_gflops_per_process': 22,
-    'spmv_bandwidth_per_process': 23,
-    'mg_gflops': 24,
-    'mg_bandwidth': 25,
-    'mg_gflops_per_process': 26,
-    'mg_bandwidth_per_process': 27,
-    'total_gflops': 28,
-    'total_bandwidth': 29,
-    'total_gflops_per_process': 30,
-    'total_bandwidth_per_process': 31,
-    'final_gflops': 32,
-    'final_bandwidth': 33,
-    'final_gflops_per_process': 34,
-    'final_bandwidth_per_process': 35,
-    'is_valid': 36,
+_HPCG_SUBJECT_ORDER = {
+    'setup_time': 0,
+    'optimization_time': 1,
+    'total_time': 2,
+    'ddot': 3,
+    'waxpby': 4,
+    'spmv': 5,
+    'mg': 6,
+    'total': 7,
+    'final': 8,
+}
+
+_HPCG_PERF_TYPE_ORDER = {
+    'gflops': 0,
+    'bandwidth': 1,
+    'gflops_per_process': 2,
+    'bandwidth_per_process': 3,
 }


@@ -70,6 +60,18 @@ def _rccl_sort_key(metric_name):
    )


+def _hpcg_workload_key(match):
+    """Return a numeric sort key for the HPCG process domain and local problem size."""
+    return (
+        int(match.group('npx')),
+        int(match.group('npy')),
+        int(match.group('npz')),
+        int(match.group('nx')),
+        int(match.group('ny')),
+        int(match.group('nz')),
+    )
+
+
 def _hpcg_sort_key(metric_name):
    """Sort HPCG metrics roughly in the order they appear in rocHPCG logs."""
    match = _HPCG_PATTERN.match(metric_name)
@@ -77,10 +79,34 @@ def _hpcg_sort_key(metric_name):
        return None

    metric = match.group('metric')
+    time_match = _HPCG_TIME_PATTERN.match(metric)
+    if time_match:
+        return (
+            1,
+            match.group('bench'),
+            _HPCG_SUBJECT_ORDER.get(time_match.group('subject'), 999),
+            0,
+            *_hpcg_workload_key(time_match),
+            metric_name,
+        )
+
+    workload_match = _HPCG_WORKLOAD_PATTERN.match(metric)
+    if workload_match:
+        subject = workload_match.group('subject')
+        metric_type = workload_match.group('type')
+        return (
+            1,
+            match.group('bench'),
+            _HPCG_SUBJECT_ORDER.get(subject, 999),
+            _HPCG_PERF_TYPE_ORDER.get(metric_type, 999),
+            *_hpcg_workload_key(workload_match),
+            metric_name,
+        )
+
    return (
        1,
        match.group('bench'),
-        _HPCG_METRIC_ORDER.get(metric, 999),
+        _HPCG_SUBJECT_ORDER.get(metric, 999),
        metric,
        metric_name,
    )

--- a/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
+++ b/superbench/benchmarks/micro_benchmarks/gpu_hpcg_performance_base.py
@@ -27,15 +27,8 @@ class GpuHpcgBenchmark(MicroBenchmarkWithInvoke):
        'Setup Time': 'setup_time',
        'Optimization Time': 'optimization_time'
    }
-    _domain_metric_map = {
-        'Local domain': 'local_domain',
-        'Global domain': 'global_domain',
-        'Process domain': 'process_domain'
-    }
    _float_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+(GFlop/s|GB/s)')
-    _dimension_pattern = re.compile(r'([0-9]+)\s*x\s*([0-9]+)\s*x\s*([0-9]+)')
    _time_value_pattern = re.compile(r'([0-9]+(?:\.[0-9]+)?)\s+sec')
-    _invalid_markers = ['*** WARNING *** INVALID RUN', '*** WARNING *** THIS IS NOT A VALID RUN ***']

    def __init__(self, name, parameters=''):
        """Constructor.
@@ -203,15 +196,6 @@ def _process_raw_result(self, cmd_idx, raw_output):
            'setup_time',
            'optimization_time',
            'total_time',
-            'local_domain_x',
-            'local_domain_y',
-            'local_domain_z',
-            'global_domain_x',
-            'global_domain_y',
-            'global_domain_z',
-            'process_domain_x',
-            'process_domain_y',
-            'process_domain_z',
        }

        for raw_line in raw_output.splitlines():
@@ -226,10 +210,6 @@ def _process_raw_result(self, cmd_idx, raw_output):
            if self._parse_time_line(line, parsed_results):
                continue

-            self._parse_domain_line(line, parsed_results)
-
-        parsed_results['is_valid'] = 0 if any(marker in raw_output for marker in self._invalid_markers) else 1
-
        missing_metrics = sorted(metric for metric in required_metrics if metric not in parsed_results)
        if missing_metrics:
            logger.error(
@@ -241,10 +221,32 @@ def _process_raw_result(self, cmd_idx, raw_output):
            return False

        for metric, value in parsed_results.items():
-            self._result.add_result(metric, value)
+            self._result.add_result(self._format_metric_name(metric), value)

        return True

+    def _format_metric_name(self, metric):
+        """Format a rocHPCG metric with the configured process domain and local problem size."""
+        metric_suffixes = (
+            'gflops_per_process',
+            'bandwidth_per_process',
+            'gflops',
+            'bandwidth',
+        )
+        workload = (
+            f'p{self._args.npx}x{self._args.npy}x{self._args.npz}_'
+            f'n{self._args.nx}x{self._args.ny}x{self._args.nz}'
+        )
+        if metric in self._time_metric_map.values():
+            return f'{metric}_{workload}'
+
+        for suffix in metric_suffixes:
+            suffix_token = f'_{suffix}'
+            if metric.endswith(suffix_token):
+                return f'{metric[:-len(suffix_token)]}_{workload}_{suffix}'
+
+        return metric
+
    def _parse_operation_line(self, line, parsed_results):
        """Parse one rocHPCG operation summary line."""
        operation_key = None
@@ -284,20 +286,3 @@ def _parse_time_line(self, line, parsed_results):
                return True

        return False
-
-    def _parse_domain_line(self, line, parsed_results):
-        """Parse one rocHPCG domain summary line."""
-        for label, metric_prefix in self._domain_metric_map.items():
-            if not line.startswith(label + ':'):
-                continue
-
-            match = self._dimension_pattern.search(line)
-            if not match:
-                return False
-
-            parsed_results[f'{metric_prefix}_x'] = int(match.group(1))
-            parsed_results[f'{metric_prefix}_y'] = int(match.group(2))
-            parsed_results[f'{metric_prefix}_z'] = int(match.group(3))
-            return True
-
-        return False
--- a/superbench/config/hygon_bw_summary.yaml
+++ b/superbench/config/hygon_bw_summary.yaml
@@ -90,214 +90,40 @@ superbench:
      statistics: mean
      categories: HPCG gpu-hpcg:r1
      metrics:
-        - gpu-hpcg:r1/is_valid
-        - gpu-hpcg:r1/final_gflops
-        - gpu-hpcg:r1/final_bandwidth
-        - gpu-hpcg:r1/final_gflops_per_process
-        - gpu-hpcg:r1/final_bandwidth_per_process
-        - gpu-hpcg:r1/ddot_gflops
-        - gpu-hpcg:r1/ddot_bandwidth
-        - gpu-hpcg:r1/ddot_gflops_per_process
-        - gpu-hpcg:r1/ddot_bandwidth_per_process
-        - gpu-hpcg:r1/waxpby_gflops
-        - gpu-hpcg:r1/waxpby_bandwidth
-        - gpu-hpcg:r1/waxpby_gflops_per_process
-        - gpu-hpcg:r1/waxpby_bandwidth_per_process
-        - gpu-hpcg:r1/spmv_gflops
-        - gpu-hpcg:r1/spmv_bandwidth
-        - gpu-hpcg:r1/spmv_gflops_per_process
-        - gpu-hpcg:r1/spmv_bandwidth_per_process
-        - gpu-hpcg:r1/mg_gflops
-        - gpu-hpcg:r1/mg_bandwidth
-        - gpu-hpcg:r1/mg_gflops_per_process
-        - gpu-hpcg:r1/mg_bandwidth_per_process
-        - gpu-hpcg:r1/total_gflops
-        - gpu-hpcg:r1/total_bandwidth
-        - gpu-hpcg:r1/total_gflops_per_process
-        - gpu-hpcg:r1/total_bandwidth_per_process
-        - gpu-hpcg:r1/local_domain_x
-        - gpu-hpcg:r1/local_domain_y
-        - gpu-hpcg:r1/local_domain_z
-        - gpu-hpcg:r1/process_domain_x
-        - gpu-hpcg:r1/process_domain_y
-        - gpu-hpcg:r1/process_domain_z
+        - gpu-hpcg:r1/(setup_time|optimization_time|total_time)_p1x1x1_n560x280x280
+        - gpu-hpcg:r1/(ddot|waxpby|spmv|mg|total|final)_p1x1x1_n560x280x280_(gflops_per_process|bandwidth_per_process|gflops|bandwidth)

    gpu_hpcg_r2:
      statistics: mean
      categories: HPCG gpu-hpcg:r2
      metrics:
-        - gpu-hpcg:r2/is_valid
-        - gpu-hpcg:r2/final_gflops
-        - gpu-hpcg:r2/final_bandwidth
-        - gpu-hpcg:r2/final_gflops_per_process
-        - gpu-hpcg:r2/final_bandwidth_per_process
-        - gpu-hpcg:r2/ddot_gflops
-        - gpu-hpcg:r2/ddot_bandwidth
-        - gpu-hpcg:r2/ddot_gflops_per_process
-        - gpu-hpcg:r2/ddot_bandwidth_per_process
-        - gpu-hpcg:r2/waxpby_gflops
-        - gpu-hpcg:r2/waxpby_bandwidth
-        - gpu-hpcg:r2/waxpby_gflops_per_process
-        - gpu-hpcg:r2/waxpby_bandwidth_per_process
-        - gpu-hpcg:r2/spmv_gflops
-        - gpu-hpcg:r2/spmv_bandwidth
-        - gpu-hpcg:r2/spmv_gflops_per_process
-        - gpu-hpcg:r2/spmv_bandwidth_per_process
-        - gpu-hpcg:r2/mg_gflops
-        - gpu-hpcg:r2/mg_bandwidth
-        - gpu-hpcg:r2/mg_gflops_per_process
-        - gpu-hpcg:r2/mg_bandwidth_per_process
-        - gpu-hpcg:r2/total_gflops
-        - gpu-hpcg:r2/total_bandwidth
-        - gpu-hpcg:r2/total_gflops_per_process
-        - gpu-hpcg:r2/total_bandwidth_per_process
-        - gpu-hpcg:r2/local_domain_x
-        - gpu-hpcg:r2/local_domain_y
-        - gpu-hpcg:r2/local_domain_z
-        - gpu-hpcg:r2/process_domain_x
-        - gpu-hpcg:r2/process_domain_y
-        - gpu-hpcg:r2/process_domain_z
+        - gpu-hpcg:r2/(setup_time|optimization_time|total_time)_p2x1x1_n560x280x280
+        - gpu-hpcg:r2/(ddot|waxpby|spmv|mg|total|final)_p2x1x1_n560x280x280_(gflops_per_process|bandwidth_per_process|gflops|bandwidth)

    gpu_hpcg_r4:
      statistics: mean
      categories: HPCG gpu-hpcg:r4
      metrics:
-        - gpu-hpcg:r4/is_valid
-        - gpu-hpcg:r4/final_gflops
-        - gpu-hpcg:r4/final_bandwidth
-        - gpu-hpcg:r4/final_gflops_per_process
-        - gpu-hpcg:r4/final_bandwidth_per_process
-        - gpu-hpcg:r4/ddot_gflops
-        - gpu-hpcg:r4/ddot_bandwidth
-        - gpu-hpcg:r4/ddot_gflops_per_process
-        - gpu-hpcg:r4/ddot_bandwidth_per_process
-        - gpu-hpcg:r4/waxpby_gflops
-        - gpu-hpcg:r4/waxpby_bandwidth
-        - gpu-hpcg:r4/waxpby_gflops_per_process
-        - gpu-hpcg:r4/waxpby_bandwidth_per_process
-        - gpu-hpcg:r4/spmv_gflops
-        - gpu-hpcg:r4/spmv_bandwidth
-        - gpu-hpcg:r4/spmv_gflops_per_process
-        - gpu-hpcg:r4/spmv_bandwidth_per_process
-        - gpu-hpcg:r4/mg_gflops
-        - gpu-hpcg:r4/mg_bandwidth
-        - gpu-hpcg:r4/mg_gflops_per_process
-        - gpu-hpcg:r4/mg_bandwidth_per_process
-        - gpu-hpcg:r4/total_gflops
-        - gpu-hpcg:r4/total_bandwidth
-        - gpu-hpcg:r4/total_gflops_per_process
-        - gpu-hpcg:r4/total_bandwidth_per_process
-        - gpu-hpcg:r4/local_domain_x
-        - gpu-hpcg:r4/local_domain_y
-        - gpu-hpcg:r4/local_domain_z
-        - gpu-hpcg:r4/process_domain_x
-        - gpu-hpcg:r4/process_domain_y
-        - gpu-hpcg:r4/process_domain_z
+        - gpu-hpcg:r4/(setup_time|optimization_time|total_time)_p2x2x1_n560x280x280
+        - gpu-hpcg:r4/(ddot|waxpby|spmv|mg|total|final)_p2x2x1_n560x280x280_(gflops_per_process|bandwidth_per_process|gflops|bandwidth)

    gpu_hpcg_r8:
      statistics: mean
      categories: HPCG gpu-hpcg:r8
      metrics:
-        - gpu-hpcg:r8/is_valid
-        - gpu-hpcg:r8/final_gflops
-        - gpu-hpcg:r8/final_bandwidth
-        - gpu-hpcg:r8/final_gflops_per_process
-        - gpu-hpcg:r8/final_bandwidth_per_process
-        - gpu-hpcg:r8/ddot_gflops
-        - gpu-hpcg:r8/ddot_bandwidth
-        - gpu-hpcg:r8/ddot_gflops_per_process
-        - gpu-hpcg:r8/ddot_bandwidth_per_process
-        - gpu-hpcg:r8/waxpby_gflops
-        - gpu-hpcg:r8/waxpby_bandwidth
-        - gpu-hpcg:r8/waxpby_gflops_per_process
-        - gpu-hpcg:r8/waxpby_bandwidth_per_process
-        - gpu-hpcg:r8/spmv_gflops
-        - gpu-hpcg:r8/spmv_bandwidth
-        - gpu-hpcg:r8/spmv_gflops_per_process
-        - gpu-hpcg:r8/spmv_bandwidth_per_process
-        - gpu-hpcg:r8/mg_gflops
-        - gpu-hpcg:r8/mg_bandwidth
-        - gpu-hpcg:r8/mg_gflops_per_process
-        - gpu-hpcg:r8/mg_bandwidth_per_process
-        - gpu-hpcg:r8/total_gflops
-        - gpu-hpcg:r8/total_bandwidth
-        - gpu-hpcg:r8/total_gflops_per_process
-        - gpu-hpcg:r8/total_bandwidth_per_process
-        - gpu-hpcg:r8/local_domain_x
-        - gpu-hpcg:r8/local_domain_y
-        - gpu-hpcg:r8/local_domain_z
-        - gpu-hpcg:r8/process_domain_x
-        - gpu-hpcg:r8/process_domain_y
-        - gpu-hpcg:r8/process_domain_z
+        - gpu-hpcg:r8/(setup_time|optimization_time|total_time)_p2x2x2_n560x280x280
+        - gpu-hpcg:r8/(ddot|waxpby|spmv|mg|total|final)_p2x2x2_n560x280x280_(gflops_per_process|bandwidth_per_process|gflops|bandwidth)

    gpu_hpcg_r16:
      statistics: mean
      categories: HPCG gpu-hpcg:r16
      metrics:
-        - gpu-hpcg:r16/is_valid
-        - gpu-hpcg:r16/final_gflops
-        - gpu-hpcg:r16/final_bandwidth
-        - gpu-hpcg:r16/final_gflops_per_process
-        - gpu-hpcg:r16/final_bandwidth_per_process
-        - gpu-hpcg:r16/ddot_gflops
-        - gpu-hpcg:r16/ddot_bandwidth
-        - gpu-hpcg:r16/ddot_gflops_per_process
-        - gpu-hpcg:r16/ddot_bandwidth_per_process
-        - gpu-hpcg:r16/waxpby_gflops
-        - gpu-hpcg:r16/waxpby_bandwidth
-        - gpu-hpcg:r16/waxpby_gflops_per_process
-        - gpu-hpcg:r16/waxpby_bandwidth_per_process
-        - gpu-hpcg:r16/spmv_gflops
-        - gpu-hpcg:r16/spmv_bandwidth
-        - gpu-hpcg:r16/spmv_gflops_per_process
-        - gpu-hpcg:r16/spmv_bandwidth_per_process
-        - gpu-hpcg:r16/mg_gflops
-        - gpu-hpcg:r16/mg_bandwidth
-        - gpu-hpcg:r16/mg_gflops_per_process
-        - gpu-hpcg:r16/mg_bandwidth_per_process
-        - gpu-hpcg:r16/total_gflops
-        - gpu-hpcg:r16/total_bandwidth
-        - gpu-hpcg:r16/total_gflops_per_process
-        - gpu-hpcg:r16/total_bandwidth_per_process
-        - gpu-hpcg:r16/local_domain_x
-        - gpu-hpcg:r16/local_domain_y
-        - gpu-hpcg:r16/local_domain_z
-        - gpu-hpcg:r16/process_domain_x
-        - gpu-hpcg:r16/process_domain_y
-        - gpu-hpcg:r16/process_domain_z
+        - gpu-hpcg:r16/(setup_time|optimization_time|total_time)_p4x2x2_n560x280x280
+        - gpu-hpcg:r16/(ddot|waxpby|spmv|mg|total|final)_p4x2x2_n560x280x280_(gflops_per_process|bandwidth_per_process|gflops|bandwidth)

    gpu_hpcg_r32:
      statistics: mean
      categories: HPCG gpu-hpcg:r32
      metrics:
-        - gpu-hpcg:r32/is_valid
-        - gpu-hpcg:r32/final_gflops
-        - gpu-hpcg:r32/final_bandwidth
-        - gpu-hpcg:r32/final_gflops_per_process
-        - gpu-hpcg:r32/final_bandwidth_per_process
-        - gpu-hpcg:r32/ddot_gflops
-        - gpu-hpcg:r32/ddot_bandwidth
-        - gpu-hpcg:r32/ddot_gflops_per_process
-        - gpu-hpcg:r32/ddot_bandwidth_per_process
-        - gpu-hpcg:r32/waxpby_gflops
-        - gpu-hpcg:r32/waxpby_bandwidth
-        - gpu-hpcg:r32/waxpby_gflops_per_process
-        - gpu-hpcg:r32/waxpby_bandwidth_per_process
-        - gpu-hpcg:r32/spmv_gflops
-        - gpu-hpcg:r32/spmv_bandwidth
-        - gpu-hpcg:r32/spmv_gflops_per_process
-        - gpu-hpcg:r32/spmv_bandwidth_per_process
-        - gpu-hpcg:r32/mg_gflops
-        - gpu-hpcg:r32/mg_bandwidth
-        - gpu-hpcg:r32/mg_gflops_per_process
-        - gpu-hpcg:r32/mg_bandwidth_per_process
-        - gpu-hpcg:r32/total_gflops
-        - gpu-hpcg:r32/total_bandwidth
-        - gpu-hpcg:r32/total_gflops_per_process
-        - gpu-hpcg:r32/total_bandwidth_per_process
-        - gpu-hpcg:r32/local_domain_x
-        - gpu-hpcg:r32/local_domain_y
-        - gpu-hpcg:r32/local_domain_z
-        - gpu-hpcg:r32/process_domain_x
-        - gpu-hpcg:r32/process_domain_y
-        - gpu-hpcg:r32/process_domain_z
+        - gpu-hpcg:r32/(setup_time|optimization_time|total_time)_p4x4x2_n560x280x280
+        - gpu-hpcg:r32/(ddot|waxpby|spmv|mg|total|final)_p4x4x2_n560x280x280_(gflops_per_process|bandwidth_per_process|gflops|bandwidth)
--- a/tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
+++ b/tests/benchmarks/micro_benchmarks/test_dtk_hpcg_performance.py
@@ -72,7 +72,15 @@ def get_benchmark(self):
        """Get benchmark."""
        (benchmark_cls, _) = BenchmarkRegistry._BenchmarkRegistry__select_benchmark(self.benchmark_name, Platform.DTK)
        benchmark = benchmark_cls(self.benchmark_name, parameters='')
-        benchmark._args = SimpleNamespace(log_raw_data=False)
+        benchmark._args = SimpleNamespace(
+            log_raw_data=False,
+            npx=4,
+            npy=4,
+            npz=2,
+            nx=560,
+            ny=280,
+            nz=280,
+        )
        benchmark._curr_run_index = 0
        benchmark._result = BenchmarkResult(self.benchmark_name, BenchmarkType.MICRO, ReturnCode.SUCCESS, run_count=1)
        return benchmark
@@ -93,51 +101,59 @@ def test_dtk_hpcg_result_parsing_with_wrapper_noise(self):
        self.assertTrue(benchmark._process_raw_result(0, self.example_raw_output))
        self.assertEqual(ReturnCode.SUCCESS, benchmark.return_code)

-        self.assertEqual(6904.9, benchmark.result['final_gflops'][0])
-        self.assertEqual(215.8, benchmark.result['final_gflops_per_process'][0])
-        self.assertEqual(5849.4, benchmark.result['ddot_gflops'][0])
-        self.assertEqual(46794.9, benchmark.result['ddot_bandwidth'][0])
-        self.assertEqual(182.8, benchmark.result['ddot_gflops_per_process'][0])
-        self.assertEqual(1462.3, benchmark.result['ddot_bandwidth_per_process'][0])
-        self.assertEqual(3052.0, benchmark.result['waxpby_gflops'][0])
-        self.assertEqual(36623.8, benchmark.result['waxpby_bandwidth'][0])
-        self.assertEqual(5473.9, benchmark.result['spmv_gflops'][0])
-        self.assertEqual(34468.8, benchmark.result['spmv_bandwidth'][0])
-        self.assertEqual(7716.9, benchmark.result['mg_gflops'][0])
-        self.assertEqual(59557.1, benchmark.result['mg_bandwidth'][0])
-        self.assertEqual(6971.0, benchmark.result['total_gflops'][0])
-        self.assertEqual(52859.9, benchmark.result['total_bandwidth'][0])
-        self.assertEqual(217.8, benchmark.result['total_gflops_per_process'][0])
-        self.assertEqual(1651.9, benchmark.result['total_bandwidth_per_process'][0])
-        self.assertEqual(0.12, benchmark.result['setup_time'][0])
-        self.assertEqual(0.25, benchmark.result['optimization_time'][0])
-        self.assertEqual(7.55, benchmark.result['total_time'][0])
-        self.assertEqual(0, benchmark.result['is_valid'][0])
-        self.assertEqual(560, benchmark.result['local_domain_x'][0])
-        self.assertEqual(280, benchmark.result['local_domain_y'][0])
-        self.assertEqual(280, benchmark.result['local_domain_z'][0])
-        self.assertEqual(2240, benchmark.result['global_domain_x'][0])
-        self.assertEqual(1120, benchmark.result['global_domain_y'][0])
-        self.assertEqual(560, benchmark.result['global_domain_z'][0])
-        self.assertEqual(4, benchmark.result['process_domain_x'][0])
-        self.assertEqual(4, benchmark.result['process_domain_y'][0])
-        self.assertEqual(2, benchmark.result['process_domain_z'][0])
+        workload = 'p4x4x2_n560x280x280'
+        expected_results = {
+            f'final_{workload}_gflops': 6904.9,
+            f'final_{workload}_gflops_per_process': 215.8,
+            f'final_{workload}_bandwidth': 52359.0,
+            f'final_{workload}_bandwidth_per_process': 1636.2,
+            f'ddot_{workload}_gflops': 5849.4,
+            f'ddot_{workload}_bandwidth': 46794.9,
+            f'ddot_{workload}_gflops_per_process': 182.8,
+            f'ddot_{workload}_bandwidth_per_process': 1462.3,
+            f'waxpby_{workload}_gflops': 3052.0,
+            f'waxpby_{workload}_bandwidth': 36623.8,
+            f'waxpby_{workload}_gflops_per_process': 95.4,
+            f'waxpby_{workload}_bandwidth_per_process': 1144.5,
+            f'spmv_{workload}_gflops': 5473.9,
+            f'spmv_{workload}_bandwidth': 34468.8,
+            f'spmv_{workload}_gflops_per_process': 171.1,
+            f'spmv_{workload}_bandwidth_per_process': 1077.1,
+            f'mg_{workload}_gflops': 7716.9,
+            f'mg_{workload}_bandwidth': 59557.1,
+            f'mg_{workload}_gflops_per_process': 241.2,
+            f'mg_{workload}_bandwidth_per_process': 1861.2,
+            f'total_{workload}_gflops': 6971.0,
+            f'total_{workload}_bandwidth': 52859.9,
+            f'total_{workload}_gflops_per_process': 217.8,
+            f'total_{workload}_bandwidth_per_process': 1651.9,
+            f'setup_time_{workload}': 0.12,
+            f'optimization_time_{workload}': 0.25,
+            f'total_time_{workload}': 7.55,
+        }
+
+        self.assertEqual(len(expected_results), len(benchmark.result) - benchmark.default_metric_count)
+        for metric, value in expected_results.items():
+            self.assertIn(metric, benchmark.result)
+            self.assertEqual(value, benchmark.result[metric][0])
+        for metric in benchmark.result:
+            self.assertNotIn('valid', metric)
+            self.assertNotIn('domain', metric)
        self.assertIn('raw_output_0', benchmark.raw_data)

-    def test_dtk_hpcg_result_parsing_valid_by_absence_of_invalid_markers(self):
-        """Test DTK gpu-hpcg valid detection by absence of invalid markers."""
+    def test_dtk_hpcg_result_parsing_ignores_invalid_markers(self):
+        """Test DTK gpu-hpcg does not emit validity metrics."""
        benchmark = self.get_benchmark()
-        valid_output = self.example_raw_output.replace('*** WARNING *** INVALID RUN', '')
-        valid_output = valid_output.replace('*** WARNING *** THIS IS NOT A VALID RUN ***', '')

-        self.assertTrue(benchmark._process_raw_result(0, valid_output))
-        self.assertEqual(1, benchmark.result['is_valid'][0])
+        self.assertTrue(benchmark._process_raw_result(0, self.example_raw_output))
+        self.assertFalse(any('valid' in metric for metric in benchmark.result))

    def test_dtk_hpcg_result_parsing_failure_when_required_summary_is_missing(self):
        """Test DTK gpu-hpcg parsing failure when required summary is missing."""
        benchmark = self.get_benchmark()
        invalid_output = self.example_raw_output.replace(
-            '[1,0]<stdout>: Process domain: 4 x 4 x 2\n\n',
+            '[1,0]<stdout>: Final  =  6904.9 GFlop/s ( 52359.0 GB/s)     '
+            '215.8 GFlop/s per process ( 1636.2 GB/s per process)\n',
            '',
        )