Config: Update config files (#7)

- Add BW150 config - Update BW1000 config - Merge summary rules

Config: Update config files (#7)
- Add BW150 config - Update BW1000 config - Merge summary rules
511807b7 · one · GitHub · 0993db75 · 511807b7 · 0993db75
Unverified Commit 511807b7 authored Apr 21, 2026 by one Committed by GitHub Apr 21, 2026
4 changed files
--- a/superbench/config/hygon_bw1000.yaml
+++ b/superbench/config/hygon_bw1000.yaml
@@ -6,7 +6,7 @@ superbench:
    enable: false
  var:
    default_local_mode: &default_local_mode
-      enable: false
+      enable: true
      modes:
        - name: local
          proc_num: 8
@@ -44,23 +44,23 @@ superbench:
        n: 8192
        k: 8192
    hipblaslt-gemm:
-      enable: false
+      enable: true
      modes:
-      - name: local
-        proc_num: 8
-        prefix: HIP_VISIBLE_DEVICES={proc_rank}
-        parallel: yes
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
      parameters:
-        in_types: ["fp32", "fp16", "bf16"]
+        in_types: [ "fp32", "fp16", "bf16" ]
        tolerant_fail: yes
        num_warmup: 100
        num_steps: 1000
        shapes:
-        - 4096,4096,4096
-        - 8192,8192,8192
-        - 16384,16384,16384
+          - 4096,4096,4096
+          - 8192,8192,8192
+          - 16384,16384,16384
    gpu-stream:
-      enable: false
+      enable: true
      modes:
        - name: local
          proc_num: 8
@@ -71,7 +71,7 @@ superbench:
        num_loops: 100
        precision: double
    rccl-bw:allreduce-r16:
-      enable: true
+      enable: false
      modes:
        - name: mpi
          proc_num: 8
@@ -92,57 +92,37 @@ superbench:
        maxbytes: 16G
        ngpus: 1
        operation: allreduce
-    rccl-bw:allreduce-r8-pcie:
+    rccl-bw:allreduce-r8:
      enable: true
      modes:
        - name: mpi
          proc_num: 8
          node_num: 1
          bind_to: none
-          mca:
-            pml: ucx
-            btl: ^openib
-            btl_tcp_if_exclude: lo,docker0
-            coll_hcoll_enable: 0
          env:
-            ROCM_PATH: /opt/dtk
-            HSA_FORCE_FINE_GRAIN_PCIE: 1
-            NCCL_SOCKET_IFNAME: p14p2
-            NCCL_NET_GDR_LEVEL: PHB
-            NCCL_NET_GDR_READ: 1
            NCCL_BUFFSIZE: 4194304
-            NCCL_SIMPLE_CHANNELS: 32
-            RCCL_P2P_XHCL_CHANNEL_NUM: 31
-            RCCL_COLL_XHCL_CHANNEL_NUM: 28
      parameters:
        maxbytes: 16G
        ngpus: 1
        operation: allreduce
-    rccl-bw:allreduce-r8:
-      enable: false
+    rccl-bw:allreduce-r8-pcie:
+      enable: true
      modes:
        - name: mpi
          proc_num: 8
          node_num: 1
          bind_to: none
-          mca:
-            pml: ucx
-            btl: ^openib
-            btl_tcp_if_exclude: lo,docker0
-            coll_hcoll_enable: 0
          env:
-            ROCM_PATH: /opt/dtk
-            HSA_FORCE_FINE_GRAIN_PCIE: 1
-            NCCL_SOCKET_IFNAME: p14p2
-            NCCL_NET_GDR_LEVEL: PHB
-            NCCL_NET_GDR_READ: 1
            NCCL_BUFFSIZE: 4194304
+            NCCL_SIMPLE_CHANNELS: 32
+            RCCL_P2P_XHCL_CHANNEL_NUM: 31
+            RCCL_COLL_XHCL_CHANNEL_NUM: 28
      parameters:
        maxbytes: 16G
        ngpus: 1
        operation: allreduce
    rccl-bw:alltoall-r16:
-      enable: true
+      enable: false
      modes:
        - name: mpi
          proc_num: 8
@@ -163,6 +143,17 @@ superbench:
        maxbytes: 16G
        ngpus: 1
        operation: alltoall
+    rccl-bw:alltoall-r8:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: alltoall
    gpu-hpcg:r32:
      enable: false
      modes:
@@ -182,12 +173,12 @@ superbench:
        nx: 560
        ny: 280
        nz: 280
-        rt: 60
+        rt: 10
        npx: 4
        npy: 4
        npz: 2
    gpu-hpcg:r16:
-      enable: true
+      enable: false
      modes:
        - name: mpi
          proc_num: 8
@@ -205,7 +196,7 @@ superbench:
        nx: 560
        ny: 280
        nz: 280
-        rt: 0
+        rt: 10
        npx: 4
        npy: 2
        npz: 2
@@ -220,7 +211,7 @@ superbench:
        nx: 560
        ny: 280
        nz: 280
-        rt: 60
+        rt: 10
        npx: 2
        npy: 2
        npz: 2
@@ -235,7 +226,7 @@ superbench:
        nx: 560
        ny: 280
        nz: 280
-        rt: 60
+        rt: 10
        npx: 2
        npy: 2
        npz: 1
@@ -250,7 +241,7 @@ superbench:
        nx: 560
        ny: 280
        nz: 280
-        rt: 60
+        rt: 10
        npx: 2
        npy: 1
        npz: 1
@@ -265,7 +256,7 @@ superbench:
        nx: 560
        ny: 280
        nz: 280
-        rt: 60
+        rt: 10
        npx: 1
        npy: 1
        npz: 1
@@ -281,7 +272,7 @@ superbench:
          - latency_matrix
          - max_bandwidth
    mem-bw:
-      enable: false
+      enable: true
      modes:
        - name: local
          proc_num: 8
@@ -290,10 +281,10 @@ superbench:
    ib-loopback:
      enable: false
      modes:
-      - name: local
-        proc_num: 16
-        prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
-        parallel: no
+        - name: local
+          proc_num: 16
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
+          parallel: no
      parameters:
        msg_size: 8388608
    disk-benchmark:
@@ -305,7 +296,7 @@ superbench:
      parameters:
        block_devices: []
    gpu-copy-bw:correctness:
-      enable: false
+      enable: true
      modes:
        - name: local
          parallel: no
@@ -317,7 +308,7 @@ superbench:
        num_loops: 1
        check_data: true
    gpu-copy-bw:perf:
-      enable: false
+      enable: true
      modes:
        - name: local
          parallel: no

--- a/superbench/config/hygon_bw1000_summary.yaml
+++ b/superbench/config/hygon_bw1000_summary.yaml
-# SuperBench summary rules for hygon_bw1000.yaml
-#
-# Usage:
-#   sb result summary \
-#     --data-file <output-dir>/results-summary.jsonl \
-#     --rule-file superbench/config/hygon_bw1000_summary.yaml \
-#     --output-file-format md \
-#     --output-dir <summary-output-dir>
-#
-# Notes:
-# - This file focuses on RCCL benchmarks defined in
-#   superbench/config/hygon_bw1000.yaml.
-# - Unmatched benchmark sections are allowed. If a benchmark was not run,
-#   the corresponding category in the summary will be empty.
-
-version: v0.12
-superbench:
-  rules:
-    rccl_bw_allreduce_r16_bandwidth:
-      statistics: mean
-      categories: RCCL rccl-bw:allreduce-r16
-      metrics:
-        - rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-        - rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
-
-    rccl_bw_allreduce_r8_pcie_bandwidth:
-      statistics: mean
-      categories: RCCL rccl-bw:allreduce-r8-pcie
-      metrics:
-        - rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-        - rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
-
-    rccl_bw_allreduce_r8_bandwidth:
-      statistics: mean
-      categories: RCCL rccl-bw:allreduce-r8
-      metrics:
-        - rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-        - rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
-
-    rccl_bw_alltoall_r16_bandwidth:
-      statistics: mean
-      categories: RCCL rccl-bw:alltoall-r16
-      metrics:
-        - rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
-        - rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
--- a/superbench/config/hygon_bw150.yaml
+++ b/superbench/config/hygon_bw150.yaml
+# SuperBench Config
+version: v0.12
+superbench:
+  enable: null
+  monitor:
+    enable: false
+  var:
+    default_local_mode: &default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
+      enable: true
+      modes:
+        - name: torch.distributed
+          proc_num: 8
+          node_num: 1
+      frameworks:
+        - pytorch
+    common_model_config: &common_model_config
+    model_ddp_parameter: &model_ddp_param
+      duration: 0
+      num_warmup: 128
+      num_steps: 512
+      sample_count: 8192
+      batch_size: 128
+      precision: [float32, float16]
+      model_action: [train]
+      pin_memory: yes
+      num_workers: 0
+  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+      parameters:
+        m: 7680
+        n: 8192
+        k: 8192
+    hipblaslt-gemm:
+      enable: true
+      modes:
+      - name: local
+        proc_num: 8
+        prefix: HIP_VISIBLE_DEVICES={proc_rank}
+        parallel: yes
+      parameters:
+        in_types: ["fp32", "fp16", "bf16"]
+        tolerant_fail: yes
+        num_warmup: 100
+        num_steps: 1000
+        shapes:
+        - 4096,4096,4096
+        - 8192,8192,8192
+        - 16384,16384,16384
+    gpu-stream:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+      parameters:
+        array_size: 268435456
+        num_loops: 100
+        precision: double
+    rccl-bw:allreduce-r8:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:allreduce-r4:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+          env:
+            NCCL_BUFFSIZE: 4194304
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:allreduce-r4-graph:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+          env:
+            NCCL_BUFFSIZE: 4194304
+            NCCL_RINGS: "0 1 2 3|0 3 2 1|0 1 3 2|0 2 3 1|0 2 1 3|0 3 1 2"
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:allreduce-r4-pcie:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+          env:
+            NCCL_BUFFSIZE: 4194304
+            NCCL_SIMPLE_CHANNELS: 20
+            RCCL_P2P_XHCL_CHANNEL_NUM: 16
+            RCCL_COLL_XHCL_CHANNEL_NUM: 16
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:allreduce-r4-graph-pcie:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+          env:
+            NCCL_BUFFSIZE: 4194304
+            NCCL_SIMPLE_CHANNELS: 20
+            RCCL_P2P_XHCL_CHANNEL_NUM: 16
+            RCCL_COLL_XHCL_CHANNEL_NUM: 16
+            NCCL_RINGS: "0 1 2 3|0 3 2 1|0 1 3 2|0 2 3 1|0 2 1 3|0 3 1 2"
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:alltoall-r8:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: alltoall
+    rccl-bw:alltoall-r4:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: alltoall
+    gpu-hpcg:r32:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 4
+          host_list: [node01, node02, node03, node04]
+          bind_to: none
+          mca:
+            pml: ob1
+            btl: ^openib
+            btl_tcp_if_include: p14p2
+            coll_hcoll_enable: 0
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 10
+        npx: 4
+        npy: 4
+        npz: 2
+    gpu-hpcg:r16:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 2
+          host_list: [node01, node02]
+          bind_to: none
+          mca:
+            pml: ob1
+            btl: ^openib
+            btl_tcp_if_include: p14p2
+            coll_hcoll_enable: 0
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 10
+        npx: 4
+        npy: 2
+        npz: 2
+    gpu-hpcg:r8:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 10
+        npx: 2
+        npy: 2
+        npz: 2
+    gpu-hpcg:r4:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 10
+        npx: 2
+        npy: 2
+        npz: 1
+    gpu-hpcg:r2:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 2
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 10
+        npx: 2
+        npy: 1
+        npz: 1
+    gpu-hpcg:r1:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 1
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 10
+        npx: 1
+        npy: 1
+        npz: 1
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
+    mem-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: no
+    ib-loopback:
+      enable: false
+      modes:
+      - name: local
+        proc_num: 16
+        prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
+        parallel: no
+      parameters:
+        msg_size: 8388608
+    disk-benchmark:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        block_devices: []
+    gpu-copy-bw:correctness:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
+        copy_type: [sm, dma]
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
+      enable: true
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
+        copy_type: [sm, dma]
+    # dist-inference:
+    #   modes:
+    #   - name: mpi
+    #     proc_num: 8
+    #     node_num: 1
+    #     mca:
+    #       pml: ob1
+    #       btl: ^openib
+    #       btl_tcp_if_exclude: lo,docker0
+    #       coll_hcoll_enable: 0
+    #   frameworks:
+    #     - pytorch
+    #   parameters:
+    #     num_layers: 50
+    #     num_warmup: 20
+    #     num_steps: 100
+    #     use_cuda_graph: true
+    #     precision: float16
+    #     hidden_size: 128
+    #     input_size: 128
+    #     batch_size: 1024
--- a/superbench/config/hygon_bw_summary.yaml
+++ b/superbench/config/hygon_bw_summary.yaml
+# SuperBench summary rules for Hygon BW configurations
+#
+# Usage:
+#   sb result summary \
+#     --data-file <output-dir>/results-summary.jsonl \
+#     --rule-file superbench/config/hygon_bw_summary.yaml \
+#     --output-file-format md \
+#     --output-dir <summary-output-dir>
+#
+# Notes:
+# - This file covers common RCCL and GPU HPCG benchmark metrics used by
+#   Hygon BW configuration files.
+# - Unmatched benchmark sections are allowed. If a benchmark was not run,
+#   the corresponding category in the summary will be empty.
+
+version: v0.12
+superbench:
+  rules:
+    rccl_bw_allreduce_r16_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r16
+      metrics:
+        - rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r8_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r8
+      metrics:
+        - rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r8_pcie_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r8-pcie
+      metrics:
+        - rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r4_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r4
+      metrics:
+        - rccl-bw:allreduce-r4/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r4/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r4_graph_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r4-graph
+      metrics:
+        - rccl-bw:allreduce-r4-graph/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r4-graph/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r4_pcie_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r4-pcie
+      metrics:
+        - rccl-bw:allreduce-r4-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r4-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r4_graph_pcie_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r4-graph-pcie
+      metrics:
+        - rccl-bw:allreduce-r4-graph-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r4-graph-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_alltoall_r8_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:alltoall-r8
+      metrics:
+        - rccl-bw:alltoall-r8/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:alltoall-r8/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_alltoall_r16_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:alltoall-r16
+      metrics:
+        - rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_alltoall_r4_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:alltoall-r4
+      metrics:
+        - rccl-bw:alltoall-r4/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:alltoall-r4/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    gpu_hpcg_r1:
+      statistics: mean
+      categories: HPCG gpu-hpcg:r1
+      metrics:
+        - gpu-hpcg:r1/is_valid
+        - gpu-hpcg:r1/final_gflops
+        - gpu-hpcg:r1/final_bandwidth
+        - gpu-hpcg:r1/final_gflops_per_process
+        - gpu-hpcg:r1/final_bandwidth_per_process
+        - gpu-hpcg:r1/ddot_gflops
+        - gpu-hpcg:r1/ddot_bandwidth
+        - gpu-hpcg:r1/ddot_gflops_per_process
+        - gpu-hpcg:r1/ddot_bandwidth_per_process
+        - gpu-hpcg:r1/waxpby_gflops
+        - gpu-hpcg:r1/waxpby_bandwidth
+        - gpu-hpcg:r1/waxpby_gflops_per_process
+        - gpu-hpcg:r1/waxpby_bandwidth_per_process
+        - gpu-hpcg:r1/spmv_gflops
+        - gpu-hpcg:r1/spmv_bandwidth
+        - gpu-hpcg:r1/spmv_gflops_per_process
+        - gpu-hpcg:r1/spmv_bandwidth_per_process
+        - gpu-hpcg:r1/mg_gflops
+        - gpu-hpcg:r1/mg_bandwidth
+        - gpu-hpcg:r1/mg_gflops_per_process
+        - gpu-hpcg:r1/mg_bandwidth_per_process
+        - gpu-hpcg:r1/total_gflops
+        - gpu-hpcg:r1/total_bandwidth
+        - gpu-hpcg:r1/total_gflops_per_process
+        - gpu-hpcg:r1/total_bandwidth_per_process
+        - gpu-hpcg:r1/local_domain_x
+        - gpu-hpcg:r1/local_domain_y
+        - gpu-hpcg:r1/local_domain_z
+        - gpu-hpcg:r1/process_domain_x
+        - gpu-hpcg:r1/process_domain_y
+        - gpu-hpcg:r1/process_domain_z
+
+    gpu_hpcg_r2:
+      statistics: mean
+      categories: HPCG gpu-hpcg:r2
+      metrics:
+        - gpu-hpcg:r2/is_valid
+        - gpu-hpcg:r2/final_gflops
+        - gpu-hpcg:r2/final_bandwidth
+        - gpu-hpcg:r2/final_gflops_per_process
+        - gpu-hpcg:r2/final_bandwidth_per_process
+        - gpu-hpcg:r2/ddot_gflops
+        - gpu-hpcg:r2/ddot_bandwidth
+        - gpu-hpcg:r2/ddot_gflops_per_process
+        - gpu-hpcg:r2/ddot_bandwidth_per_process
+        - gpu-hpcg:r2/waxpby_gflops
+        - gpu-hpcg:r2/waxpby_bandwidth
+        - gpu-hpcg:r2/waxpby_gflops_per_process
+        - gpu-hpcg:r2/waxpby_bandwidth_per_process
+        - gpu-hpcg:r2/spmv_gflops
+        - gpu-hpcg:r2/spmv_bandwidth
+        - gpu-hpcg:r2/spmv_gflops_per_process
+        - gpu-hpcg:r2/spmv_bandwidth_per_process
+        - gpu-hpcg:r2/mg_gflops
+        - gpu-hpcg:r2/mg_bandwidth
+        - gpu-hpcg:r2/mg_gflops_per_process
+        - gpu-hpcg:r2/mg_bandwidth_per_process
+        - gpu-hpcg:r2/total_gflops
+        - gpu-hpcg:r2/total_bandwidth
+        - gpu-hpcg:r2/total_gflops_per_process
+        - gpu-hpcg:r2/total_bandwidth_per_process
+        - gpu-hpcg:r2/local_domain_x
+        - gpu-hpcg:r2/local_domain_y
+        - gpu-hpcg:r2/local_domain_z
+        - gpu-hpcg:r2/process_domain_x
+        - gpu-hpcg:r2/process_domain_y
+        - gpu-hpcg:r2/process_domain_z
+
+    gpu_hpcg_r4:
+      statistics: mean
+      categories: HPCG gpu-hpcg:r4
+      metrics:
+        - gpu-hpcg:r4/is_valid
+        - gpu-hpcg:r4/final_gflops
+        - gpu-hpcg:r4/final_bandwidth
+        - gpu-hpcg:r4/final_gflops_per_process
+        - gpu-hpcg:r4/final_bandwidth_per_process
+        - gpu-hpcg:r4/ddot_gflops
+        - gpu-hpcg:r4/ddot_bandwidth
+        - gpu-hpcg:r4/ddot_gflops_per_process
+        - gpu-hpcg:r4/ddot_bandwidth_per_process
+        - gpu-hpcg:r4/waxpby_gflops
+        - gpu-hpcg:r4/waxpby_bandwidth
+        - gpu-hpcg:r4/waxpby_gflops_per_process
+        - gpu-hpcg:r4/waxpby_bandwidth_per_process
+        - gpu-hpcg:r4/spmv_gflops
+        - gpu-hpcg:r4/spmv_bandwidth
+        - gpu-hpcg:r4/spmv_gflops_per_process
+        - gpu-hpcg:r4/spmv_bandwidth_per_process
+        - gpu-hpcg:r4/mg_gflops
+        - gpu-hpcg:r4/mg_bandwidth
+        - gpu-hpcg:r4/mg_gflops_per_process
+        - gpu-hpcg:r4/mg_bandwidth_per_process
+        - gpu-hpcg:r4/total_gflops
+        - gpu-hpcg:r4/total_bandwidth
+        - gpu-hpcg:r4/total_gflops_per_process
+        - gpu-hpcg:r4/total_bandwidth_per_process
+        - gpu-hpcg:r4/local_domain_x
+        - gpu-hpcg:r4/local_domain_y
+        - gpu-hpcg:r4/local_domain_z
+        - gpu-hpcg:r4/process_domain_x
+        - gpu-hpcg:r4/process_domain_y
+        - gpu-hpcg:r4/process_domain_z
+
+    gpu_hpcg_r8:
+      statistics: mean
+      categories: HPCG gpu-hpcg:r8
+      metrics:
+        - gpu-hpcg:r8/is_valid
+        - gpu-hpcg:r8/final_gflops
+        - gpu-hpcg:r8/final_bandwidth
+        - gpu-hpcg:r8/final_gflops_per_process
+        - gpu-hpcg:r8/final_bandwidth_per_process
+        - gpu-hpcg:r8/ddot_gflops
+        - gpu-hpcg:r8/ddot_bandwidth
+        - gpu-hpcg:r8/ddot_gflops_per_process
+        - gpu-hpcg:r8/ddot_bandwidth_per_process
+        - gpu-hpcg:r8/waxpby_gflops
+        - gpu-hpcg:r8/waxpby_bandwidth
+        - gpu-hpcg:r8/waxpby_gflops_per_process
+        - gpu-hpcg:r8/waxpby_bandwidth_per_process
+        - gpu-hpcg:r8/spmv_gflops
+        - gpu-hpcg:r8/spmv_bandwidth
+        - gpu-hpcg:r8/spmv_gflops_per_process
+        - gpu-hpcg:r8/spmv_bandwidth_per_process
+        - gpu-hpcg:r8/mg_gflops
+        - gpu-hpcg:r8/mg_bandwidth
+        - gpu-hpcg:r8/mg_gflops_per_process
+        - gpu-hpcg:r8/mg_bandwidth_per_process
+        - gpu-hpcg:r8/total_gflops
+        - gpu-hpcg:r8/total_bandwidth
+        - gpu-hpcg:r8/total_gflops_per_process
+        - gpu-hpcg:r8/total_bandwidth_per_process
+        - gpu-hpcg:r8/local_domain_x
+        - gpu-hpcg:r8/local_domain_y
+        - gpu-hpcg:r8/local_domain_z
+        - gpu-hpcg:r8/process_domain_x
+        - gpu-hpcg:r8/process_domain_y
+        - gpu-hpcg:r8/process_domain_z
+
+    gpu_hpcg_r16:
+      statistics: mean
+      categories: HPCG gpu-hpcg:r16
+      metrics:
+        - gpu-hpcg:r16/is_valid
+        - gpu-hpcg:r16/final_gflops
+        - gpu-hpcg:r16/final_bandwidth
+        - gpu-hpcg:r16/final_gflops_per_process
+        - gpu-hpcg:r16/final_bandwidth_per_process
+        - gpu-hpcg:r16/ddot_gflops
+        - gpu-hpcg:r16/ddot_bandwidth
+        - gpu-hpcg:r16/ddot_gflops_per_process
+        - gpu-hpcg:r16/ddot_bandwidth_per_process
+        - gpu-hpcg:r16/waxpby_gflops
+        - gpu-hpcg:r16/waxpby_bandwidth
+        - gpu-hpcg:r16/waxpby_gflops_per_process
+        - gpu-hpcg:r16/waxpby_bandwidth_per_process
+        - gpu-hpcg:r16/spmv_gflops
+        - gpu-hpcg:r16/spmv_bandwidth
+        - gpu-hpcg:r16/spmv_gflops_per_process
+        - gpu-hpcg:r16/spmv_bandwidth_per_process
+        - gpu-hpcg:r16/mg_gflops
+        - gpu-hpcg:r16/mg_bandwidth
+        - gpu-hpcg:r16/mg_gflops_per_process
+        - gpu-hpcg:r16/mg_bandwidth_per_process
+        - gpu-hpcg:r16/total_gflops
+        - gpu-hpcg:r16/total_bandwidth
+        - gpu-hpcg:r16/total_gflops_per_process
+        - gpu-hpcg:r16/total_bandwidth_per_process
+        - gpu-hpcg:r16/local_domain_x
+        - gpu-hpcg:r16/local_domain_y
+        - gpu-hpcg:r16/local_domain_z
+        - gpu-hpcg:r16/process_domain_x
+        - gpu-hpcg:r16/process_domain_y
+        - gpu-hpcg:r16/process_domain_z
+
+    gpu_hpcg_r32:
+      statistics: mean
+      categories: HPCG gpu-hpcg:r32
+      metrics:
+        - gpu-hpcg:r32/is_valid
+        - gpu-hpcg:r32/final_gflops
+        - gpu-hpcg:r32/final_bandwidth
+        - gpu-hpcg:r32/final_gflops_per_process
+        - gpu-hpcg:r32/final_bandwidth_per_process
+        - gpu-hpcg:r32/ddot_gflops
+        - gpu-hpcg:r32/ddot_bandwidth
+        - gpu-hpcg:r32/ddot_gflops_per_process
+        - gpu-hpcg:r32/ddot_bandwidth_per_process
+        - gpu-hpcg:r32/waxpby_gflops
+        - gpu-hpcg:r32/waxpby_bandwidth
+        - gpu-hpcg:r32/waxpby_gflops_per_process
+        - gpu-hpcg:r32/waxpby_bandwidth_per_process
+        - gpu-hpcg:r32/spmv_gflops
+        - gpu-hpcg:r32/spmv_bandwidth
+        - gpu-hpcg:r32/spmv_gflops_per_process
+        - gpu-hpcg:r32/spmv_bandwidth_per_process
+        - gpu-hpcg:r32/mg_gflops
+        - gpu-hpcg:r32/mg_bandwidth
+        - gpu-hpcg:r32/mg_gflops_per_process
+        - gpu-hpcg:r32/mg_bandwidth_per_process
+        - gpu-hpcg:r32/total_gflops
+        - gpu-hpcg:r32/total_bandwidth
+        - gpu-hpcg:r32/total_gflops_per_process
+        - gpu-hpcg:r32/total_bandwidth_per_process
+        - gpu-hpcg:r32/local_domain_x
+        - gpu-hpcg:r32/local_domain_y
+        - gpu-hpcg:r32/local_domain_z
+        - gpu-hpcg:r32/process_domain_x
+        - gpu-hpcg:r32/process_domain_y
+        - gpu-hpcg:r32/process_domain_z