Add bw1000 config files (beta)

49a4389b · one · 53e0e494 · 49a4389b · 49a4389b
Commit 49a4389b authored Apr 02, 2026 by one
Hide whitespace changes
Inline Side-by-side

Showing with 408 additions and 0 deletions

superbench/config/hygon_bw1000.yaml superbench/config/hygon_bw1000.yaml +363 -0

superbench/config/hygon_bw1000_summary.yaml superbench/config/hygon_bw1000_summary.yaml +45 -0

No files found.
--- a/superbench/config/hygon_bw1000.yaml
+++ b/superbench/config/hygon_bw1000.yaml
+# SuperBench Config
+version: v0.12
+superbench:
+  enable: null
+  monitor:
+    enable: false
+  var:
+    default_local_mode: &default_local_mode
+      enable: false
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
+      enable: false
+      modes:
+        - name: torch.distributed
+          proc_num: 8
+          node_num: 1
+      frameworks:
+        - pytorch
+    common_model_config: &common_model_config
+    model_ddp_parameter: &model_ddp_param
+      duration: 0
+      num_warmup: 128
+      num_steps: 512
+      sample_count: 8192
+      batch_size: 128
+      precision: [float32, float16]
+      model_action: [train]
+      pin_memory: yes
+      num_workers: 0
+  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+      parameters:
+        m: 7680
+        n: 8192
+        k: 8192
+    hipblaslt-gemm:
+      enable: false
+      modes:
+      - name: local
+        proc_num: 8
+        prefix: HIP_VISIBLE_DEVICES={proc_rank}
+        parallel: yes
+      parameters:
+        in_types: ["fp32", "fp16", "bf16"]
+        tolerant_fail: yes
+        num_warmup: 100
+        num_steps: 1000
+        shapes:
+        - 4096,4096,4096
+        - 8192,8192,8192
+        - 16384,16384,16384
+    gpu-stream:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+      parameters:
+        array_size: 268435456
+        num_loops: 100
+        precision: double
+    rccl-bw:allreduce-r16:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 2
+          bind_to: none
+          mca:
+            pml: ucx
+            btl: ^openib
+            btl_tcp_if_exclude: lo,docker0
+            coll_hcoll_enable: 0
+          env:
+            ROCM_PATH: /opt/dtk
+            HSA_FORCE_FINE_GRAIN_PCIE: 1
+            NCCL_SOCKET_IFNAME: p14p2
+            NCCL_NET_GDR_LEVEL: PHB
+            NCCL_NET_GDR_READ: 1
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:allreduce-r8-pcie:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+          mca:
+            pml: ucx
+            btl: ^openib
+            btl_tcp_if_exclude: lo,docker0
+            coll_hcoll_enable: 0
+          env:
+            ROCM_PATH: /opt/dtk
+            HSA_FORCE_FINE_GRAIN_PCIE: 1
+            NCCL_SOCKET_IFNAME: p14p2
+            NCCL_NET_GDR_LEVEL: PHB
+            NCCL_NET_GDR_READ: 1
+            NCCL_BUFFSIZE: 4194304
+            NCCL_SIMPLE_CHANNELS: 32
+            RCCL_P2P_XHCL_CHANNEL_NUM: 31
+            RCCL_COLL_XHCL_CHANNEL_NUM: 28
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:allreduce-r8:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+          mca:
+            pml: ucx
+            btl: ^openib
+            btl_tcp_if_exclude: lo,docker0
+            coll_hcoll_enable: 0
+          env:
+            ROCM_PATH: /opt/dtk
+            HSA_FORCE_FINE_GRAIN_PCIE: 1
+            NCCL_SOCKET_IFNAME: p14p2
+            NCCL_NET_GDR_LEVEL: PHB
+            NCCL_NET_GDR_READ: 1
+            NCCL_BUFFSIZE: 4194304
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: allreduce
+    rccl-bw:alltoall-r16:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 2
+          bind_to: none
+          mca:
+            pml: ucx
+            btl: ^openib
+            btl_tcp_if_exclude: lo,docker0
+            coll_hcoll_enable: 0
+          env:
+            ROCM_PATH: /opt/dtk
+            HSA_FORCE_FINE_GRAIN_PCIE: 1
+            NCCL_SOCKET_IFNAME: p14p2
+            NCCL_NET_GDR_LEVEL: PHB
+            NCCL_NET_GDR_READ: 1
+      parameters:
+        maxbytes: 16G
+        ngpus: 1
+        operation: alltoall
+    gpu-hpcg:r32:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 4
+          bind_to: none
+          mca:
+            pml: ucx
+            btl: ^openib
+            btl_tcp_if_exclude: lo,docker0
+            coll_hcoll_enable: 0
+          env:
+            ROCM_PATH: /opt/dtk
+            HSA_FORCE_FINE_GRAIN_PCIE: 1
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 60
+        npx: 4
+        npy: 4
+        npz: 2
+    gpu-hpcg:r16:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 2
+          bind_to: none
+          mca:
+            pml: ucx
+            btl: ^openib
+            btl_tcp_if_exclude: lo,docker0
+            coll_hcoll_enable: 0
+          env:
+            ROCM_PATH: /opt/dtk
+            HSA_FORCE_FINE_GRAIN_PCIE: 1
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 0
+        npx: 4
+        npy: 2
+        npz: 2
+    gpu-hpcg:r8:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 60
+        npx: 2
+        npy: 2
+        npz: 2
+    gpu-hpcg:r4:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 4
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 60
+        npx: 2
+        npy: 2
+        npz: 1
+    gpu-hpcg:r2:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 2
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 60
+        npx: 2
+        npy: 1
+        npz: 1
+    gpu-hpcg:r1:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 1
+          node_num: 1
+          bind_to: none
+      parameters:
+        nx: 560
+        ny: 280
+        nz: 280
+        rt: 60
+        npx: 1
+        npy: 1
+        npz: 1
+    cpu-memory-bw-latency:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        tests:
+          - bandwidth_matrix
+          - latency_matrix
+          - max_bandwidth
+    mem-bw:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/4))
+          parallel: no
+    ib-loopback:
+      enable: false
+      modes:
+      - name: local
+        proc_num: 16
+        prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8))
+        parallel: no
+      parameters:
+        msg_size: 8388608
+    disk-benchmark:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        block_devices: []
+    gpu-copy-bw:correctness:
+      enable: false
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
+        copy_type: [sm, dma]
+        size: 4096
+        num_warm_up: 0
+        num_loops: 1
+        check_data: true
+    gpu-copy-bw:perf:
+      enable: false
+      modes:
+        - name: local
+          parallel: no
+      parameters:
+        mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all]
+        copy_type: [sm, dma]
+    ib-traffic:
+      enable: false
+      modes:
+        - name: mpi
+          proc_num: 8
+      parameters:
+        command:
+          - ib_write_bw
+          - ib_write_lat
+        direction:
+          - gpu-to-gpu
+        pattern: one-to-one
+        msg_size: 8388608
+        iters: 5000
+        timeout: 120
+        gpu_dev: $LOCAL_RANK
+        ib_dev: '"$(case $LOCAL_RANK in 0) echo mlx5_1 ;; 1) echo mlx5_2 ;; 2) echo mlx5_3 ;; 3) echo mlx5_4 ;; 4) echo mlx5_7 ;; 5) echo mlx5_8 ;; 6) echo mlx5_9 ;; 7) echo mlx5_10 ;; esac)"'
+        numa_dev: '"$(case $LOCAL_RANK in 0) echo 3 ;; 1) echo 1 ;; 2) echo 1 ;; 3) echo 0 ;; 4) echo 7 ;; 5) echo 5 ;; 6) echo 5 ;; 7) echo 4 ;; esac)"'
+        bidirectional: false
+    # dist-inference:
+    #   modes:
+    #   - name: mpi
+    #     proc_num: 8
+    #     node_num: 1
+    #     mca:
+    #       pml: ob1
+    #       btl: ^openib
+    #       btl_tcp_if_exclude: lo,docker0
+    #       coll_hcoll_enable: 0
+    #   frameworks:
+    #     - pytorch
+    #   parameters:
+    #     num_layers: 50
+    #     num_warmup: 20
+    #     num_steps: 100
+    #     use_cuda_graph: true
+    #     precision: float16
+    #     hidden_size: 128
+    #     input_size: 128
+    #     batch_size: 1024
--- a/superbench/config/hygon_bw1000_summary.yaml
+++ b/superbench/config/hygon_bw1000_summary.yaml
+# SuperBench summary rules for hygon_bw1000.yaml
+#
+# Usage:
+#   sb result summary \
+#     --data-file <output-dir>/results-summary.jsonl \
+#     --rule-file superbench/config/hygon_bw1000_summary.yaml \
+#     --output-file-format md \
+#     --output-dir <summary-output-dir>
+#
+# Notes:
+# - This file focuses on RCCL benchmarks defined in
+#   superbench/config/hygon_bw1000.yaml.
+# - Unmatched benchmark sections are allowed. If a benchmark was not run,
+#   the corresponding category in the summary will be empty.
+
+version: v0.12
+superbench:
+  rules:
+    rccl_bw_allreduce_r16_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r16
+      metrics:
+        - rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r16/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r8_pcie_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r8-pcie
+      metrics:
+        - rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r8-pcie/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_allreduce_r8_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:allreduce-r8
+      metrics:
+        - rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:allreduce-r8/allreduce_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw
+
+    rccl_bw_alltoall_r16_bandwidth:
+      statistics: mean
+      categories: RCCL rccl-bw:alltoall-r16
+      metrics:
+        - rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_busbw
+        - rccl-bw:alltoall-r16/alltoall_(8388608|67108864|1073741824|4294967296|8589934592|17179869184)_algbw