# SuperBench Config version: v0.12 superbench: enable: null monitor: enable: false var: default_local_mode: &default_local_mode enable: false modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl: cpunodebind: gpu_affinity membind: gpu_affinity parallel: yes default_pytorch_mode: &default_pytorch_mode enable: false modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config model_ddp_parameter: &model_ddp_param duration: 0 num_warmup: 128 num_steps: 512 sample_count: 8192 batch_size: 128 precision: [float32, float16] model_action: [train] pin_memory: yes num_workers: 0 benchmarks: kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode parameters: m: 7680 n: 8192 k: 8192 hipblaslt-gemm: enable: false modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes parameters: in_types: ["fp32", "fp16", "bf16"] tolerant_fail: yes num_warmup: 100 num_steps: 1000 shapes: - 4096,4096,4096 - 8192,8192,8192 - 16384,16384,16384 gpu-stream: enable: false modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes parameters: array_size: 268435456 num_loops: 100 precision: double rccl-bw:allreduce-r16: enable: true modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 NCCL_SOCKET_IFNAME: p14p2 NCCL_NET_GDR_LEVEL: PHB NCCL_NET_GDR_READ: 1 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r8-pcie: enable: true modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 NCCL_SOCKET_IFNAME: p14p2 NCCL_NET_GDR_LEVEL: PHB NCCL_NET_GDR_READ: 1 NCCL_BUFFSIZE: 4194304 NCCL_SIMPLE_CHANNELS: 32 RCCL_P2P_XHCL_CHANNEL_NUM: 31 RCCL_COLL_XHCL_CHANNEL_NUM: 28 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 NCCL_SOCKET_IFNAME: p14p2 NCCL_NET_GDR_LEVEL: PHB NCCL_NET_GDR_READ: 1 NCCL_BUFFSIZE: 4194304 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:alltoall-r16: enable: true modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 NCCL_SOCKET_IFNAME: p14p2 NCCL_NET_GDR_LEVEL: PHB NCCL_NET_GDR_READ: 1 parameters: maxbytes: 16G ngpus: 1 operation: alltoall gpu-hpcg:r32: enable: false modes: - name: mpi proc_num: 8 node_num: 4 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 parameters: nx: 560 ny: 280 nz: 280 rt: 60 npx: 4 npy: 4 npz: 2 gpu-hpcg:r16: enable: true modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 parameters: nx: 560 ny: 280 nz: 280 rt: 0 npx: 4 npy: 2 npz: 2 gpu-hpcg:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 60 npx: 2 npy: 2 npz: 2 gpu-hpcg:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 60 npx: 2 npy: 2 npz: 1 gpu-hpcg:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 60 npx: 2 npy: 1 npz: 1 gpu-hpcg:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 60 npx: 1 npy: 1 npz: 1 cpu-memory-bw-latency: enable: false modes: - name: local proc_num: 1 parallel: no parameters: tests: - bandwidth_matrix - latency_matrix - max_bandwidth mem-bw: enable: false modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: no ib-loopback: enable: false modes: - name: local proc_num: 16 prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) parallel: no parameters: msg_size: 8388608 disk-benchmark: enable: false modes: - name: local proc_num: 1 parallel: no parameters: block_devices: [] gpu-copy-bw:correctness: enable: false modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] size: 4096 num_warm_up: 0 num_loops: 1 check_data: true gpu-copy-bw:perf: enable: false modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] ib-traffic: enable: false modes: - name: mpi proc_num: 8 parameters: command: - ib_write_bw - ib_write_lat direction: - gpu-to-gpu pattern: one-to-one msg_size: 8388608 iters: 5000 timeout: 120 gpu_dev: $LOCAL_RANK ib_dev: '"$(case $LOCAL_RANK in 0) echo mlx5_1 ;; 1) echo mlx5_2 ;; 2) echo mlx5_3 ;; 3) echo mlx5_4 ;; 4) echo mlx5_7 ;; 5) echo mlx5_8 ;; 6) echo mlx5_9 ;; 7) echo mlx5_10 ;; esac)"' numa_dev: '"$(case $LOCAL_RANK in 0) echo 3 ;; 1) echo 1 ;; 2) echo 1 ;; 3) echo 0 ;; 4) echo 7 ;; 5) echo 5 ;; 6) echo 5 ;; 7) echo 4 ;; esac)"' bidirectional: false # dist-inference: # modes: # - name: mpi # proc_num: 8 # node_num: 1 # mca: # pml: ob1 # btl: ^openib # btl_tcp_if_exclude: lo,docker0 # coll_hcoll_enable: 0 # frameworks: # - pytorch # parameters: # num_layers: 50 # num_warmup: 20 # num_steps: 100 # use_cuda_graph: true # precision: float16 # hidden_size: 128 # input_size: 128 # batch_size: 1024