# SuperBench Config version: v0.12 superbench: enable: null monitor: enable: false var: default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} numactl: cpunodebind: gpu_affinity membind: gpu_affinity parallel: yes default_pytorch_mode: &default_pytorch_mode enable: false modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config model_ddp_parameter: &model_ddp_param duration: 0 num_warmup: 128 num_steps: 512 sample_count: 8192 batch_size: 128 precision: [float32, float16] model_action: [train] pin_memory: yes num_workers: 0 benchmarks: kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode parameters: m: 7680 n: 8192 k: 8192 hipblaslt-gemm: enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes parameters: in_types: [ "fp32", "fp16", "bf16" ] tolerant_fail: yes num_warmup: 100 num_steps: 1000 shapes: - 4096,4096,4096 - 8192,8192,8192 - 16384,16384,16384 gpu-stream: enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes parameters: array_size: 268435456 num_loops: 100 precision: double rccl-bw:allreduce-r16: enable: false modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 NCCL_SOCKET_IFNAME: p14p2 NCCL_NET_GDR_LEVEL: PHB NCCL_NET_GDR_READ: 1 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r8: enable: true modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none env: NCCL_BUFFSIZE: 4194304 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r8-pcie: enable: true modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none env: NCCL_BUFFSIZE: 4194304 NCCL_SIMPLE_CHANNELS: 32 RCCL_P2P_XHCL_CHANNEL_NUM: 31 RCCL_COLL_XHCL_CHANNEL_NUM: 28 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:alltoall-r16: enable: false modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 NCCL_SOCKET_IFNAME: p14p2 NCCL_NET_GDR_LEVEL: PHB NCCL_NET_GDR_READ: 1 parameters: maxbytes: 16G ngpus: 1 operation: alltoall rccl-bw:alltoall-r8: enable: true modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: maxbytes: 16G ngpus: 1 operation: alltoall gpu-hpl:r32: enable: false modes: - name: mpi proc_num: 8 node_num: 4 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 UCX_RNDV_SCHEME: put_zcopy UCX_RNDV_FRAG_MEM_TYPE: rocm UCX_MEMTYPE_CACHE: n parameters: p: 8 q: 4 n: 512000 nb: 512 bcast: 5 warmup: 1 iterations: 5 gpu-hpl:r16: enable: false modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 UCX_RNDV_SCHEME: put_zcopy UCX_RNDV_FRAG_MEM_TYPE: rocm UCX_MEMTYPE_CACHE: n parameters: p: 8 q: 2 n: 360448 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: p: 4 q: 2 n: 254976 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: p: 4 q: 1 n: 180224 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: p: 2 q: 1 n: 128000 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: p: 1 q: 1 n: 90624 nb: 512 nbmin: 16 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: p: 4 q: 2 n: 344064 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: p: 4 q: 1 n: 245760 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: p: 2 q: 1 n: 172032 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: p: 1 q: 1 n: 122880 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpcg:r32: enable: false modes: - name: mpi proc_num: 8 node_num: 4 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 4 npy: 4 npz: 2 gpu-hpcg:r16: enable: false modes: - name: mpi proc_num: 8 node_num: 2 bind_to: none mca: pml: ucx btl: ^openib btl_tcp_if_exclude: lo,docker0 coll_hcoll_enable: 0 env: ROCM_PATH: /opt/dtk HSA_FORCE_FINE_GRAIN_PCIE: 1 parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 4 npy: 2 npz: 2 gpu-hpcg:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 2 npy: 2 npz: 2 gpu-hpcg:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 2 npy: 2 npz: 1 gpu-hpcg:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 2 npy: 1 npz: 1 gpu-hpcg:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 1 npy: 1 npz: 1 cpu-memory-bw-latency: enable: false modes: - name: local proc_num: 1 parallel: no parameters: tests: - bandwidth_matrix - latency_matrix - max_bandwidth mem-bw: enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: no ib-loopback: enable: false modes: - name: local proc_num: 16 prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) parallel: no parameters: msg_size: 8388608 disk-benchmark: enable: false modes: - name: local proc_num: 1 parallel: no parameters: block_devices: [] gpu-copy-bw:correctness: enable: true modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] size: 4096 num_warm_up: 0 num_loops: 1 check_data: true gpu-copy-bw:perf: enable: true modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] ib-traffic: enable: false modes: - name: mpi proc_num: 8 parameters: command: - ib_write_bw - ib_write_lat direction: - gpu-to-gpu pattern: one-to-one msg_size: 8388608 iters: 5000 timeout: 120 gpu_dev: $LOCAL_RANK ib_dev: '"$(case $LOCAL_RANK in 0) echo mlx5_1 ;; 1) echo mlx5_2 ;; 2) echo mlx5_3 ;; 3) echo mlx5_4 ;; 4) echo mlx5_7 ;; 5) echo mlx5_8 ;; 6) echo mlx5_9 ;; 7) echo mlx5_10 ;; esac)"' numa_dev: '"$(case $LOCAL_RANK in 0) echo 3 ;; 1) echo 1 ;; 2) echo 1 ;; 3) echo 0 ;; 4) echo 7 ;; 5) echo 5 ;; 6) echo 5 ;; 7) echo 4 ;; esac)"' bidirectional: false dist-inference: modes: - name: mpi proc_num: 8 node_num: 1 frameworks: - pytorch parameters: num_layers: 20 num_warmup: 20 num_steps: 100 use_cuda_graph: true precision: float16 hidden_size: 4096 input_size: 4096 batch_size: 1024 ort-inference: <<: *default_local_mode parameters: execution_provider: rocm pytorch_models: - resnet50 - resnet152 - resnext50_32x4d - wide_resnet50_2 - mobilenet_v2 precision: float16 batch_size: 1 computation-communication-overlap: <<: *default_pytorch_mode sharding-matmul: <<: *default_pytorch_mode