# SuperBench Config version: v0.12 superbench: enable: null monitor: enable: false var: default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config model_ddp_parameter: &model_ddp_param duration: 0 num_warmup: 128 num_steps: 512 sample_count: 8192 batch_size: 128 precision: [float32, float16] model_action: [train] pin_memory: yes num_workers: 0 benchmarks: kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode parameters: m: 7680 n: 8192 k: 8192 hipblaslt-gemm: enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes parameters: in_types: ["fp32", "fp16", "bf16"] tolerant_fail: yes num_warmup: 100 num_steps: 1000 shapes: - 4096,4096,4096 - 8192,8192,8192 - 16384,16384,16384 gpu-stream: enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: yes parameters: array_size: 268435456 num_loops: 100 precision: double rccl-bw:allreduce-r8: enable: true modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r4: enable: true modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none env: NCCL_BUFFSIZE: 4194304 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r4-graph: enable: true modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none env: NCCL_BUFFSIZE: 4194304 NCCL_RINGS: "0 1 2 3|0 3 2 1|0 1 3 2|0 2 3 1|0 2 1 3|0 3 1 2" parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r4-pcie: enable: true modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none env: NCCL_BUFFSIZE: 4194304 NCCL_SIMPLE_CHANNELS: 20 RCCL_P2P_XHCL_CHANNEL_NUM: 16 RCCL_COLL_XHCL_CHANNEL_NUM: 16 parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:allreduce-r4-graph-pcie: enable: true modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none env: NCCL_BUFFSIZE: 4194304 NCCL_SIMPLE_CHANNELS: 20 RCCL_P2P_XHCL_CHANNEL_NUM: 16 RCCL_COLL_XHCL_CHANNEL_NUM: 16 NCCL_RINGS: "0 1 2 3|0 3 2 1|0 1 3 2|0 2 3 1|0 2 1 3|0 3 1 2" parameters: maxbytes: 16G ngpus: 1 operation: allreduce rccl-bw:alltoall-r8: enable: true modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: maxbytes: 16G ngpus: 1 operation: alltoall rccl-bw:alltoall-r4: enable: true modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: maxbytes: 16G ngpus: 1 operation: alltoall gpu-hpl:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: p: 4 q: 2 n: 256000 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: p: 4 q: 1 n: 180224 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: p: 2 q: 1 n: 128000 nb: 512 bcast: 1 warmup: 1 iterations: 5 gpu-hpl:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: p: 1 q: 1 n: 90624 nb: 512 bcast: 1 nbmin: 16 warmup: 1 iterations: 5 gpu-hpl-mxp:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: p: 4 q: 2 n: 344064 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: p: 4 q: 1 n: 245760 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: p: 2 q: 1 n: 172032 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpl-mxp:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: p: 1 q: 1 n: 122880 nb: 4096 bcast: 1 warmup: 1 iterations: 5 gpu-hpcg:r8: enable: false modes: - name: mpi proc_num: 8 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 2 npy: 2 npz: 2 gpu-hpcg:r4: enable: false modes: - name: mpi proc_num: 4 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 2 npy: 2 npz: 1 gpu-hpcg:r2: enable: false modes: - name: mpi proc_num: 2 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 2 npy: 1 npz: 1 gpu-hpcg:r1: enable: false modes: - name: mpi proc_num: 1 node_num: 1 bind_to: none parameters: nx: 560 ny: 280 nz: 280 rt: 10 npx: 1 npy: 1 npz: 1 cpu-memory-bw-latency: enable: false modes: - name: local proc_num: 1 parallel: no parameters: tests: - bandwidth_matrix - latency_matrix - max_bandwidth mem-bw: enable: true modes: - name: local proc_num: 8 prefix: HIP_VISIBLE_DEVICES={proc_rank} parallel: no ib-loopback: enable: false modes: - name: local proc_num: 16 prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3,4,5,6,7,0,1,2,3,4,5,6,7 numactl -N $(({proc_rank}/8)) -m $(({proc_rank}/8)) parallel: no parameters: msg_size: 8388608 disk-benchmark: enable: false modes: - name: local proc_num: 1 parallel: no parameters: block_devices: [] gpu-copy-bw:correctness: enable: true modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] size: 4096 num_warm_up: 0 num_loops: 1 check_data: true gpu-copy-bw:perf: enable: true modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] dist-inference: modes: - name: mpi proc_num: 8 node_num: 1 frameworks: - pytorch parameters: num_layers: 20 num_warmup: 20 num_steps: 100 use_cuda_graph: true precision: float16 hidden_size: 4096 input_size: 4096 batch_size: 1024 ort-inference: <<: *default_local_mode parameters: execution_provider: rocm pytorch_models: - resnet50 - resnet152 - resnext50_32x4d - wide_resnet50_2 - mobilenet_v2 precision: float16 batch_size: 1 computation-communication-overlap: <<: *default_pytorch_mode sharding-matmul: <<: *default_pytorch_mode