# SuperBench Config version: v0.11 superbench: enable: monitor: enable: true sample_duration: 1 sample_interval: 10 var: default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} parallel: yes default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config duration: 0 num_warmup: 128 num_steps: 512 sample_count: 8192 batch_size: 128 precision: [float32, float16] model_action: [train] pin_memory: yes num_workers: 0 benchmarks: kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode parameters: precision: ["fp64", "fp32", "fp16", "fp64_tc","tf32_tc", "bf16_tc", "fp16_tc", "int8_tc"] cublaslt-gemm: <<: *default_local_mode parameters: in_types: ['fp8e4m3', 'fp8e5m2', 'fp64', 'fp32', 'fp16', 'bf16', 'int8'] shapes: - 4096,4096,4096 - 8192,8192,8192 - 16384,16384,16384 gpu-burn: enable: false modes: - name: local proc_num: 1 parallel: no parameters: time: 900 doubles: true tensor_core: true nccl-bw:default: enable: true modes: - name: local proc_num: 1 parallel: no parameters: ngpus: 8 nccl-bw:gdr-only: enable: true modes: - name: local proc_num: 1 parallel: no env: NCCL_IB_PCI_RELAXED_ORDERING: '1' NCCL_NET_GDR_LEVEL: '5' NCCL_P2P_DISABLE: '1' NCCL_SHM_DISABLE: '1' NCCL_MIN_NCHANNELS: '16' NCCL_IB_DISABLE: '0' parameters: ngpus: 8 nccl-lat:default: enable: true modes: - name: mpi proc_num: 8 node_num: 1 parameters: maxbytes: 16M warmup_iters: 20 iters: 1000 graph_iters: 1 ib-loopback: timeout: *default_timeout modes: - name: local proc_num: 4 prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=0,0,1,1 parallel: yes - name: local proc_num: 4 prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=0,0,1,1 parallel: yes cpu-memory-bw-latency: enable: false modes: - name: local proc_num: 1 parallel: no parameters: tests: - bandwidth_matrix - latency_matrix - max_bandwidth mem-bw: enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) parallel: no disk-benchmark: enable: false modes: - name: local proc_num: 1 parallel: no parameters: block_devices: - /dev/nvme0n1 - /dev/nvme1n1 - /dev/nvme2n1 - /dev/nvme3n1 - /dev/nvme4n1 - /dev/nvme5n1 - /dev/nvme6n1 - /dev/nvme7n1 seq_read_runtime: 60 seq_write_runtime: 60 seq_readwrite_runtime: 60 rand_read_runtime: 60 rand_write_runtime: 60 rand_readwrite_runtime: 60 gpu-copy-bw:correctness: enable: true modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] size: 4096 num_warm_up: 0 num_loops: 1 check_data: true gpu-copy-bw:perf: enable: true modes: - name: local parallel: no parameters: mem_type: [htod, dtoh, dtod, one_to_all, all_to_one, all_to_all] copy_type: [sm, dma] cudnn-function: <<: *default_local_mode cublas-function: <<: *default_local_mode matmul: <<: *default_local_mode frameworks: - pytorch sharding-matmul: <<: *default_pytorch_mode computation-communication-overlap: <<: *default_pytorch_mode dist-inference: enable: true timeout: 600 modes: - name: mpi proc_num: 8 node_num: 1 env: NCCL_TOPO_FILE: '/opt/microsoft/ndv5-topo.xml' frameworks: - pytorch parameters: num_layers: 50 num_warmup: 20 num_steps: 100 use_cuda_graph: true precision: float16 hidden_size: 128 input_size: 128 batch_size: 1024 ib-traffic: enable: false modes: - name: mpi proc_num: 8 parameters: msg_size: 8388608 ib_dev: mlx5_$LOCAL_RANK gpu_dev: $LOCAL_RANK numa_dev: $((LOCAL_RANK/2)) gpcnet-network-test: enable: false modes: - name: mpi proc_num: 1 mca: pml: ucx btl: ^uct btl_tcp_if_include: eth0 gpcnet-network-load-test: enable: false modes: - name: mpi proc_num: 1 mca: pml: ucx btl: ^uct btl_tcp_if_include: eth0 tcp-connectivity: enable: false modes: - name: local parallel: no parameters: port: 22 ort-inference: <<: *default_local_mode tensorrt-inference: <<: *default_local_mode parameters: pytorch_models: - resnet50 - resnet101 - resnet152 - densenet169 - densenet201 - bert-base - bert-large seq_length: 224 batch_size: 32 precision: int8 model-benchmarks:gpt: <<: *default_pytorch_mode models: - gpt2-small - gpt2-large parameters: <<: *common_model_config precision: [float32, float16, fp8_hybrid] batch_size: 32 seq_len: 224 model-benchmarks:bert: <<: *default_pytorch_mode models: - bert-base - bert-large parameters: <<: *common_model_config precision: [float32, float16, fp8_hybrid] seq_len: 224 model-benchmarks:lstm: <<: *default_pytorch_mode models: - lstm parameters: <<: *common_model_config batch_size: 1024 input_size: 224 hidden_size: 1000 seq_len: 32 pin_memory: no model-benchmarks:resnet: <<: *default_pytorch_mode models: - resnet50 - resnet101 - resnet152 parameters: <<: *common_model_config batch_size: 384 num_steps: 512 model-benchmarks:densenet: <<: *default_pytorch_mode models: - densenet169 - densenet201 parameters: <<: *common_model_config pin_memory: no model-benchmarks:vgg: <<: *default_pytorch_mode models: - vgg11 - vgg13 - vgg16 - vgg19 parameters: <<: *common_model_config pin_memory: no