# SuperBench Config version: v0.7 superbench: enable: null monitor: enable: true sample_duration: 1 sample_interval: 10 var: default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} parallel: yes default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config duration: 0 num_warmup: 64 num_steps: 2048 sample_count: 8192 batch_size: 32 precision: - float32 - float16 model_action: - train pin_memory: yes benchmarks: kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode gpu-burn: enable: false modes: - name: local proc_num: 1 parallel: no parameters: time: 900 doubles: true tensor_core: true nccl-bw:default: enable: true modes: - name: local proc_num: 1 parallel: no parameters: ngpus: 8 nccl-bw:gdr-only: enable: true modes: - name: local proc_num: 1 parallel: no env: NCCL_IB_PCI_RELAXED_ORDERING: '1' NCCL_NET_GDR_LEVEL: '5' NCCL_P2P_DISABLE: '1' NCCL_SHM_DISABLE: '1' NCCL_MIN_NCHANNELS: '16' NCCL_IB_DISABLE: '0' parameters: ngpus: 8 ib-loopback: enable: true modes: - name: local proc_num: 4 prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2 parallel: yes - name: local proc_num: 4 prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2 parallel: yes cpu-memory-bw-latency: enable: false modes: - name: local proc_num: 1 parallel: no parameters: tests: - bandwidth_matrix - latency_matrix - max_bandwidth mem-bw: enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) parallel: no disk-benchmark: enable: false modes: - name: local proc_num: 1 parallel: no parameters: block_devices: - /dev/nvme0n1 - /dev/nvme1n1 - /dev/nvme2n1 - /dev/nvme3n1 - /dev/nvme4n1 - /dev/nvme5n1 - /dev/nvme6n1 - /dev/nvme7n1 seq_read_runtime: 60 seq_write_runtime: 60 seq_readwrite_runtime: 60 rand_read_runtime: 60 rand_write_runtime: 60 rand_readwrite_runtime: 60 gpu-copy-bw:correctness: enable: true modes: - name: local parallel: no parameters: mem_type: - htod - dtoh - dtod copy_type: - sm - dma size: 4096 num_warm_up: 0 num_loops: 1 check_data: true gpu-copy-bw:perf: enable: true modes: - name: local parallel: no parameters: mem_type: - htod - dtoh - dtod copy_type: - sm - dma cudnn-function: <<: *default_local_mode cublas-function: <<: *default_local_mode matmul: <<: *default_local_mode frameworks: - pytorch sharding-matmul: <<: *default_pytorch_mode computation-communication-overlap: <<: *default_pytorch_mode ib-traffic: enable: false modes: - name: mpi proc_num: 8 parameters: msg_size: 8388608 ib_dev: mlx5_$LOCAL_RANK gpu_dev: $LOCAL_RANK numa_dev: $((LOCAL_RANK/2)) gpcnet-network-test: enable: false modes: - name: mpi proc_num: 1 mca: pml: ucx btl: ^uct btl_tcp_if_include: eth0 gpcnet-network-load-test: enable: false modes: - name: mpi proc_num: 1 mca: pml: ucx btl: ^uct btl_tcp_if_include: eth0 tcp-connectivity: enable: false modes: - name: local parallel: no parameters: port: 22 ort-inference: <<: *default_local_mode tensorrt-inference: <<: *default_local_mode parameters: pytorch_models: - resnet50 - resnet101 - resnet152 - densenet169 - densenet201 - bert-base - bert-large seq_length: 224 batch_size: 32 precision: int8 gpt_models: <<: *default_pytorch_mode models: - gpt2-small - gpt2-large parameters: <<: *common_model_config batch_size: 8 seq_len: 224 bert_models: <<: *default_pytorch_mode models: - bert-base - bert-large parameters: <<: *common_model_config seq_len: 224 lstm_models: <<: *default_pytorch_mode models: - lstm parameters: <<: *common_model_config batch_size: 224 input_size: 224 hidden_size: 1000 seq_len: 32 pin_memory: no resnet_models: <<: *default_pytorch_mode models: - resnet50 - resnet101 - resnet152 parameters: <<: *common_model_config batch_size: 192 num_steps: 512 densenet_models: <<: *default_pytorch_mode models: - densenet169 - densenet201 parameters: <<: *common_model_config pin_memory: no vgg_models: <<: *default_pytorch_mode models: - vgg11 - vgg13 - vgg16 - vgg19 parameters: <<: *common_model_config pin_memory: no