# SuperBench Config version: v0.12 superbench: enable: null monitor: enable: true sample_duration: 1 sample_interval: 10 var: default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} parallel: yes default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config duration: 0 num_warmup: 16 num_steps: 128 batch_size: 1 precision: - float32 - float16 model_action: - train benchmarks: gpu-burn: enable: true modes: - name: local proc_num: 1 parallel: no parameters: time: 300 doubles: true tensor_core: true nccl-bw:default: enable: true modes: - name: local proc_num: 1 parallel: no parameters: ngpus: 8 nccl-bw:gdr-only: enable: true modes: - name: local proc_num: 1 parallel: no env: NCCL_IB_PCI_RELAXED_ORDERING: '1' NCCL_NET_GDR_LEVEL: '5' NCCL_P2P_DISABLE: '1' NCCL_SHM_DISABLE: '1' NCCL_MIN_NCHANNELS: '16' NCCL_IB_DISABLE: '0' parameters: ngpus: 8 ib-loopback: enable: true modes: - name: local proc_num: 4 prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2 parallel: yes - name: local proc_num: 4 prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2 parallel: yes disk-benchmark: enable: false modes: - name: local proc_num: 1 parallel: no parameters: block_devices: - /dev/nvme0n1 cpu-memory-bw-latency: enable: false modes: - name: local proc_num: 1 parallel: no parameters: tests: - bandwidth_matrix - latency_matrix - max_bandwidth mem-bw: enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -N $(({proc_rank}/2)) parallel: no gpu-copy-bw:correctness: enable: true modes: - name: local parallel: no parameters: mem_type: - htod - dtoh - dtod copy_type: - sm - dma size: 4096 num_warm_up: 0 num_loops: 1 check_data: true gpu-copy-bw:perf: enable: true modes: - name: local parallel: no parameters: mem_type: - htod - dtoh - dtod copy_type: - sm - dma nvbandwidth: enable: true modes: - name: local parallel: no parameters: buffer_size: 128 test_cases: - host_to_device_memcpy_ce - device_to_host_memcpy_ce - host_to_device_memcpy_sm - device_to_host_memcpy_sm num_loops: 18 skip_verification: false disable_affinity: false use_mean: false kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode cudnn-function: <<: *default_local_mode cublas-function: <<: *default_local_mode matmul: <<: *default_local_mode frameworks: - pytorch sharding-matmul: <<: *default_pytorch_mode computation-communication-overlap: <<: *default_pytorch_mode ib-traffic: enable: false modes: - name: mpi proc_num: 8 parameters: msg_size: 8388608 ib_dev: mlx5_$LOCAL_RANK gpu_dev: $LOCAL_RANK numa_dev: $((LOCAL_RANK/2)) gpcnet-network-test: enable: false modes: - name: mpi proc_num: 1 mca: pml: ucx btl: ^uct btl_tcp_if_include: eth0 env: UCX_NET_DEVICES: mlx5_0:1 gpcnet-network-load-test: enable: false modes: - name: mpi proc_num: 1 mca: pml: ucx btl: ^uct btl_tcp_if_include: eth0 env: UCX_NET_DEVICES: mlx5_0:1 tcp-connectivity: enable: false modes: - name: local parallel: no parameters: port: 22 ort-inference: <<: *default_local_mode parameters: batch_size: 1 tensorrt-inference: <<: *default_local_mode parameters: pytorch_models: - resnet50 - resnet101 - resnet152 - densenet169 - densenet201 - bert-base - bert-large seq_length: 224 batch_size: 1 precision: int8 megatron-gpt: modes: - name: mpi proc_num: 1 node_num: all parameters: code_base: /opt/superbench/third_party/Megatron/Megatron-DeepSpeed/ dataset_url: https://huggingface.co/datasets/suolyer/pile_bookcorpus2/raw/main/test.json batch_size: 2048 num_warmup: 0 num_steps: 10 precision: - float16 - bfloat16 deepspeed: yes sequence_parallel: yes use_rotary_position_embeddings: yes gpt_models: <<: *default_pytorch_mode models: - gpt2-small - gpt2-large parameters: <<: *common_model_config bert_models: <<: *default_pytorch_mode models: - bert-base - bert-large parameters: <<: *common_model_config lstm_models: <<: *default_pytorch_mode models: - lstm parameters: <<: *common_model_config resnet_models: <<: *default_pytorch_mode models: - resnet50 - resnet101 - resnet152 parameters: <<: *common_model_config densenet_models: <<: *default_pytorch_mode models: - densenet169 - densenet201 parameters: <<: *common_model_config vgg_models: <<: *default_pytorch_mode models: - vgg11 - vgg13 - vgg16 - vgg19 parameters: <<: *common_model_config