# SuperBench Config version: v0.2 superbench: enable: null var: default_local_mode: &default_local_mode enable: true modes: - name: local proc_num: 8 prefix: CUDA_VISIBLE_DEVICES={proc_rank} parallel: yes default_pytorch_mode: &default_pytorch_mode enable: true modes: - name: torch.distributed proc_num: 8 node_num: 1 frameworks: - pytorch common_model_config: &common_model_config duration: 0 num_warmup: 16 num_steps: 128 precision: - float32 - float16 model_action: - train benchmarks: kernel-launch: <<: *default_local_mode gemm-flops: <<: *default_local_mode cudnn-function: <<: *default_local_mode cublas-function: <<: *default_local_mode matmul: <<: *default_local_mode frameworks: - pytorch sharding-matmul: <<: *default_pytorch_mode computation-communication-overlap: <<: *default_pytorch_mode gpt_models: <<: *default_pytorch_mode models: - gpt2-small - gpt2-large parameters: <<: *common_model_config batch_size: 4 bert_models: <<: *default_pytorch_mode models: - bert-base - bert-large parameters: <<: *common_model_config batch_size: 8 lstm_models: <<: *default_pytorch_mode models: - lstm parameters: <<: *common_model_config batch_size: 128 resnet_models: <<: *default_pytorch_mode models: - resnet50 - resnet101 - resnet152 parameters: <<: *common_model_config batch_size: 128 densenet_models: <<: *default_pytorch_mode models: - densenet169 - densenet201 parameters: <<: *common_model_config batch_size: 128 vgg_models: <<: *default_pytorch_mode models: - vgg11 - vgg13 - vgg16 - vgg19 parameters: <<: *common_model_config batch_size: 128