Benchmarks: Add Configuration - Add microbenchmark in the validation config...

Benchmarks: Add Configuration - Add microbenchmark in the validation config file for HPE (AMD MI00) (#176) **Description** Add microbenchmark in the validation config file for AMD MI00. **Major Revision** - add rccl-bw, mem-bw,ib-loopback,gemm-flops,kernel-launch config for mi100

Benchmarks: Add Configuration - Add microbenchmark in the validation config...
Benchmarks: Add Configuration - Add microbenchmark in the validation config file for HPE (AMD MI00) (#176) **Description** Add microbenchmark in the validation config file for AMD MI00. **Major Revision** - add rccl-bw, mem-bw,ib-loopback,gemm-flops,kernel-launch config for mi100
47daedbe · Yuting Jiang · GitHub · 2ebb44cc · 47daedbe
Unverified Commit 47daedbe authored Sep 02, 2021 by Yuting Jiang Committed by GitHub Sep 02, 2021
Hide whitespace changes
Inline Side-by-side

Showing with 140 additions and 0 deletions

superbench/config/amd_mi100_hpe.yaml superbench/config/amd_mi100_hpe.yaml +140 -0

No files found.
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
+# SuperBench Config
+#
+# Server:
+#   - Product: HPE Apollo 6500
+version: v0.2
+superbench:
+  enable: null
+  var:
+    default_local_mode: &default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
+      enable: true
+      modes:
+        - name: torch.distributed
+          proc_num: 8
+          node_num: 1
+      frameworks:
+        - pytorch
+    common_model_config: &common_model_config
+      duration: 0
+      num_warmup: 64
+      num_steps: 2048
+      sample_count: 8192
+      batch_size: 32
+      precision:
+        - float32
+        - float16
+      model_action:
+        - train
+      pin_memory: yes
+  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    rccl-bw:
+      enable: true
+      modes:
+        - name: mpi
+          proc_num: 8
+          env:  
+            NCCL_SOCKET_IFNAME: ens17f0 
+            NCCL_IB_GDR_LEVEL: 1
+      parameters:
+        maxbytes: 128M
+        minbytes: 32M
+        iters: 50
+        ngpus: 1
+        operations: allreduce
+    mem-bw:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+      parameters:
+        m: 7680 
+        n: 8192 
+        k: 8192
+    ib-loopback:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1,2,3
+          parallel: no
+    disk-benchmark:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        block_devices: []
+    gpu-sm-copy-bw:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 32
+          prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
+          parallel: no
+      parameters:
+        dtoh: true
+        htod: true
+    gpt_models:
+      <<: *default_pytorch_mode
+      models:
+        - gpt2-large
+      parameters:
+        <<: *common_model_config
+        batch_size: 8
+        seq_len: 224
+    bert_models:
+      <<: *default_pytorch_mode
+      models:
+        - bert-base
+        - bert-large
+      parameters:
+        <<: *common_model_config
+        seq_len: 224
+    lstm_models:
+      <<: *default_pytorch_mode
+      models:
+        - lstm
+      parameters:
+        <<: *common_model_config
+        batch_size: 224
+        input_size: 224
+        hidden_size: 1000
+        seq_len: 32
+        pin_memory: no
+    resnet_models:
+      <<: *default_pytorch_mode
+      models:
+        - resnet50
+        - resnet101
+        - resnet152
+      parameters:
+        <<: *common_model_config
+        pin_memory: no
+    densenet_models:
+      <<: *default_pytorch_mode
+      models:
+        - densenet169
+        - densenet201
+      parameters:
+        <<: *common_model_config
+        pin_memory: no
+    vgg_models:
+      <<: *default_pytorch_mode
+      models:
+        - vgg11
+        - vgg13
+        - vgg16
+        - vgg19
+      parameters:
+        <<: *common_model_config
+        pin_memory: no