Benchmarks: Update - Update benchmarks in configuration file (#208)

**Description** Update benchmarks in configuration files for single node validation of superbench v0.3. **Major Revision** - fix bugs of parameters in nccl-bw for single node validation in configs - update new benchmarks in amd_mi100_hpe.yaml, amd_mi100_z53.yaml, azure_ndv4.yaml - fix bug of wrong gpu visible prefix

Benchmarks: Update - Update benchmarks in configuration file (#208)
**Description** Update benchmarks in configuration files for single node validation of superbench v0.3. **Major Revision** - fix bugs of parameters in nccl-bw for single node validation in configs - update new benchmarks in amd_mi100_hpe.yaml, amd_mi100_z53.yaml, azure_ndv4.yaml - fix bug of wrong gpu visible prefix
a58f218b · Yuting Jiang · GitHub · 6fb0fb12 · a58f218b · a58f218b
Unverified Commit a58f218b authored Sep 23, 2021 by Yuting Jiang Committed by GitHub Sep 23, 2021
4 changed files
--- a/superbench/config/amd_mi100_hpe.yaml
+++ b/superbench/config/amd_mi100_hpe.yaml
@@ -40,16 +40,12 @@ superbench:
    rccl-bw:
      enable: true
      modes:
-        - name: mpi
-          proc_num: 8
-          env:  
-            NCCL_SOCKET_IFNAME: ens17f0 
-            NCCL_IB_GDR_LEVEL: 1
+        - name: local
+          proc_num: 1
+          parallel: no
      parameters:
-        maxbytes: 128M
-        minbytes: 32M
-        iters: 50
-        ngpus: 1
+        maxbytes: 8G
+        ngpus: 8
        operation: allreduce
    mem-bw:
      <<: *default_local_mode
@@ -79,7 +75,7 @@ superbench:
      modes:
        - name: local
          proc_num: 32
-          prefix: CUDA_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
+          prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
          parallel: no
      parameters:
        mem_type:

--- a/superbench/config/amd_mi100_z53.yaml
+++ b/superbench/config/amd_mi100_z53.yaml
@@ -13,7 +13,7 @@ superbench:
      modes:
        - name: local
          proc_num: 8
-          prefix: CUDA_VISIBLE_DEVICES={proc_rank}
+          prefix: HIP_VISIBLE_DEVICES={proc_rank}
          parallel: yes
    default_pytorch_mode: &default_pytorch_mode
      enable: true
@@ -36,6 +36,52 @@ superbench:
        - train
      pin_memory: yes
  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    rccl-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        maxbytes: 8G
+        ngpus: 8
+        operation: allreduce
+    mem-bw:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+      parameters:
+        m: 7680 
+        n: 8192 
+        k: 8192
+    ib-loopback:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 2
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,1
+          parallel: no
+    disk-benchmark:
+      enable: false
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        block_devices: []
+    gpu-sm-copy-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 32
+          prefix: HIP_VISIBLE_DEVICES=$(({proc_rank}%8)) numactl -N $(({proc_rank}%4)) -m $(({proc_rank}%4))
+          parallel: no
+      parameters:
+        mem_type:
+          - dtoh
+          - htod
    gpt_models:
      <<: *default_pytorch_mode
      models:

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
@@ -35,6 +35,32 @@ superbench:
      <<: *default_local_mode
    gemm-flops:
      <<: *default_local_mode
+    nccl-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 1
+          parallel: no
+      parameters:
+        ngpus: 8
+    ib-loopback:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=0,2,4,6 NUMA_NODES=1,0,3,2
+          parallel: yes
+        - name: local
+          proc_num: 4
+          prefix: PROC_RANK={proc_rank} IB_DEVICES=1,3,5,7 NUMA_NODES=1,0,3,2
+          parallel: yes
+    mem-bw:
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: CUDA_VISIBLE_DEVICES={proc_rank} numactl -c $(({proc_rank}/2))
+          parallel: yes
    disk-benchmark:
      enable: false
      modes:

--- a/superbench/config/default.yaml
+++ b/superbench/config/default.yaml
@@ -32,7 +32,10 @@ superbench:
      enable: true
      modes:
        - name: local
-          prefix: NCCL_DEBUG=INFO NCCL_IB_DISABLE=1
+          proc_num: 1
+          parallel: no
+      parameters:
+        ngpus: 8
    ib-loopback:
      enable: true
      modes: