Benchmarks: Add Configuration - Add validation config file for azure NDv4. (#103)

* add config file for ndv4.

Benchmarks: Add Configuration - Add validation config file for azure NDv4. (#103)
* add config file for ndv4.
f22bb3f2 · guoshzhao · GitHub · 9c748527 · f22bb3f2 · f22bb3f2
Unverified Commit f22bb3f2 authored Jun 28, 2021 by guoshzhao Committed by GitHub Jun 28, 2021
Showing with 108 additions and 1 deletion

superbench/benchmarks/model_benchmarks/model_base.py superbench/benchmarks/model_benchmarks/model_base.py +5 -1

superbench/config/azure_ndv4.yaml superbench/config/azure_ndv4.yaml +103 -0

No files found.
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -164,7 +164,11 @@ class ModelBenchmark(Benchmark):
            return False
        self._judge_gpu_availability()
-        logger.info('GPU availablility - model: {}, availablility: {}.'.format(self._name, self._gpu_available))
+        logger.info(
+            'Model placement - model: {}, GPU availablility: {}, pin memory: {}.'.format(
+                self._name, self._gpu_available, self._args.pin_memory
+            )
+        )
        if not self._init_distributed_setting():
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)

--- a/superbench/config/azure_ndv4.yaml
+++ b/superbench/config/azure_ndv4.yaml
+# SuperBench Config
+superbench:
+  enable: null
+  var:
+    default_local_mode: &default_local_mode
+      enable: true
+      modes:
+        - name: local
+          proc_num: 8
+          prefix: CUDA_VISIBLE_DEVICES={proc_rank}
+          parallel: yes
+    default_pytorch_mode: &default_pytorch_mode
+      enable: true
+      modes:
+        - name: torch.distributed
+          proc_num: 8
+          node_num: 1
+      frameworks:
+        - pytorch
+    common_model_config: &common_model_config
+      duration: 0
+      num_warmup: 64
+      num_steps: 2048
+      sample_count: 8192
+      batch_size: 32
+      precision:
+        - float32
+        - float16
+      model_action:
+        - train
+      pin_memory: yes
+  benchmarks:
+    kernel-launch:
+      <<: *default_local_mode
+    gemm-flops:
+      <<: *default_local_mode
+    cudnn-function:
+      <<: *default_local_mode
+    cublas-function:
+      <<: *default_local_mode
+    matmul:
+      <<: *default_local_mode
+      frameworks:
+        - pytorch
+    sharding-matmul:
+      <<: *default_pytorch_mode
+    computation-communication-overlap:
+      <<: *default_pytorch_mode
+    gpt_models:
+      <<: *default_pytorch_mode
+      models:
+        - gpt2-large
+      parameters:
+        <<: *common_model_config
+        batch_size: 8
+        seq_len: 224
+    bert_models:
+      <<: *default_pytorch_mode
+      models:
+        - bert-base
+        - bert-large
+      parameters:
+        <<: *common_model_config
+        seq_len: 224
+    lstm_models:
+      <<: *default_pytorch_mode
+      models:
+        - lstm
+      parameters:
+        <<: *common_model_config
+        batch_size: 224
+        input_size: 224
+        hidden_size: 1000
+        seq_len: 32
+        pin_memory: no
+    resnet_models:
+      <<: *default_pytorch_mode
+      models:
+        - resnet50
+        - resnet101
+        - resnet152
+      parameters:
+        <<: *common_model_config
+        batch_size: 192
+        num_steps: 512
+    densenet_models:
+      <<: *default_pytorch_mode
+      models:
+        - densenet169
+        - densenet201
+      parameters:
+        <<: *common_model_config
+        pin_memory: no
+    vgg_models:
+      <<: *default_pytorch_mode
+      models:
+        - vgg11
+        - vgg13
+        - vgg16
+        - vgg19
+      parameters:
+        <<: *common_model_config
+        pin_memory: no