Benchmarks: Add Feature - Add flag to disable GPU. (#15)

* add flag to disable GPU. * fix spelling * fix unittest. * address comments. Co-authored-by: Guoshuai Zhao <guzhao@microsoft.com>

Benchmarks: Add Feature - Add flag to disable GPU. (#15)
* add flag to disable GPU. * fix spelling * fix unittest. * address comments. Co-authored-by: Guoshuai Zhao <guzhao@microsoft.com>
52848d2f · guoshzhao · GitHub · 83a4e93f · 52848d2f · 52848d2f
Unverified Commit 52848d2f authored Mar 10, 2021 by guoshzhao Committed by GitHub Mar 10, 2021
3 changed files
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -56,6 +56,7 @@ class ModelBenchmark(Benchmark):
        self._loss_fn = None
        self._target = None
        self._supported_precision = []
+        self._gpu_available = None

    def add_parser_arguments(self):
        """Add the specified arguments."""
@@ -66,21 +67,21 @@ class ModelBenchmark(Benchmark):
            type=int,
            default=64,
            required=False,
-            help='The number of warmup step',
+            help='The number of warmup step.',
        )
        self._parser.add_argument(
            '--num_steps',
            type=int,
            default=2048,
            required=False,
-            help='The number of test step',
+            help='The number of test step.',
        )
        self._parser.add_argument(
            '--batch_size',
            type=int,
            default=32,
            required=False,
-            help='The number of batch size',
+            help='The number of batch size.',
        )
        self._parser.add_argument(
            '--precision',
@@ -103,7 +104,7 @@ class ModelBenchmark(Benchmark):
            type=DistributedImpl,
            default=None,
            required=False,
-            help='Distributed implementations. E.g. {}'.format(' '.join(DistributedImpl.get_values())),
+            help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
        )

        self._parser.add_argument(
@@ -111,9 +112,21 @@ class ModelBenchmark(Benchmark):
            type=DistributedBackend,
            default=None,
            required=False,
-            help='Distributed backends. E.g. {}'.format(' '.join(DistributedBackend.get_values())),
+            help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
+        )
+
+        self._parser.add_argument(
+            '--no_gpu',
+            action='store_true',
+            default=False,
+            help='Disable GPU training.',
        )

+    @abstractmethod
+    def _judge_gpu_availability(self):
+        """Judge GPUs' availability according to arguments and running environment."""
+        pass
+
    @abstractmethod
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.
@@ -150,6 +163,9 @@ class ModelBenchmark(Benchmark):
        if not super()._preprocess():
            return False

+        self._judge_gpu_availability()
+        logger.info('GPU availablility - model: {}, availablility: {}.'.format(self._name, self._gpu_available))
+
        if not self._init_distributed_setting():
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
            return False

--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -27,6 +27,10 @@ class PytorchBase(ModelBenchmark):
        self._framework = Framework.PYTORCH
        torch.backends.cudnn.benchmark = True

+    def _judge_gpu_availability(self):
+        """Judge GPUs' availability according to arguments and running environment."""
+        self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
+
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.

@@ -63,7 +67,7 @@ class PytorchBase(ModelBenchmark):
                )
                return False

-            if torch.cuda.is_available():
+            if self._gpu_available:
                torch.cuda.set_device(self._local_rank)

        return True

--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -28,7 +28,7 @@ class FakeModelBenchmark(ModelBenchmark):
            type=int,
            default=1024,
            required=False,
-            help='Hidden size',
+            help='Hidden size.',
        )

        self._parser.add_argument(
@@ -36,9 +36,13 @@ class FakeModelBenchmark(ModelBenchmark):
            type=int,
            default=512,
            required=False,
-            help='Sequence length',
+            help='Sequence length.',
        )

+    def _judge_gpu_availability(self):
+        """Judge GPUs' availability according to arguments and running environment."""
+        self._gpu_available = False
+
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU."""
        return True
@@ -141,9 +145,9 @@ def test_arguments_related_interfaces():
        """optional arguments:
  --run_count int       The run count of benchmark.
  --duration int        The elapsed time of benchmark in seconds.
-  --num_warmup int      The number of warmup step
-  --num_steps int       The number of test step
-  --batch_size int      The number of batch size
+  --num_warmup int      The number of warmup step.
+  --num_steps int       The number of test step.
+  --batch_size int      The number of batch size.
  --precision Precision [Precision ...]
                        Model precision. E.g. float16 float32 float64 bfloat16
                        uint8 int8 int16 int32 int64.
@@ -151,11 +155,12 @@ def test_arguments_related_interfaces():
                        Benchmark model process. E.g. train inference.
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
-                        multiworkermirrored parameterserver horovod
+                        multiworkermirrored parameterserver horovod.
  --distributed_backend DistributedBackend
-                        Distributed backends. E.g. nccl mpi gloo
-  --hidden_size int     Hidden size
-  --seq_len int         Sequence length"""
+                        Distributed backends. E.g. nccl mpi gloo.
+  --no_gpu              Disable GPU training.
+  --hidden_size int     Hidden size.
+  --seq_len int         Sequence length."""
    )
    assert (settings == expected_settings)

@@ -171,9 +176,9 @@ def test_preprocess():
        """optional arguments:
  --run_count int       The run count of benchmark.
  --duration int        The elapsed time of benchmark in seconds.
-  --num_warmup int      The number of warmup step
-  --num_steps int       The number of test step
-  --batch_size int      The number of batch size
+  --num_warmup int      The number of warmup step.
+  --num_steps int       The number of test step.
+  --batch_size int      The number of batch size.
  --precision Precision [Precision ...]
                        Model precision. E.g. float16 float32 float64 bfloat16
                        uint8 int8 int16 int32 int64.
@@ -181,11 +186,12 @@ def test_preprocess():
                        Benchmark model process. E.g. train inference.
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
-                        multiworkermirrored parameterserver horovod
+                        multiworkermirrored parameterserver horovod.
  --distributed_backend DistributedBackend
-                        Distributed backends. E.g. nccl mpi gloo
-  --hidden_size int     Hidden size
-  --seq_len int         Sequence length"""
+                        Distributed backends. E.g. nccl mpi gloo.
+  --no_gpu              Disable GPU training.
+  --hidden_size int     Hidden size.
+  --seq_len int         Sequence length."""
    )
    print(settings)
    assert (settings == expected_settings)