Benchmarks: Add Feature - Add flag to disable GPU. (#15)

* add flag to disable GPU. * fix spelling * fix unittest. * address comments. Co-authored-by: Guoshuai Zhao <guzhao@microsoft.com>

Benchmarks: Add Feature - Add flag to disable GPU. (#15)
* add flag to disable GPU. * fix spelling * fix unittest. * address comments. Co-authored-by: Guoshuai Zhao <guzhao@microsoft.com>
52848d2f · guoshzhao · GitHub · 83a4e93f · 52848d2f · 52848d2f
Unverified Commit 52848d2f authored Mar 10, 2021 by guoshzhao Committed by GitHub Mar 10, 2021
3 changed files
--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -56,6 +56,7 @@ def __init__(self, name, parameters=''):
        self._loss_fn = None
        self._target = None
        self._supported_precision = []
+        self._gpu_available = None
    def add_parser_arguments(self):
        """Add the specified arguments."""
@@ -66,21 +67,21 @@ def add_parser_arguments(self):
            type=int,
            default=64,
            required=False,
-            help='The number of warmup step',
+            help='The number of warmup step.',
        )
        self._parser.add_argument(
            '--num_steps',
            type=int,
            default=2048,
            required=False,
-            help='The number of test step',
+            help='The number of test step.',
        )
        self._parser.add_argument(
            '--batch_size',
            type=int,
            default=32,
            required=False,
-            help='The number of batch size',
+            help='The number of batch size.',
        )
        self._parser.add_argument(
            '--precision',
@@ -103,7 +104,7 @@ def add_parser_arguments(self):
            type=DistributedImpl,
            default=None,
            required=False,
-            help='Distributed implementations. E.g. {}'.format(' '.join(DistributedImpl.get_values())),
+            help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
        )
        self._parser.add_argument(
@@ -111,9 +112,21 @@ def add_parser_arguments(self):
            type=DistributedBackend,
            default=None,
            required=False,
-            help='Distributed backends. E.g. {}'.format(' '.join(DistributedBackend.get_values())),
+            help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
+        )
+        self._parser.add_argument(
+            '--no_gpu',
+            action='store_true',
+            default=False,
+            help='Disable GPU training.',
        )
+    @abstractmethod
+    def _judge_gpu_availability(self):
+        """Judge GPUs' availability according to arguments and running environment."""
+        pass
    @abstractmethod
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.
@@ -150,6 +163,9 @@ def _preprocess(self):
        if not super()._preprocess():
            return False
+        self._judge_gpu_availability()
+        logger.info('GPU availablility - model: {}, availablility: {}.'.format(self._name, self._gpu_available))
        if not self._init_distributed_setting():
            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
            return False

--- a/superbench/benchmarks/model_benchmarks/pytorch_base.py
+++ b/superbench/benchmarks/model_benchmarks/pytorch_base.py
@@ -27,6 +27,10 @@ def __init__(self, name, parameters=''):
        self._framework = Framework.PYTORCH
        torch.backends.cudnn.benchmark = True
+    def _judge_gpu_availability(self):
+        """Judge GPUs' availability according to arguments and running environment."""
+        self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU.
@@ -63,7 +67,7 @@ def _init_distributed_setting(self):
                )
                return False
-            if torch.cuda.is_available():
+            if self._gpu_available:
                torch.cuda.set_device(self._local_rank)
        return True

--- a/tests/benchmarks/model_benchmarks/test_model_base.py
+++ b/tests/benchmarks/model_benchmarks/test_model_base.py
@@ -28,7 +28,7 @@ def add_parser_arguments(self):
            type=int,
            default=1024,
            required=False,
-            help='Hidden size',
+            help='Hidden size.',
        )
        self._parser.add_argument(
@@ -36,9 +36,13 @@ def add_parser_arguments(self):
            type=int,
            default=512,
            required=False,
-            help='Sequence length',
+            help='Sequence length.',
        )
+    def _judge_gpu_availability(self):
+        """Judge GPUs' availability according to arguments and running environment."""
+        self._gpu_available = False
    def _init_distributed_setting(self):
        """Initialize the distributed library and bind the worker to GPU."""
        return True
@@ -141,9 +145,9 @@ def test_arguments_related_interfaces():
        """optional arguments:
  --run_count int       The run count of benchmark.
  --duration int        The elapsed time of benchmark in seconds.
-  --num_warmup int      The number of warmup step
+  --num_warmup int      The number of warmup step.
-  --num_steps int       The number of test step
+  --num_steps int       The number of test step.
-  --batch_size int      The number of batch size
+  --batch_size int      The number of batch size.
  --precision Precision [Precision ...]
                        Model precision. E.g. float16 float32 float64 bfloat16
                        uint8 int8 int16 int32 int64.
@@ -151,11 +155,12 @@ def test_arguments_related_interfaces():
                        Benchmark model process. E.g. train inference.
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
-                        multiworkermirrored parameterserver horovod
+                        multiworkermirrored parameterserver horovod.
  --distributed_backend DistributedBackend
-                        Distributed backends. E.g. nccl mpi gloo
+                        Distributed backends. E.g. nccl mpi gloo.
-  --hidden_size int     Hidden size
+  --no_gpu              Disable GPU training.
-  --seq_len int         Sequence length"""
+  --hidden_size int     Hidden size.
+  --seq_len int         Sequence length."""
    )
    assert (settings == expected_settings)
@@ -171,9 +176,9 @@ def test_preprocess():
        """optional arguments:
  --run_count int       The run count of benchmark.
  --duration int        The elapsed time of benchmark in seconds.
-  --num_warmup int      The number of warmup step
+  --num_warmup int      The number of warmup step.
-  --num_steps int       The number of test step
+  --num_steps int       The number of test step.
-  --batch_size int      The number of batch size
+  --batch_size int      The number of batch size.
  --precision Precision [Precision ...]
                        Model precision. E.g. float16 float32 float64 bfloat16
                        uint8 int8 int16 int32 int64.
@@ -181,11 +186,12 @@ def test_preprocess():
                        Benchmark model process. E.g. train inference.
  --distributed_impl DistributedImpl
                        Distributed implementations. E.g. ddp mirrored
-                        multiworkermirrored parameterserver horovod
+                        multiworkermirrored parameterserver horovod.
  --distributed_backend DistributedBackend
-                        Distributed backends. E.g. nccl mpi gloo
+                        Distributed backends. E.g. nccl mpi gloo.
-  --hidden_size int     Hidden size
+  --no_gpu              Disable GPU training.
-  --seq_len int         Sequence length"""
+  --hidden_size int     Hidden size.
+  --seq_len int         Sequence length."""
    )
    print(settings)
    assert (settings == expected_settings)