Unverified Commit 52848d2f authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Add flag to disable GPU. (#15)



* add flag to disable GPU.

* fix spelling

* fix unittest.

* address comments.
Co-authored-by: default avatarGuoshuai Zhao <guzhao@microsoft.com>
parent 83a4e93f
......@@ -56,6 +56,7 @@ def __init__(self, name, parameters=''):
self._loss_fn = None
self._target = None
self._supported_precision = []
self._gpu_available = None
def add_parser_arguments(self):
"""Add the specified arguments."""
......@@ -66,21 +67,21 @@ def add_parser_arguments(self):
type=int,
default=64,
required=False,
help='The number of warmup step',
help='The number of warmup step.',
)
self._parser.add_argument(
'--num_steps',
type=int,
default=2048,
required=False,
help='The number of test step',
help='The number of test step.',
)
self._parser.add_argument(
'--batch_size',
type=int,
default=32,
required=False,
help='The number of batch size',
help='The number of batch size.',
)
self._parser.add_argument(
'--precision',
......@@ -103,7 +104,7 @@ def add_parser_arguments(self):
type=DistributedImpl,
default=None,
required=False,
help='Distributed implementations. E.g. {}'.format(' '.join(DistributedImpl.get_values())),
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
......@@ -111,9 +112,21 @@ def add_parser_arguments(self):
type=DistributedBackend,
default=None,
required=False,
help='Distributed backends. E.g. {}'.format(' '.join(DistributedBackend.get_values())),
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
self._parser.add_argument(
'--no_gpu',
action='store_true',
default=False,
help='Disable GPU training.',
)
@abstractmethod
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
pass
@abstractmethod
def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU.
......@@ -150,6 +163,9 @@ def _preprocess(self):
if not super()._preprocess():
return False
self._judge_gpu_availability()
logger.info('GPU availablility - model: {}, availablility: {}.'.format(self._name, self._gpu_available))
if not self._init_distributed_setting():
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
return False
......
......@@ -27,6 +27,10 @@ def __init__(self, name, parameters=''):
self._framework = Framework.PYTORCH
torch.backends.cudnn.benchmark = True
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU.
......@@ -63,7 +67,7 @@ def _init_distributed_setting(self):
)
return False
if torch.cuda.is_available():
if self._gpu_available:
torch.cuda.set_device(self._local_rank)
return True
......
......@@ -28,7 +28,7 @@ def add_parser_arguments(self):
type=int,
default=1024,
required=False,
help='Hidden size',
help='Hidden size.',
)
self._parser.add_argument(
......@@ -36,9 +36,13 @@ def add_parser_arguments(self):
type=int,
default=512,
required=False,
help='Sequence length',
help='Sequence length.',
)
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
self._gpu_available = False
def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU."""
return True
......@@ -141,9 +145,9 @@ def test_arguments_related_interfaces():
"""optional arguments:
--run_count int The run count of benchmark.
--duration int The elapsed time of benchmark in seconds.
--num_warmup int The number of warmup step
--num_steps int The number of test step
--batch_size int The number of batch size
--num_warmup int The number of warmup step.
--num_steps int The number of test step.
--batch_size int The number of batch size.
--precision Precision [Precision ...]
Model precision. E.g. float16 float32 float64 bfloat16
uint8 int8 int16 int32 int64.
......@@ -151,11 +155,12 @@ def test_arguments_related_interfaces():
Benchmark model process. E.g. train inference.
--distributed_impl DistributedImpl
Distributed implementations. E.g. ddp mirrored
multiworkermirrored parameterserver horovod
multiworkermirrored parameterserver horovod.
--distributed_backend DistributedBackend
Distributed backends. E.g. nccl mpi gloo
--hidden_size int Hidden size
--seq_len int Sequence length"""
Distributed backends. E.g. nccl mpi gloo.
--no_gpu Disable GPU training.
--hidden_size int Hidden size.
--seq_len int Sequence length."""
)
assert (settings == expected_settings)
......@@ -171,9 +176,9 @@ def test_preprocess():
"""optional arguments:
--run_count int The run count of benchmark.
--duration int The elapsed time of benchmark in seconds.
--num_warmup int The number of warmup step
--num_steps int The number of test step
--batch_size int The number of batch size
--num_warmup int The number of warmup step.
--num_steps int The number of test step.
--batch_size int The number of batch size.
--precision Precision [Precision ...]
Model precision. E.g. float16 float32 float64 bfloat16
uint8 int8 int16 int32 int64.
......@@ -181,11 +186,12 @@ def test_preprocess():
Benchmark model process. E.g. train inference.
--distributed_impl DistributedImpl
Distributed implementations. E.g. ddp mirrored
multiworkermirrored parameterserver horovod
multiworkermirrored parameterserver horovod.
--distributed_backend DistributedBackend
Distributed backends. E.g. nccl mpi gloo
--hidden_size int Hidden size
--seq_len int Sequence length"""
Distributed backends. E.g. nccl mpi gloo.
--no_gpu Disable GPU training.
--hidden_size int Hidden size.
--seq_len int Sequence length."""
)
print(settings)
assert (settings == expected_settings)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment