Unverified Commit 52848d2f authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Add flag to disable GPU. (#15)



* add flag to disable GPU.

* fix spelling

* fix unittest.

* address comments.
Co-authored-by: default avatarGuoshuai Zhao <guzhao@microsoft.com>
parent 83a4e93f
...@@ -56,6 +56,7 @@ def __init__(self, name, parameters=''): ...@@ -56,6 +56,7 @@ def __init__(self, name, parameters=''):
self._loss_fn = None self._loss_fn = None
self._target = None self._target = None
self._supported_precision = [] self._supported_precision = []
self._gpu_available = None
def add_parser_arguments(self): def add_parser_arguments(self):
"""Add the specified arguments.""" """Add the specified arguments."""
...@@ -66,21 +67,21 @@ def add_parser_arguments(self): ...@@ -66,21 +67,21 @@ def add_parser_arguments(self):
type=int, type=int,
default=64, default=64,
required=False, required=False,
help='The number of warmup step', help='The number of warmup step.',
) )
self._parser.add_argument( self._parser.add_argument(
'--num_steps', '--num_steps',
type=int, type=int,
default=2048, default=2048,
required=False, required=False,
help='The number of test step', help='The number of test step.',
) )
self._parser.add_argument( self._parser.add_argument(
'--batch_size', '--batch_size',
type=int, type=int,
default=32, default=32,
required=False, required=False,
help='The number of batch size', help='The number of batch size.',
) )
self._parser.add_argument( self._parser.add_argument(
'--precision', '--precision',
...@@ -103,7 +104,7 @@ def add_parser_arguments(self): ...@@ -103,7 +104,7 @@ def add_parser_arguments(self):
type=DistributedImpl, type=DistributedImpl,
default=None, default=None,
required=False, required=False,
help='Distributed implementations. E.g. {}'.format(' '.join(DistributedImpl.get_values())), help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
) )
self._parser.add_argument( self._parser.add_argument(
...@@ -111,9 +112,21 @@ def add_parser_arguments(self): ...@@ -111,9 +112,21 @@ def add_parser_arguments(self):
type=DistributedBackend, type=DistributedBackend,
default=None, default=None,
required=False, required=False,
help='Distributed backends. E.g. {}'.format(' '.join(DistributedBackend.get_values())), help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
self._parser.add_argument(
'--no_gpu',
action='store_true',
default=False,
help='Disable GPU training.',
) )
@abstractmethod
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
pass
@abstractmethod @abstractmethod
def _init_distributed_setting(self): def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU. """Initialize the distributed library and bind the worker to GPU.
...@@ -150,6 +163,9 @@ def _preprocess(self): ...@@ -150,6 +163,9 @@ def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
self._judge_gpu_availability()
logger.info('GPU availablility - model: {}, availablility: {}.'.format(self._name, self._gpu_available))
if not self._init_distributed_setting(): if not self._init_distributed_setting():
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE) self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
return False return False
......
...@@ -27,6 +27,10 @@ def __init__(self, name, parameters=''): ...@@ -27,6 +27,10 @@ def __init__(self, name, parameters=''):
self._framework = Framework.PYTORCH self._framework = Framework.PYTORCH
torch.backends.cudnn.benchmark = True torch.backends.cudnn.benchmark = True
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
self._gpu_available = not self._args.no_gpu and torch.cuda.is_available()
def _init_distributed_setting(self): def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU. """Initialize the distributed library and bind the worker to GPU.
...@@ -63,7 +67,7 @@ def _init_distributed_setting(self): ...@@ -63,7 +67,7 @@ def _init_distributed_setting(self):
) )
return False return False
if torch.cuda.is_available(): if self._gpu_available:
torch.cuda.set_device(self._local_rank) torch.cuda.set_device(self._local_rank)
return True return True
......
...@@ -28,7 +28,7 @@ def add_parser_arguments(self): ...@@ -28,7 +28,7 @@ def add_parser_arguments(self):
type=int, type=int,
default=1024, default=1024,
required=False, required=False,
help='Hidden size', help='Hidden size.',
) )
self._parser.add_argument( self._parser.add_argument(
...@@ -36,9 +36,13 @@ def add_parser_arguments(self): ...@@ -36,9 +36,13 @@ def add_parser_arguments(self):
type=int, type=int,
default=512, default=512,
required=False, required=False,
help='Sequence length', help='Sequence length.',
) )
def _judge_gpu_availability(self):
"""Judge GPUs' availability according to arguments and running environment."""
self._gpu_available = False
def _init_distributed_setting(self): def _init_distributed_setting(self):
"""Initialize the distributed library and bind the worker to GPU.""" """Initialize the distributed library and bind the worker to GPU."""
return True return True
...@@ -141,9 +145,9 @@ def test_arguments_related_interfaces(): ...@@ -141,9 +145,9 @@ def test_arguments_related_interfaces():
"""optional arguments: """optional arguments:
--run_count int The run count of benchmark. --run_count int The run count of benchmark.
--duration int The elapsed time of benchmark in seconds. --duration int The elapsed time of benchmark in seconds.
--num_warmup int The number of warmup step --num_warmup int The number of warmup step.
--num_steps int The number of test step --num_steps int The number of test step.
--batch_size int The number of batch size --batch_size int The number of batch size.
--precision Precision [Precision ...] --precision Precision [Precision ...]
Model precision. E.g. float16 float32 float64 bfloat16 Model precision. E.g. float16 float32 float64 bfloat16
uint8 int8 int16 int32 int64. uint8 int8 int16 int32 int64.
...@@ -151,11 +155,12 @@ def test_arguments_related_interfaces(): ...@@ -151,11 +155,12 @@ def test_arguments_related_interfaces():
Benchmark model process. E.g. train inference. Benchmark model process. E.g. train inference.
--distributed_impl DistributedImpl --distributed_impl DistributedImpl
Distributed implementations. E.g. ddp mirrored Distributed implementations. E.g. ddp mirrored
multiworkermirrored parameterserver horovod multiworkermirrored parameterserver horovod.
--distributed_backend DistributedBackend --distributed_backend DistributedBackend
Distributed backends. E.g. nccl mpi gloo Distributed backends. E.g. nccl mpi gloo.
--hidden_size int Hidden size --no_gpu Disable GPU training.
--seq_len int Sequence length""" --hidden_size int Hidden size.
--seq_len int Sequence length."""
) )
assert (settings == expected_settings) assert (settings == expected_settings)
...@@ -171,9 +176,9 @@ def test_preprocess(): ...@@ -171,9 +176,9 @@ def test_preprocess():
"""optional arguments: """optional arguments:
--run_count int The run count of benchmark. --run_count int The run count of benchmark.
--duration int The elapsed time of benchmark in seconds. --duration int The elapsed time of benchmark in seconds.
--num_warmup int The number of warmup step --num_warmup int The number of warmup step.
--num_steps int The number of test step --num_steps int The number of test step.
--batch_size int The number of batch size --batch_size int The number of batch size.
--precision Precision [Precision ...] --precision Precision [Precision ...]
Model precision. E.g. float16 float32 float64 bfloat16 Model precision. E.g. float16 float32 float64 bfloat16
uint8 int8 int16 int32 int64. uint8 int8 int16 int32 int64.
...@@ -181,11 +186,12 @@ def test_preprocess(): ...@@ -181,11 +186,12 @@ def test_preprocess():
Benchmark model process. E.g. train inference. Benchmark model process. E.g. train inference.
--distributed_impl DistributedImpl --distributed_impl DistributedImpl
Distributed implementations. E.g. ddp mirrored Distributed implementations. E.g. ddp mirrored
multiworkermirrored parameterserver horovod multiworkermirrored parameterserver horovod.
--distributed_backend DistributedBackend --distributed_backend DistributedBackend
Distributed backends. E.g. nccl mpi gloo Distributed backends. E.g. nccl mpi gloo.
--hidden_size int Hidden size --no_gpu Disable GPU training.
--seq_len int Sequence length""" --hidden_size int Hidden size.
--seq_len int Sequence length."""
) )
print(settings) print(settings)
assert (settings == expected_settings) assert (settings == expected_settings)
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment