Unverified Commit 216c5b5c authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments...

Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments for micro benchmark. (#100)
parent 3d72c078
...@@ -6,7 +6,8 @@ ...@@ -6,7 +6,8 @@
import importlib import importlib
from superbench.benchmarks.return_code import ReturnCode from superbench.benchmarks.return_code import ReturnCode
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, BenchmarkType, BenchmarkContext from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
from superbench.common.utils import LazyImport from superbench.common.utils import LazyImport
BenchmarkRegistry = LazyImport( BenchmarkRegistry = LazyImport(
...@@ -21,6 +22,6 @@ ...@@ -21,6 +22,6 @@
) )
__all__ = [ __all__ = [
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'BenchmarkContext', 'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
'BenchmarkRegistry' 'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry'
] ]
...@@ -61,6 +61,22 @@ class ModelAction(Enum): ...@@ -61,6 +61,22 @@ class ModelAction(Enum):
INFERENCE = 'inference' INFERENCE = 'inference'
class DistributedImpl(Enum):
"""The Enum class representing different distributed implementations."""
DDP = 'ddp'
MIRRORED = 'mirrored'
MW_MIRRORED = 'multiworkermirrored'
PS = 'parameterserver'
HOROVOD = 'horovod'
class DistributedBackend(Enum):
"""The Enum class representing different distributed backends."""
NCCL = 'nccl'
MPI = 'mpi'
GLOO = 'gloo'
class BenchmarkContext(): class BenchmarkContext():
"""Context class of all benchmarks. """Context class of all benchmarks.
......
...@@ -21,7 +21,7 @@ ...@@ -21,7 +21,7 @@
import torch import torch
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark from superbench.benchmarks.micro_benchmarks import MicroBenchmark
from superbench.benchmarks.context import Enum from superbench.benchmarks.context import Enum
...@@ -114,6 +114,21 @@ def add_parser_arguments(self): ...@@ -114,6 +114,21 @@ def add_parser_arguments(self):
required=False, required=False,
help='The number of test step.', help='The number of test step.',
) )
self._parser.add_argument(
'--distributed_impl',
type=DistributedImpl,
default=DistributedImpl.DDP,
required=False,
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
'--distributed_backend',
type=DistributedBackend,
default=DistributedBackend.NCCL,
required=False,
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -124,8 +139,17 @@ def _preprocess(self): ...@@ -124,8 +139,17 @@ def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
if self._args.distributed_impl != DistributedImpl.DDP:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
logger.error(
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
self._name, self._args.distributed_impl
)
)
return False
try: try:
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
self.__world_size = int(os.environ['WORLD_SIZE']) self.__world_size = int(os.environ['WORLD_SIZE'])
self.__local_rank = int(os.environ['LOCAL_RANK']) self.__local_rank = int(os.environ['LOCAL_RANK'])
# if self.__world_size < 2: # if self.__world_size < 2:
......
...@@ -18,7 +18,7 @@ ...@@ -18,7 +18,7 @@
import torch import torch
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark from superbench.benchmarks.micro_benchmarks import MicroBenchmark
from superbench.benchmarks.context import Enum from superbench.benchmarks.context import Enum
...@@ -91,6 +91,21 @@ def add_parser_arguments(self): ...@@ -91,6 +91,21 @@ def add_parser_arguments(self):
required=False, required=False,
help='The number of test step.', help='The number of test step.',
) )
self._parser.add_argument(
'--distributed_impl',
type=DistributedImpl,
default=DistributedImpl.DDP,
required=False,
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
'--distributed_backend',
type=DistributedBackend,
default=DistributedBackend.NCCL,
required=False,
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
def _preprocess(self): def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking. """Preprocess/preparation operations before the benchmarking.
...@@ -101,6 +116,15 @@ def _preprocess(self): ...@@ -101,6 +116,15 @@ def _preprocess(self):
if not super()._preprocess(): if not super()._preprocess():
return False return False
if self._args.distributed_impl != DistributedImpl.DDP:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
logger.error(
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
self._name, self._args.distributed_impl
)
)
return False
if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode: if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
try: try:
torch.distributed.init_process_group(backend='nccl') torch.distributed.init_process_group(backend='nccl')
......
...@@ -8,7 +8,7 @@ ...@@ -8,7 +8,7 @@
from abc import abstractmethod from abc import abstractmethod
from superbench.common.utils import logger from superbench.common.utils import logger
from superbench.benchmarks import Precision, ModelAction, BenchmarkType, ReturnCode from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
from superbench.benchmarks.base import Benchmark from superbench.benchmarks.base import Benchmark
from superbench.benchmarks.context import Enum from superbench.benchmarks.context import Enum
...@@ -20,22 +20,6 @@ class Optimizer(Enum): ...@@ -20,22 +20,6 @@ class Optimizer(Enum):
ADAMW = 'adamw' ADAMW = 'adamw'
class DistributedImpl(Enum):
"""The Enum class representing different distributed implementations."""
DDP = 'ddp'
MIRRORED = 'mirrored'
MW_MIRRORED = 'multiworkermirrored'
PS = 'parameterserver'
HOROVOD = 'horovod'
class DistributedBackend(Enum):
"""The Enum class representing different distributed backends."""
NCCL = 'nccl'
MPI = 'mpi'
GLOO = 'gloo'
class ModelBenchmark(Benchmark): class ModelBenchmark(Benchmark):
"""The base class of E2E model benchmarks.""" """The base class of E2E model benchmarks."""
def __init__(self, name, parameters=''): def __init__(self, name, parameters=''):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment