Unverified Commit 216c5b5c authored by guoshzhao's avatar guoshzhao Committed by GitHub
Browse files

Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments...

Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments for micro benchmark. (#100)
parent 3d72c078
......@@ -6,7 +6,8 @@
import importlib
from superbench.benchmarks.return_code import ReturnCode
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, BenchmarkType, BenchmarkContext
from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
from superbench.common.utils import LazyImport
BenchmarkRegistry = LazyImport(
......@@ -21,6 +22,6 @@
)
__all__ = [
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'BenchmarkContext',
'BenchmarkRegistry'
'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry'
]
......@@ -61,6 +61,22 @@ class ModelAction(Enum):
INFERENCE = 'inference'
class DistributedImpl(Enum):
"""The Enum class representing different distributed implementations."""
DDP = 'ddp'
MIRRORED = 'mirrored'
MW_MIRRORED = 'multiworkermirrored'
PS = 'parameterserver'
HOROVOD = 'horovod'
class DistributedBackend(Enum):
"""The Enum class representing different distributed backends."""
NCCL = 'nccl'
MPI = 'mpi'
GLOO = 'gloo'
class BenchmarkContext():
"""Context class of all benchmarks.
......
......@@ -21,7 +21,7 @@
import torch
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
from superbench.benchmarks.context import Enum
......@@ -114,6 +114,21 @@ def add_parser_arguments(self):
required=False,
help='The number of test step.',
)
self._parser.add_argument(
'--distributed_impl',
type=DistributedImpl,
default=DistributedImpl.DDP,
required=False,
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
'--distributed_backend',
type=DistributedBackend,
default=DistributedBackend.NCCL,
required=False,
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
......@@ -124,8 +139,17 @@ def _preprocess(self):
if not super()._preprocess():
return False
if self._args.distributed_impl != DistributedImpl.DDP:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
logger.error(
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
self._name, self._args.distributed_impl
)
)
return False
try:
torch.distributed.init_process_group(backend='nccl')
torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
self.__world_size = int(os.environ['WORLD_SIZE'])
self.__local_rank = int(os.environ['LOCAL_RANK'])
# if self.__world_size < 2:
......
......@@ -18,7 +18,7 @@
import torch
from superbench.common.utils import logger
from superbench.benchmarks import BenchmarkRegistry, ReturnCode
from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
from superbench.benchmarks.micro_benchmarks import MicroBenchmark
from superbench.benchmarks.context import Enum
......@@ -91,6 +91,21 @@ def add_parser_arguments(self):
required=False,
help='The number of test step.',
)
self._parser.add_argument(
'--distributed_impl',
type=DistributedImpl,
default=DistributedImpl.DDP,
required=False,
help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
)
self._parser.add_argument(
'--distributed_backend',
type=DistributedBackend,
default=DistributedBackend.NCCL,
required=False,
help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
)
def _preprocess(self):
"""Preprocess/preparation operations before the benchmarking.
......@@ -101,6 +116,15 @@ def _preprocess(self):
if not super()._preprocess():
return False
if self._args.distributed_impl != DistributedImpl.DDP:
self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
logger.error(
'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
self._name, self._args.distributed_impl
)
)
return False
if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
try:
torch.distributed.init_process_group(backend='nccl')
......
......@@ -8,7 +8,7 @@
from abc import abstractmethod
from superbench.common.utils import logger
from superbench.benchmarks import Precision, ModelAction, BenchmarkType, ReturnCode
from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
from superbench.benchmarks.base import Benchmark
from superbench.benchmarks.context import Enum
......@@ -20,22 +20,6 @@ class Optimizer(Enum):
ADAMW = 'adamw'
class DistributedImpl(Enum):
"""The Enum class representing different distributed implementations."""
DDP = 'ddp'
MIRRORED = 'mirrored'
MW_MIRRORED = 'multiworkermirrored'
PS = 'parameterserver'
HOROVOD = 'horovod'
class DistributedBackend(Enum):
"""The Enum class representing different distributed backends."""
NCCL = 'nccl'
MPI = 'mpi'
GLOO = 'gloo'
class ModelBenchmark(Benchmark):
"""The base class of E2E model benchmarks."""
def __init__(self, name, parameters=''):
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment