Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments...

Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments for micro benchmark. (#100)

Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments...
Benchmarks: Add Feature - Add DistributedImpl and DistributedBackend arguments for micro benchmark. (#100)
216c5b5c · guoshzhao · GitHub · 3d72c078 · 216c5b5c · 216c5b5c
Unverified Commit 216c5b5c authored Jun 21, 2021 by guoshzhao Committed by GitHub Jun 21, 2021
5 changed files
--- a/superbench/benchmarks/__init__.py
+++ b/superbench/benchmarks/__init__.py
@@ -6,7 +6,8 @@
 import importlib

 from superbench.benchmarks.return_code import ReturnCode
-from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, BenchmarkType, BenchmarkContext
+from superbench.benchmarks.context import Platform, Framework, Precision, ModelAction, \
+    DistributedImpl, DistributedBackend, BenchmarkType, BenchmarkContext
 from superbench.common.utils import LazyImport

 BenchmarkRegistry = LazyImport(
@@ -21,6 +22,6 @@ BenchmarkRegistry = LazyImport(
 )

 __all__ = [
-    'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'BenchmarkContext',
-    'BenchmarkRegistry'
+    'ReturnCode', 'Platform', 'Framework', 'BenchmarkType', 'Precision', 'ModelAction', 'DistributedImpl',
+    'DistributedBackend', 'BenchmarkContext', 'BenchmarkRegistry'
 ]
--- a/superbench/benchmarks/context.py
+++ b/superbench/benchmarks/context.py
@@ -61,6 +61,22 @@ class ModelAction(Enum):
    INFERENCE = 'inference'


+class DistributedImpl(Enum):
+    """The Enum class representing different distributed implementations."""
+    DDP = 'ddp'
+    MIRRORED = 'mirrored'
+    MW_MIRRORED = 'multiworkermirrored'
+    PS = 'parameterserver'
+    HOROVOD = 'horovod'
+
+
+class DistributedBackend(Enum):
+    """The Enum class representing different distributed backends."""
+    NCCL = 'nccl'
+    MPI = 'mpi'
+    GLOO = 'gloo'
+
+
 class BenchmarkContext():
    """Context class of all benchmarks.


--- a/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
+++ b/superbench/benchmarks/micro_benchmarks/computation_communication_overlap.py
@@ -21,7 +21,7 @@ import time
 import torch

 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
 from superbench.benchmarks.context import Enum

@@ -114,6 +114,21 @@ class ComputationCommunicationOverlap(MicroBenchmark):
            required=False,
            help='The number of test step.',
        )
+        self._parser.add_argument(
+            '--distributed_impl',
+            type=DistributedImpl,
+            default=DistributedImpl.DDP,
+            required=False,
+            help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
+        )
+
+        self._parser.add_argument(
+            '--distributed_backend',
+            type=DistributedBackend,
+            default=DistributedBackend.NCCL,
+            required=False,
+            help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
+        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.
@@ -124,8 +139,17 @@ class ComputationCommunicationOverlap(MicroBenchmark):
        if not super()._preprocess():
            return False

+        if self._args.distributed_impl != DistributedImpl.DDP:
+            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
+            logger.error(
+                'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
+                    self._name, self._args.distributed_impl
+                )
+            )
+            return False
+
        try:
-            torch.distributed.init_process_group(backend='nccl')
+            torch.distributed.init_process_group(backend=self._args.distributed_backend.value)
            self.__world_size = int(os.environ['WORLD_SIZE'])
            self.__local_rank = int(os.environ['LOCAL_RANK'])
            # if self.__world_size < 2:

--- a/superbench/benchmarks/micro_benchmarks/sharding_matmul.py
+++ b/superbench/benchmarks/micro_benchmarks/sharding_matmul.py
@@ -18,7 +18,7 @@ import time
 import torch

 from superbench.common.utils import logger
-from superbench.benchmarks import BenchmarkRegistry, ReturnCode
+from superbench.benchmarks import DistributedImpl, DistributedBackend, BenchmarkRegistry, ReturnCode
 from superbench.benchmarks.micro_benchmarks import MicroBenchmark
 from superbench.benchmarks.context import Enum

@@ -91,6 +91,21 @@ class ShardingMatmul(MicroBenchmark):
            required=False,
            help='The number of test step.',
        )
+        self._parser.add_argument(
+            '--distributed_impl',
+            type=DistributedImpl,
+            default=DistributedImpl.DDP,
+            required=False,
+            help='Distributed implementations. E.g. {}.'.format(' '.join(DistributedImpl.get_values())),
+        )
+
+        self._parser.add_argument(
+            '--distributed_backend',
+            type=DistributedBackend,
+            default=DistributedBackend.NCCL,
+            required=False,
+            help='Distributed backends. E.g. {}.'.format(' '.join(DistributedBackend.get_values())),
+        )

    def _preprocess(self):
        """Preprocess/preparation operations before the benchmarking.
@@ -101,6 +116,15 @@ class ShardingMatmul(MicroBenchmark):
        if not super()._preprocess():
            return False

+        if self._args.distributed_impl != DistributedImpl.DDP:
+            self._result.set_return_code(ReturnCode.DISTRIBUTED_SETTING_INIT_FAILURE)
+            logger.error(
+                'Unsupported distributed implementation - model: {}, distributed implementation: {}.'.format(
+                    self._name, self._args.distributed_impl
+                )
+            )
+            return False
+
        if ShardingMode.ALLGATHER in self._args.mode or ShardingMode.ALLREDUCE in self._args.mode:
            try:
                torch.distributed.init_process_group(backend='nccl')

--- a/superbench/benchmarks/model_benchmarks/model_base.py
+++ b/superbench/benchmarks/model_benchmarks/model_base.py
@@ -8,7 +8,7 @@ import time
 from abc import abstractmethod

 from superbench.common.utils import logger
-from superbench.benchmarks import Precision, ModelAction, BenchmarkType, ReturnCode
+from superbench.benchmarks import Precision, ModelAction, DistributedImpl, DistributedBackend, BenchmarkType, ReturnCode
 from superbench.benchmarks.base import Benchmark
 from superbench.benchmarks.context import Enum

@@ -20,22 +20,6 @@ class Optimizer(Enum):
    ADAMW = 'adamw'


-class DistributedImpl(Enum):
-    """The Enum class representing different distributed implementations."""
-    DDP = 'ddp'
-    MIRRORED = 'mirrored'
-    MW_MIRRORED = 'multiworkermirrored'
-    PS = 'parameterserver'
-    HOROVOD = 'horovod'
-
-
-class DistributedBackend(Enum):
-    """The Enum class representing different distributed backends."""
-    NCCL = 'nccl'
-    MPI = 'mpi'
-    GLOO = 'gloo'
-
-
 class ModelBenchmark(Benchmark):
    """The base class of E2E model benchmarks."""
    def __init__(self, name, parameters=''):