push dsv0.8.2 version

67ea635f · aiss · 1b2721ad · 67ea635f · 67ea635f · 67ea635f
Commit 67ea635f authored Mar 30, 2023 by aiss
20 changed files
--- a/CODEOWNERS
+++ b/CODEOWNERS
-*       @jeffra @samyam @tjruwase @ShadenSmith @conglongli @awan-10 @arashashari @cli99 @eltonzheng @minjiaz @RezaYazdaniAminabadi @niumanar
+# This file is used to subscribe for notifications for PRs
+# related to specific file paths, does not necessarily mean
+# approval is required from these people before merging.
+#
+# Learn more about CODEOWNERS syntax here:
+# https://docs.github.com/en/repositories/managing-your-repositorys-settings-and-features/customizing-your-repository/about-code-owners
+
+
+# top-level repo folders
+/.github/ @jeffra @mrwyattii
+/azure/ @jeffra @awan-10
+/benchmarks/ @jeffra @awan-10 @mrwyattii @molly-smith
+/bin/ @jeffra
+/csrc/ @RezaYazdaniAminabadi @awan-10 @jeffra @cmikeh2 @arashb
+/deepspeed/ @jeffra
+/docker/ @jeffra @awan-10
+/docs/ @jeffra @mrwyattii
+/examples/ @jeffra @awan-10 @mrwyattii
+/op_builder/ @jeffra @RezaYazdaniAminabadi @cmikeh2
+/release/ @jeffra @mrwyattii
+/requirements/ @jeffra @mrwyattii
+/scripts/ @jeffra @awan-10
+/tests/ @jeffra @mrwyattii @tjruwase
+
+# deepspeed
+/deepspeed/autotuning/ @cli99
+/deepspeed/checkpoint/ @tjruwase
+/deepspeed/comm/ @awan-10
+/deepspeed/compression/ @yaozhewei @minjiaz @xiaoxiawu-microsoft @conglongli
+/deepspeed/elasticity/ @jeffra @awan-10
+/deepspeed/launcher/ @jeffra @awan-10
+/deepspeed/module_inject/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/moe/ @awan-10
+/deepspeed/monitor/ @awan-10 @jeffra
+/deepspeed/nebula/ @tjruwase @jeffra
+/deepspeed/ops/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/pipe/ @ShadenSmith @duli2012
+/deepspeed/profiling/ @cli99
+/deepspeed/utils/ @jeffra @tjruwase @awan-10
+
+# inference
+/deepspeed/inference/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+/deepspeed/model_implementations/ @RezaYazdaniAminabadi @jeffra @mrwyattii @awan-10 @cmikeh2 @arashb
+
+# training
+/deepspeed/runtime/ @jeffra @tjruwase
+/deepspeed/runtime/activation_checkpointing/ @jeffra @tjruwase
+/deepspeed/runtime/checkpoint_engine/ @tjruwase @jeffra
+/deepspeed/runtime/comm/ @awan-10
+/deepspeed/runtime/compression/ @awan-10 @conglongli
+/deepspeed/runtime/data_pipeline/ @conglongli
+/deepspeed/runtime/fp16/ @jeffra @tjruwase
+/deepspeed/runtime/fp16/onebit/ @conglongli @awan-10
+/deepspeed/runtime/pipe/ @ShadenSmith @duli2012
+/deepspeed/runtime/swap_tensor/ @tjruwase @mrwyattii
+/deepspeed/runtime/zero/ @jeffra @tjruwase @samyam @mrwyattii
--- a/MANIFEST.in
+++ b/MANIFEST.in
@@ -2,3 +2,6 @@ include *.txt README.md
 recursive-include requirements *.txt
 recursive-include deepspeed *.cpp *.h *.cu *.hip *.tr *.cuh *.cc *.json
 recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include op_builder *.py
+recursive-include benchmarks *.py
+recursive-include accelerator *.py
--- a/MANIFEST_win.in
+++ b/MANIFEST_win.in
@@ -6,3 +6,4 @@ recursive-include deepspeed *.tr
 recursive-exclude deepspeed/ops/csrc *.cpp *.h *.cu *.cuh *.cc
 prune csrc
 prune op_builder
+prune accelerator
--- a/README.md
+++ b/README.md
--- a/accelerator/__init__.py
+++ b/accelerator/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from .abstract_accelerator import DeepSpeedAccelerator
+from .real_accelerator import get_accelerator, set_accelerator
--- a/accelerator/abstract_accelerator.py
+++ b/accelerator/abstract_accelerator.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import abc
+from abc import ABC
+
+
+class DeepSpeedAccelerator(ABC):
+    def __init__(self):
+        self._name = None
+        self._communication_backend_name = None
+
+    # Device APIs
+    @abc.abstractmethod
+    def device_name(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def device(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def set_device(self, device_index):
+        ...
+
+    @abc.abstractmethod
+    def current_device(self):
+        ...
+
+    @abc.abstractmethod
+    def current_device_name(self):
+        ...
+
+    @abc.abstractmethod
+    def device_count(self):
+        ...
+
+    @abc.abstractmethod
+    def synchronize(self, device_index=None):
+        ...
+
+    # RNG APIs
+    @abc.abstractmethod
+    def random(self):
+        ...
+
+    @abc.abstractmethod
+    def set_rng_state(self, new_state, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def get_rng_state(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def manual_seed(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def manual_seed_all(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def initial_seed(self, seed):
+        ...
+
+    @abc.abstractmethod
+    def default_generator(self, device_index):
+        ...
+
+    # Streams/Events
+    @property
+    @abc.abstractmethod
+    def Stream(self):
+        ...
+
+    @abc.abstractmethod
+    def stream(self, stream):
+        ...
+
+    @abc.abstractmethod
+    def current_stream(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def default_stream(self, device_index=None):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def Event(self):
+        ...
+
+    # Memory management
+    @abc.abstractmethod
+    def empty_cache(self):
+        ...
+
+    @abc.abstractmethod
+    def memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_max_memory_allocated(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_max_memory_cached(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_stats(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def reset_peak_memory_stats(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def memory_reserved(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def max_memory_reserved(self, device_index=None):
+        ...
+
+    @abc.abstractmethod
+    def total_memory(self, device_index=None):
+        ...
+
+    # Data types
+    @abc.abstractmethod
+    def is_bf16_supported(self):
+        ...
+
+    @abc.abstractmethod
+    def is_fp16_supported(self):
+        ...
+
+    # Misc
+    @abc.abstractmethod
+    def amp(self):
+        ...
+
+    @abc.abstractmethod
+    def is_available(self):
+        ...
+
+    @abc.abstractmethod
+    def range_push(self, msg):
+        ...
+
+    @abc.abstractmethod
+    def range_pop(self):
+        ...
+
+    @abc.abstractmethod
+    def lazy_call(self, callback):
+        ...
+
+    @abc.abstractmethod
+    def communication_backend_name(self):
+        ...
+
+    # Tensor operations
+    @property
+    @abc.abstractmethod
+    def BFloat16Tensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def ByteTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def DoubleTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def FloatTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def HalfTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def IntTensor(self):
+        ...
+
+    @property
+    @abc.abstractmethod
+    def LongTensor(self):
+        ...
+
+    @abc.abstractmethod
+    def pin_memory(self, tensor):
+        ...
+
+    @abc.abstractmethod
+    def on_accelerator(self, tensor):
+        ...
+
+    @abc.abstractmethod
+    def op_builder_dir(self):
+        ...
+
+    # create an instance of op builder, specified by class_name
+    @abc.abstractmethod
+    def create_op_builder(self, class_name):
+        ...
+
+    # return an op builder class, specified by class_name
+    @abc.abstractmethod
+    def get_op_builder(self, class_name):
+        ...
+
+    @abc.abstractmethod
+    def build_extension(self):
+        ...
--- a/accelerator/cuda_accelerator.py
+++ b/accelerator/cuda_accelerator.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import os
+import pkgutil
+import importlib
+
+from .abstract_accelerator import DeepSpeedAccelerator
+# During setup stage torch may not be installed, pass on no torch will
+# allow op builder related API to be executed.
+try:
+    import torch.cuda
+except ImportError:
+    pass
+
+
+class CUDA_Accelerator(DeepSpeedAccelerator):
+    def __init__(self):
+        self._name = 'cuda'
+        self._communication_backend_name = 'nccl'
+
+        # begin initialize for create_op_builder()
+        # put all valid class name <--> class type mapping into class_dict
+        op_builder_dir = self.op_builder_dir()
+        op_builder_module = importlib.import_module(op_builder_dir)
+
+        for _, module_name, _ in pkgutil.iter_modules([os.path.dirname(op_builder_module.__file__)]):
+            # avoid self references
+            if module_name != 'all_ops' and module_name != 'builder':
+                module = importlib.import_module("{}.{}".format(
+                    op_builder_dir,
+                    module_name))
+                for member_name in module.__dir__():
+                    if member_name.endswith(
+                            'Builder'
+                    ) and member_name != "OpBuilder" and member_name != "CUDAOpBuilder" and member_name != "TorchCPUOpBuilder":  # avoid abstract classes
+                        if not member_name in self.class_dict:
+                            self.class_dict[member_name] = getattr(module, member_name)
+        # end initialize for create_op_builder()
+
+    # Device APIs
+    def device_name(self, device_index=None):
+        if device_index == None:
+            return 'cuda'
+        return 'cuda:{}'.format(device_index)
+
+    def device(self, device_index=None):
+        return torch.cuda.device(device_index)
+
+    def set_device(self, device_index):
+        torch.cuda.set_device(device_index)
+
+    def current_device(self):
+        return torch.cuda.current_device()
+
+    def current_device_name(self):
+        return 'cuda:{}'.format(torch.cuda.current_device())
+
+    def device_count(self):
+        return torch.cuda.device_count()
+
+    def synchronize(self, device_index=None):
+        return torch.cuda.synchronize(device_index)
+
+    # RNG APIs
+    def random(self):
+        return torch.random
+
+    def set_rng_state(self, new_state, device_index=None):
+        if device_index is None:
+            return torch.cuda.set_rng_state(new_state)
+
+        return torch.cuda.set_rng_state(new_state, device_index)
+
+    def get_rng_state(self, device_index=None):
+        if device_index is None:
+            return torch.cuda.get_rng_state()
+
+        return torch.cuda.get_rng_state(device_index)
+
+    def manual_seed(self, seed):
+        return torch.cuda.manual_seed(seed)
+
+    def manual_seed_all(self, seed):
+        return torch.cuda.manual_seed_all(seed)
+
+    def initial_seed(self, seed):
+        return torch.cuda.initial_seed(seed)
+
+    def default_generator(self, device_index):
+        return torch.cuda.default_generators[device_index]
+
+    # Streams/Events
+    @property
+    def Stream(self):
+        return torch.cuda.Stream
+
+    def stream(self, stream):
+        return torch.cuda.stream(stream)
+
+    def current_stream(self, device_index=None):
+        return torch.cuda.current_stream(device_index)
+
+    def default_stream(self, device_index=None):
+        return torch.cuda.default_stream(device_index)
+
+    @property
+    def Event(self):
+        return torch.cuda.Event
+
+    # Memory management
+    def empty_cache(self):
+        return torch.cuda.empty_cache()
+
+    def memory_allocated(self, device_index=None):
+        return torch.cuda.memory_allocated(device_index)
+
+    def max_memory_allocated(self, device_index=None):
+        return torch.cuda.max_memory_allocated(device_index)
+
+    def reset_max_memory_allocated(self, device_index=None):
+        return torch.cuda.reset_max_memory_allocated(device_index)
+
+    def memory_cached(self, device_index=None):
+        return torch.cuda.memory_cached(device_index)
+
+    def max_memory_cached(self, device_index=None):
+        return torch.cuda.max_memory_cached(device_index)
+
+    def reset_max_memory_cached(self, device_index=None):
+        return torch.cuda.reset_max_memory_cached(device_index)
+
+    def memory_stats(self, device_index=None):
+        if hasattr(torch.cuda, 'memory_stats'):
+            return torch.cuda.memory_stats(device_index)
+
+    def reset_peak_memory_stats(self, device_index=None):
+        if hasattr(torch.cuda, 'reset_peak_memory_stats'):
+            return torch.cuda.reset_peak_memory_stats(device_index)
+
+    def memory_reserved(self, device_index=None):
+        if hasattr(torch.cuda, 'memory_reserved'):
+            return torch.cuda.memory_reserved(device_index)
+
+    def max_memory_reserved(self, device_index=None):
+        if hasattr(torch.cuda, 'max_memory_reserved'):
+            return torch.cuda.max_memory_reserved(device_index)
+
+    def total_memory(self, device_index=None):
+        return torch.cuda.get_device_properties(device_index).total_memory
+
+    # Data types
+    def is_bf16_supported(self):
+        return torch.cuda.is_bf16_supported()
+
+    def is_fp16_supported(self):
+        major, _ = torch.cuda.get_device_capability()
+        if major >= 7:
+            return True
+        else:
+            return False
+
+    # Misc
+    def amp(self):
+        if hasattr(torch.cuda, 'amp'):
+            return torch.cuda.amp
+        return None
+
+    def is_available(self):
+        return torch.cuda.is_available()
+
+    def range_push(self, msg):
+        if hasattr(torch.cuda.nvtx, 'range_push'):
+            return torch.cuda.nvtx.range_push(msg)
+
+    def range_pop(self):
+        if hasattr(torch.cuda.nvtx, 'range_pop'):
+            return torch.cuda.nvtx.range_pop()
+
+    def lazy_call(self, callback):
+        return torch.cuda._lazy_call(callback)
+
+    def communication_backend_name(self):
+        return self._communication_backend_name
+
+    # Tensor operations
+
+    @property
+    def BFloat16Tensor(self):
+        return torch.cuda.BFloat16Tensor
+
+    @property
+    def ByteTensor(self):
+        return torch.cuda.ByteTensor
+
+    @property
+    def DoubleTensor(self):
+        return torch.cuda.DoubleTensor
+
+    @property
+    def FloatTensor(self):
+        return torch.cuda.FloatTensor
+
+    @property
+    def HalfTensor(self):
+        return torch.cuda.HalfTensor
+
+    @property
+    def IntTensor(self):
+        return torch.cuda.IntTensor
+
+    @property
+    def LongTensor(self):
+        return torch.cuda.LongTensor
+
+    def pin_memory(self, tensor):
+        return tensor.pin_memory()
+
+    def on_accelerator(self, tensor):
+        device_str = str(tensor.device)
+        if device_str.startswith('cuda:'):
+            return True
+        else:
+            return False
+
+    def op_builder_dir(self):
+        try:
+            # during installation time op_builder is visible, otherwise return deepspeed.ops.op_builder
+            import op_builder  # noqa: F401
+            return "op_builder"
+        except ImportError:
+            return "deepspeed.ops.op_builder"
+
+    # dict that holds class name <--> class type mapping i.e.
+    # 'AsyncIOBuilder': <class 'op_builder.async_io.AsyncIOBuilder'>
+    # this dict will be filled at init stage
+    class_dict = {}
+
+    # create an instance of op builder and return, name specified by class_name
+    def create_op_builder(self, class_name):
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]()
+        else:
+            return None
+
+    # return an op builder class, name specified by class_name
+    def get_op_builder(self, class_name):
+        if class_name in self.class_dict:
+            return self.class_dict[class_name]
+        else:
+            return None
+
+    def build_extension(self):
+        from torch.utils.cpp_extension import BuildExtension
+        return BuildExtension
--- a/accelerator/real_accelerator.py
+++ b/accelerator/real_accelerator.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+try:
+    from accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa1
+except ImportError as e:
+    dsa1 = None
+try:
+    from deepspeed.accelerator.abstract_accelerator import DeepSpeedAccelerator as dsa2
+except ImportError as e:
+    dsa2 = None
+
+ds_accelerator = None
+
+
+def _validate_accelerator(accel_obj):
+    # because abstract_accelerator has different path during
+    # build time (accelerator.abstract_accelerator)
+    # and run time (deepspeed.accelerator.abstract_accelerator)
+    # and extension would import the
+    # run time abstract_accelerator/DeepSpeedAccelerator as its base
+    # class, so we need to compare accel_obj with both base class.
+    # if accel_obj is instance of DeepSpeedAccelerator in one of
+    # accelerator.abstractor_accelerator
+    # or deepspeed.accelerator.abstract_accelerator, consider accel_obj
+    # is a conforming object
+    if not ((dsa1 != None and isinstance(accel_obj,
+                                         dsa1)) or
+            (dsa2 != None and isinstance(accel_obj,
+                                         dsa2))):
+        raise AssertionError(
+            f'{accel_obj.__class__.__name__} accelerator is not subclass of DeepSpeedAccelerator'
+        )
+
+    # TODO: turn off is_available test since this breaks tests
+    #assert accel_obj.is_available(), \
+    #    f'{accel_obj.__class__.__name__} accelerator fails is_available() test'
+
+
+def get_accelerator():
+    global ds_accelerator
+    if ds_accelerator is None:
+        try:
+            from intel_extension_for_deepspeed import XPU_Accelerator
+        except ImportError as e:
+            pass
+        else:
+            ds_accelerator = XPU_Accelerator()
+            _validate_accelerator(ds_accelerator)
+            return ds_accelerator
+
+        from .cuda_accelerator import CUDA_Accelerator
+        ds_accelerator = CUDA_Accelerator()
+        _validate_accelerator(ds_accelerator)
+    return ds_accelerator
+
+
+def set_accelerator(accel_obj):
+    global ds_accelerator
+    _validate_accelerator(accel_obj)
+    ds_accelerator = accel_obj
+
+
+'''
+-----------[code] test_get.py -----------
+from deepspeed.accelerator import get_accelerator
+my_accelerator = get_accelerator()
+print(f'{my_accelerator._name=}')
+print(f'{my_accelerator._communication_backend=}')
+print(f'{my_accelerator.HalfTensor().device=}')
+print(f'{my_accelerator.total_memory()=}')
+-----------[code] test_get.py -----------
+
+---[output] python test_get.py---------
+my_accelerator.name()='cuda'
+my_accelerator.communication_backend='nccl'
+my_accelerator.HalfTensor().device=device(type='cuda', index=0)
+my_accelerator.total_memory()=34089730048
+---[output] python test_get.py---------
+
+**************************************************************************
+-----------[code] test_set.py -----------
+from deepspeed.accelerator.cuda_accelerator import CUDA_Accelerator
+cu_accel = CUDA_Accelerator()
+print(f'{id(cu_accel)=}')
+from deepspeed.accelerator import set_accelerator, get_accelerator
+set_accelerator(cu_accel)
+
+my_accelerator = get_accelerator()
+print(f'{id(my_accelerator)=}')
+print(f'{my_accelerator._name=}')
+print(f'{my_accelerator._communication_backend=}')
+print(f'{my_accelerator.HalfTensor().device=}')
+print(f'{my_accelerator.total_memory()=}')
+-----------[code] test_set.py -----------
+
+
+---[output] python test_set.py---------
+id(cu_accel)=139648165478304
+my_accelerator=<deepspeed.accelerator.cuda_accelerator.CUDA_Accelerator object at 0x7f025f4bffa0>
+my_accelerator.name='cuda'
+my_accelerator.communication_backend='nccl'
+my_accelerator.HalfTensor().device=device(type='cuda', index=0)
+my_accelerator.total_memory()=34089730048
+---[output] python test_set.py---------
+'''
--- a/azure/README.md
+++ b/azure/README.md
 # Getting Started with DeepSpeed on Azure

-Please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/) to get started with DeepSpeed on Azure!
+The recommended and simplest method to try DeepSpeed on Azure is through [AzureML](https://azure.microsoft.com/en-us/services/machine-learning/). For more details, please see our [Azure tutorial](https://www.deepspeed.ai/tutorials/azure/).
--- a/benchmarks/__init__.py
+++ b/benchmarks/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
--- a/benchmarks/communication/README.md
+++ b/benchmarks/communication/README.md
+# Running Communication Benchmarks
+
+
+To run benchmarks, there are two options:
+
+1. Run a single communication operation:
+
+For example, run with a single large message size:
+<pre>
+deepspeed all_reduce.py
+</pre>
+
+Scan across message sizes:
+<pre>
+deepspeed all_reduce.py --scan
+</pre>
+
+2. Run all available communication benchmarks:
+
+<pre>
+deepspeed run_all.py
+</pre>
+
+Like the individual benchmarks, `run_all.py` supports scanning arguments for the max message size, bw-unit, etc. Simply pass the desired arguments to `run_all.py` and they'll be propagated to each comm op.
+
+<pre>
+usage: ds_bench [-h] [--local_rank LOCAL_RANK] [--trials TRIALS] [--warmups WARMUPS] [--maxsize MAXSIZE] [--async-op] [--bw-unit {Gbps,GBps}] [--backend {nccl}] [--dist {deepspeed,torch}] [--scan] [--raw] [--all-reduce] [--all-gather] [--all-to-all]
+                [--pt2pt] [--broadcast] [--dtype DTYPE] [--mem-factor MEM_FACTOR] [--debug]
+
+optional arguments:
+  -h, --help            show this help message and exit
+  --local_rank LOCAL_RANK
+  --trials TRIALS       Number of timed iterations
+  --warmups WARMUPS     Number of warmup (non-timed) iterations
+  --maxsize MAXSIZE     Max message size as a power of 2
+  --async-op            Enables non-blocking communication
+  --bw-unit {Gbps,GBps}
+  --backend {nccl}      Communication library to use
+  --dist {deepspeed,torch}
+                        Distributed DL framework to use
+  --scan                Enables scanning all message sizes
+  --raw                 Print the message size and latency without units
+  --all-reduce          Run all_reduce
+  --all-gather          Run all_gather
+  --all-to-all          Run all_to_all
+  --pt2pt               Run pt2pt
+  --broadcast           Run broadcast
+  --dtype DTYPE         PyTorch tensor dtype
+  --mem-factor MEM_FACTOR
+                        Proportion of max available GPU memory to use for single-size evals
+  --debug               Enables all_to_all debug prints
+</pre>
+
+Note that `ds_bench` is a pre-packaged wrapper around `run_all.py`. Users can pass the same arguments as well:
+
+<pre>
+<path to deepspeed>/bin/ds_bench --scan --trials=10
+</pre>
+
+Finally, users can choose specific communication operations to run in `run_all.py` or `ds_bench` by passing them as arguments (all operations are run by default). For example:
+
+<pre>
+deepspeed run_all.py --scan --all-reduce --all-to-all --broadcast
+</pre>
+
+
+# Adding Communication Benchmarks
+
+To add new communication benchmarks, follow this general procedure:
+
+1. Copy a similar benchmark file (e.g. to add `reduce_scatter`, copy `all_reduce.py` as a template)
+2. Add a new bw formula in `utils.get_bw`, a new maximum tensor element formula in `utils.max_numel`, and a new arg in `utils.benchmark_parser`
+3. Replace comm op calls in new file with find-replace
+4. Find a good default `mem_factor` for use in `run_<collective>_single()` function
+5. Add new comm op to `run_all.py`
--- a/benchmarks/communication/__init__.py
+++ b/benchmarks/communication/__init__.py
+'''Copyright The Microsoft DeepSpeed Team'''
--- a/benchmarks/communication/all_gather.py
+++ b/benchmarks/communication/all_gather.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+# Run all_gather and print metrics
+def timed_all_gather(input, output, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        # use all_gather_base if available
+        if args.dist == 'torch':
+            if hasattr(torch.distributed, "_all_gather_base"):
+                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
+            else:
+                output_tensors = list(
+                    torch.chunk(output_tensor,
+                                cdb.get_world_size(group)))
+                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
+        elif args.dist == 'deepspeed':
+            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        # use all_gather_base if available
+        if args.dist == 'torch':
+            if hasattr(torch.distributed, "_all_gather_base"):
+                dist._all_gather_base(output, input, group=None, async_op=args.async_op)
+            else:
+                output_tensors = list(
+                    torch.chunk(output_tensor,
+                                cdb.get_world_size(group)))
+                dist.all_gather(output_tensors, input_tensor, group=group, async_op=True)
+        elif args.dist == 'deepspeed':
+            dist.allgather_fn(output, input, group=None, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('all_gather', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_all_gather(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'all_gather')
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    if args.scan:
+        # Create list of message sizes
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+                # Delete original mat to avoid OOM
+                del mat
+                get_accelerator().empty_cache()
+                output = torch.zeros(input.nelement() * world_size,
+                                     dtype=getattr(
+                                         torch,
+                                         args.dtype)).to(
+                                             get_accelerator().device_name(local_rank))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_all_gather(input, output, args)
+    else:
+        # all_gather_base saves memory
+        if (args.dist == 'torch'
+                and hasattr(torch.distributed,
+                            "_all_gather_base")) or (args.dist == 'deepspeed'
+                                                     and dist.has_allgather_base):
+            mem_factor = args.mem_factor + 0.2
+        else:
+            mem_factor = args.mem_factor
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        sync_all()
+        elements_per_gpu = max_numel(comm_op='all_gather',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=mem_factor,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            # multiply each GPU's tensor by the rank to ease debugging
+            input = ((mat.mul_(float(global_rank))).view(-1))
+            # Delete original mat to avoid OOM
+            del mat
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu * world_size,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+
+        sync_all()
+        timed_all_gather(input, output, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_all_gather(local_rank=rank, args=args)
--- a/benchmarks/communication/all_reduce.py
+++ b/benchmarks/communication/all_reduce.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_all_reduce(input, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        dist.all_reduce(input, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        dist.all_reduce(input, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('all_reduce', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_all_reduce(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'all_reduce')
+
+    world_size = dist.get_world_size()
+    global_rank = dist.get_rank()
+
+    if args.scan:
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_all_reduce(input, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        # Don't need output tensor, so we double mem_factor
+        elements_per_gpu = max_numel(comm_op='all_reduce',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor * 2,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            input = ((mat.mul_(float(global_rank))).view(-1))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+        timed_all_reduce(input, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_all_reduce(local_rank=rank, args=args)
--- a/benchmarks/communication/all_to_all.py
+++ b/benchmarks/communication/all_to_all.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_all_to_all(input, output, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        dist.all_to_all_single(output, input, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        dist.all_to_all_single(output, input, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('all_to_all', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_all_to_all(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    world_size = dist.get_world_size()
+    global_rank = dist.get_rank()
+    # Prepare benchmark header
+    print_header(args, 'all_to_all')
+
+    if args.scan:
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                assert mat.numel() % world_size == 0, f"tensor cannot be divided in {world_size} chunks"
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+                output = (mat.clone().view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_all_to_all(input, output, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        elements_per_gpu = max_numel(comm_op='all_to_all',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            assert mat.numel() % world_size == 0, f"tensor with {mat.numel()} elements cannot be divided in {world_size} chunks"
+            input = ((mat.mul_(float(global_rank))).view(-1))
+            # Delete original mat to avoid OOM
+            del mat
+            get_accelerator().empty_cache()
+            output = torch.zeros(
+                elements_per_gpu,
+                dtype=getattr(torch,
+                              args.dtype)).to(get_accelerator().device_name(local_rank))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+
+        if args.debug:
+            for i in range(world_size):
+                if i == global_rank:
+                    print(f"Before AllToAll Input List at rank {global_rank}: {input}")
+                dist.barrier()
+
+        timed_all_to_all(input, output, args)
+
+        if args.debug:
+            for i in range(world_size):
+                if i == global_rank:
+                    print(f"AllToAll Results at rank {global_rank}: {output}")
+                dist.barrier()
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_all_to_all(local_rank=rank, args=args)
--- a/benchmarks/communication/broadcast.py
+++ b/benchmarks/communication/broadcast.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_broadcast(input, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        dist.broadcast(input, 0, async_op=args.async_op)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        dist.broadcast(input, 0, async_op=args.async_op)
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('broadcast', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_broadcast(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'broadcast')
+
+    world_size = dist.get_world_size()
+    global_rank = dist.get_rank()
+
+    if args.scan:
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_broadcast(input, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        # Don't need output tensor, so we double mem_factor
+        elements_per_gpu = max_numel(comm_op='broadcast',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor * 2,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            input = ((mat.mul_(float(global_rank))).view(-1))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+        timed_broadcast(input, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_broadcast(local_rank=rank, args=args)
--- a/benchmarks/communication/constants.py
+++ b/benchmarks/communication/constants.py
+'''Copyright The Microsoft DeepSpeed Team'''
+from deepspeed.accelerator import get_accelerator
+
+DEFAULT_WARMUPS = 5
+DEFAULT_TRIALS = 50
+DEFAULT_TYPE = 'float'
+DEFAULT_BACKEND = get_accelerator().communication_backend_name()
+DEFAULT_UNIT = 'Gbps'
+DEFAULT_DIST = 'deepspeed'
+DEFAULT_MAXSIZE = 24
--- a/benchmarks/communication/pt2pt.py
+++ b/benchmarks/communication/pt2pt.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+import time
+
+
+def timed_pt2pt(input, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    sync_all()
+    # Warmups, establish connections, etc.
+    for i in range(args.warmups):
+        if dist.get_rank() == 0:
+            if args.async_op:
+                dist.isend(input, 1)
+            else:
+                dist.send(input, 1)
+        if dist.get_rank() == 1:
+            if args.async_op:
+                dist.irecv(input, src=0)
+            else:
+                dist.recv(input, src=0)
+    sync_all()
+
+    # time the actual comm op trials times and average it
+    pre = time.perf_counter()
+    for i in range(args.trials):
+        if dist.get_rank() == 0:
+            if args.async_op:
+                dist.isend(input, 1)
+            else:
+                dist.send(input, 1)
+        if dist.get_rank() == 1:
+            if args.async_op:
+                dist.irecv(input, src=0)
+            else:
+                dist.recv(input, src=0)
+
+    sync_all()
+    duration = time.perf_counter() - pre
+
+    # maintain and clean performance data
+    avg_duration = duration / args.trials
+    size = input.element_size() * input.nelement()
+    n = dist.get_world_size()
+    tput, busbw = get_bw('pt2pt', size, avg_duration, args)
+    tput_str, busbw_str, duration_str = get_metric_strings(args, tput, busbw, avg_duration)
+    desc = f'{input.nelement()}x{input.element_size()}'
+
+    if not args.raw:
+        size = convert_size(size)
+
+    print_rank_0(
+        f"{size:<20} {desc:25s} {duration_str:20s} {tput_str:20s} {busbw_str:20s}")
+
+
+def run_pt2pt(local_rank, args):
+    if args.dist == 'torch':
+        import torch.distributed as dist
+    elif args.dist == 'deepspeed':
+        import deepspeed.comm as dist
+
+    # Prepare benchmark header
+    print_header(args, 'pt2pt')
+    global_rank = dist.get_rank()
+    world_size = dist.get_world_size()
+
+    if args.scan:
+        # Create list of message sizes
+        M_LIST = []
+        for x in (2**p for p in range(1, args.maxsize)):
+            M_LIST.append(x)
+
+        sync_all()
+        # loop over various tensor sizes
+        for M in M_LIST:
+            global_rank = dist.get_rank()
+            try:
+                mat = torch.ones(world_size,
+                                 M,
+                                 dtype=getattr(
+                                     torch,
+                                     args.dtype)).to(
+                                         get_accelerator().device_name(local_rank))
+                sync_all()
+                input = ((mat.mul_(float(global_rank))).view(-1))
+            except RuntimeError as e:
+                if 'out of memory' in str(e):
+                    if dist.get_rank() == 0:
+                        print('WARNING: Ran out of GPU memory. Exiting comm op.')
+                    sync_all()
+                    break
+            sync_all()
+            timed_pt2pt(input, args)
+    else:
+        # Send the biggest message size our GPUs can fit. If you're facing OOM errors, reduce the mem_factor
+        # Don't need output tensor, so double mem_factor
+        elements_per_gpu = max_numel(comm_op='pt2pt',
+                                     dtype=getattr(torch,
+                                                   args.dtype),
+                                     mem_factor=args.mem_factor * 2,
+                                     local_rank=local_rank,
+                                     args=args)
+        try:
+            mat = torch.ones(elements_per_gpu,
+                             dtype=getattr(torch,
+                                           args.dtype)).to(
+                                               get_accelerator().device_name(local_rank))
+            input = ((mat.mul_(float(global_rank))).view(-1))
+        except RuntimeError as e:
+            if 'out of memory' in str(e):
+                if dist.get_rank() == 0:
+                    print(
+                        'WARNING: Ran out of GPU memory. Try to reduce the --mem-factor argument!'
+                    )
+                sync_all()
+                return
+        sync_all()
+        timed_pt2pt(input, args)
+
+
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    init_processes(local_rank=rank, args=args)
+    run_pt2pt(local_rank=rank, args=args)
--- a/benchmarks/communication/run_all.py
+++ b/benchmarks/communication/run_all.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+from benchmarks.communication.utils import *
+from benchmarks.communication.all_reduce import run_all_reduce
+from benchmarks.communication.all_gather import run_all_gather
+from benchmarks.communication.all_to_all import run_all_to_all
+from benchmarks.communication.pt2pt import run_pt2pt
+from benchmarks.communication.broadcast import run_broadcast
+from benchmarks.communication.constants import *
+
+
+# For importing
+def main(args, rank):
+
+    init_processes(local_rank=rank, args=args)
+
+    ops_to_run = []
+    if args.all_reduce:
+        ops_to_run.append('all_reduce')
+    if args.all_gather:
+        ops_to_run.append('all_gather')
+    if args.broadcast:
+        ops_to_run.append('broadcast')
+    if args.pt2pt:
+        ops_to_run.append('pt2pt')
+    if args.all_to_all:
+        ops_to_run.append('all_to_all')
+
+    if len(ops_to_run) == 0:
+        ops_to_run = ['all_reduce', 'all_gather', 'all_to_all', 'broadcast', 'pt2pt']
+
+    for comm_op in ops_to_run:
+        if comm_op == 'all_reduce':
+            run_all_reduce(local_rank=rank, args=args)
+        if comm_op == 'all_gather':
+            run_all_gather(local_rank=rank, args=args)
+        if comm_op == 'all_to_all':
+            run_all_to_all(local_rank=rank, args=args)
+        if comm_op == 'pt2pt':
+            run_pt2pt(local_rank=rank, args=args)
+        if comm_op == 'broadcast':
+            run_broadcast(local_rank=rank, args=args)
+
+
+# For directly calling benchmark
+if __name__ == "__main__":
+    args = benchmark_parser().parse_args()
+    rank = args.local_rank
+    main(args, rank)
--- a/benchmarks/communication/utils.py
+++ b/benchmarks/communication/utils.py
+'''Copyright The Microsoft DeepSpeed Team'''
+
+import torch
+import os
+import math
+import argparse
+from benchmarks.communication.constants import *
+from deepspeed.accelerator import get_accelerator
+
+global dist
+
+
+def init_torch_distributed(backend):
+    global dist
+    import torch.distributed as dist
+    torch.distributed.init_process_group(backend)
+    local_rank = int(os.environ['LOCAL_RANK'])
+    get_accelerator().set_device(local_rank)
+
+
+def init_deepspeed_comm(backend):
+    global dist
+    import deepspeed
+    import deepspeed.comm as dist
+    deepspeed.init_distributed(dist_backend=backend)
+    local_rank = int(os.environ['LOCAL_RANK'])
+    get_accelerator().set_device(local_rank)
+
+
+def init_processes(local_rank, args):
+    if args.dist == 'deepspeed':
+        init_deepspeed_comm(args.backend)
+    elif args.dist == 'torch':
+        init_torch_distributed(args.backend)
+    else:
+        print_rank_0(f"distributed framework {args.dist} not supported")
+        exit(0)
+
+
+def print_rank_0(message):
+    if dist.get_rank() == 0:
+        print(message)
+
+
+def print_header(args, comm_op):
+    if comm_op == 'pt2pt':
+        world_size = 2
+    else:
+        world_size = dist.get_world_size()
+    tput = f'Throughput ({args.bw_unit})'
+    busbw = f'BusBW ({args.bw_unit})'
+    header = f"\n---- Performance of {comm_op} on {world_size} devices ---------------------------------------------------------\n"
+    duration_str = 'Duration'
+    if args.raw:
+        duration_str += ' (us)'
+    header += f"{'Size (Bytes)':20s} {'Description':25s} {duration_str:20s} {tput:20s} {busbw:20s}\n"
+    header += "----------------------------------------------------------------------------------------------------"
+    print_rank_0(header)
+
+
+def get_bw(comm_op, size, duration, args):
+    n = dist.get_world_size()
+    tput = 0
+    busbw = 0
+    if comm_op == "all_to_all":
+        tput = (size / duration)
+        busbw = (size / duration) * ((n - 1) / n)
+    elif comm_op == "all_gather":
+        size *= n
+        tput = (size / duration)
+        busbw = (size / duration) * ((n - 1) / n)
+    elif comm_op == "all_reduce":
+        tput = (size * 2 / duration)
+        busbw = (size / duration) * (2 * (n - 1) / n)
+    elif comm_op == "pt2pt" or comm_op == "broadcast":
+        tput = (size / duration)
+        busbw = tput
+    else:
+        print_rank_0("wrong comm_op specified")
+        exit(0)
+
+    if args.bw_unit == 'Gbps':
+        tput *= 8
+        busbw *= 8
+
+    return tput, busbw
+
+
+def get_metric_strings(args, tput, busbw, duration):
+    duration_ms = duration * 1e3
+    duration_us = duration * 1e6
+    tput = f'{tput / 1e9:.3f}'
+    busbw = f'{busbw /1e9:.3f}'
+
+    if duration_us < 1e3 or args.raw:
+        duration = f'{duration_us:.3f}'
+        if not args.raw:
+            duration += ' us'
+    else:
+        duration = f'{duration_ms:.3f} ms'
+    return tput, busbw, duration
+
+
+def sync_all():
+    get_accelerator().synchronize()
+    dist.barrier()
+
+
+def max_numel(comm_op, dtype, mem_factor, local_rank, args):
+    dtype_size = _element_size(dtype)
+    max_memory_per_gpu = get_accelerator().total_memory(local_rank) * mem_factor
+    if comm_op == 'all_reduce' or comm_op == 'pt2pt' or comm_op == 'broadcast':
+        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
+    elif comm_op == 'all_gather':
+        # all_gather performance is lower for non-powers of two, and the output buffer size scales with world size
+        # Therefore, divide by world size and round down to nearest power of 2
+        elements_per_gpu = int(max_memory_per_gpu // dtype_size // dist.get_world_size())
+        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
+    elif comm_op == 'all_to_all':
+        # Number of elements must be divisible by world_size
+        # all_to_all performance is lower for non-powers of two. Round down like all_gather.
+        elements_per_gpu = int(max_memory_per_gpu // dtype_size)
+        elements_per_gpu = int(dist.get_world_size() *
+                               round(elements_per_gpu / dist.get_world_size()))
+        elements_per_gpu = int(pow(2, int(math.log(elements_per_gpu, 2))))
+    else:
+        print(f"This communication operation: {comm_op} is not supported yet")
+        exit(0)
+    return elements_per_gpu
+
+
+# Helper function to pretty-print message sizes
+def convert_size(size_bytes):
+    if size_bytes == 0:
+        return "0B"
+    size_name = ("B", "KB", "MB", "GB", "TB", "PB", "EB", "ZB", "YB")
+    i = int(math.floor(math.log(size_bytes, 1024)))
+    p = math.pow(1024, i)
+    s = round(size_bytes / p, 2)
+    return "%s %s" % (s, size_name[i])
+
+
+# Copied from torch. Need to add the func here for old torch compatibility.
+def _element_size(dtype):
+    """
+    Returns the element size for a dtype, in bytes
+    """
+    if not isinstance(dtype, torch.dtype):
+        raise RuntimeError(f'expected torch.dtype, but got {type(dtype)}')
+
+    if dtype.is_complex:
+        return torch.finfo(dtype).bits >> 2
+    elif dtype.is_floating_point:
+        return torch.finfo(dtype).bits >> 3
+    elif dtype == torch.bool:
+        # NOTE: torch.bool is not supported in torch.iinfo()
+        return 1
+    else:
+        return torch.iinfo(dtype).bits >> 3
+
+
+def benchmark_parser():
+    parser = argparse.ArgumentParser()
+    parser.add_argument("--local_rank", type=int)
+    parser.add_argument("--trials",
+                        type=int,
+                        default=DEFAULT_TRIALS,
+                        help='Number of timed iterations')
+    parser.add_argument("--warmups",
+                        type=int,
+                        default=DEFAULT_WARMUPS,
+                        help='Number of warmup (non-timed) iterations')
+    parser.add_argument("--maxsize",
+                        type=int,
+                        default=24,
+                        help='Max message size as a power of 2')
+    parser.add_argument("--async-op",
+                        action="store_true",
+                        help='Enables non-blocking communication')
+    parser.add_argument("--bw-unit",
+                        type=str,
+                        default=DEFAULT_UNIT,
+                        choices=['Gbps',
+                                 'GBps'])
+    parser.add_argument("--backend",
+                        type=str,
+                        default=DEFAULT_BACKEND,
+                        choices=['nccl',
+                                 'ccl'],
+                        help='Communication library to use')
+    parser.add_argument("--dist",
+                        type=str,
+                        default=DEFAULT_DIST,
+                        choices=['deepspeed',
+                                 'torch'],
+                        help='Distributed DL framework to use')
+    parser.add_argument("--scan",
+                        action="store_true",
+                        help='Enables scanning all message sizes')
+    parser.add_argument("--raw",
+                        action="store_true",
+                        help='Print the message size and latency without units')
+    parser.add_argument("--all-reduce", action="store_true", help='Run all_reduce')
+    parser.add_argument("--all-gather", action="store_true", help='Run all_gather')
+    parser.add_argument("--all-to-all", action="store_true", help='Run all_to_all')
+    parser.add_argument("--pt2pt", action="store_true", help='Run pt2pt')
+    parser.add_argument("--broadcast", action="store_true", help='Run broadcast')
+    parser.add_argument("--dtype",
+                        type=str,
+                        default=DEFAULT_TYPE,
+                        help='PyTorch tensor dtype')
+    parser.add_argument(
+        "--mem-factor",
+        type=float,
+        default=.4,
+        help='Proportion of max available GPU memory to use for single-size evals')
+    parser.add_argument("--debug",
+                        action="store_true",
+                        help='Enables all_to_all debug prints')
+    return parser