DeepSpeed JIT op + PyPI support (#496)

Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com>

DeepSpeed JIT op + PyPI support (#496)
Co-authored-by: Shaden Smith <Shaden.Smith@microsoft.com> Co-authored-by: Reza Yazdani <reyazda@microsoft.com>
31f46fee · Jeff Rasley · GitHub · 0ad4fd88 · 31f46fee · 31f46fee
Unverified Commit 31f46fee authored Nov 12, 2020 by Jeff Rasley Committed by GitHub Nov 12, 2020
20 changed files
--- a/deepspeed/ops/adam/multi_tensor_apply.py
+++ b/deepspeed/ops/adam/multi_tensor_apply.py
+'''
+Copyright 2020 The Microsoft DeepSpeed Team
+Copyright NVIDIA/apex
+This file is adapted from NVIDIA/apex, commit a109f85
+'''
+import torch
+class MultiTensorApply(object):
+    def __init__(self, chunk_size):
+        self.chunk_size = chunk_size
+    def __call__(self, op, noop_flag_buffer, tensor_lists, *args):
+        return op(self.chunk_size, noop_flag_buffer, tensor_lists, *args)
--- a/deepspeed/ops/csrc
+++ b/deepspeed/ops/csrc
+../../csrc
\ No newline at end of file
--- a/deepspeed/ops/lamb/__init__.py
+++ b/deepspeed/ops/lamb/__init__.py
-from deepspeed.ops.lamb.fused_lamb import FusedLamb
+from .fused_lamb import FusedLamb
--- a/deepspeed/ops/lamb/fused_lamb.py
+++ b/deepspeed/ops/lamb/fused_lamb.py
@@ -5,8 +5,8 @@ Copyright NVIDIA/apex
 This file is adapted from NVIDIA/apex/optimizer/fused_adam and implements the LAMB optimizer
 '''
 import types
-import importlib
 import torch
+from ..op_builder import FusedLambBuilder
 class FusedLamb(torch.optim.Optimizer):
@@ -48,15 +48,7 @@ class FusedLamb(torch.optim.Optimizer):
                 max_coeff=10.0,
                 min_coeff=0.01,
                 amsgrad=False):
-        global fused_lamb_cuda
+        self.fused_lamb_cuda = FusedLambBuilder().load()
-        try:
-            fused_lamb_cuda = importlib.import_module(
-                "deepspeed.ops.lamb.fused_lamb_cuda")
-        except ImportError as err:
-            print(
-                "Unable to import Lamb cuda extension, please build DeepSpeed with cuda/cpp extensions."
-            )
-            raise err
        if amsgrad:
            raise RuntimeError('FusedLamb does not support the AMSGrad variant.')
@@ -173,22 +165,22 @@ class FusedLamb(torch.optim.Optimizer):
                out_p = torch.tensor(
                    [],
                    dtype=torch.float) if output_param is None else output_param
-                lamb_coeff = fused_lamb_cuda.lamb(p.data,
+                lamb_coeff = self.fused_lamb_cuda.lamb(p.data,
-                                                  out_p,
+                                                       out_p,
-                                                  exp_avg,
+                                                       exp_avg,
-                                                  exp_avg_sq,
+                                                       exp_avg_sq,
-                                                  grad,
+                                                       grad,
-                                                  group['lr'],
+                                                       group['lr'],
-                                                  beta1,
+                                                       beta1,
-                                                  beta2,
+                                                       beta2,
-                                                  max_coeff,
+                                                       max_coeff,
-                                                  min_coeff,
+                                                       min_coeff,
-                                                  group['eps'],
+                                                       group['eps'],
-                                                  combined_scale,
+                                                       combined_scale,
-                                                  state['step'],
+                                                       state['step'],
-                                                  self.eps_mode,
+                                                       self.eps_mode,
-                                                  bias_correction,
+                                                       bias_correction,
-                                                  group['weight_decay'])
+                                                       group['weight_decay'])
                self.lamb_coeffs.append(lamb_coeff)
        return loss

--- a/deepspeed/ops/op_builder
+++ b/deepspeed/ops/op_builder
+../../op_builder
\ No newline at end of file
--- a/deepspeed/ops/sparse_attention/matmul.py
+++ b/deepspeed/ops/sparse_attention/matmul.py
@@ -2,13 +2,12 @@
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
 import importlib
 import warnings
-try:
-    import triton
-except ImportError:
-    warnings.warn("Unable to import triton, sparse attention will not be accessible")
 import torch
 import math
-from deepspeed.ops.sparse_attention.trsrc import matmul
+from .trsrc import matmul
+from ..op_builder import SparseAttnBuilder
+triton = None
 ##############
@@ -27,6 +26,9 @@ class _sparse_matmul(torch.autograd.Function):
    # between `seg_size` elements
    @staticmethod
    def load_balance(sizes, block):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        # segment size
        # heuristics taken from OpenAI blocksparse code
        # https://github.com/openai/blocksparse/blob/master/blocksparse/matmul.py#L95
@@ -83,11 +85,18 @@ class _sparse_matmul(torch.autograd.Function):
    ##########################
    # SPARSE = DENSE x DENSE #
    ##########################
-    cpp_utils = importlib.import_module('deepspeed.ops.sparse_attention.cpp_utils')
+    cpp_utils = None
-    sdd_segment = cpp_utils.sdd_segment
+    sdd_segment = None
+    @staticmethod
+    def _load_utils():
+        if _sparse_matmul.cpp_utils is None:
+            _sparse_matmul.cpp_utils = SparseAttnBuilder().load()
+            _sparse_matmul.sdd_segment = _sparse_matmul.cpp_utils.sdd_segment
    @staticmethod
    def make_sdd_lut(layout, block, dtype, device):
+        _sparse_matmul._load_utils()
        start_width = 64 // block
        segmented = _sparse_matmul.sdd_segment(layout.type(torch.int32), start_width)
        luts, widths, packs = [], [], []
@@ -118,6 +127,10 @@ class _sparse_matmul(torch.autograd.Function):
                    packs,
                    bench,
                    time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        if trans_c:
            a, b = b, a
            trans_a, trans_b = not trans_b, not trans_a
@@ -332,6 +345,10 @@ class _sparse_matmul(torch.autograd.Function):
                    packs,
                    bench,
                    time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        # shapes / dtypes
        AS0 = a.size(0)
        AS1 = a.size(1)
@@ -413,6 +430,10 @@ class _sparse_matmul(torch.autograd.Function):
                    packs,
                    bench,
                    time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        # shapes / dtypes
        AS0 = spdims[0]
        AS1 = block * spdims[2 if trans_a else 1]

--- a/deepspeed/ops/sparse_attention/softmax.py
+++ b/deepspeed/ops/sparse_attention/softmax.py
@@ -2,17 +2,17 @@
 # https://github.com/ptillet/torch-blocksparse/blob/master/torch_blocksparse/matmul.py
 import warnings
-try:
+import importlib
-    import triton
-except ImportError:
-    warnings.warn("Unable to import triton, sparse attention will not be accessible")
 import torch
 import math
-from deepspeed.ops.sparse_attention.trsrc import softmax_fwd, softmax_bwd
+from .trsrc import softmax_fwd, softmax_bwd
 fwd_kernels = dict()
 bwd_kernels = dict()
+# Delay importing triton unless we need it
+triton = None
 class _sparse_softmax(torch.autograd.Function):
@@ -52,6 +52,10 @@ class _sparse_softmax(torch.autograd.Function):
                    apply_attn_mask,
                    kp_mask_mode,
                    attn_mask_mode):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        if max_k >= 32768:
            raise NotImplementedError('Reductions larger than 32768 elements '\
                                      'are not yet implemented')
@@ -112,6 +116,10 @@ class _sparse_softmax(torch.autograd.Function):
                maxlut,
                bench,
                time):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        apply_scale = False if scale == 1.0 else True
        # handle None rpe
@@ -180,6 +188,10 @@ class _sparse_softmax(torch.autograd.Function):
    @staticmethod
    def backward(ctx, dx):
+        global triton
+        if triton is None:
+            triton = importlib.import_module('triton')
        # retrieve from context
        x, lut = ctx.saved_tensors
        # run kernel

--- a/deepspeed/ops/transformer/__init__.py
+++ b/deepspeed/ops/transformer/__init__.py
-from deepspeed.ops.transformer.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
+from .transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
--- a/deepspeed/ops/transformer/transformer.py
+++ b/deepspeed/ops/transformer/transformer.py
@@ -8,6 +8,8 @@ import torch
 from torch import nn
 from torch.autograd import Function
+from ..op_builder import TransformerBuilder, StochasticTransformerBuilder
 # Cuda modules will be imported if needed
 transformer_cuda_module = None
 stochastic_transformer_cuda_module = None
@@ -483,19 +485,12 @@ class DeepSpeedTransformerLayer(nn.Module):
            self.norm_w = initial_weights[7]
            self.norm_b = initial_biases[7]
-        # Import cuda modules if needed
+        # Load cuda modules if needed
        global transformer_cuda_module, stochastic_transformer_cuda_module
-        if transformer_cuda_module is None or stochastic_transformer_cuda_module is None:
+        if transformer_cuda_module is None and not self.config.stochastic_mode:
-            try:
+            transformer_cuda_module = TransformerBuilder().load()
-                transformer_cuda_module = importlib.import_module(
+        if stochastic_transformer_cuda_module is None and self.config.stochastic_mode:
-                    "deepspeed.ops.transformer.transformer_cuda")
+            stochastic_transformer_cuda_module = StochasticTransformerBuilder().load()
-                stochastic_transformer_cuda_module = importlib.import_module(
-                    "deepspeed.ops.transformer.stochastic_transformer_cuda")
-            except ImportError as err:
-                print(
-                    "Unable to import transformer cuda extension, please build DeepSpeed with cuda/cpp extensions."
-                )
-                raise err
        # create the layer in cuda kernels.
        cuda_module = stochastic_transformer_cuda_module if self.config.stochastic_mode else transformer_cuda_module

--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
@@ -7,8 +7,6 @@ import torch
 import warnings
 import torch.distributed as dist
-import apex
-from apex import amp
 from torch.nn.modules import Module
 from torch.distributed.distributed_c10d import _get_global_rank
 from tensorboardX import SummaryWriter
@@ -36,22 +34,17 @@ from deepspeed.utils.timer import ThroughputTimer, SynchronizedWallClockTimer
 from deepspeed.runtime.progressive_layer_drop import ProgressiveLayerDrop
 from .utils import ensure_directory_exists
+from ..ops.op_builder import UtilsBuilder
+from ..ops.adam import DeepSpeedCPUAdam
+from ..ops.adam import FusedAdam
 MEMORY_OPT_ALLREDUCE_SIZE = 500000000
 try:
-    from apex_C import flatten
+    from apex import amp
-    from apex_C import unflatten
 except ImportError:
-    try:
+    # Fail silently so we don't spam logs unnecessarily if user isn't using amp
-        _ = warned_flatten
+    pass
-    except NameError:
-        logger.warning(
-            "Warning:  apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
 def split_half_float_double_csr(tensors):
@@ -201,6 +194,11 @@ class DeepSpeedEngine(Module):
            if self.dump_state():
                print_configuration(self, 'DeepSpeedEngine')
+        # Load pre-installed or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
    def _mpi_check(self, args, dist_init_required):
        if hasattr(args, 'deepspeed_mpi') and args.deepspeed_mpi:
            from mpi4py import MPI
@@ -558,6 +556,12 @@ class DeepSpeedEngine(Module):
            amp_params = self.amp_params()
            if self.global_rank == 0:
                logger.info(f"Initializing AMP with these params: {amp_params}")
+            try:
+                logger.info("Initializing Apex amp from: {}".format(amp.__path__))
+            except NameError:
+                # If apex/amp is available it will be imported above
+                raise RuntimeError(
+                    "Unable to import apex/amp, please make sure it is installed")
            self.module, self.optimizer = amp.initialize(self.module, basic_optimizer, **amp_params)
            self._broadcast_model()
        elif self.fp16_enabled():
@@ -584,17 +588,18 @@ class DeepSpeedEngine(Module):
            # T|F           T           F           torch.optim.Adam
            # T             F           T|F         DeepSpeedCPUAdam(adam_w_mode)
            # F             F           T|F         FusedAdam(adam_w_mode)
-            if torch_adam and adam_w_mode:
+            if torch_adam:
-                optimizer = torch.optim.AdamW(model_parameters, **optimizer_parameters)
+                if adam_w_mode:
-            elif torch_adam and not adam_w_mode:
+                    optimizer = torch.optim.AdamW(model_parameters,
-                optimizer = torch.optim.Adam(model_parameters, **optimizer_parameters)
+                                                  **optimizer_parameters)
-            elif self.zero_cpu_offload() and not torch_adam:
+                else:
-                from deepspeed.ops.adam import DeepSpeedCPUAdam
+                    optimizer = torch.optim.Adam(model_parameters,
+                                                 **optimizer_parameters)
+            elif self.zero_cpu_offload():
                optimizer = DeepSpeedCPUAdam(model_parameters,
                                             **optimizer_parameters,
                                             adamw_mode=adam_w_mode)
-            elif not self.zero_cpu_offload() and not torch_adam:
+            else:
-                from apex.optimizers.fused_adam import FusedAdam
                optimizer_parameters[ADAM_W_MODE_PARAM] = adam_w_mode
                optimizer = FusedAdam(model_parameters, **optimizer_parameters)
@@ -614,8 +619,7 @@ class DeepSpeedEngine(Module):
        dynamic_loss_args = self.dynamic_loss_scale_args()
        clip_grad = self.gradient_clipping()
        if isinstance(optimizer,
-                      apex.optimizers.FusedAdam) or self.optimizer_name(
+                      FusedAdam) or self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
-                      ) == ONEBIT_ADAM_OPTIMIZER:
            if self.dynamic_loss_scale():
                logger.info('Creating fp16 optimizer with dynamic loss scale')
                timers = self.timers if self.wall_clock_breakdown() else None
@@ -1072,7 +1076,7 @@ class DeepSpeedEngine(Module):
                 ranks=[0])
    def allreduce_bucket(self, bucket):
-        tensor = flatten(bucket)
+        tensor = self.flatten(bucket)
        tensor_to_allreduce = tensor
@@ -1100,7 +1104,7 @@ class DeepSpeedEngine(Module):
    def allreduce_and_copy(self, small_bucket):
        allreduced = self.allreduce_bucket(small_bucket)
-        for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
+        for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
            buf.copy_(synced)
    def allreduce_no_retain(self, bucket, numel_per_bucket=500000000):

--- a/deepspeed/runtime/zero/stage2.py
+++ b/deepspeed/runtime/zero/stage2.py
@@ -15,26 +15,15 @@ import collections
 from deepspeed.runtime.fp16.loss_scaler import LossScaler, DynamicLossScaler
 from deepspeed.runtime.utils import see_memory_usage, is_model_parallel_parameter
 from deepspeed.runtime.zero.config import ZERO_OPTIMIZATION_GRADIENTS
+from deepspeed.ops.adam import DeepSpeedCPUAdam
 from deepspeed.utils import logger
+from ...ops.op_builder import UtilsBuilder
 #Toggle this to true to enable correctness test
 #with gradient partitioning and without
 pg_correctness_test = False
-try:
-    from apex_C import flatten
-    from apex_C import unflatten
-except ImportError:
-    try:
-        _ = warned_flatten
-    except NameError:
-        logger.warning(
-            "apex was installed without --cpp_ext.  Falling back to Python flatten and unflatten."
-        )
-        warned_flatten = True
-    from torch._utils import _flatten_dense_tensors as flatten
-    from torch._utils import _unflatten_dense_tensors as unflatten
 def input(msg):
    return
@@ -132,6 +121,11 @@ class FP16_DeepSpeedZeroOptimizer(object):
                 gradient_predivide_factor=1.0,
                 gradient_accumulation_steps=1):
+        # Load pre-installed or JIT compile (un)flatten ops
+        util_ops = UtilsBuilder().load()
+        self.flatten = util_ops.flatten
+        self.unflatten = util_ops.unflatten
        if dist.get_rank() == 0:
            logger.info(f"Reduce bucket size {reduce_bucket_size}")
            logger.info(f"Allgather bucket size {allgather_bucket_size}")
@@ -1053,7 +1047,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
    def allreduce_bucket(self, bucket, allreduce_always_fp32=False, rank=None, log=None):
        rank = None
-        tensor = flatten(bucket)
+        tensor = self.flatten(bucket)
        tensor_to_allreduce = tensor
@@ -1095,7 +1089,7 @@ class FP16_DeepSpeedZeroOptimizer(object):
        with torch.cuda.stream(stream):
            allreduced = self.allreduce_bucket(small_bucket, rank=rank, log=log)
            if rank is None or rank == dist.get_rank(group=self.dp_process_group):
-                for buf, synced in zip(small_bucket, unflatten(allreduced, small_bucket)):
+                for buf, synced in zip(small_bucket, self.unflatten(allreduced, small_bucket)):
                    buf.copy_(synced)
    def allreduce_no_retain(self,

--- a/deepspeed/runtime/zero/utils.py
+++ b/deepspeed/runtime/zero/utils.py
 import torch
 import torch.distributed as dist
-import apex
 from deepspeed.utils import logger
 from deepspeed.ops.adam import DeepSpeedCPUAdam
+from deepspeed.ops.adam import FusedAdam
 def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
@@ -23,11 +23,14 @@ def _initialize_parameter_parallel_groups(parameter_parallel_size=None):
    return my_group
-ZERO_SUPPORTED_OPTIMIZERS = [
+ZERO_SUPPORTED_OPTIMIZERS = [torch.optim.Adam, FusedAdam, DeepSpeedCPUAdam]
-    torch.optim.Adam,
-    apex.optimizers.FusedAdam,
+# Add apex FusedAdam to supported list if apex is installed
-    DeepSpeedCPUAdam
+try:
-]
+    import apex
+    ZERO_SUPPORTED_OPTIMIZERS.append(apex.optimizers.FusedAdam)
+except ImportError:
+    pass
 def is_zero_supported_optimizer(optimizer):

--- a/docs/_config.yml
+++ b/docs/_config.yml
@@ -30,6 +30,7 @@ collections:
    output: true
    permalink: /:collection/:path/
    order:
+      - advanced-install.md
      - getting-started.md
      - azure.md
      - cifar-10.md

--- a/docs/_tutorials/advanced-install.md
+++ b/docs/_tutorials/advanced-install.md
+---
+title: "Installation Details"
+date: 2020-10-28
+---
+The quickest way to get started with DeepSpeed is via pip, this will install
+the latest release of DeepSpeed which is not tied to specific PyTorch or CUDA
+versions. DeepSpeed includes several C++/CUDA extensions that we commonly refer
+to as our 'ops'.  By default, all of these extensions/ops will be built
+just-in-time (JIT) using [torch's JIT C++ extension loader that relies on
+ninja](https://pytorch.org/docs/stable/cpp_extension.html) to build and
+dynamically link them at runtime.
+```bash
+pip install deepspeed
+```
+After installation you can validate your install and see which ops your machine
+is compatible with via the DeepSpeed environment report with `ds_report` or
+`python -m deepspeed.env_report`. We've found this report useful when debugging
+DeepSpeed install or compatibility issues.
+```bash
+ds_report
+```
+## Install DeepSpeed from source
+After cloning the DeepSpeed repo from github you can install DeepSpeed in
+JIT mode via pip (see below). This install should complete
+quickly since it is not compiling any C++/CUDA source files.
+```bash
+pip install .
+```
+For installs spanning multiple nodes we find it useful to install DeepSpeed
+using the
+[install.sh](https://github.com/microsoft/DeepSpeed/blob/master/install.sh)
+script in the repo. This will build a python wheel locally and copy it to all
+the nodes listed in your hostfile (either given via --hostfile, or defaults to
+/job/hostfile).
+## Pre-install DeepSpeed Ops
+Sometimes we have found it useful to pre-install either some or all DeepSpeed
+C++/CUDA ops instead of using the JIT compiled path. In order to support
+pre-installation we introduce build environment flags to turn on/off building
+specific ops.
+You can indicate to our installer (either install.sh or pip install) that you
+want to attempt to install all of our ops by setting the `DS_BUILD_OPS`
+environment variable to 1, for example:
+```bash
+DS_BUILD_OPS=1 pip install .
+```
+We will only install any ops that are compatible with your machine, for more
+details on which ops are compatible with your system please try our `ds_report`
+tool described above.
+If you want to install only a specific op (e.g., FusedLamb) you can view the op
+specific build environment variable (set as `BUILD_VAR`) in the corresponding
+op builder class in the
+[op\_builder](https://github.com/microsoft/DeepSpeed/tree/master/op_builder)
+directory. For example to install only the Fused Lamb op you would install via:
+```bash
+DS_BUILD_FUSED_LAMB=1 pip install .
+```
+## Feature specific dependencies
+Some DeepSpeed features require specific dependencies outside of the general
+dependencies of DeepSpeed.
+* Python package dependencies per feature/op please
+see our [requirements
+directory](https://github.com/microsoft/DeepSpeed/tree/master/requirements).
+* We attempt to keep the system level dependencies to a minimum, however some features do require special system-level packages. Please see our `ds_report` tool output to see if you are missing any system-level packages for a given feature.
+## Pre-compiled DeepSpeed builds from PyPI
+Coming soon
--- a/docs/_tutorials/getting-started.md
+++ b/docs/_tutorials/getting-started.md
@@ -7,9 +7,9 @@ date: 2020-05-15
 ## Installation
+* Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
 * Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
 * If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
-* If you want to install DeepSpeed manually, we provide an install script `install.sh` to help install on a local machine or across an entire cluster.
 ## Writing DeepSpeed Models
 DeepSpeed model training is accomplished using the DeepSpeed engine. The engine

--- a/docs/index.md
+++ b/docs/index.md
@@ -28,8 +28,9 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
 information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
 # What's New?
-* [2020/10/28] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+* [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
-  * [DeepSpeed: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
+* [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
+* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
  * [Powering 10x longer sequences and 6x faster execution through DeepSpeed Sparse Attention](https://www.deepspeed.ai/news/2020/09/08/sparse-attention-news.html)
  * [Training a trillion parameters with pipeline parallelism](https://www.deepspeed.ai/news/2020/09/08/pipeline-parallelism.html)
  * [Up to 5x less communication and 3.4x faster training through 1-bit Adam](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-news.html)

--- a/install.sh
+++ b/install.sh
@@ -15,16 +15,13 @@ By default will install deepspeed and all third party dependecies accross all ma
 hostfile (hostfile: /job/hostfile). If no hostfile exists, will only install locally
 [optional]
-    -d, --deepspeed_only    Install only deepspeed and no third party dependencies
-    -t, --third_party_only  Install only third party dependencies and not deepspeed
    -l, --local_only        Install only on local machine
    -s, --pip_sudo          Run pip install with sudo (default: no sudo)
    -r, --allow_sudo        Allow script to be run by root (probably don't want this, instead use --pip_sudo)
    -n, --no_clean          Do not clean prior build state, by default prior build files are removed before building wheels
    -m, --pip_mirror        Use the specified pip mirror (default: the default pip mirror)
    -H, --hostfile          Path to MPI-style hostfile (default: /job/hostfile)
-    -a, --apex_commit       Install a specific commit hash of apex, instead of the one deepspeed points to
+    -v, --verbose           Verbose logging
-    -k, --skip_requirements Skip installing DeepSpeed requirements
    -h, --help              This help text
  """
 }
@@ -42,27 +39,12 @@ apex_commit=""
 skip_requirements=0
 allow_sudo=0
 no_clean=0
+verbose=0
 while [[ $# -gt 0 ]]
 do
 key="$1"
 case $key in
-    -d|--deepspeed_only)
-    deepspeed_install=1;
-    third_party_install=0;
-    ds_only=1;
-    shift
-    ;;
-    -t|--third_party_only)
-    deepspeed_install=0;
-    third_party_install=1;
-    tp_only=1;
-    shift
-    ;;
-    -l|--local_only)
-    local_only=1;
-    shift
-    ;;
    -s|--pip_sudo)
    pip_sudo=1;
    shift
@@ -72,13 +54,8 @@ case $key in
    shift
    shift
    ;;
-    -a|--apex_commit)
+    -v|--verbose)
-    apex_commit=$2;
+    verbose=1;
-    shift
-    shift
-    ;;
-    -k|--skip_requirements)
-    skip_requirements=1;
    shift
    ;;
    -r|--allow_sudo)
@@ -126,12 +103,18 @@ if [ "$ds_only" == "1" ] && [ "$tp_only" == "1" ]; then
    exit 1
 fi
+if [ "$verbose" == "1" ]; then
+    VERBOSE="-v"
+else
+    VERBOSE=""
+fi
 rm_if_exist() {
    echo "Attempting to remove $1"
    if [ -f $1 ]; then
-        rm -v $1
+        rm $VERBOSE $1
    elif [ -d $1 ]; then
-        rm -vr $1
+        rm -r $VERBOSE $1
    fi
 }
@@ -141,10 +124,6 @@ if [ "$no_clean" == "0" ]; then
    rm_if_exist dist
    rm_if_exist build
    rm_if_exist deepspeed.egg-info
-    # remove apex build files
-    rm_if_exist third_party/apex/dist
-    rm_if_exist third_party/apex/build
-    rm_if_exist third_party/apex/apex.egg-info
 fi
 if [ "$pip_sudo" == "1" ]; then
@@ -154,60 +133,25 @@ else
 fi
 if [ "$pip_mirror" != "" ]; then
-    PIP_INSTALL="pip install -v -i $pip_mirror"
+    PIP_INSTALL="pip install $VERBOSE -i $pip_mirror"
 else
-    PIP_INSTALL="pip install -v"
+    PIP_INSTALL="pip install $VERBOSE"
 fi
 if [ ! -f $hostfile ]; then
    echo "No hostfile exists at $hostfile, installing locally"
    local_only=1
 fi
-if [ "$skip_requirements" == "0" ]; then
+echo "Building deepspeed wheel"
-   # Ensure dependencies are installed locally
+python setup.py $VERBOSE bdist_wheel
-   $PIP_SUDO $PIP_INSTALL -r requirements/requirements.txt
-fi
-# Build wheels
-if [ "$third_party_install" == "1" ]; then
-    echo "Checking out sub-module(s)"
-    git submodule update --init --recursive
-    echo "Building apex wheel"
-    cd third_party/apex
-    if [ "$apex_commit" != "" ]; then
-        echo "Installing a non-standard version of apex at commit: $apex_commit"
-        git fetch
-        git checkout $apex_commit
-    fi
-    python setup.py -v --cpp_ext --cuda_ext bdist_wheel
-    cd -
-    echo "Installing apex locally so that deepspeed will build"
-    $PIP_SUDO pip uninstall -y apex
-    $PIP_SUDO $PIP_INSTALL third_party/apex/dist/apex*.whl
-fi
-if [ "$deepspeed_install" == "1" ]; then
-    echo "Building deepspeed wheel"
-    python setup.py -v bdist_wheel
-fi
 if [ "$local_only" == "1" ]; then
-    if [ "$deepspeed_install" == "1" ]; then
+    echo "Installing deepspeed"
-        echo "Installing deepspeed"
+    $PIP_SUDO pip uninstall -y deepspeed
-        $PIP_SUDO pip uninstall -y deepspeed
+    $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
-        $PIP_SUDO $PIP_INSTALL dist/deepspeed*.whl
+    ds_report
-	# -I to exclude local directory files
-        python -I basic_install_test.py
-        if [ $? == 0 ]; then
-            echo "Installation is successful"
-        else
-            echo "Installation failed"
-        fi
-    fi
 else
    local_path=`pwd`
    if [ -f $hostfile ]; then
@@ -216,28 +160,16 @@ else
        echo "hostfile not found, cannot proceed"
        exit 1
    fi
-    export PDSH_RCMD_TYPE=ssh;
+    export PDSH_RCMD_TYPE=ssh
    tmp_wheel_path="/tmp/deepspeed_wheels"
    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; else mkdir -pv $tmp_wheel_path; fi"
    pdcp -w $hosts requirements/requirements.txt ${tmp_wheel_path}/
-    if [ "$skip_requirements" == "0" ]; then
-       pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL -r ${tmp_wheel_path}/requirements.txt"
+    echo "Installing deepspeed"
-    fi
+    pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
-    if [ "$third_party_install" == "1" ]; then
+    pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
-        pdsh -w $hosts "$PIP_SUDO pip uninstall -y apex"
+    pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
-        pdcp -w $hosts third_party/apex/dist/apex*.whl $tmp_wheel_path/
+    pdsh -w $hosts "ds_report"
-        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/apex*.whl"
+    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl; rmdir $tmp_wheel_path; fi"
-        pdsh -w $hosts 'python -c "import apex"'
-    fi
-    if [ "$deepspeed_install" == "1" ]; then
-        echo "Installing deepspeed"
-        pdsh -w $hosts "$PIP_SUDO pip uninstall -y deepspeed"
-        pdcp -w $hosts dist/deepspeed*.whl $tmp_wheel_path/
-        pdcp -w $hosts basic_install_test.py $tmp_wheel_path/
-        pdsh -w $hosts "$PIP_SUDO $PIP_INSTALL $tmp_wheel_path/deepspeed*.whl"
-        pdsh -w $hosts "python $tmp_wheel_path/basic_install_test.py"
-        echo "Installation is successful"
-    fi
-    pdsh -w $hosts "if [ -d $tmp_wheel_path ]; then rm $tmp_wheel_path/*.whl $tmp_wheel_path/basic_install_test.py; rmdir $tmp_wheel_path; fi"
 fi
--- a/op_builder/__init__.py
+++ b/op_builder/__init__.py
+from .cpu_adam import CPUAdamBuilder
+from .fused_adam import FusedAdamBuilder
+from .fused_lamb import FusedLambBuilder
+from .sparse_attn import SparseAttnBuilder
+from .transformer import TransformerBuilder
+from .stochastic_transformer import StochasticTransformerBuilder
+from .utils import UtilsBuilder
+# TODO: infer this list instead of hard coded
+# List of all available ops
+__op_builders__ = [
+    CPUAdamBuilder(),
+    FusedAdamBuilder(),
+    FusedLambBuilder(),
+    SparseAttnBuilder(),
+    TransformerBuilder(),
+    StochasticTransformerBuilder(),
+    UtilsBuilder()
+]
+ALL_OPS = {op.name: op for op in __op_builders__}
--- a/op_builder/builder.py
+++ b/op_builder/builder.py
+import os
+import time
+import torch
+import importlib
+from pathlib import Path
+import subprocess
+from abc import ABC, abstractmethod
+YELLOW = '\033[93m'
+END = '\033[0m'
+WARNING = f"{YELLOW} [WARNING] {END}"
+DEFAULT_TORCH_EXTENSION_PATH = "/tmp/torch_extensions"
+def assert_no_cuda_mismatch():
+    import torch.utils.cpp_extension
+    cuda_home = torch.utils.cpp_extension.CUDA_HOME
+    assert cuda_home is not None, "CUDA_HOME does not exist, unable to compile CUDA op(s)"
+    # Ensure there is not a cuda version mismatch between torch and nvcc compiler
+    output = subprocess.check_output([cuda_home + "/bin/nvcc",
+                                      "-V"],
+                                     universal_newlines=True)
+    output_split = output.split()
+    release_idx = output_split.index("release")
+    release = output_split[release_idx + 1].replace(',', '').split(".")
+    # Ignore patch versions, only look at major + minor
+    installed_cuda_version = ".".join(release[:2])
+    torch_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    # This is a show-stopping error, should probably not proceed past this
+    if installed_cuda_version != torch_cuda_version:
+        raise Exception(
+            f"Installed CUDA version {installed_cuda_version} does not match the "
+            f"version torch was compiled with {torch.version.cuda}, unable to compile "
+            "cuda/cpp extensions without a matching cuda version.")
+def assert_torch_info(torch_info):
+    install_torch_version = torch_info['version']
+    install_cuda_version = torch_info['cuda_version']
+    current_cuda_version = ".".join(torch.version.cuda.split('.')[:2])
+    current_torch_version = ".".join(torch.__version__.split('.')[:2])
+    if install_cuda_version != current_cuda_version or install_torch_version != current_torch_version:
+        raise RuntimeError(
+            "PyTorch and CUDA version mismatch! DeepSpeed ops were compiled and installed "
+            "with a different version than what is being used at runtime. Please re-install "
+            f"DeepSpeed or switch torch versions. DeepSpeed install versions: "
+            f"torch={install_torch_version}, cuda={install_cuda_version}, runtime versions:"
+            f"torch={current_torch_version}, cuda={current_cuda_version}")
+class OpBuilder(ABC):
+    def __init__(self, name):
+        self.name = name
+        self.jit_mode = False
+    @abstractmethod
+    def absolute_name(self):
+        '''
+        Returns absolute build path for cases where the op is pre-installed, e.g., deepspeed.ops.adam.cpu_adam
+        will be installed as something like: deepspeed/ops/adam/cpu_adam.so
+        '''
+        pass
+    @abstractmethod
+    def sources(self):
+        '''
+        Returns list of source files for your op, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        pass
+    def include_paths(self):
+        '''
+        Returns list of include paths, relative to root of deepspeed package (i.e., DeepSpeed/deepspeed)
+        '''
+        return []
+    def nvcc_args(self):
+        '''
+        Returns optional list of compiler flags to forward to nvcc when building CUDA sources
+        '''
+        return []
+    def cxx_args(self):
+        '''
+        Returns optional list of compiler flags to forward to the build
+        '''
+        return []
+    def is_compatible(self):
+        '''
+        Check if all non-python dependencies are satisfied to build this op
+        '''
+        return True
+    def python_requirements(self):
+        '''
+        Override if op wants to define special dependencies, otherwise will
+        take self.name and load requirements-<op-name>.txt if it exists.
+        '''
+        path = f'requirements/requirements-{self.name}.txt'
+        requirements = []
+        if os.path.isfile(path):
+            with open(path, 'r') as fd:
+                requirements = [r.strip() for r in fd.readlines()]
+        return requirements
+    def command_exists(self, cmd):
+        if '|' in cmd:
+            cmds = cmd.split("|")
+        else:
+            cmds = [cmd]
+        valid = False
+        for cmd in cmds:
+            result = subprocess.Popen(f'type {cmd}', stdout=subprocess.PIPE, shell=True)
+            valid = valid or result.wait() == 0
+        if not valid and len(cmds) > 1:
+            print(
+                f"{WARNING} {self.name} requires one of the following commands '{cmds}', but it does not exist!"
+            )
+        elif not valid and len(cmds) == 1:
+            print(
+                f"{WARNING} {self.name} requires the '{cmd}' command, but it does not exist!"
+            )
+        return valid
+    def warning(self, msg):
+        print(f"{WARNING} {msg}")
+    def deepspeed_src_path(self, code_path):
+        if os.path.isabs(code_path):
+            return code_path
+        else:
+            return os.path.join(Path(__file__).parent.parent.absolute(), code_path)
+    def builder(self):
+        from torch.utils.cpp_extension import CppExtension
+        return CppExtension(name=self.absolute_name(),
+                            sources=self.sources(),
+                            include_dirs=self.include_paths(),
+                            extra_compile_args={'cxx': self.cxx_args()})
+    def load(self, verbose=True):
+        from ...git_version_info import installed_ops, torch_info
+        if installed_ops[self.name]:
+            # Ensure the op we're about to load was compiled with the same
+            # torch/cuda versions we are currently using at runtime.
+            if isinstance(self, CUDAOpBuilder):
+                assert_torch_info(torch_info)
+            return importlib.import_module(self.absolute_name())
+        else:
+            return self.jit_load(verbose)
+    def jit_load(self, verbose=True):
+        if not self.is_compatible():
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to it not being compatible due to hardware/software issue."
+            )
+        try:
+            import ninja
+        except ImportError:
+            raise RuntimeError(
+                f"Unable to JIT load the {self.name} op due to ninja not being installed."
+            )
+        if isinstance(self, CUDAOpBuilder):
+            assert_no_cuda_mismatch()
+        self.jit_mode = True
+        from torch.utils.cpp_extension import load
+        # Ensure directory exists to prevent race condition in some cases
+        ext_path = os.path.join(
+            os.environ.get('TORCH_EXTENSIONS_DIR',
+                           DEFAULT_TORCH_EXTENSION_PATH),
+            self.name)
+        os.makedirs(ext_path, exist_ok=True)
+        start_build = time.time()
+        op_module = load(
+            name=self.name,
+            sources=[self.deepspeed_src_path(path) for path in self.sources()],
+            extra_include_paths=[
+                self.deepspeed_src_path(path) for path in self.include_paths()
+            ],
+            extra_cflags=self.cxx_args(),
+            extra_cuda_cflags=self.nvcc_args(),
+            verbose=verbose)
+        build_duration = time.time() - start_build
+        if verbose:
+            print(f"Time to load {self.name} op: {build_duration} seconds")
+        return op_module
+class CUDAOpBuilder(OpBuilder):
+    def compute_capability_args(self, cross_compile_archs=['60', '61', '70']):
+        args = []
+        if self.jit_mode:
+            # Compile for underlying architecture since we know it at runtime
+            CC_MAJOR, CC_MINOR = torch.cuda.get_device_capability()
+            compute_capability = f"{CC_MAJOR}{CC_MINOR}"
+            args.append('-gencode')
+            args.append(
+                f'arch=compute_{compute_capability},code=compute_{compute_capability}')
+        else:
+            # Cross-compile mode, compile for various architectures
+            for compute_capability in cross_compile_archs:
+                args.append('-gencode')
+                args.append(
+                    f'arch=compute_{compute_capability},code=compute_{compute_capability}'
+                )
+        return args
+    def version_dependent_macros(self):
+        # Fix from apex that might be relevant for us as well, related to https://github.com/NVIDIA/apex/issues/456
+        TORCH_MAJOR = int(torch.__version__.split('.')[0])
+        TORCH_MINOR = int(torch.__version__.split('.')[1])
+        version_ge_1_1 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 0):
+            version_ge_1_1 = ['-DVERSION_GE_1_1']
+        version_ge_1_3 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 2):
+            version_ge_1_3 = ['-DVERSION_GE_1_3']
+        version_ge_1_5 = []
+        if (TORCH_MAJOR > 1) or (TORCH_MAJOR == 1 and TORCH_MINOR > 4):
+            version_ge_1_5 = ['-DVERSION_GE_1_5']
+        return version_ge_1_1 + version_ge_1_3 + version_ge_1_5
+    def is_compatible(self):
+        return super().is_compatible()
+    def builder(self):
+        from torch.utils.cpp_extension import CUDAExtension
+        assert_no_cuda_mismatch()
+        return CUDAExtension(name=self.absolute_name(),
+                             sources=self.sources(),
+                             include_dirs=self.include_paths(),
+                             extra_compile_args={
+                                 'cxx': self.cxx_args(),
+                                 'nvcc': self.nvcc_args()
+                             })
--- a/op_builder/cpu_adam.py
+++ b/op_builder/cpu_adam.py
+import os
+import torch
+import warnings
+from .builder import CUDAOpBuilder
+class CPUAdamBuilder(CUDAOpBuilder):
+    BUILD_VAR = "DS_BUILD_CPU_ADAM"
+    NAME = "cpu_adam"
+    def __init__(self):
+        super().__init__(name=self.NAME)
+    def absolute_name(self):
+        return f'deepspeed.ops.adam.{self.NAME}_op'
+    def sources(self):
+        return ['csrc/adam/cpu_adam.cpp', 'csrc/adam/custom_cuda_kernel.cu']
+    def include_paths(self):
+        CUDA_INCLUDE = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "include")
+        return ['csrc/includes', CUDA_INCLUDE]
+    def available_vector_instructions(self):
+        try:
+            import cpufeature
+        except ImportError:
+            warnings.warn(
+                f'import cpufeature failed - CPU vector optimizations are not available for CPUAdam'
+            )
+            return {}
+        cpu_vector_instructions = {}
+        try:
+            cpu_vector_instructions = cpufeature.CPUFeature
+        except _:
+            warnings.warn(
+                f'cpufeature.CPUFeature failed - CPU vector optimizations are not available for CPUAdam'
+            )
+            return {}
+        return cpu_vector_instructions
+    def cxx_args(self):
+        CUDA_LIB64 = os.path.join(torch.utils.cpp_extension.CUDA_HOME, "lib64")
+        cpu_info = self.available_vector_instructions()
+        SIMD_WIDTH = ''
+        if 'Intel' in cpu_info.get('VendorId', ''):
+            if cpu_info.get('AVX512f', False):
+                SIMD_WIDTH = '-D__AVX512__'
+            elif cpu_info.get('AVX2', False):
+                SIMD_WIDTH = '-D__AVX256__'
+        return [
+            '-O3',
+            '-std=c++14',
+            f'-L{CUDA_LIB64}',
+            '-lcudart',
+            '-lcublas',
+            '-g',
+            '-Wno-reorder',
+            '-march=native',
+            '-fopenmp',
+            SIMD_WIDTH
+        ]
+    def nvcc_args(self):
+        args = [
+            '-O3',
+            '--use_fast_math',
+            '-std=c++14',
+            '-U__CUDA_NO_HALF_OPERATORS__',
+            '-U__CUDA_NO_HALF_CONVERSIONS__',
+            '-U__CUDA_NO_HALF2_OPERATORS__'
+        ]
+        args += self.compute_capability_args()
+        return args