Pipeline parallel training engine. (#392)

Co-authored-by: Jeff Rasley <jerasley@microsoft.com>

Pipeline parallel training engine. (#392)
Co-authored-by: Jeff Rasley <jerasley@microsoft.com>
65c2f974 · Shaden Smith · GitHub · 41db1c2f · 65c2f974 · 65c2f974
Unverified Commit 65c2f974 authored Sep 09, 2020 by Shaden Smith Committed by GitHub Sep 09, 2020
20 changed files
--- a/deepspeed/__init__.py
+++ b/deepspeed/__init__.py
@@ -8,11 +8,14 @@ from . import ops
 from .runtime.engine import DeepSpeedEngine
 from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_ADAM
+from .runtime.pipe.engine import PipelineEngine
 from .runtime.lr_schedules import add_tuning_arguments
 from .runtime.config import DeepSpeedConfig
 from .runtime.activation_checkpointing import checkpointing
 from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
-from .utils import logger
+from .utils import log_dist
+from .pipe import PipelineModule
 try:
    from .git_version_info import version, git_hash, git_branch
@@ -99,23 +102,35 @@ def initialize(args,
        * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
          if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
    """
-    logger.info(
+    log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
-        "DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
+        __version__,
-            __version__,
+        __git_hash__,
-            __git_hash__,
+        __git_branch__),
-            __git_branch__),
+             ranks=[0])
-    )
+    if not isinstance(model, PipelineModule):
-    engine = DeepSpeedEngine(args=args,
+        engine = DeepSpeedEngine(args=args,
-                             model=model,
+                                 model=model,
-                             optimizer=optimizer,
+                                 optimizer=optimizer,
-                             model_parameters=model_parameters,
+                                 model_parameters=model_parameters,
-                             training_data=training_data,
+                                 training_data=training_data,
-                             lr_scheduler=lr_scheduler,
+                                 lr_scheduler=lr_scheduler,
-                             mpu=mpu,
+                                 mpu=mpu,
-                             dist_init_required=dist_init_required,
+                                 dist_init_required=dist_init_required,
-                             collate_fn=collate_fn,
+                                 collate_fn=collate_fn,
-                             config_params=config_params)
+                                 config_params=config_params)
+    else:
+        assert mpu is None, "mpu must be None with pipeline parallelism"
+        engine = PipelineEngine(args=args,
+                                model=model,
+                                optimizer=optimizer,
+                                model_parameters=model_parameters,
+                                training_data=training_data,
+                                lr_scheduler=lr_scheduler,
+                                mpu=model.mpu(),
+                                dist_init_required=dist_init_required,
+                                collate_fn=collate_fn,
+                                config_params=config_params)
    return_items = [
        engine,

--- a/deepspeed/pipe/__init__.py
+++ b/deepspeed/pipe/__init__.py
+from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
--- a/deepspeed/runtime/activation_checkpointing/checkpointing.py
+++ b/deepspeed/runtime/activation_checkpointing/checkpointing.py
@@ -480,6 +480,10 @@ class CheckpointFunction(torch.autograd.Function):
            timers.log(['forward'])
        if SYNCHRONIZE:
            torch.cuda.synchronize()
+        # Tensors returned from forward() may not be differentiable, e.g., attention mask
+        non_grad_outputs = [o for o in outputs if not o.is_floating_point()]
+        ctx.mark_non_differentiable(*non_grad_outputs)
        return outputs
    @staticmethod
@@ -548,7 +552,20 @@ class CheckpointFunction(torch.autograd.Function):
        if isinstance(outputs, torch.Tensor):
            outputs = (outputs, )
-        torch.autograd.backward(outputs, args)
+        # Go over args and build the list of gradient tensors. This is usually just args,
+        # but if the forward pass returns tensors that do not require_grad then we should
+        # adjust the arguments to autograd.backward() too. This happens when forward()
+        # returns indices or a mask (such as an attention mask).
+        # We skip the first needs_input_grad because it corresponds to run_function.
+        output_tensors = []
+        grad_tensors = []
+        for idx, need_grad in enumerate(ctx.needs_input_grad[1:]):
+            if need_grad:
+                output_tensors.append(outputs[idx])
+                grad_tensors.append(args[idx])
+        torch.autograd.backward(output_tensors, grad_tensors)
        if PROFILE_TIME:
            timers('backward').stop()

--- a/deepspeed/runtime/config.py
+++ b/deepspeed/runtime/config.py
@@ -324,6 +324,20 @@ def get_sparse_attention_type(param_dict):
        return SPARSE_ATTENTION_TYPE_DEFAULT
+def get_pipeline_config(param_dict):
+    '''Parses pipeline engine configuration. '''
+    default_pipeline = {
+        'stages': 'auto',
+        'partition': 'best',
+        'seed_layers': False,
+        'activation_checkpoint_interval': 0
+    }
+    config = default_pipeline
+    for key, val in param_dict.get('pipeline', {}).items():
+        config[key] = val
+    return config
 def get_optimizer_name(param_dict):
    if OPTIMIZER in param_dict.keys() and \
            TYPE in param_dict[OPTIMIZER].keys():
@@ -523,6 +537,7 @@ class DeepSpeedConfig(object):
        self.tensorboard_job_name = get_tensorboard_job_name(param_dict)
        self.sparse_attention = get_sparse_attention(param_dict)
+        self.pipeline = get_pipeline_config(param_dict)
    def _batch_assertion(self):
@@ -592,10 +607,6 @@ class DeepSpeedConfig(object):
            assert False, \
                'Either train_batch_size or micro_batch_per_gpu needs to be provided'
-        logger.info(
-            f' After Train batch {self.train_batch_size} micro_batch {self.train_micro_batch_size_per_gpu} and grad_acc {self.gradient_accumulation_steps}'
-        )
    def _configure_train_batch_size(self):
        self._set_batch_related_parameters()
        self._batch_assertion()
@@ -646,12 +657,14 @@ class DeepSpeedConfig(object):
            MAX_GRAD_NORM in self.optimizer_params.keys() and \
                self.optimizer_params[MAX_GRAD_NORM] > 0:
            if fp16_enabled:
-                logger.warning(
+                if self.global_rank == 0:
-                    'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
+                    logger.warning(
-                    .format(MAX_GRAD_NORM,
+                        'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
-                            self.optimizer_params[MAX_GRAD_NORM]))
+                        .format(MAX_GRAD_NORM,
+                                self.optimizer_params[MAX_GRAD_NORM]))
            else:
-                logger.warning(
+                if self.global_rank == 0:
-                    'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
+                    logger.warning(
-                    .format(self.optimizer_params[MAX_GRAD_NORM]))
+                        'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
+                        .format(self.optimizer_params[MAX_GRAD_NORM]))
                self.optimizer_params[MAX_GRAD_NORM] = 0.0
--- a/deepspeed/runtime/dataloader.py
+++ b/deepspeed/runtime/dataloader.py
@@ -7,6 +7,29 @@ from torch.utils.data import DataLoader, RandomSampler
 from torch.utils.data.distributed import DistributedSampler
+class RepeatingLoader:
+    def __init__(self, loader):
+        """Wraps an iterator to allow for infinite iteration. This is especially useful
+        for DataLoader types that we wish to automatically restart upon completion.
+        Args:
+            loader (iterator): The data loader to repeat.
+        """
+        self.loader = loader
+        self.data_iter = iter(self.loader)
+    def __iter__(self):
+        return self
+    def __next__(self):
+        try:
+            batch = next(self.data_iter)
+        except StopIteration:
+            self.data_iter = iter(self.loader)
+            batch = next(self.data_iter)
+        return batch
 class DeepSpeedDataLoader(object):
    def __init__(self,
                 dataset,

--- a/deepspeed/runtime/engine.py
+++ b/deepspeed/runtime/engine.py
--- a/deepspeed/runtime/fp16/fused_optimizer.py
+++ b/deepspeed/runtime/fp16/fused_optimizer.py
@@ -97,7 +97,7 @@ class FP16_Optimizer(object):
            self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
        #model parallel object
-        self.mpu = None
+        self.mpu = mpu
        self.overflow = False
        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
@@ -237,8 +237,8 @@ class FP16_Optimizer(object):
        if self.overflow:
            if self.verbose:
                print("[deepspeed] OVERFLOW! Skipping step. Attempted loss "
-                      "scale: {}, reducing to {}".format(prev_scale,
+                      "scale: {}, reducing to {} ".format(prev_scale,
-                                                         self.cur_scale))
+                                                          self.cur_scale))
            self.log_timers(OVERFLOW_TIMERS)
            grads_groups_flat = None
            return self.overflow

--- a/deepspeed/runtime/fp16/unfused_optimizer.py
+++ b/deepspeed/runtime/fp16/unfused_optimizer.py
@@ -93,11 +93,13 @@ class FP16_UnfusedOptimizer(object):
        else:
            self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
-        self.mpu = None
+        self.mpu = mpu
        self.overflow = False
        self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
+        self.initialize_optimizer_states()
    def zero_grad(self, set_grads_to_None=True):
        """
        Zero FP16 parameter grads.
@@ -349,3 +351,26 @@ class FP16_UnfusedOptimizer(object):
    def __repr__(self):
        return repr(self.optimizer)
+    def initialize_optimizer_states(self):
+        for i, group in enumerate(self.fp16_groups):
+            for param in group:
+                param.grad = torch.zeros(param.size(),
+                                         dtype=param.dtype,
+                                         device=torch.cuda.current_device())
+        for i, group in enumerate(self.fp32_groups):
+            for param in group:
+                param.grad = torch.zeros(param.size(),
+                                         dtype=param.dtype,
+                                         device=torch.cuda.current_device())
+        self.optimizer.step()
+        for i, group in enumerate(self.fp16_groups):
+            for param in group:
+                param.grad = None
+        for i, group in enumerate(self.fp32_groups):
+            for param in group:
+                param.grad = None
--- a/deepspeed/runtime/pipe/__init__.py
+++ b/deepspeed/runtime/pipe/__init__.py
+from .module import PipelineModule, LayerSpec, TiedLayerSpec
--- a/deepspeed/runtime/pipe/engine.py
+++ b/deepspeed/runtime/pipe/engine.py
--- a/deepspeed/runtime/pipe/module.py
+++ b/deepspeed/runtime/pipe/module.py
--- a/deepspeed/runtime/pipe/p2p.py
+++ b/deepspeed/runtime/pipe/p2p.py
+'''
+Copyright 2019 The Microsoft DeepSpeed Team
+'''
+import torch.distributed as dist
+_groups = None
+_grid = None
+#initializes adjacent process groups
+#run this only after torch.distributed.init_process_group() has been called
+def init_process_groups(grid):
+    global _groups, _grid
+    _grid = grid
+    assert _grid.pipe_parallel_size > 1, "There is no model parallelism"
+    _groups = [dist.new_group(ranks=group) for group in _grid.p2p_groups]
+def _is_valid_send_recv(src_stage, dest_stage):
+    first_stage = 0
+    last_stage = _grid.pipe_parallel_size - 1
+    assert abs(src_stage-dest_stage) == 1 or \
+        (src_stage == first_stage and dest_stage == last_stage) or \
+        (src_stage == last_stage and dest_stage == first_stage), \
+    "Functionality currently limited to send and receive between adjacent ranks only"
+def send(tensor, dest_stage, async_op=False):
+    global _groups
+    async_op = False
+    src_stage = _grid.get_stage_id()
+    _is_valid_send_recv(src_stage, dest_stage)
+    group = _get_send_recv_group(src_stage, dest_stage)
+    src_rank = _grid.stage_to_global(stage_id=src_stage)
+    return dist.broadcast(tensor, src_rank, group=group, async_op=async_op)
+def recv(tensor, src_stage, async_op=False):
+    global _groups
+    async_op = False
+    dest_stage = _grid.get_stage_id()
+    _is_valid_send_recv(src_stage, dest_stage)
+    group = _get_send_recv_group(src_stage, dest_stage)
+    src_rank = _grid.stage_to_global(stage_id=src_stage)
+    return dist.broadcast(tensor, src_rank, group=group, async_op=async_op)
+def barrier(stage_id):
+    global _groups, _grid
+    group_id = _grid.stage_to_global(stage_id=stage_id)
+    if (dist.get_rank() >= 0):
+        print("Barrier Group ID", group_id)
+        print("Barrier Group", _grid.p2p_groups[group_id])
+    dist.barrier(group=_groups[group_id])
+    if (dist.get_rank() >= 0):
+        print("Exiting Barrier ", group_id)
+def _get_send_recv_group(src_stage, dest_stage):
+    '''the group id is always the smaller rank unless its a wrap around'''
+    stage_id = None
+    first_stage = 0
+    last_stage = _grid.pipe_parallel_size - 1
+    if (src_stage == first_stage and dest_stage == last_stage
+            or dest_stage == first_stage and src_stage == last_stage):
+        stage_id = last_stage
+    elif src_stage > dest_stage:
+        stage_id = dest_stage
+    else:
+        stage_id = src_stage
+    '''group_id corresponds to group of [group_id, group_id+1]
+     unless group_id is the rank of the last stage
+     in which case group_id correspods to group[group_id-num_stages+1, group_id]
+     '''
+    group_id = _grid.stage_to_global(stage_id=stage_id)
+    return _groups[group_id]
--- a/deepspeed/runtime/pipe/schedule.py
+++ b/deepspeed/runtime/pipe/schedule.py
+from ..utils import call_to_str
+from abc import ABC, abstractmethod
+class PipeSchedule(ABC):
+    """Directs the execution of a pipeline engine by generating sequences of
+    :class:`PipeInstruction`.
+    Schedules are generators that yield sequences of
+    :class:`PipeInstruction` to process the micro-batches in one batch.
+    Each yielded step is atomic in the sense that a barrier
+    synchronization can be placed between successive steps without
+    deadlock.
+    Below is an example schedule that implements data parallelism with gradient accumulation:
+    .. code-block:: python
+        class DataParallelSchedule(PipeSchedule):
+            def steps(self):
+                for step_id in range(self.micro_batches):
+                    cmds = [
+                        LoadMicroBatch(buffer_id=0),
+                        ForwardPass(buffer_id=0),
+                        BackwardPass(buffer_id=0),
+                    ]
+                    if step_id == self.micro_batches - 1:
+                        cmds.extend([
+                            ReduceGrads(),
+                            OptimizerStep(),
+                        ])
+                    yield cmds
+            def num_pipe_buffers(self):
+                return 1
+    Args:
+        micro_batches (int): The number of micro-batches that comprise a batch.
+        stages (int): The number of pipeline stages.
+        stage_id (int): The pipe stage that will execute the generated schedule.
+    """
+    def __init__(self, micro_batches, stages, stage_id):
+        super().__init__()
+        self.micro_batches = micro_batches
+        self.stages = stages
+        self.stage_id = stage_id
+        self.prev_stage = self.stage_id - 1
+        self.next_stage = self.stage_id + 1
+    @abstractmethod
+    def steps(self):
+        """Yield a list of :class:`PipeInstruction` for each step in the schedule.
+        .. note::
+            Schedules must implement ``steps()`` to define the schedule.
+        Returns:
+            Instructions to be executed as one step of the pipeline
+        """
+        pass
+    def num_pipe_buffers(self):
+        """The number of pipeline buffers that will be used by this stage.
+        .. note::
+            Schedules should specialize ``num_pipe_buffers()`` for memory savings at scale.
+        Returns:
+            The number of buffers for the engine to allocate.
+        """
+        return self.micro_batches
+    def _valid_micro_batch(self, micro_batch_id):
+        return 0 <= micro_batch_id < self.micro_batches
+    def _valid_stage(self, stage_id):
+        return 0 <= stage_id < self.stages
+    @property
+    def stage(self):
+        """Stage index used to configure this schedule."""
+        return self.stage_id
+    @property
+    def num_stages(self):
+        """The number of total pipeline stages used to configure this schedule."""
+        return self.stages
+    @property
+    def num_micro_batches(self):
+        """The number of total micro_batches used to configure this schedule."""
+        return self.micro_batches
+    @property
+    def is_first_stage(self):
+        """True if the configured ``stage_id`` is the first stage in the pipeline."""
+        return self.stage_id == 0
+    @property
+    def is_last_stage(self):
+        """True if the configured ``stage_id`` is the last stage in the pipeline."""
+        return self.stage_id == self.stages - 1
+    def _buffer_idx(self, micro_batch_id):
+        """Map a micro-batch index to a pipeline buffer index.
+        This method uses a cyclic allocation strategy.
+        Args:
+            micro_batch_id (int): The micro-batch index relative to the beginning of the schedule.
+        Returns:
+            int: The index of the buffer that should store data.
+        """
+        assert self._valid_micro_batch(micro_batch_id)
+        return micro_batch_id % self.num_pipe_buffers()
+    def __iter__(self):
+        self.it = None
+        return self
+    def __next__(self):
+        if self.it is None:
+            self.it = self.steps()
+        return next(self.it)
+class InferenceSchedule(PipeSchedule):
+    """A schedule for inferencing batches using pipeline parallelism.
+    """
+    def steps(self):
+        """"""
+        prev_micro_batch_id = -1
+        total_steps = self.micro_batches + self.stages - 1
+        for step_id in range(total_steps):
+            cmds = []
+            micro_batch_id = step_id - self.stage_id
+            # Alternate send/recv buffers
+            if _is_even(self.stage_id):
+                recv_buf = step_id % 2
+                send_buf = (step_id + 1) % 2
+            else:
+                recv_buf = (step_id + 1) % 2
+                send_buf = step_id % 2
+            if self.is_first_stage or self.is_last_stage:
+                if self._valid_micro_batch(micro_batch_id):
+                    cmds.append(LoadMicroBatch(recv_buf))
+            if _is_even(self.stage_id):
+                if self._valid_stage(self.next_stage):
+                    if self._valid_micro_batch(micro_batch_id - 1):
+                        cmds.append(SendActivation(send_buf))
+                if self._valid_stage(self.prev_stage):
+                    if self._valid_micro_batch(micro_batch_id):
+                        cmds.append(RecvActivation(recv_buf))
+            else:
+                if self._valid_stage(self.prev_stage):
+                    if self._valid_micro_batch(micro_batch_id):
+                        cmds.append(RecvActivation(recv_buf))
+                if self._valid_stage(self.next_stage):
+                    if self._valid_micro_batch(micro_batch_id - 1):
+                        cmds.append(SendActivation(send_buf))
+            if self._valid_micro_batch(micro_batch_id):
+                cmds.append(ForwardPass(recv_buf))
+            yield cmds
+    def num_pipe_buffers(self):
+        """Only two pipeline buffers are required for inferencing.
+        Returns:
+            ``2``
+        """
+        return 2
+class TrainSchedule(PipeSchedule):
+    """A schedule for training a batch using hybrid parallelism.
+    Pipeline parallelism is extracted through gradient accumulation and thus
+    convergence follows that of a data parallel approach with the same batch
+    size.
+    """
+    def steps(self):
+        """"""
+        prev_micro_batch_id = -1
+        total_steps = 2 * (self.micro_batches + self.stages - 1)
+        for step_id in range(total_steps):
+            # Map the step of the pipeline to the micro-batch id and also whether it is a
+            # forward or backward pass step.
+            micro_batch_id, is_forward = self._step_to_micro_batch(step_id)
+            if self._valid_micro_batch(prev_micro_batch_id):
+                prev_buffer = self._buffer_idx(prev_micro_batch_id)
+            if self._valid_micro_batch(micro_batch_id):
+                curr_buffer = self._buffer_idx(micro_batch_id)
+            cmds = []
+            # Exchange activations
+            if is_forward:
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
+                        self.prev_stage):
+                    cmds.append(RecvActivation(curr_buffer))
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
+                        self.prev_stage):
+                    cmds.append(SendGrad(prev_buffer))
+            else:
+                if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
+                        self.next_stage):
+                    cmds.append(SendActivation(prev_buffer))
+                if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
+                        self.next_stage):
+                    cmds.append(RecvGrad(curr_buffer))
+            # First/last stage loads
+            if self.stage_id == 0 or self.stage_id == self.stages - 1:
+                if is_forward and self._valid_micro_batch(micro_batch_id):
+                    cmds.append(LoadMicroBatch(curr_buffer))
+            # Computation
+            if self._valid_micro_batch(micro_batch_id):
+                if is_forward:
+                    cmds.append(ForwardPass(curr_buffer))
+                else:
+                    cmds.append(BackwardPass(curr_buffer))
+            # Model step at the end of the batch
+            if step_id == total_steps - 1:
+                cmds.append(ReduceTiedGrads())
+                cmds.append(ReduceGrads())
+                cmds.append(OptimizerStep())
+            # Prepare state for next time
+            prev_micro_batch_id = micro_batch_id
+            yield cmds
+    def num_pipe_buffers(self):
+        """As many buffers as the distance from this stage to the last stage.
+        """
+        buffers = min(self.stages - self.stage_id + 1, self.micro_batches)
+        return max(2, buffers)
+    def _step_to_micro_batch(self, step_id):
+        if _is_even(step_id) and _is_even(self.stage_id):
+            micro_batch_id = self._even_step_forward_id(step_id)
+            is_forward = True
+        elif _is_odd(step_id) and _is_odd(self.stage_id):
+            micro_batch_id = self._odd_step_forward_id(step_id)
+            is_forward = True
+        elif _is_even(step_id) and _is_odd(self.stage_id):
+            micro_batch_id = self._even_step_backward_id(step_id)
+            is_forward = False
+        elif _is_odd(step_id) and _is_even(self.stage_id):
+            micro_batch_id = self._odd_step_backward_id(step_id)
+            is_forward = False
+        else:
+            assert False
+        return micro_batch_id, is_forward
+    def _even_step_forward_id(self, step_id):
+        base = step_id // 2
+        micro_batch_id = int(base - self.stage_id // 2)
+        return micro_batch_id
+    def _odd_step_forward_id(self, step_id):
+        base = (step_id - 1) // 2
+        micro_batch_id = int(base - self.stage_id // 2)
+        return micro_batch_id
+    def _even_step_backward_id(self, step_id):
+        base = step_id // 2
+        micro_batch_id = int(base - self.stages + (self.stage_id + 1) // 2)
+        return micro_batch_id
+    def _odd_step_backward_id(self, step_id):
+        base = ((step_id - 1) // 2) - self.stages + 1
+        micro_batch_id = int(base + self.stage_id // 2)
+        return micro_batch_id
+class DataParallelSchedule(PipeSchedule):
+    """An example schedule that trains using traditional data parallelism with gradient
+    accumulation.
+    """
+    def steps(self):
+        """"""
+        for step_id in range(self.micro_batches):
+            cmds = [
+                LoadMicroBatch(buffer_id=0),
+                ForwardPass(buffer_id=0),
+                BackwardPass(buffer_id=0),
+            ]
+            if step_id == self.micro_batches - 1:
+                cmds.extend([
+                    ReduceGrads(),
+                    OptimizerStep(),
+                ])
+            yield cmds
+    def num_pipe_buffers(self):
+        """Only one pipeline buffer needed.
+        """
+        return 1
+class PipeInstruction:
+    """Base class for all instructions to be executed by the pipeline engine.
+    All keyword arguments are stored as members similar to a ``namedtuple``. These are
+    then accessible to the :class:`PipeEngine` during execution.
+    Args:
+        kwargs (optional): keyword arguments to store as members
+    """
+    def __init__(self, **kwargs):
+        self.name = self.__class__.__name__
+        self.kwargs = kwargs
+        for key, val in kwargs.items():
+            setattr(self, key, val)
+    def __repr__(self):
+        return call_to_str(self.name, **self.kwargs)
+class OptimizerStep(PipeInstruction):
+    """Performs one step with the optimizer and zeros gradients.
+    .. note:: Should be issued after :class:`ReduceGrads` and :class:`ReduceTiedGrads`.
+    .. note:: Can be a synchronization point among data-parallel ranks.
+    """
+    pass
+class ReduceGrads(PipeInstruction):
+    """Reduce the computed gradients among data-parallel processes within the stage.
+    """
+    pass
+class ReduceTiedGrads(PipeInstruction):
+    """Reduce the computed gradients of tied modules within a pipeline-parallel group.
+    .. warning::
+        The stages included in this synchronization point are not known until
+        the model is partitioned among pipeline stages. In the worst case, it
+        includes all pipeline stages. This instruction should be scheduled
+        carefully to avoid deadlocks.
+    """
+    pass
+class BufferOpInstruction(PipeInstruction):
+    """A pipeline instruction that operates on pipeline buffer(s).
+    Args:
+        buffer_id (int): the index of the pipeline buffer() to modify.
+    """
+    def __init__(self, buffer_id, **kwargs):
+        super().__init__(buffer_id=buffer_id, **kwargs)
+# IO
+class LoadMicroBatch(BufferOpInstruction):
+    """Load a micro-batch into a buffer.
+    Roughly:
+    .. code-block:: python
+        buffers['inputs'][buffer_id] = next(data_iter)
+    """
+    pass
+# Compute
+class ForwardPass(BufferOpInstruction):
+    """Compute a forward pass.
+    Roughly:
+    .. code-block:: python
+        buffers['ouputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
+    """
+    pass
+class BackwardPass(BufferOpInstruction):
+    """Compute a backward pass and accumulate gradients.
+    Roughly:
+    .. code-block:: python
+        outputs = buffers['ouputs'][buffer_id]
+        gradients = buffers['gradients'][buffer_id]
+        torch.autograd.backward(tensors=outputs,
+                                grad_tensors=gradients)
+    """
+    pass
+# Communication
+class SendActivation(BufferOpInstruction):
+    """Send activations to the next stage in the pipeline.
+    Roughly:
+    .. code-block:: python
+        send(buffers['outputs'][buffer_id])
+    .. note::
+        The communication is blocking and must be paired with a :class:`RecvActivation`
+        on the next pipeline stage to avoid deadlock.
+    """
+    pass
+class RecvActivation(BufferOpInstruction):
+    """Receive activations from the previous stage in the pipeline.
+    Roughly:
+    .. code-block:: python
+        buffers['inputs'][buffer_id] = recv()
+    .. note::
+        The communication is blocking and must be paired with a :class:`SendActivation`
+        on the previous pipeline stage to avoid deadlock.
+    """
+    pass
+class SendGrad(BufferOpInstruction):
+    """Send computed gradients to the previous pipeline stage.
+    with respect to the received activations
+    .. note::
+        Only received tensors with ``requires_grad==True`` will produce gradients.
+        Missing gradients will be replaced with ``None`` on the receiving stage.
+    .. note::
+        The communication is blocking and must be paired with a :class:`RecvGrad`
+        on the previous pipeline stage to avoid deadlock.
+    """
+    pass
+class RecvGrad(BufferOpInstruction):
+    """Receive computed gradients the next pipeline stage.
+    .. note::
+        Only activations with ``requires_grad==True`` will produce gradients.
+        Missing gradients will be replaced with ``None``.
+    .. note::
+        The communication is blocking and must be paired with a :class:`SendGrad`
+        on the next pipeline stage to avoid deadlock.
+    """
+    pass
+def _is_even(x):
+    return x % 2 == 0
+def _is_odd(x):
+    return x % 2 != 0
--- a/deepspeed/runtime/pipe/topology.py
+++ b/deepspeed/runtime/pipe/topology.py
+# Copyright 2019 The Microsoft DeepSpeed Team
+from deepspeed.utils import logger
+import torch.distributed as dist
+import sys
+from collections import namedtuple
+from itertools import product as cartesian_product
+class ProcessTopology:
+    """ Manages the mapping of n-dimensional Cartesian coordinates to linear
+    indices. This mapping is used to map the rank of processes to the grid
+    for various forms of parallelism.
+    Each axis of the tensor is accessed by its name. The provided ordering
+    of the axes defines the layout of the topology. ProcessTopology uses a "row-major"
+    layout of the tensor axes, and so axes=['x', 'y'] would map coordinates (x,y) and
+    (x,y+1) to adjacent linear indices. If instead axes=['y', 'x'] was used, coordinates
+    (x,y) and (x+1,y) would be adjacent.
+    Some methods return ProcessCoord namedtuples.
+    """
+    def __init__(self, axes, dims):
+        """Create a mapping of n-dimensional tensor coordinates to linear indices.
+        Arguments:
+            axes (list): the names of the tensor axes
+            dims (list): the dimension (length) of each axis of the topology tensor
+        """
+        self.axes = axes  # names of each topology axis
+        self.dims = dims  # length of each topology axis
+        # This is actually a class that lets us hash {'row':3, 'col':2} mappings
+        self.ProcessCoord = namedtuple('ProcessCoord', axes)
+        self.mapping = {}
+        ranges = [range(d) for d in dims]
+        # example: 1, (0,0,1)
+        for global_rank, coord in enumerate(cartesian_product(*ranges)):
+            key = {axis: coord[self.axes.index(axis)] for axis in self.axes}
+            key = self.ProcessCoord(**key)
+            # for example, {ProcessCoord(row=0, col=1) : 1}
+            self.mapping[key] = global_rank
+    def get_rank(self, **coord_kwargs):
+        """Return the global rank of a process via its coordinates.
+        Coordinates are specified as kwargs. For example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> X.get_rank(x=0, y=1)
+            1
+        """
+        if len(coord_kwargs) != len(self.axes):
+            raise ValueError('get_rank() does not support slices. Use filter_match())')
+        key = self.ProcessCoord(**coord_kwargs)
+        assert key in self.mapping, f'key {kwargs} invalid'
+        return self.mapping[key]
+    def get_axis_names(self):
+        """Return a list of the axis names in the ordering of the topology. """
+        return self.axes
+    def get_rank_repr(self,
+                      rank,
+                      omit_axes=['data',
+                                 'pipe'],
+                      inner_sep='_',
+                      outer_sep='-'):
+        """Return a string representation of a rank.
+        This method is primarily used for checkpointing model data.
+        For example:
+            >>> topo = Topo(axes=['a', 'b'], dims=[2, 2])
+            >>> topo.get_rank_repr(rank=3)
+            'a_01-b_01'
+            >>> topo.get_rank_repr(rank=3, omit_axes=['a'])
+            'b_01'
+        Args:
+            rank (int): A rank in the topology.
+            omit_axes (list, optional): Axes that should not be in the representation. Defaults to ['data', 'pipe'].
+            inner_sep (str, optional): [description]. Defaults to '_'.
+            outer_sep (str, optional): [description]. Defaults to '-'.
+        Returns:
+            str: A string representation of the coordinate owned by ``rank``.
+        """
+        omit_axes = frozenset(omit_axes)
+        axes = [a for a in self.get_axis_names() if a not in omit_axes]
+        names = []
+        for ax in axes:
+            ax_rank = getattr(self.get_coord(rank=rank), ax)
+            names.append(f'{ax}{inner_sep}{ax_rank:02d}')
+        return outer_sep.join(names)
+    def get_dim(self, axis):
+        """Return the number of processes along the given axis.
+        For example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> X.get_dim('y')
+            3
+        """
+        if axis not in self.axes:
+            return 0
+        return self.dims[self.axes.index(axis)]
+    def get_coord(self, rank):
+        """Return the coordinate owned by a process rank.
+        The axes of the returned namedtuple can be directly accessed as members. For
+        example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> coord = X.get_coord(rank=1)
+            >>> coord.x
+            0
+            >>> coord.y
+            1
+        """
+        for coord, idx in self.mapping.items():
+            if idx == rank:
+                return coord
+        raise ValueError(f'rank {rank} not found in topology.')
+    def get_axis_comm_lists(self, axis):
+        """ Construct lists suitable for a communicator group along axis ``axis``.
+        Example:
+            >>> topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+            >>> topo.get_axis_comm_lists('pipe')
+            [
+                [0, 4], # data=0, model=0
+                [1, 5], # data=0, model=1
+                [2, 6], # data=1, model=0
+                [3, 7], # data=1, model=1
+            ]
+        Returns:
+            A list of lists whose coordinates match in all axes *except* ``axis``.
+        """
+        # We don't want to RuntimeError because it allows us to write more generalized
+        # code for hybrid parallelisms.
+        if axis not in self.axes:
+            return []
+        # Grab all axes but `axis`
+        other_axes = [a for a in self.axes if a != axis]
+        lists = []
+        # Construct all combinations of coords with other_axes
+        ranges = [range(self.get_dim(a)) for a in other_axes]
+        for coord in cartesian_product(*ranges):
+            other_keys = {a: coord[other_axes.index(a)] for a in other_axes}
+            # now go over all ranks in `axis`.
+            sub_list = []
+            for axis_key in range(self.get_dim(axis)):
+                key = self.ProcessCoord(**other_keys, **{axis: axis_key})
+                sub_list.append(self.mapping[key])
+            lists.append(sub_list)
+        return lists
+    def filter_match(self, **filter_kwargs):
+        """Return the list of ranks whose coordinates match the provided criteria.
+        Example:
+            >>> X = ProcessTopology(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
+            >>> X.filter_match(pipe=0, data=1)
+            [2, 3]
+            >>> [X.get_coord(rank) for rank in X.filter_match(pipe=0, data=1)]
+            [ProcessCoord(pipe=0, data=1, model=0), ProcessCoord(pipe=0, data=1, model=1)]
+        Arguments:
+            **filter_kwargs (dict): criteria used to select coordinates.
+        Returns:
+            The list of ranks whose coordinates match filter_kwargs.
+        """
+        def _filter_helper(x):
+            for key, val in filter_kwargs.items():
+                if getattr(x, key) != val:
+                    return False
+            return True
+        coords = filter(_filter_helper, self.mapping.keys())
+        return [self.mapping[coo] for coo in coords]
+    def get_axis_list(self, axis, idx):
+        """Returns the list of global ranks whose coordinate in an axis is idx.
+        For example:
+            >>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
+            >>> X.get_axis_list(axis='x', idx=0)
+            [0, 1, 2]
+            >>> X.get_axis_list(axis='y', idx=0)
+            [0, 3]
+        """
+        # This could be faster by generating the desired keys directly instead of
+        # filtering.
+        axis_num = self.axes.index(axis)
+        ranks = [self.mapping[k] for k in self.mapping.keys() if k[axis_num] == idx]
+        return ranks
+    def world_size(self):
+        return len(self.mapping)
+    def __str__(self):
+        return str(self.mapping)
+def _prime_factors(N):
+    """ Returns the prime factorization of positive integer N. """
+    if N <= 0:
+        raise ValueError("Values must be strictly positive.")
+    primes = []
+    while N != 1:
+        for candidate in range(2, N + 1):
+            if N % candidate == 0:
+                primes.append(candidate)
+                N //= candidate
+                break
+    return primes
+class PipeDataParallelTopology(ProcessTopology):
+    """ A topology specialiation for hybrid data and pipeline parallelism.
+        Uses data parallelism on the last dimension to encourage gradient
+        reductions to use high-bandwidth intra-node links and lower-volume
+        pipeline communications to use low-bandwidth inter-node links.
+    """
+    def __init__(self, num_pp, num_dp):
+        super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
+class PipeModelDataParallelTopology(ProcessTopology):
+    """ A topology for hybrid pipeline, model, and data parallelism. """
+    def __init__(self, num_pp, num_mp, num_dp):
+        super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
+class PipelineParallelGrid:
+    """Implements a grid object that stores the data parallel ranks
+    corresponding to each o the model parallel stages
+    The grid object organizes the processes in a distributed pytorch job
+    into a 2D grid, of stage_id and data_parallel_id.
+    self.stage_id and self.data_parallel_id stores the stage id
+    and the data parallel id of current process.
+    self.dp_group groups the processes by stage_id.
+    self.dp_group[i], is a list containing all process ranks whose
+    stage_id is i.
+    self.p2p_groups stores a list of tuple, where each tuple
+    stores process ranks of adjacent stages for a given data_parallel_id.
+    For example if num_stage is 5 then a tuple [7,8] represents stages [3, 4],
+    with data_parallel id = 1. A stage wrap around will appear as non-adjacent ranks,
+    for example tuple [4,0] with representing wrap-around stage 4 and 0, for
+    data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
+    for data_parallel_id = 1.
+    """
+    def __init__(self, topology=None, process_group=None):
+        # TODO use process_group if provided
+        self.global_rank = dist.get_rank()
+        self.world_size = dist.get_world_size()
+        if topology is not None:
+            if self.global_rank == 0:
+                print('Using topology:', topology)
+            self._topo = topology
+        else:
+            num_pp = 1
+            num_dp = 1
+            for idx, prime in enumerate(_prime_factors(self.world_size)):
+                if idx % 2 == 0:
+                    num_pp *= prime
+                else:
+                    num_dp *= prime
+            self._topo = PipeDataParallelTopology(num_dp=num_dp, num_pp=num_pp)
+        self.data_parallel_size = max(self._topo.get_dim('data'), 1)
+        self.pipe_parallel_size = max(self._topo.get_dim('pipe'), 1)
+        self.model_parallel_size = max(self._topo.get_dim('model'), 1)
+        assert self._is_grid_valid(), "Invalid Grid"
+        self.stage_id = self.get_stage_id()
+        self.data_parallel_id = self.get_data_parallel_id()
+        # Create new ProcessGroups for all model parallelism. DeepSpeedLight uses these
+        # to detect overflow, etc.
+        self.ds_model_proc_group = None
+        self.ds_model_rank = -1
+        for dp in range(self.data_parallel_size):
+            ranks = sorted(self._topo.get_axis_list(axis='data', idx=dp))
+            if self.global_rank == 0:
+                #print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
+                pass
+            proc_group = dist.new_group(ranks=ranks)
+            if self.global_rank in ranks:
+                self.ds_model_proc_group = proc_group
+                self.ds_model_world_size = len(ranks)
+                self.ds_model_rank = ranks.index(self.global_rank)
+        assert self.ds_model_rank > -1
+        assert self.ds_model_proc_group is not None
+        # Create new ProcessGroup for gradient all-reduces - these are the data parallel groups
+        self.dp_group = []
+        self.dp_groups = self._topo.get_axis_comm_lists('data')
+        for g in self.dp_groups:
+            proc_group = dist.new_group(ranks=g)
+            if self.global_rank in g:
+                self.dp_group = g
+                self.dp_proc_group = proc_group
+        self.is_first_stage = (self.stage_id == 0)
+        self.is_last_stage = (self.stage_id == (self.pipe_parallel_size - 1))
+        self.p2p_groups = self._build_p2p_groups()
+        # Create new ProcessGroup for pipeline collectives - these are pipe parallel groups
+        self.pp_group = []
+        self.pp_proc_group = None
+        self.pipe_groups = self._topo.get_axis_comm_lists('pipe')
+        for ranks in self.pipe_groups:
+            if self.global_rank == 0:
+                #print(f'RANK={self.global_rank} building pipeline group: {ranks}')
+                pass
+            proc_group = dist.new_group(ranks=ranks)
+            if self.global_rank in ranks:
+                self.pp_group = ranks
+                self.pp_proc_group = proc_group
+        assert self.pp_proc_group is not None
+        # Create new ProcessGroup for model (tensor-slicing) collectives
+        # Short circuit case without model parallelism.
+        # TODO: it would be nice if topology had bcast semantics to avoid this branching
+        # case?
+        if self.model_parallel_size == 1:
+            for group_rank in range(self.world_size):
+                group_rank = [group_rank]
+                group = dist.new_group(ranks=group_rank)
+                if group_rank[0] == self.global_rank:
+                    self.slice_group = group_rank
+                    self.slice_proc_group = group
+            return
+        else:
+            self.mp_group = []
+            self.model_groups = self._topo.get_axis_comm_lists('model')
+            for g in self.model_groups:
+                proc_group = dist.new_group(ranks=g)
+                if self.global_rank in g:
+                    self.slice_group = g
+                    self.slice_proc_group = proc_group
+    def get_stage_id(self):
+        return self._topo.get_coord(rank=self.global_rank).pipe
+    def get_data_parallel_id(self):
+        return self._topo.get_coord(rank=self.global_rank).data
+    def _build_p2p_groups(self):
+        """Groups for sending and receiving activations and gradients across model
+        parallel stages.
+        """
+        comm_lists = self._topo.get_axis_comm_lists('pipe')
+        p2p_lists = []
+        for rank in range(self.world_size):
+            for l in comm_lists:
+                assert len(l) == self.pipe_parallel_size
+                if rank in l:
+                    idx = l.index(rank)
+                    buddy_rank = l[(idx + 1) % self.pipe_parallel_size]
+                    p2p_lists.append([rank, buddy_rank])
+                    break  # next global rank
+        assert len(p2p_lists) == self.world_size
+        return p2p_lists
+    def _is_grid_valid(self):
+        ranks = 1
+        for ax in self._topo.get_axis_names():
+            ranks *= self._topo.get_dim(ax)
+        return ranks == dist.get_world_size()
+    #returns the global rank of the process with the provided stage id
+    #which has the same data_parallel_id as caller process
+    def stage_to_global(self, stage_id, **kwargs):
+        me = self._topo.get_coord(self.global_rank)
+        transform = me._replace(pipe=stage_id, **kwargs)._asdict()
+        return self._topo.get_rank(**transform)
+    def topology(self):
+        return self._topo
+    # MPU functions for DeepSpeed integration
+    def get_global_rank(self):
+        return self.global_rank
+    def get_pipe_parallel_rank(self):
+        """ The stage of the pipeline this rank resides in. """
+        return self.get_stage_id()
+    def get_pipe_parallel_world_size(self):
+        """ The number of stages in the pipeline. """
+        return self.pipe_parallel_size
+    def get_pipe_parallel_group(self):
+        """ The group of ranks within the same pipeline. """
+        return self.pp_proc_group
+    def get_data_parallel_rank(self):
+        """ Which pipeline this rank resides in. """
+        return self.data_parallel_id
+    def get_data_parallel_world_size(self):
+        """ The number of pipelines. """
+        return self.data_parallel_size
+    def get_data_parallel_group(self):
+        """ The group of ranks within the same stage of all pipelines. """
+        return self.dp_proc_group
+    # These are model parallel groups across all types of model parallelism.
+    # Deepspeed uses them to detect overflow, etc.
+    def get_model_parallel_rank(self):
+        return self.ds_model_rank
+    def get_model_parallel_world_size(self):
+        return self.ds_model_world_size
+    def get_model_parallel_group(self):
+        return self.ds_model_proc_group
+    # For Megatron-style tensor slicing
+    def get_slice_parallel_rank(self):
+        if 'model' in self._topo.get_axis_names():
+            return self._topo.get_coord(rank=self.global_rank).model
+        else:
+            return 0
+    def get_slice_parallel_world_size(self):
+        self.slice_parallel_size
+    def get_slice_parallel_group(self):
+        return self.slice_proc_group
--- a/deepspeed/runtime/utils.py
+++ b/deepspeed/runtime/utils.py
@@ -6,11 +6,36 @@ Copyright NVIDIA/Megatron
 Helper functions and classes from multiple sources.
 '''
+import os
+from math import ceil
+from math import floor
+from bisect import bisect_left, bisect_right
 import torch
+import torch.distributed as dist
 from torch._six import inf
 import torch.distributed as dist
 from deepspeed.utils import logger
+from numpy import prod
+def ensure_directory_exists(filename):
+    """Create the directory path to ``filename`` if it does not already exist.
+    Args:
+        filename (str): A file path.
+    """
+    dirname = os.path.dirname(filename)
+    os.makedirs(dirname, exist_ok=True)
+def set_random_seed(seed):
+    import numpy
+    import random
+    random.seed(seed)
+    numpy.random.seed(seed)
+    torch.manual_seed(seed)
 class CheckOverflow(object):
@@ -85,6 +110,7 @@ class CheckOverflow(object):
            torch.distributed.all_reduce(overflow_gpu,
                                         op=torch.distributed.ReduceOp.MAX,
                                         group=self.mpu.get_model_parallel_group())
        overflow = overflow_gpu[0].item()
        return bool(overflow)
@@ -160,9 +186,8 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
        total_norm = 0.
        for p in parameters:
            if mpu is not None:
-                if (mpu.get_model_parallel_rank() == 0) or (hasattr(p,
+                if (mpu.get_model_parallel_rank() == 0
-                                                                    'model_parallel')
+                    ) or is_model_parallel_parameter(p):
-                                                            and p.model_parallel):
                    param_norm = p.grad.data.float().norm(norm_type)
                    total_norm += param_norm.item()**norm_type
            else:
@@ -218,9 +243,8 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
        total_norm = 0.
        for p in parameters:
            if mpu is not None:
-                if (mpu.get_model_parallel_rank() == 0) or (hasattr(p,
+                if (mpu.get_model_parallel_rank() == 0
-                                                                    'model_parallel')
+                    ) or is_model_parallel_parameter(p):
-                                                            and p.model_parallel):
                    try:
                        param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
                    except TypeError as err:
@@ -255,6 +279,255 @@ def is_model_parallel_parameter(p):
    return hasattr(p, 'model_parallel') and p.model_parallel
+def prefix_sum_inc(weights):
+    """ Compute an inclusive prefix sum.
+    Example:
+        >>> prefix_sum_inc([3,4,5])
+        [3, 7, 12]
+    """
+    weights_ = [w for w in weights]
+    for x in range(1, len(weights_)):
+        weights_[x] += weights_[x - 1]
+    return weights_
+def partition_uniform(num_items, num_parts):
+    parts = [0] * (num_parts + 1)
+    # First check for the trivial edge case
+    if num_items <= num_parts:
+        for p in range(num_parts + 1):
+            parts[p] = min(p, num_items)
+        return parts
+    chunksize = floor(num_items / num_parts)
+    for p in range(num_parts):
+        parts[p] = min(chunksize * p, num_items)
+    parts[num_parts] = num_items
+    return parts
+def _lprobe(weights, num_parts, bottleneck):
+    num_items = len(weights)
+    total_weight = weights[-1]
+    # initialize partitioning
+    parts = [0] * (num_parts + 1)
+    for p in range(1, num_parts + 1):
+        parts[p] = num_items
+    bsum = bottleneck  # running sum of target weight for pth partition
+    chunksize = num_items // num_parts
+    step = chunksize
+    for p in range(1, num_parts):
+        # Jump to the next bucket
+        while (step < num_items) and (weights[step] < bsum):
+            step += chunksize
+        # Find the end index of partition p
+        parts[p] = bisect_left(weights,
+                               bsum,
+                               lo=step - chunksize,
+                               hi=min(step,
+                                      num_items))
+        # Nothing more to partition, return early
+        if parts[p] == num_items:
+            # See if the current partition is overweight.
+            part_size = weights[-1] - weights[parts[p - 1]]
+            return parts, part_size < bottleneck
+        # Next partition target
+        bsum = weights[parts[p] - 1] + bottleneck
+    return parts, bsum >= total_weight
+def _rb_partition_balanced(weights, num_parts, eps):
+    total_weight = weights[-1]
+    lower = total_weight / num_parts  # best case heaviest partition
+    upper = total_weight  # worst case heaviest partition
+    # Do a binary search for the best partitioning
+    while upper > lower + eps:
+        mid = lower + ((upper - lower) / 2)
+        parts, success = _lprobe(weights, num_parts, mid)
+        if success:
+            upper = mid
+        else:
+            lower = mid + eps
+    return upper
+def partition_balanced(weights, num_parts, eps=1e-3):
+    num_items = len(weights)
+    # First check for the trivial edge case
+    if num_items <= num_parts:
+        return partition_uniform(num_items, num_parts)
+    weights_ = prefix_sum_inc(weights)
+    # Find the smallest bottleneck (weight of heaviest partition)
+    bottleneck = _rb_partition_balanced(weights_, num_parts, eps=eps)
+    # Now compute that partitioning
+    parts, success = _lprobe(weights_, num_parts, bottleneck)
+    assert success
+    return parts
+class PartitionedTensor:
+    def __init__(self, tensor, group, partition_meta=None):
+        super().__init__()
+        self.group = group
+        self.num_parts = dist.get_world_size(group=self.group)
+        self.rank = dist.get_rank(group=self.group)
+        self.orig_size = list(tensor.size())
+        self.orig_device = tensor.device
+        self.local_data, self.partition = self._partition_tensor(tensor)
+    @classmethod
+    def from_meta(cls, meta, local_part, group, device='cuda'):
+        assert meta.dtype == torch.long
+        dummy = torch.ones(dist.get_world_size(group=group))
+        part_obj = cls(tensor=dummy, group=group)
+        meta = meta.tolist()
+        # [N, list0, ..., listN-1]
+        part_obj.orig_size = meta[1:(1 + meta[0])]
+        meta = meta[1 + meta[0]:]
+        part_obj.orig_device = device
+        part_obj.local_data = local_part.detach()
+        part_obj.group = group
+        # Partition is encoded like the rowptr of a CSR matrix:
+        # [num_parts, rank, 0, part_1, ..., part_num_parts]
+        # TODO: support shuffle between different partition granularities
+        assert part_obj.num_parts == meta[0]
+        assert part_obj.rank == meta[1]
+        part_obj.partition = meta[2:]  # length num_parts+1
+        return part_obj
+    def _partition_tensor(self, tensor):
+        partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts)
+        start = partition[self.rank]
+        length = partition[self.rank + 1] - start
+        tensor_part = tensor.detach().contiguous().view(-1).narrow(
+            0,
+            start=start,
+            length=length).clone()
+        return tensor_part, partition
+    def full(self, device=None):
+        if device is None:
+            device = self.orig_device
+        # Allocate the full tensor as a flat buffer.
+        full_numel = prod(self.full_size())
+        flat_tensor = torch.zeros([full_numel],
+                                  dtype=self.local_data.dtype,
+                                  device=device)
+        # Prepare all-gather buffer
+        partition_tensors = []
+        for part_id in range(self.num_parts):
+            part_size = self.partition[part_id + 1] - self.partition[part_id]
+            buf = flat_tensor.narrow(0, start=self.partition[part_id], length=part_size)
+            if part_id == self.rank:
+                buf.copy_(self.local_data)
+            partition_tensors.append(buf)
+        # Collect the full tensor
+        dist.all_gather(partition_tensors,
+                        partition_tensors[self.rank],
+                        group=self.group)
+        for i in range(len(partition_tensors)):
+            partition_tensors[i].data = torch.zeros(1)
+            partition_tensors[i] = None
+        return flat_tensor.view(self.full_size()).clone().detach()
+    def to_meta(self):
+        """Returns a torch.LongTensor that encodes partitioning information.
+        Can be used along with ``data()`` to serialize a ``PartitionedTensor`` for
+        communication.
+        Returns:
+            torch.LongTensor: a tensor encoding the meta-information for the partitioning
+        """
+        meta = []
+        meta.append(len(self.orig_size))
+        meta += list(self.orig_size)
+        meta.append(self.num_parts)
+        meta.append(self.rank)
+        meta += self.partition
+        return torch.LongTensor(data=meta).to(self.orig_device)
+    def data(self):
+        return self.local_data
+    def local_size(self):
+        return self.local_data.size()
+    def full_size(self):
+        return self.orig_size
+mem_alloced = 0
+mem_cached = 0
+def memory_status(msg, print_rank=-1, reset_max=False):
+    global mem_alloced, mem_cached
+    rank = dist.get_rank()
+    if print_rank != -1 and rank != print_rank:
+        return
+    torch.cuda.synchronize()
+    if reset_max:
+        torch.cuda.reset_max_memory_cached()
+        torch.cuda.reset_max_memory_allocated()
+    new_alloced = torch.cuda.memory_allocated()
+    new_cached = torch.cuda.memory_cached()
+    delta_alloced = new_alloced - mem_alloced
+    delta_cached = new_cached - mem_cached
+    mem_cached = new_cached
+    mem_alloced = new_alloced
+    max_alloced = torch.cuda.max_memory_allocated()
+    max_cached = torch.cuda.max_memory_cached()
+    # convert to GB for printing
+    new_alloced /= 1024**3
+    new_cached /= 1024**3
+    delta_alloced /= 1024**3
+    delta_cached /= 1024**3
+    max_alloced /= 1024**3
+    max_cached /= 1024**3
+    print(
+        f'RANK={rank} MEMSTATS',
+        msg,
+        f'device={torch.cuda.current_device()} '
+        f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
+        f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
+    )
 def see_memory_usage(message):
    return
    if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
@@ -278,3 +551,25 @@ def see_memory_usage(message):
        "Max cache Allocated %s GigaBytes",
        torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
    )
+def call_to_str(base, *args, **kwargs):
+    """Construct a string representation of a call.
+    Args:
+        base (str): name of the call
+        args (tuple, optional): args to ``base``
+        kwargs (dict, optional): kwargs supplied to ``base``
+    Returns:
+        str: A string representation of base(*args, **kwargs)
+    """
+    name = f'{base}('
+    if args:
+        name += ', '.join(repr(arg) for arg in args)
+        if kwargs:
+            name += ', '
+    if kwargs:
+        name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
+    name += ')'
+    return name
--- a/deepspeed/runtime/zero/stage1.py
+++ b/deepspeed/runtime/zero/stage1.py
@@ -26,13 +26,6 @@ def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_c
        padding = get_alignment_padding(flattened_size, i, sub_partition_size)
        group_paddings.append(padding)
-    logger.info("****Padding information*****")
-    logger.info(f"tensor_size = {flattened_size}")
-    logger.info(f"sub_partition_size = {sub_partition_size}")
-    logger.info(f"sub_partition_count = {sub_partition_count}")
-    for i, padding in enumerate(group_paddings):
-        logger.info(f"padding[{i}] = {padding}")
    return group_paddings

--- a/deepspeed/utils/__init__.py
+++ b/deepspeed/utils/__init__.py
 from deepspeed.utils.logging import logger, log_dist
+from deepspeed.runtime.dataloader import RepeatingLoader
--- a/deepspeed/utils/timer.py
+++ b/deepspeed/utils/timer.py
@@ -12,9 +12,9 @@ from deepspeed.utils import logger
 def print_rank_0(message):
    if torch.distributed.is_initialized():
        if torch.distributed.get_rank() == 0:
-            logger.info(message)
+            print(message)
    else:
-        logger.info(message)
+        print(message)
 class SynchronizedWallClockTimer:
@@ -85,12 +85,15 @@ class SynchronizedWallClockTimer:
    def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False):
        """Log a group of timers."""
        assert normalizer > 0.0
-        string = 'time (ms)'
+        string = f'rank={torch.distributed.get_rank()} time (ms)'
        for name in names:
-            elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer
+            if name in self.timers:
-            string += ' | {}: {:.2f}'.format(name, elapsed_time)
+                elapsed_time = self.timers[name].elapsed(
-        if memory_breakdown:
+                    reset=reset) * 1000.0 / normalizer
-            string += self.memory_usage()
+                string += ' | {}: {:.2f}'.format(name, elapsed_time)
+        # TODO: use our logging utilitied to selectively print. Useful for model
+        # parallelism because rank=0 is too restrictive.
        print_rank_0(string)

--- a/docs/_data/navigation.yml
+++ b/docs/_data/navigation.yml
@@ -66,5 +66,7 @@ lnav:
        url: /tutorials/lrrt/
      - title: "DeepSpeed Sparse Attention"
        url: /tutorials/sparse-attention/
+      - title: "Pipeline Parallelism"
+        url: /tutorials/pipeline/
  - title: "Contributing"
    url: /contributing/
--- a/docs/_tutorials/pipeline.md
+++ b/docs/_tutorials/pipeline.md
+---
+title: "Pipeline Parallelism"
+---
+DeepSpeed v0.3 includes new support for pipeline parallelism! Pipeline
+parallelism improves both the memory and compute efficiency of deep learning
+training by partitioning the layers of a model into stages that can be
+processed in parallel.
+DeepSpeed's training engine provides hybrid data and pipeline parallelism and
+can be further combined with model parallelism such as
+[Megatron-LM](https://github.com/NVIDIA/Megatron-LM).
+An illustration of
+3D parallelism is shown below. Our latest [results](linklinklink)
+demonstrate that this 3D parallelism enables training models with over a
+**trillion** parameters.
+![3D parallelism in DeepSpeed](/assets/images/3d-parallelism.png)
+DeepSpeed uses *gradient accumulation* to extract pipeline parallelism (shown
+below). Each batch of training data is divided into micro-batches that can be
+processed in parallel by the pipeline stages. Once a stage completes the
+forward pass for a micro-batch, the activation memory is communicated to the
+next stage in the pipeline. Similarly, as the next stage completes its
+backward pass on a micro-batch, the gradient with respect to the activation
+is communicated backwards through the pipeline. Each backward pass
+accumulates gradients locally. Next, all data parallel groups perform
+reductions of the gradients in parallel. Lastly, the optimizer updates the
+model weights.
+Below is an illustration of how DeepSpeed will train a batch with eight
+micro-batches using hybrid two-way data parallelism and two-stage pipeline
+parallelism. GPUs 0 and 2 are arranged in a pipeline and will alternate
+forward (F) and backward (B) passes. They will then all-reduce (AR) gradients
+with their data parallel counterparts, GPUs 1 and 3, respectively. Finally,
+the two pipeline stages update their model weights.
+![Pipeline Schedule](/assets/images/pipe-schedule.png)
+## Getting Starting with Pipeline Parallelism
+DeepSpeed strives to accelerate *and* simplify the process of pipeline
+parallel training. This section provides first steps with hybrid data and
+pipeline parallel training by preparing `torchvision`'s
+[AlexNet](https://pytorch.org/docs/1.2.0/_modules/torchvision/models/alexnet.html)
+model.
+### Expressing Pipeline Models
+Pipeline parallelism requires models to be expressed as a sequence of layers.
+In the forward pass, each layer consumes the output of the previous
+layer. In fact, there is no need to specify a `forward()` for a pipeline
+parallel model! The forward pass of a pipeline parallel model implicitly
+takes the form:
+```python
+def forward(self, inputs):
+    x = inputs
+    for layer in self.layers:
+        x = layer(x)
+    return x
+```
+PyTorch's
+[`torch.nn.Sequential`](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)
+is a convenient container for expressing pipeline parallel models and can be
+parallelized by DeepSpeed with no modification:
+```python
+net = nn.Sequential(
+    nn.Linear(in_features, hidden_dim),
+    nn.ReLU(inplace=True),
+    nn.Linear(hidden_dim, out_features)
+)
+from deepspeed.pipe import PipelineModule
+net = PipelineModule(layers=net, num_stages=2)
+```
+`PipelineModule` uses its `layers` argument as the sequence of layers that
+comprise the model. After initialization, `net` is divided into two pipeline
+stages and its layers moved to the correpsonding GPUs. If more than two GPUs
+are present, DeepSpeed will also use hybrid data parallelism.
+**Note:** The total number of GPUs must be divisible by the number of pipeline
+stages.
+{: .notice--info}
+**Note:** For large model training, see [memory-efficient model construction](#memory-efficient-module-initialization).
+{: .notice--info}
+### AlexNet
+Let's look at an abbreviated implementation of `torchvision`'s
+[AlexNet](https://pytorch.org/docs/1.2.0/_modules/torchvision/models/alexnet.html):
+```python
+class AlexNet(nn.Module):
+    def __init__(self, num_classes=1000):
+        super(AlexNet, self).__init__()
+        self.features = nn.Sequential(
+            nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
+            ...
+            nn.MaxPool2d(kernel_size=3, stride=2),
+        )
+        self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
+        self.classifier = nn.Sequential(
+            nn.Dropout(),
+            ...
+            nn.Linear(4096, num_classes),
+        )
+    def forward(self, x):
+        x = self.features(x)
+        x = self.avgpool(x)
+        x = torch.flatten(x, 1)
+        x = self.classifier(x)
+        return x
+```
+`AlexNet` is mostly a composition of several `Sequential` submodules. We can
+turn this into a `PipelineModule` by flattening its submodules into a single
+sequence of layers:
+```python
+class AlexNetPipe(AlexNet):
+    def to_layers(self):
+        layers = [
+            *self.features,
+            self.avgpool,
+            lambda x: torch.flatten(x, 1),
+            *self.classifier
+        ]
+        return layers
+from deepspeed.pipe import PipelineModule
+net = AlexNetPipe()
+net = PipelineModule(layers=net.to_layers(), num_stages=2)
+```
+**Note:**
+the `lamda` in the middle of `layers` above is not a `torch.nn.Module`
+type. Any object that implements `__call__()` can be a layer in a
+`PipelineModule`: this allows for convenient data transformations in the
+pipeline.
+{: .notice--info}
+### Inputs and Outputs
+Following `torch.nn.Sequential`, the inputs and outputs of each layer must be
+either a single `torch.Tensor` or a `tuple` of tensors. In practice, some
+models may need to modify their forward pass to pack and unpack arguments to
+`forward()`. Consider an abbreviated implementation of a stack of Transformer
+blocks:
+```python
+class TransformerBlock(nn.Module)
+    ...
+    def forward(self, hidden, mask):
+        output = self.compute(hidden, mask)
+        return output
+    ...
+stack = [ TransformerBlock() for _ in range(num_layers) ]
+```
+Two modifications to `TransformerBlock` are required:
+1. The arguments must be collected into a `tuple`.
+2. `mask` must also be returned from `forward()` to pass to the next layer.
+These modifications can be accomplished with a short subclass:
+```python
+class TransformerBlockPipe(TransformerBlock)
+    def forward(self, inputs):
+        hidden, mask = inputs
+        outputs = super().forward(hidden, mask)
+        return (output, mask)
+stack = [ TransformerBlockPipe() for _ in range(num_layers) ]
+```
+### Training Loops
+Pipeline parallelism interleaves forward and backward passes, and thus the
+training loop cannot be divided into separate stages of `forward()`,
+`backward()` and `step()`.
+Instead, DeepSpeed's pipeline engine provides a `train_batch()` method that
+advances the pipeline engine until the next batch of training data is
+consumed and the model weights updated.
+```python
+train_iter = iter(train_loader)
+loss = engine.train_batch(data_iter=train_iter)
+```
+The above `train_batch()` example is equivalent to the following with
+traditional data parallel DeepSpeed:
+```python
+train_iter = iter(train_loader)
+for micro_batch in engine.gradient_accumulation_steps():
+    batch = next(data_iter)
+    loss = engine(batch)
+    engine.backward(loss)
+    engine.step()
+```
+### Dealing with Data
+Data parallel training typically has each worker perform IO independently at
+the start of each batch. However, in a pipeline parallel environment, only the
+first stage uses the input data, and only the last stage uses labels for loss
+calculation.
+**Note:**
+The pipeline engine expects data loaders to return a `tuple` of two items. The
+first returned item is the input batch data, and the second item is the data
+to be used in the loss calculation. As before, inputs and labels should be
+either `torch.Tensor` type or a `tuple` of tensors.
+{: .notice--info}
+For convenience, the DeepSpeed pipeline engine can construct a distributed
+data loader when a dataset is provided to `deepspeed.initialize()`. DeepSpeed
+handles the rest of the complexity of data loading, and so the pipeline
+training loop becomes:
+```python
+engine, _, _, _ = deepspeed.initialize(
+    args=args,
+    model=net,
+    model_parameters=[p for p in net.parameters() if p.requires_grad],
+    training_data=cifar_trainset())
+for step in range(args.steps):
+    loss = engine.train_batch()
+```
+Of course, DeepSpeed will work with any data loader that you wish to use.
+Data loaders should be constructed by the first and last stages in the
+pipeline. Each worker should load micro-batches of size
+`engine.train_micro_batch_size_per_gpu()` and will be queried
+a total of `engine.gradient_accumulation_steps()` times per `train_batch()`.
+**Watch out!**
+The pipeline engine *pulls* data from an iteratior instead of iterating over
+it. It's critical that the data stream does not empty in the middle of a
+training batch. Each invocation of `train_batch()` will pull
+a total of `engine.gradient_accumulation_steps()` micro-batches of data from
+the data iterator.
+{: .notice--warning}
+DeepSpeed provides a convenience class `deepspeed.utils.RepeatingLoader` that
+simply wraps an iterable such as a data loader and restarts it whenever the
+end is reached:
+```python
+train_loader = deepspeed.utils.RepeatingLoader(train_loader)
+train_iter = iter(train_loader)
+for step in range(args.steps):
+    loss = engine.train_batch(data_iter=trainiter)
+```
+## Advanced Topics
+### Load Balancing Pipeline Modules
+The performance of pipeline parallel training strongly relies on load
+balance. DeepSpeed provides several mechanisms for partitioning the model
+across GPUs. These strategies can be set with the `partition_method` keyword
+argument to `PipelineModule`. Here are partitioning methods currently provided
+by DeepSpeed:
+* `partition_method="parameters"` (**default**)
+   balances the number of trainable parameters on each pipeline stage . This is
+   especially useful in memory-constrained environments and when the size of a
+   layer is proportional to the computation time.
+* `partition_method="type:[regex]"`
+  balances layers whose class names match `[regex]`. The regular expression
+  is not case sensitive. For example, `partition_method="type:transformer"`
+  would balance the number of transformer layers per stage.
+* `partition_method="uniform"` balances the number of layers per stage.
+### Memory-Efficient Model Construction
+Building a `Sequential` and providing it `PipelineModule` is a convenient way
+of specifying a pipeline parallel model. However, this approach encounters
+scalability issues for massive models. Starting from a `Sequential` allocates
+the model in CPU memory redundantly by every worker. A machine with 16 GPUs
+must have as much local CPU memory as 16 times the model size.
+DeepSpeed provides a `LayerSpec` class that delays the construction of
+modules until the model layers have been partitioned across workers. Then,
+the modules are built on the GPU that owns the layer.
+Here's an example of the abbreviated AlexNet model, but expressed only
+with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
+simply becomes `LayerSpec(nn.ReLU, inplace=True)`.
+```python
+from deepspeed.pipe import PipelineModule, LayerSpec
+class AlexNetPipe(PipelineModule):
+    def __init__(self, num_classes=10, **kwargs):
+        self.num_classes = num_classes
+        specs = [
+            LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=2),
+            LayerSpec(nn.ReLU, inplace=True),
+            ...
+            LayerSpec(nn.ReLU, inplace=True),
+            LayerSpec(nn.Linear, 4096, self.num_classes),
+        ]
+        super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
+```
+### Tied Layers
+Some models cannot be entirely expressed as pipeline parallel models because
+some layers are reused in the pipeline. For example, Transformer based
+language models commonly use an embedding layer early in the pipeline to map
+vocabulary to hidden states, and then use the embedding to map hidden states
+back to vocabulary at the end of the pipeline. If the model was restricted to
+pure pipeline parallelism, this embedding reuse would prohibit pipeline
+parallelism.
+DeepSpeed provides a `TiedLayerSpec` that is an extension of
+`LayerSpec`. `TiedLayerSpec` requires an additional argument: `key`.
+Each reuse of a layer is specified with a `TiedLayerSpec`, and the `key` field
+is used to identify where a layer is reused.
+Tied layers are replicated on every pipeline stage that owns an instance of
+reuse. Training then proceeds as normal, but an additional all-reduce of the
+tied gradients is added after all backward passes complete. The all-reduce
+ensures that the weights of the tied layer remain in sync across pipeline stages.