"git@developer.sourcefind.cn:OpenDAS/torch-sparce.git" did not exist on "582e0def1ff20f52ca1d3530a9dfbe65faf23e34"
Unverified Commit 65c2f974 authored by Shaden Smith's avatar Shaden Smith Committed by GitHub
Browse files

Pipeline parallel training engine. (#392)


Co-authored-by: default avatarJeff Rasley <jerasley@microsoft.com>
parent 41db1c2f
...@@ -8,11 +8,14 @@ from . import ops ...@@ -8,11 +8,14 @@ from . import ops
from .runtime.engine import DeepSpeedEngine from .runtime.engine import DeepSpeedEngine
from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_ADAM from .runtime.engine import ADAM_OPTIMIZER, LAMB_OPTIMIZER, DEEPSPEED_ADAM
from .runtime.pipe.engine import PipelineEngine
from .runtime.lr_schedules import add_tuning_arguments from .runtime.lr_schedules import add_tuning_arguments
from .runtime.config import DeepSpeedConfig from .runtime.config import DeepSpeedConfig
from .runtime.activation_checkpointing import checkpointing from .runtime.activation_checkpointing import checkpointing
from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig from .ops.transformer import DeepSpeedTransformerLayer, DeepSpeedTransformerConfig
from .utils import logger from .utils import log_dist
from .pipe import PipelineModule
try: try:
from .git_version_info import version, git_hash, git_branch from .git_version_info import version, git_hash, git_branch
...@@ -99,23 +102,35 @@ def initialize(args, ...@@ -99,23 +102,35 @@ def initialize(args,
* ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or * ``lr_scheduler``: Wrapped lr scheduler if user ``lr_scheduler`` is passed, or
if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``. if ``lr_scheduler`` specified in JSON configuration. Otherwise ``None``.
""" """
logger.info( log_dist("DeepSpeed info: version={}, git-hash={}, git-branch={}".format(
"DeepSpeed info: version={}, git-hash={}, git-branch={}".format( __version__,
__version__, __git_hash__,
__git_hash__, __git_branch__),
__git_branch__), ranks=[0])
)
if not isinstance(model, PipelineModule):
engine = DeepSpeedEngine(args=args, engine = DeepSpeedEngine(args=args,
model=model, model=model,
optimizer=optimizer, optimizer=optimizer,
model_parameters=model_parameters, model_parameters=model_parameters,
training_data=training_data, training_data=training_data,
lr_scheduler=lr_scheduler, lr_scheduler=lr_scheduler,
mpu=mpu, mpu=mpu,
dist_init_required=dist_init_required, dist_init_required=dist_init_required,
collate_fn=collate_fn, collate_fn=collate_fn,
config_params=config_params) config_params=config_params)
else:
assert mpu is None, "mpu must be None with pipeline parallelism"
engine = PipelineEngine(args=args,
model=model,
optimizer=optimizer,
model_parameters=model_parameters,
training_data=training_data,
lr_scheduler=lr_scheduler,
mpu=model.mpu(),
dist_init_required=dist_init_required,
collate_fn=collate_fn,
config_params=config_params)
return_items = [ return_items = [
engine, engine,
......
from ..runtime.pipe import PipelineModule, LayerSpec, TiedLayerSpec
...@@ -480,6 +480,10 @@ class CheckpointFunction(torch.autograd.Function): ...@@ -480,6 +480,10 @@ class CheckpointFunction(torch.autograd.Function):
timers.log(['forward']) timers.log(['forward'])
if SYNCHRONIZE: if SYNCHRONIZE:
torch.cuda.synchronize() torch.cuda.synchronize()
# Tensors returned from forward() may not be differentiable, e.g., attention mask
non_grad_outputs = [o for o in outputs if not o.is_floating_point()]
ctx.mark_non_differentiable(*non_grad_outputs)
return outputs return outputs
@staticmethod @staticmethod
...@@ -548,7 +552,20 @@ class CheckpointFunction(torch.autograd.Function): ...@@ -548,7 +552,20 @@ class CheckpointFunction(torch.autograd.Function):
if isinstance(outputs, torch.Tensor): if isinstance(outputs, torch.Tensor):
outputs = (outputs, ) outputs = (outputs, )
torch.autograd.backward(outputs, args)
# Go over args and build the list of gradient tensors. This is usually just args,
# but if the forward pass returns tensors that do not require_grad then we should
# adjust the arguments to autograd.backward() too. This happens when forward()
# returns indices or a mask (such as an attention mask).
# We skip the first needs_input_grad because it corresponds to run_function.
output_tensors = []
grad_tensors = []
for idx, need_grad in enumerate(ctx.needs_input_grad[1:]):
if need_grad:
output_tensors.append(outputs[idx])
grad_tensors.append(args[idx])
torch.autograd.backward(output_tensors, grad_tensors)
if PROFILE_TIME: if PROFILE_TIME:
timers('backward').stop() timers('backward').stop()
......
...@@ -324,6 +324,20 @@ def get_sparse_attention_type(param_dict): ...@@ -324,6 +324,20 @@ def get_sparse_attention_type(param_dict):
return SPARSE_ATTENTION_TYPE_DEFAULT return SPARSE_ATTENTION_TYPE_DEFAULT
def get_pipeline_config(param_dict):
'''Parses pipeline engine configuration. '''
default_pipeline = {
'stages': 'auto',
'partition': 'best',
'seed_layers': False,
'activation_checkpoint_interval': 0
}
config = default_pipeline
for key, val in param_dict.get('pipeline', {}).items():
config[key] = val
return config
def get_optimizer_name(param_dict): def get_optimizer_name(param_dict):
if OPTIMIZER in param_dict.keys() and \ if OPTIMIZER in param_dict.keys() and \
TYPE in param_dict[OPTIMIZER].keys(): TYPE in param_dict[OPTIMIZER].keys():
...@@ -523,6 +537,7 @@ class DeepSpeedConfig(object): ...@@ -523,6 +537,7 @@ class DeepSpeedConfig(object):
self.tensorboard_job_name = get_tensorboard_job_name(param_dict) self.tensorboard_job_name = get_tensorboard_job_name(param_dict)
self.sparse_attention = get_sparse_attention(param_dict) self.sparse_attention = get_sparse_attention(param_dict)
self.pipeline = get_pipeline_config(param_dict)
def _batch_assertion(self): def _batch_assertion(self):
...@@ -592,10 +607,6 @@ class DeepSpeedConfig(object): ...@@ -592,10 +607,6 @@ class DeepSpeedConfig(object):
assert False, \ assert False, \
'Either train_batch_size or micro_batch_per_gpu needs to be provided' 'Either train_batch_size or micro_batch_per_gpu needs to be provided'
logger.info(
f' After Train batch {self.train_batch_size} micro_batch {self.train_micro_batch_size_per_gpu} and grad_acc {self.gradient_accumulation_steps}'
)
def _configure_train_batch_size(self): def _configure_train_batch_size(self):
self._set_batch_related_parameters() self._set_batch_related_parameters()
self._batch_assertion() self._batch_assertion()
...@@ -646,12 +657,14 @@ class DeepSpeedConfig(object): ...@@ -646,12 +657,14 @@ class DeepSpeedConfig(object):
MAX_GRAD_NORM in self.optimizer_params.keys() and \ MAX_GRAD_NORM in self.optimizer_params.keys() and \
self.optimizer_params[MAX_GRAD_NORM] > 0: self.optimizer_params[MAX_GRAD_NORM] > 0:
if fp16_enabled: if fp16_enabled:
logger.warning( if self.global_rank == 0:
'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper' logger.warning(
.format(MAX_GRAD_NORM, 'DeepSpeedConfig: In FP16 mode, DeepSpeed will pass {}:{} to FP16 wrapper'
self.optimizer_params[MAX_GRAD_NORM])) .format(MAX_GRAD_NORM,
self.optimizer_params[MAX_GRAD_NORM]))
else: else:
logger.warning( if self.global_rank == 0:
'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero' logger.warning(
.format(self.optimizer_params[MAX_GRAD_NORM])) 'DeepSpeedConfig: In FP32 mode, DeepSpeed does not permit MAX_GRAD_NORM ({}) > 0, setting to zero'
.format(self.optimizer_params[MAX_GRAD_NORM]))
self.optimizer_params[MAX_GRAD_NORM] = 0.0 self.optimizer_params[MAX_GRAD_NORM] = 0.0
...@@ -7,6 +7,29 @@ from torch.utils.data import DataLoader, RandomSampler ...@@ -7,6 +7,29 @@ from torch.utils.data import DataLoader, RandomSampler
from torch.utils.data.distributed import DistributedSampler from torch.utils.data.distributed import DistributedSampler
class RepeatingLoader:
def __init__(self, loader):
"""Wraps an iterator to allow for infinite iteration. This is especially useful
for DataLoader types that we wish to automatically restart upon completion.
Args:
loader (iterator): The data loader to repeat.
"""
self.loader = loader
self.data_iter = iter(self.loader)
def __iter__(self):
return self
def __next__(self):
try:
batch = next(self.data_iter)
except StopIteration:
self.data_iter = iter(self.loader)
batch = next(self.data_iter)
return batch
class DeepSpeedDataLoader(object): class DeepSpeedDataLoader(object):
def __init__(self, def __init__(self,
dataset, dataset,
......
This diff is collapsed.
...@@ -97,7 +97,7 @@ class FP16_Optimizer(object): ...@@ -97,7 +97,7 @@ class FP16_Optimizer(object):
self.clip_grad_norm = torch.nn.utils.clip_grad_norm_ self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
#model parallel object #model parallel object
self.mpu = None self.mpu = mpu
self.overflow = False self.overflow = False
self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu) self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
...@@ -237,8 +237,8 @@ class FP16_Optimizer(object): ...@@ -237,8 +237,8 @@ class FP16_Optimizer(object):
if self.overflow: if self.overflow:
if self.verbose: if self.verbose:
print("[deepspeed] OVERFLOW! Skipping step. Attempted loss " print("[deepspeed] OVERFLOW! Skipping step. Attempted loss "
"scale: {}, reducing to {}".format(prev_scale, "scale: {}, reducing to {} ".format(prev_scale,
self.cur_scale)) self.cur_scale))
self.log_timers(OVERFLOW_TIMERS) self.log_timers(OVERFLOW_TIMERS)
grads_groups_flat = None grads_groups_flat = None
return self.overflow return self.overflow
......
...@@ -93,11 +93,13 @@ class FP16_UnfusedOptimizer(object): ...@@ -93,11 +93,13 @@ class FP16_UnfusedOptimizer(object):
else: else:
self.clip_grad_norm = torch.nn.utils.clip_grad_norm_ self.clip_grad_norm = torch.nn.utils.clip_grad_norm_
self.mpu = None self.mpu = mpu
self.overflow = False self.overflow = False
self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu) self.overflow_checker = CheckOverflow(self.fp16_groups, mpu=self.mpu)
self.initialize_optimizer_states()
def zero_grad(self, set_grads_to_None=True): def zero_grad(self, set_grads_to_None=True):
""" """
Zero FP16 parameter grads. Zero FP16 parameter grads.
...@@ -349,3 +351,26 @@ class FP16_UnfusedOptimizer(object): ...@@ -349,3 +351,26 @@ class FP16_UnfusedOptimizer(object):
def __repr__(self): def __repr__(self):
return repr(self.optimizer) return repr(self.optimizer)
def initialize_optimizer_states(self):
for i, group in enumerate(self.fp16_groups):
for param in group:
param.grad = torch.zeros(param.size(),
dtype=param.dtype,
device=torch.cuda.current_device())
for i, group in enumerate(self.fp32_groups):
for param in group:
param.grad = torch.zeros(param.size(),
dtype=param.dtype,
device=torch.cuda.current_device())
self.optimizer.step()
for i, group in enumerate(self.fp16_groups):
for param in group:
param.grad = None
for i, group in enumerate(self.fp32_groups):
for param in group:
param.grad = None
from .module import PipelineModule, LayerSpec, TiedLayerSpec
This diff is collapsed.
This diff is collapsed.
'''
Copyright 2019 The Microsoft DeepSpeed Team
'''
import torch.distributed as dist
_groups = None
_grid = None
#initializes adjacent process groups
#run this only after torch.distributed.init_process_group() has been called
def init_process_groups(grid):
global _groups, _grid
_grid = grid
assert _grid.pipe_parallel_size > 1, "There is no model parallelism"
_groups = [dist.new_group(ranks=group) for group in _grid.p2p_groups]
def _is_valid_send_recv(src_stage, dest_stage):
first_stage = 0
last_stage = _grid.pipe_parallel_size - 1
assert abs(src_stage-dest_stage) == 1 or \
(src_stage == first_stage and dest_stage == last_stage) or \
(src_stage == last_stage and dest_stage == first_stage), \
"Functionality currently limited to send and receive between adjacent ranks only"
def send(tensor, dest_stage, async_op=False):
global _groups
async_op = False
src_stage = _grid.get_stage_id()
_is_valid_send_recv(src_stage, dest_stage)
group = _get_send_recv_group(src_stage, dest_stage)
src_rank = _grid.stage_to_global(stage_id=src_stage)
return dist.broadcast(tensor, src_rank, group=group, async_op=async_op)
def recv(tensor, src_stage, async_op=False):
global _groups
async_op = False
dest_stage = _grid.get_stage_id()
_is_valid_send_recv(src_stage, dest_stage)
group = _get_send_recv_group(src_stage, dest_stage)
src_rank = _grid.stage_to_global(stage_id=src_stage)
return dist.broadcast(tensor, src_rank, group=group, async_op=async_op)
def barrier(stage_id):
global _groups, _grid
group_id = _grid.stage_to_global(stage_id=stage_id)
if (dist.get_rank() >= 0):
print("Barrier Group ID", group_id)
print("Barrier Group", _grid.p2p_groups[group_id])
dist.barrier(group=_groups[group_id])
if (dist.get_rank() >= 0):
print("Exiting Barrier ", group_id)
def _get_send_recv_group(src_stage, dest_stage):
'''the group id is always the smaller rank unless its a wrap around'''
stage_id = None
first_stage = 0
last_stage = _grid.pipe_parallel_size - 1
if (src_stage == first_stage and dest_stage == last_stage
or dest_stage == first_stage and src_stage == last_stage):
stage_id = last_stage
elif src_stage > dest_stage:
stage_id = dest_stage
else:
stage_id = src_stage
'''group_id corresponds to group of [group_id, group_id+1]
unless group_id is the rank of the last stage
in which case group_id correspods to group[group_id-num_stages+1, group_id]
'''
group_id = _grid.stage_to_global(stage_id=stage_id)
return _groups[group_id]
from ..utils import call_to_str
from abc import ABC, abstractmethod
class PipeSchedule(ABC):
"""Directs the execution of a pipeline engine by generating sequences of
:class:`PipeInstruction`.
Schedules are generators that yield sequences of
:class:`PipeInstruction` to process the micro-batches in one batch.
Each yielded step is atomic in the sense that a barrier
synchronization can be placed between successive steps without
deadlock.
Below is an example schedule that implements data parallelism with gradient accumulation:
.. code-block:: python
class DataParallelSchedule(PipeSchedule):
def steps(self):
for step_id in range(self.micro_batches):
cmds = [
LoadMicroBatch(buffer_id=0),
ForwardPass(buffer_id=0),
BackwardPass(buffer_id=0),
]
if step_id == self.micro_batches - 1:
cmds.extend([
ReduceGrads(),
OptimizerStep(),
])
yield cmds
def num_pipe_buffers(self):
return 1
Args:
micro_batches (int): The number of micro-batches that comprise a batch.
stages (int): The number of pipeline stages.
stage_id (int): The pipe stage that will execute the generated schedule.
"""
def __init__(self, micro_batches, stages, stage_id):
super().__init__()
self.micro_batches = micro_batches
self.stages = stages
self.stage_id = stage_id
self.prev_stage = self.stage_id - 1
self.next_stage = self.stage_id + 1
@abstractmethod
def steps(self):
"""Yield a list of :class:`PipeInstruction` for each step in the schedule.
.. note::
Schedules must implement ``steps()`` to define the schedule.
Returns:
Instructions to be executed as one step of the pipeline
"""
pass
def num_pipe_buffers(self):
"""The number of pipeline buffers that will be used by this stage.
.. note::
Schedules should specialize ``num_pipe_buffers()`` for memory savings at scale.
Returns:
The number of buffers for the engine to allocate.
"""
return self.micro_batches
def _valid_micro_batch(self, micro_batch_id):
return 0 <= micro_batch_id < self.micro_batches
def _valid_stage(self, stage_id):
return 0 <= stage_id < self.stages
@property
def stage(self):
"""Stage index used to configure this schedule."""
return self.stage_id
@property
def num_stages(self):
"""The number of total pipeline stages used to configure this schedule."""
return self.stages
@property
def num_micro_batches(self):
"""The number of total micro_batches used to configure this schedule."""
return self.micro_batches
@property
def is_first_stage(self):
"""True if the configured ``stage_id`` is the first stage in the pipeline."""
return self.stage_id == 0
@property
def is_last_stage(self):
"""True if the configured ``stage_id`` is the last stage in the pipeline."""
return self.stage_id == self.stages - 1
def _buffer_idx(self, micro_batch_id):
"""Map a micro-batch index to a pipeline buffer index.
This method uses a cyclic allocation strategy.
Args:
micro_batch_id (int): The micro-batch index relative to the beginning of the schedule.
Returns:
int: The index of the buffer that should store data.
"""
assert self._valid_micro_batch(micro_batch_id)
return micro_batch_id % self.num_pipe_buffers()
def __iter__(self):
self.it = None
return self
def __next__(self):
if self.it is None:
self.it = self.steps()
return next(self.it)
class InferenceSchedule(PipeSchedule):
"""A schedule for inferencing batches using pipeline parallelism.
"""
def steps(self):
""""""
prev_micro_batch_id = -1
total_steps = self.micro_batches + self.stages - 1
for step_id in range(total_steps):
cmds = []
micro_batch_id = step_id - self.stage_id
# Alternate send/recv buffers
if _is_even(self.stage_id):
recv_buf = step_id % 2
send_buf = (step_id + 1) % 2
else:
recv_buf = (step_id + 1) % 2
send_buf = step_id % 2
if self.is_first_stage or self.is_last_stage:
if self._valid_micro_batch(micro_batch_id):
cmds.append(LoadMicroBatch(recv_buf))
if _is_even(self.stage_id):
if self._valid_stage(self.next_stage):
if self._valid_micro_batch(micro_batch_id - 1):
cmds.append(SendActivation(send_buf))
if self._valid_stage(self.prev_stage):
if self._valid_micro_batch(micro_batch_id):
cmds.append(RecvActivation(recv_buf))
else:
if self._valid_stage(self.prev_stage):
if self._valid_micro_batch(micro_batch_id):
cmds.append(RecvActivation(recv_buf))
if self._valid_stage(self.next_stage):
if self._valid_micro_batch(micro_batch_id - 1):
cmds.append(SendActivation(send_buf))
if self._valid_micro_batch(micro_batch_id):
cmds.append(ForwardPass(recv_buf))
yield cmds
def num_pipe_buffers(self):
"""Only two pipeline buffers are required for inferencing.
Returns:
``2``
"""
return 2
class TrainSchedule(PipeSchedule):
"""A schedule for training a batch using hybrid parallelism.
Pipeline parallelism is extracted through gradient accumulation and thus
convergence follows that of a data parallel approach with the same batch
size.
"""
def steps(self):
""""""
prev_micro_batch_id = -1
total_steps = 2 * (self.micro_batches + self.stages - 1)
for step_id in range(total_steps):
# Map the step of the pipeline to the micro-batch id and also whether it is a
# forward or backward pass step.
micro_batch_id, is_forward = self._step_to_micro_batch(step_id)
if self._valid_micro_batch(prev_micro_batch_id):
prev_buffer = self._buffer_idx(prev_micro_batch_id)
if self._valid_micro_batch(micro_batch_id):
curr_buffer = self._buffer_idx(micro_batch_id)
cmds = []
# Exchange activations
if is_forward:
if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
self.prev_stage):
cmds.append(RecvActivation(curr_buffer))
if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
self.prev_stage):
cmds.append(SendGrad(prev_buffer))
else:
if self._valid_micro_batch(prev_micro_batch_id) and self._valid_stage(
self.next_stage):
cmds.append(SendActivation(prev_buffer))
if self._valid_micro_batch(micro_batch_id) and self._valid_stage(
self.next_stage):
cmds.append(RecvGrad(curr_buffer))
# First/last stage loads
if self.stage_id == 0 or self.stage_id == self.stages - 1:
if is_forward and self._valid_micro_batch(micro_batch_id):
cmds.append(LoadMicroBatch(curr_buffer))
# Computation
if self._valid_micro_batch(micro_batch_id):
if is_forward:
cmds.append(ForwardPass(curr_buffer))
else:
cmds.append(BackwardPass(curr_buffer))
# Model step at the end of the batch
if step_id == total_steps - 1:
cmds.append(ReduceTiedGrads())
cmds.append(ReduceGrads())
cmds.append(OptimizerStep())
# Prepare state for next time
prev_micro_batch_id = micro_batch_id
yield cmds
def num_pipe_buffers(self):
"""As many buffers as the distance from this stage to the last stage.
"""
buffers = min(self.stages - self.stage_id + 1, self.micro_batches)
return max(2, buffers)
def _step_to_micro_batch(self, step_id):
if _is_even(step_id) and _is_even(self.stage_id):
micro_batch_id = self._even_step_forward_id(step_id)
is_forward = True
elif _is_odd(step_id) and _is_odd(self.stage_id):
micro_batch_id = self._odd_step_forward_id(step_id)
is_forward = True
elif _is_even(step_id) and _is_odd(self.stage_id):
micro_batch_id = self._even_step_backward_id(step_id)
is_forward = False
elif _is_odd(step_id) and _is_even(self.stage_id):
micro_batch_id = self._odd_step_backward_id(step_id)
is_forward = False
else:
assert False
return micro_batch_id, is_forward
def _even_step_forward_id(self, step_id):
base = step_id // 2
micro_batch_id = int(base - self.stage_id // 2)
return micro_batch_id
def _odd_step_forward_id(self, step_id):
base = (step_id - 1) // 2
micro_batch_id = int(base - self.stage_id // 2)
return micro_batch_id
def _even_step_backward_id(self, step_id):
base = step_id // 2
micro_batch_id = int(base - self.stages + (self.stage_id + 1) // 2)
return micro_batch_id
def _odd_step_backward_id(self, step_id):
base = ((step_id - 1) // 2) - self.stages + 1
micro_batch_id = int(base + self.stage_id // 2)
return micro_batch_id
class DataParallelSchedule(PipeSchedule):
"""An example schedule that trains using traditional data parallelism with gradient
accumulation.
"""
def steps(self):
""""""
for step_id in range(self.micro_batches):
cmds = [
LoadMicroBatch(buffer_id=0),
ForwardPass(buffer_id=0),
BackwardPass(buffer_id=0),
]
if step_id == self.micro_batches - 1:
cmds.extend([
ReduceGrads(),
OptimizerStep(),
])
yield cmds
def num_pipe_buffers(self):
"""Only one pipeline buffer needed.
"""
return 1
class PipeInstruction:
"""Base class for all instructions to be executed by the pipeline engine.
All keyword arguments are stored as members similar to a ``namedtuple``. These are
then accessible to the :class:`PipeEngine` during execution.
Args:
kwargs (optional): keyword arguments to store as members
"""
def __init__(self, **kwargs):
self.name = self.__class__.__name__
self.kwargs = kwargs
for key, val in kwargs.items():
setattr(self, key, val)
def __repr__(self):
return call_to_str(self.name, **self.kwargs)
class OptimizerStep(PipeInstruction):
"""Performs one step with the optimizer and zeros gradients.
.. note:: Should be issued after :class:`ReduceGrads` and :class:`ReduceTiedGrads`.
.. note:: Can be a synchronization point among data-parallel ranks.
"""
pass
class ReduceGrads(PipeInstruction):
"""Reduce the computed gradients among data-parallel processes within the stage.
"""
pass
class ReduceTiedGrads(PipeInstruction):
"""Reduce the computed gradients of tied modules within a pipeline-parallel group.
.. warning::
The stages included in this synchronization point are not known until
the model is partitioned among pipeline stages. In the worst case, it
includes all pipeline stages. This instruction should be scheduled
carefully to avoid deadlocks.
"""
pass
class BufferOpInstruction(PipeInstruction):
"""A pipeline instruction that operates on pipeline buffer(s).
Args:
buffer_id (int): the index of the pipeline buffer() to modify.
"""
def __init__(self, buffer_id, **kwargs):
super().__init__(buffer_id=buffer_id, **kwargs)
# IO
class LoadMicroBatch(BufferOpInstruction):
"""Load a micro-batch into a buffer.
Roughly:
.. code-block:: python
buffers['inputs'][buffer_id] = next(data_iter)
"""
pass
# Compute
class ForwardPass(BufferOpInstruction):
"""Compute a forward pass.
Roughly:
.. code-block:: python
buffers['ouputs'][buffer_id] = forward(buffers['inputs'][buffer_id])
"""
pass
class BackwardPass(BufferOpInstruction):
"""Compute a backward pass and accumulate gradients.
Roughly:
.. code-block:: python
outputs = buffers['ouputs'][buffer_id]
gradients = buffers['gradients'][buffer_id]
torch.autograd.backward(tensors=outputs,
grad_tensors=gradients)
"""
pass
# Communication
class SendActivation(BufferOpInstruction):
"""Send activations to the next stage in the pipeline.
Roughly:
.. code-block:: python
send(buffers['outputs'][buffer_id])
.. note::
The communication is blocking and must be paired with a :class:`RecvActivation`
on the next pipeline stage to avoid deadlock.
"""
pass
class RecvActivation(BufferOpInstruction):
"""Receive activations from the previous stage in the pipeline.
Roughly:
.. code-block:: python
buffers['inputs'][buffer_id] = recv()
.. note::
The communication is blocking and must be paired with a :class:`SendActivation`
on the previous pipeline stage to avoid deadlock.
"""
pass
class SendGrad(BufferOpInstruction):
"""Send computed gradients to the previous pipeline stage.
with respect to the received activations
.. note::
Only received tensors with ``requires_grad==True`` will produce gradients.
Missing gradients will be replaced with ``None`` on the receiving stage.
.. note::
The communication is blocking and must be paired with a :class:`RecvGrad`
on the previous pipeline stage to avoid deadlock.
"""
pass
class RecvGrad(BufferOpInstruction):
"""Receive computed gradients the next pipeline stage.
.. note::
Only activations with ``requires_grad==True`` will produce gradients.
Missing gradients will be replaced with ``None``.
.. note::
The communication is blocking and must be paired with a :class:`SendGrad`
on the next pipeline stage to avoid deadlock.
"""
pass
def _is_even(x):
return x % 2 == 0
def _is_odd(x):
return x % 2 != 0
# Copyright 2019 The Microsoft DeepSpeed Team
from deepspeed.utils import logger
import torch.distributed as dist
import sys
from collections import namedtuple
from itertools import product as cartesian_product
class ProcessTopology:
""" Manages the mapping of n-dimensional Cartesian coordinates to linear
indices. This mapping is used to map the rank of processes to the grid
for various forms of parallelism.
Each axis of the tensor is accessed by its name. The provided ordering
of the axes defines the layout of the topology. ProcessTopology uses a "row-major"
layout of the tensor axes, and so axes=['x', 'y'] would map coordinates (x,y) and
(x,y+1) to adjacent linear indices. If instead axes=['y', 'x'] was used, coordinates
(x,y) and (x+1,y) would be adjacent.
Some methods return ProcessCoord namedtuples.
"""
def __init__(self, axes, dims):
"""Create a mapping of n-dimensional tensor coordinates to linear indices.
Arguments:
axes (list): the names of the tensor axes
dims (list): the dimension (length) of each axis of the topology tensor
"""
self.axes = axes # names of each topology axis
self.dims = dims # length of each topology axis
# This is actually a class that lets us hash {'row':3, 'col':2} mappings
self.ProcessCoord = namedtuple('ProcessCoord', axes)
self.mapping = {}
ranges = [range(d) for d in dims]
# example: 1, (0,0,1)
for global_rank, coord in enumerate(cartesian_product(*ranges)):
key = {axis: coord[self.axes.index(axis)] for axis in self.axes}
key = self.ProcessCoord(**key)
# for example, {ProcessCoord(row=0, col=1) : 1}
self.mapping[key] = global_rank
def get_rank(self, **coord_kwargs):
"""Return the global rank of a process via its coordinates.
Coordinates are specified as kwargs. For example:
>>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
>>> X.get_rank(x=0, y=1)
1
"""
if len(coord_kwargs) != len(self.axes):
raise ValueError('get_rank() does not support slices. Use filter_match())')
key = self.ProcessCoord(**coord_kwargs)
assert key in self.mapping, f'key {kwargs} invalid'
return self.mapping[key]
def get_axis_names(self):
"""Return a list of the axis names in the ordering of the topology. """
return self.axes
def get_rank_repr(self,
rank,
omit_axes=['data',
'pipe'],
inner_sep='_',
outer_sep='-'):
"""Return a string representation of a rank.
This method is primarily used for checkpointing model data.
For example:
>>> topo = Topo(axes=['a', 'b'], dims=[2, 2])
>>> topo.get_rank_repr(rank=3)
'a_01-b_01'
>>> topo.get_rank_repr(rank=3, omit_axes=['a'])
'b_01'
Args:
rank (int): A rank in the topology.
omit_axes (list, optional): Axes that should not be in the representation. Defaults to ['data', 'pipe'].
inner_sep (str, optional): [description]. Defaults to '_'.
outer_sep (str, optional): [description]. Defaults to '-'.
Returns:
str: A string representation of the coordinate owned by ``rank``.
"""
omit_axes = frozenset(omit_axes)
axes = [a for a in self.get_axis_names() if a not in omit_axes]
names = []
for ax in axes:
ax_rank = getattr(self.get_coord(rank=rank), ax)
names.append(f'{ax}{inner_sep}{ax_rank:02d}')
return outer_sep.join(names)
def get_dim(self, axis):
"""Return the number of processes along the given axis.
For example:
>>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
>>> X.get_dim('y')
3
"""
if axis not in self.axes:
return 0
return self.dims[self.axes.index(axis)]
def get_coord(self, rank):
"""Return the coordinate owned by a process rank.
The axes of the returned namedtuple can be directly accessed as members. For
example:
>>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
>>> coord = X.get_coord(rank=1)
>>> coord.x
0
>>> coord.y
1
"""
for coord, idx in self.mapping.items():
if idx == rank:
return coord
raise ValueError(f'rank {rank} not found in topology.')
def get_axis_comm_lists(self, axis):
""" Construct lists suitable for a communicator group along axis ``axis``.
Example:
>>> topo = Topo(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
>>> topo.get_axis_comm_lists('pipe')
[
[0, 4], # data=0, model=0
[1, 5], # data=0, model=1
[2, 6], # data=1, model=0
[3, 7], # data=1, model=1
]
Returns:
A list of lists whose coordinates match in all axes *except* ``axis``.
"""
# We don't want to RuntimeError because it allows us to write more generalized
# code for hybrid parallelisms.
if axis not in self.axes:
return []
# Grab all axes but `axis`
other_axes = [a for a in self.axes if a != axis]
lists = []
# Construct all combinations of coords with other_axes
ranges = [range(self.get_dim(a)) for a in other_axes]
for coord in cartesian_product(*ranges):
other_keys = {a: coord[other_axes.index(a)] for a in other_axes}
# now go over all ranks in `axis`.
sub_list = []
for axis_key in range(self.get_dim(axis)):
key = self.ProcessCoord(**other_keys, **{axis: axis_key})
sub_list.append(self.mapping[key])
lists.append(sub_list)
return lists
def filter_match(self, **filter_kwargs):
"""Return the list of ranks whose coordinates match the provided criteria.
Example:
>>> X = ProcessTopology(axes=['pipe', 'data', 'model'], dims=[2, 2, 2])
>>> X.filter_match(pipe=0, data=1)
[2, 3]
>>> [X.get_coord(rank) for rank in X.filter_match(pipe=0, data=1)]
[ProcessCoord(pipe=0, data=1, model=0), ProcessCoord(pipe=0, data=1, model=1)]
Arguments:
**filter_kwargs (dict): criteria used to select coordinates.
Returns:
The list of ranks whose coordinates match filter_kwargs.
"""
def _filter_helper(x):
for key, val in filter_kwargs.items():
if getattr(x, key) != val:
return False
return True
coords = filter(_filter_helper, self.mapping.keys())
return [self.mapping[coo] for coo in coords]
def get_axis_list(self, axis, idx):
"""Returns the list of global ranks whose coordinate in an axis is idx.
For example:
>>> X = ProcessTopology(axes=['x', 'y'], dims=[2,3])
>>> X.get_axis_list(axis='x', idx=0)
[0, 1, 2]
>>> X.get_axis_list(axis='y', idx=0)
[0, 3]
"""
# This could be faster by generating the desired keys directly instead of
# filtering.
axis_num = self.axes.index(axis)
ranks = [self.mapping[k] for k in self.mapping.keys() if k[axis_num] == idx]
return ranks
def world_size(self):
return len(self.mapping)
def __str__(self):
return str(self.mapping)
def _prime_factors(N):
""" Returns the prime factorization of positive integer N. """
if N <= 0:
raise ValueError("Values must be strictly positive.")
primes = []
while N != 1:
for candidate in range(2, N + 1):
if N % candidate == 0:
primes.append(candidate)
N //= candidate
break
return primes
class PipeDataParallelTopology(ProcessTopology):
""" A topology specialiation for hybrid data and pipeline parallelism.
Uses data parallelism on the last dimension to encourage gradient
reductions to use high-bandwidth intra-node links and lower-volume
pipeline communications to use low-bandwidth inter-node links.
"""
def __init__(self, num_pp, num_dp):
super().__init__(axes=['pipe', 'data'], dims=[num_pp, num_dp])
class PipeModelDataParallelTopology(ProcessTopology):
""" A topology for hybrid pipeline, model, and data parallelism. """
def __init__(self, num_pp, num_mp, num_dp):
super().__init__(axes=['pipe', 'data', 'model'], dims=[num_pp, num_dp, num_mp])
class PipelineParallelGrid:
"""Implements a grid object that stores the data parallel ranks
corresponding to each o the model parallel stages
The grid object organizes the processes in a distributed pytorch job
into a 2D grid, of stage_id and data_parallel_id.
self.stage_id and self.data_parallel_id stores the stage id
and the data parallel id of current process.
self.dp_group groups the processes by stage_id.
self.dp_group[i], is a list containing all process ranks whose
stage_id is i.
self.p2p_groups stores a list of tuple, where each tuple
stores process ranks of adjacent stages for a given data_parallel_id.
For example if num_stage is 5 then a tuple [7,8] represents stages [3, 4],
with data_parallel id = 1. A stage wrap around will appear as non-adjacent ranks,
for example tuple [4,0] with representing wrap-around stage 4 and 0, for
data_parallel_id = 0, or similarly [9,5] represents wrapped around stages [4,0]
for data_parallel_id = 1.
"""
def __init__(self, topology=None, process_group=None):
# TODO use process_group if provided
self.global_rank = dist.get_rank()
self.world_size = dist.get_world_size()
if topology is not None:
if self.global_rank == 0:
print('Using topology:', topology)
self._topo = topology
else:
num_pp = 1
num_dp = 1
for idx, prime in enumerate(_prime_factors(self.world_size)):
if idx % 2 == 0:
num_pp *= prime
else:
num_dp *= prime
self._topo = PipeDataParallelTopology(num_dp=num_dp, num_pp=num_pp)
self.data_parallel_size = max(self._topo.get_dim('data'), 1)
self.pipe_parallel_size = max(self._topo.get_dim('pipe'), 1)
self.model_parallel_size = max(self._topo.get_dim('model'), 1)
assert self._is_grid_valid(), "Invalid Grid"
self.stage_id = self.get_stage_id()
self.data_parallel_id = self.get_data_parallel_id()
# Create new ProcessGroups for all model parallelism. DeepSpeedLight uses these
# to detect overflow, etc.
self.ds_model_proc_group = None
self.ds_model_rank = -1
for dp in range(self.data_parallel_size):
ranks = sorted(self._topo.get_axis_list(axis='data', idx=dp))
if self.global_rank == 0:
#print(f'RANK={self.global_rank} building DeepSpeed model group: {ranks}')
pass
proc_group = dist.new_group(ranks=ranks)
if self.global_rank in ranks:
self.ds_model_proc_group = proc_group
self.ds_model_world_size = len(ranks)
self.ds_model_rank = ranks.index(self.global_rank)
assert self.ds_model_rank > -1
assert self.ds_model_proc_group is not None
# Create new ProcessGroup for gradient all-reduces - these are the data parallel groups
self.dp_group = []
self.dp_groups = self._topo.get_axis_comm_lists('data')
for g in self.dp_groups:
proc_group = dist.new_group(ranks=g)
if self.global_rank in g:
self.dp_group = g
self.dp_proc_group = proc_group
self.is_first_stage = (self.stage_id == 0)
self.is_last_stage = (self.stage_id == (self.pipe_parallel_size - 1))
self.p2p_groups = self._build_p2p_groups()
# Create new ProcessGroup for pipeline collectives - these are pipe parallel groups
self.pp_group = []
self.pp_proc_group = None
self.pipe_groups = self._topo.get_axis_comm_lists('pipe')
for ranks in self.pipe_groups:
if self.global_rank == 0:
#print(f'RANK={self.global_rank} building pipeline group: {ranks}')
pass
proc_group = dist.new_group(ranks=ranks)
if self.global_rank in ranks:
self.pp_group = ranks
self.pp_proc_group = proc_group
assert self.pp_proc_group is not None
# Create new ProcessGroup for model (tensor-slicing) collectives
# Short circuit case without model parallelism.
# TODO: it would be nice if topology had bcast semantics to avoid this branching
# case?
if self.model_parallel_size == 1:
for group_rank in range(self.world_size):
group_rank = [group_rank]
group = dist.new_group(ranks=group_rank)
if group_rank[0] == self.global_rank:
self.slice_group = group_rank
self.slice_proc_group = group
return
else:
self.mp_group = []
self.model_groups = self._topo.get_axis_comm_lists('model')
for g in self.model_groups:
proc_group = dist.new_group(ranks=g)
if self.global_rank in g:
self.slice_group = g
self.slice_proc_group = proc_group
def get_stage_id(self):
return self._topo.get_coord(rank=self.global_rank).pipe
def get_data_parallel_id(self):
return self._topo.get_coord(rank=self.global_rank).data
def _build_p2p_groups(self):
"""Groups for sending and receiving activations and gradients across model
parallel stages.
"""
comm_lists = self._topo.get_axis_comm_lists('pipe')
p2p_lists = []
for rank in range(self.world_size):
for l in comm_lists:
assert len(l) == self.pipe_parallel_size
if rank in l:
idx = l.index(rank)
buddy_rank = l[(idx + 1) % self.pipe_parallel_size]
p2p_lists.append([rank, buddy_rank])
break # next global rank
assert len(p2p_lists) == self.world_size
return p2p_lists
def _is_grid_valid(self):
ranks = 1
for ax in self._topo.get_axis_names():
ranks *= self._topo.get_dim(ax)
return ranks == dist.get_world_size()
#returns the global rank of the process with the provided stage id
#which has the same data_parallel_id as caller process
def stage_to_global(self, stage_id, **kwargs):
me = self._topo.get_coord(self.global_rank)
transform = me._replace(pipe=stage_id, **kwargs)._asdict()
return self._topo.get_rank(**transform)
def topology(self):
return self._topo
# MPU functions for DeepSpeed integration
def get_global_rank(self):
return self.global_rank
def get_pipe_parallel_rank(self):
""" The stage of the pipeline this rank resides in. """
return self.get_stage_id()
def get_pipe_parallel_world_size(self):
""" The number of stages in the pipeline. """
return self.pipe_parallel_size
def get_pipe_parallel_group(self):
""" The group of ranks within the same pipeline. """
return self.pp_proc_group
def get_data_parallel_rank(self):
""" Which pipeline this rank resides in. """
return self.data_parallel_id
def get_data_parallel_world_size(self):
""" The number of pipelines. """
return self.data_parallel_size
def get_data_parallel_group(self):
""" The group of ranks within the same stage of all pipelines. """
return self.dp_proc_group
# These are model parallel groups across all types of model parallelism.
# Deepspeed uses them to detect overflow, etc.
def get_model_parallel_rank(self):
return self.ds_model_rank
def get_model_parallel_world_size(self):
return self.ds_model_world_size
def get_model_parallel_group(self):
return self.ds_model_proc_group
# For Megatron-style tensor slicing
def get_slice_parallel_rank(self):
if 'model' in self._topo.get_axis_names():
return self._topo.get_coord(rank=self.global_rank).model
else:
return 0
def get_slice_parallel_world_size(self):
self.slice_parallel_size
def get_slice_parallel_group(self):
return self.slice_proc_group
...@@ -6,11 +6,36 @@ Copyright NVIDIA/Megatron ...@@ -6,11 +6,36 @@ Copyright NVIDIA/Megatron
Helper functions and classes from multiple sources. Helper functions and classes from multiple sources.
''' '''
import os
from math import ceil
from math import floor
from bisect import bisect_left, bisect_right
import torch import torch
import torch.distributed as dist
from torch._six import inf from torch._six import inf
import torch.distributed as dist import torch.distributed as dist
from deepspeed.utils import logger from deepspeed.utils import logger
from numpy import prod
def ensure_directory_exists(filename):
"""Create the directory path to ``filename`` if it does not already exist.
Args:
filename (str): A file path.
"""
dirname = os.path.dirname(filename)
os.makedirs(dirname, exist_ok=True)
def set_random_seed(seed):
import numpy
import random
random.seed(seed)
numpy.random.seed(seed)
torch.manual_seed(seed)
class CheckOverflow(object): class CheckOverflow(object):
...@@ -85,6 +110,7 @@ class CheckOverflow(object): ...@@ -85,6 +110,7 @@ class CheckOverflow(object):
torch.distributed.all_reduce(overflow_gpu, torch.distributed.all_reduce(overflow_gpu,
op=torch.distributed.ReduceOp.MAX, op=torch.distributed.ReduceOp.MAX,
group=self.mpu.get_model_parallel_group()) group=self.mpu.get_model_parallel_group())
overflow = overflow_gpu[0].item() overflow = overflow_gpu[0].item()
return bool(overflow) return bool(overflow)
...@@ -160,9 +186,8 @@ def get_grad_norm(parameters, norm_type=2, mpu=None): ...@@ -160,9 +186,8 @@ def get_grad_norm(parameters, norm_type=2, mpu=None):
total_norm = 0. total_norm = 0.
for p in parameters: for p in parameters:
if mpu is not None: if mpu is not None:
if (mpu.get_model_parallel_rank() == 0) or (hasattr(p, if (mpu.get_model_parallel_rank() == 0
'model_parallel') ) or is_model_parallel_parameter(p):
and p.model_parallel):
param_norm = p.grad.data.float().norm(norm_type) param_norm = p.grad.data.float().norm(norm_type)
total_norm += param_norm.item()**norm_type total_norm += param_norm.item()**norm_type
else: else:
...@@ -218,9 +243,8 @@ def get_weight_norm(parameters, norm_type=2, mpu=None): ...@@ -218,9 +243,8 @@ def get_weight_norm(parameters, norm_type=2, mpu=None):
total_norm = 0. total_norm = 0.
for p in parameters: for p in parameters:
if mpu is not None: if mpu is not None:
if (mpu.get_model_parallel_rank() == 0) or (hasattr(p, if (mpu.get_model_parallel_rank() == 0
'model_parallel') ) or is_model_parallel_parameter(p):
and p.model_parallel):
try: try:
param_norm = float(torch.norm(p, norm_type, dtype=torch.float32)) param_norm = float(torch.norm(p, norm_type, dtype=torch.float32))
except TypeError as err: except TypeError as err:
...@@ -255,6 +279,255 @@ def is_model_parallel_parameter(p): ...@@ -255,6 +279,255 @@ def is_model_parallel_parameter(p):
return hasattr(p, 'model_parallel') and p.model_parallel return hasattr(p, 'model_parallel') and p.model_parallel
def prefix_sum_inc(weights):
""" Compute an inclusive prefix sum.
Example:
>>> prefix_sum_inc([3,4,5])
[3, 7, 12]
"""
weights_ = [w for w in weights]
for x in range(1, len(weights_)):
weights_[x] += weights_[x - 1]
return weights_
def partition_uniform(num_items, num_parts):
parts = [0] * (num_parts + 1)
# First check for the trivial edge case
if num_items <= num_parts:
for p in range(num_parts + 1):
parts[p] = min(p, num_items)
return parts
chunksize = floor(num_items / num_parts)
for p in range(num_parts):
parts[p] = min(chunksize * p, num_items)
parts[num_parts] = num_items
return parts
def _lprobe(weights, num_parts, bottleneck):
num_items = len(weights)
total_weight = weights[-1]
# initialize partitioning
parts = [0] * (num_parts + 1)
for p in range(1, num_parts + 1):
parts[p] = num_items
bsum = bottleneck # running sum of target weight for pth partition
chunksize = num_items // num_parts
step = chunksize
for p in range(1, num_parts):
# Jump to the next bucket
while (step < num_items) and (weights[step] < bsum):
step += chunksize
# Find the end index of partition p
parts[p] = bisect_left(weights,
bsum,
lo=step - chunksize,
hi=min(step,
num_items))
# Nothing more to partition, return early
if parts[p] == num_items:
# See if the current partition is overweight.
part_size = weights[-1] - weights[parts[p - 1]]
return parts, part_size < bottleneck
# Next partition target
bsum = weights[parts[p] - 1] + bottleneck
return parts, bsum >= total_weight
def _rb_partition_balanced(weights, num_parts, eps):
total_weight = weights[-1]
lower = total_weight / num_parts # best case heaviest partition
upper = total_weight # worst case heaviest partition
# Do a binary search for the best partitioning
while upper > lower + eps:
mid = lower + ((upper - lower) / 2)
parts, success = _lprobe(weights, num_parts, mid)
if success:
upper = mid
else:
lower = mid + eps
return upper
def partition_balanced(weights, num_parts, eps=1e-3):
num_items = len(weights)
# First check for the trivial edge case
if num_items <= num_parts:
return partition_uniform(num_items, num_parts)
weights_ = prefix_sum_inc(weights)
# Find the smallest bottleneck (weight of heaviest partition)
bottleneck = _rb_partition_balanced(weights_, num_parts, eps=eps)
# Now compute that partitioning
parts, success = _lprobe(weights_, num_parts, bottleneck)
assert success
return parts
class PartitionedTensor:
def __init__(self, tensor, group, partition_meta=None):
super().__init__()
self.group = group
self.num_parts = dist.get_world_size(group=self.group)
self.rank = dist.get_rank(group=self.group)
self.orig_size = list(tensor.size())
self.orig_device = tensor.device
self.local_data, self.partition = self._partition_tensor(tensor)
@classmethod
def from_meta(cls, meta, local_part, group, device='cuda'):
assert meta.dtype == torch.long
dummy = torch.ones(dist.get_world_size(group=group))
part_obj = cls(tensor=dummy, group=group)
meta = meta.tolist()
# [N, list0, ..., listN-1]
part_obj.orig_size = meta[1:(1 + meta[0])]
meta = meta[1 + meta[0]:]
part_obj.orig_device = device
part_obj.local_data = local_part.detach()
part_obj.group = group
# Partition is encoded like the rowptr of a CSR matrix:
# [num_parts, rank, 0, part_1, ..., part_num_parts]
# TODO: support shuffle between different partition granularities
assert part_obj.num_parts == meta[0]
assert part_obj.rank == meta[1]
part_obj.partition = meta[2:] # length num_parts+1
return part_obj
def _partition_tensor(self, tensor):
partition = partition_uniform(num_items=tensor.numel(), num_parts=self.num_parts)
start = partition[self.rank]
length = partition[self.rank + 1] - start
tensor_part = tensor.detach().contiguous().view(-1).narrow(
0,
start=start,
length=length).clone()
return tensor_part, partition
def full(self, device=None):
if device is None:
device = self.orig_device
# Allocate the full tensor as a flat buffer.
full_numel = prod(self.full_size())
flat_tensor = torch.zeros([full_numel],
dtype=self.local_data.dtype,
device=device)
# Prepare all-gather buffer
partition_tensors = []
for part_id in range(self.num_parts):
part_size = self.partition[part_id + 1] - self.partition[part_id]
buf = flat_tensor.narrow(0, start=self.partition[part_id], length=part_size)
if part_id == self.rank:
buf.copy_(self.local_data)
partition_tensors.append(buf)
# Collect the full tensor
dist.all_gather(partition_tensors,
partition_tensors[self.rank],
group=self.group)
for i in range(len(partition_tensors)):
partition_tensors[i].data = torch.zeros(1)
partition_tensors[i] = None
return flat_tensor.view(self.full_size()).clone().detach()
def to_meta(self):
"""Returns a torch.LongTensor that encodes partitioning information.
Can be used along with ``data()`` to serialize a ``PartitionedTensor`` for
communication.
Returns:
torch.LongTensor: a tensor encoding the meta-information for the partitioning
"""
meta = []
meta.append(len(self.orig_size))
meta += list(self.orig_size)
meta.append(self.num_parts)
meta.append(self.rank)
meta += self.partition
return torch.LongTensor(data=meta).to(self.orig_device)
def data(self):
return self.local_data
def local_size(self):
return self.local_data.size()
def full_size(self):
return self.orig_size
mem_alloced = 0
mem_cached = 0
def memory_status(msg, print_rank=-1, reset_max=False):
global mem_alloced, mem_cached
rank = dist.get_rank()
if print_rank != -1 and rank != print_rank:
return
torch.cuda.synchronize()
if reset_max:
torch.cuda.reset_max_memory_cached()
torch.cuda.reset_max_memory_allocated()
new_alloced = torch.cuda.memory_allocated()
new_cached = torch.cuda.memory_cached()
delta_alloced = new_alloced - mem_alloced
delta_cached = new_cached - mem_cached
mem_cached = new_cached
mem_alloced = new_alloced
max_alloced = torch.cuda.max_memory_allocated()
max_cached = torch.cuda.max_memory_cached()
# convert to GB for printing
new_alloced /= 1024**3
new_cached /= 1024**3
delta_alloced /= 1024**3
delta_cached /= 1024**3
max_alloced /= 1024**3
max_cached /= 1024**3
print(
f'RANK={rank} MEMSTATS',
msg,
f'device={torch.cuda.current_device()} '
f'current alloc={new_alloced:0.4f}GB (delta={delta_alloced:0.4f}GB max={max_alloced:0.4f}GB) '
f'current cache={new_cached:0.4f}GB (delta={delta_cached:0.4f}GB max={max_cached:0.4f}GB)'
)
def see_memory_usage(message): def see_memory_usage(message):
return return
if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0: if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
...@@ -278,3 +551,25 @@ def see_memory_usage(message): ...@@ -278,3 +551,25 @@ def see_memory_usage(message):
"Max cache Allocated %s GigaBytes", "Max cache Allocated %s GigaBytes",
torch.cuda.max_memory_cached() / (1024 * 1024 * 1024), torch.cuda.max_memory_cached() / (1024 * 1024 * 1024),
) )
def call_to_str(base, *args, **kwargs):
"""Construct a string representation of a call.
Args:
base (str): name of the call
args (tuple, optional): args to ``base``
kwargs (dict, optional): kwargs supplied to ``base``
Returns:
str: A string representation of base(*args, **kwargs)
"""
name = f'{base}('
if args:
name += ', '.join(repr(arg) for arg in args)
if kwargs:
name += ', '
if kwargs:
name += ', '.join(f'{key}={repr(arg)}' for key, arg in kwargs.items())
name += ')'
return name
...@@ -26,13 +26,6 @@ def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_c ...@@ -26,13 +26,6 @@ def get_group_alignment_padding(tensor_list, sub_partition_size, sub_partition_c
padding = get_alignment_padding(flattened_size, i, sub_partition_size) padding = get_alignment_padding(flattened_size, i, sub_partition_size)
group_paddings.append(padding) group_paddings.append(padding)
logger.info("****Padding information*****")
logger.info(f"tensor_size = {flattened_size}")
logger.info(f"sub_partition_size = {sub_partition_size}")
logger.info(f"sub_partition_count = {sub_partition_count}")
for i, padding in enumerate(group_paddings):
logger.info(f"padding[{i}] = {padding}")
return group_paddings return group_paddings
......
from deepspeed.utils.logging import logger, log_dist from deepspeed.utils.logging import logger, log_dist
from deepspeed.runtime.dataloader import RepeatingLoader
...@@ -12,9 +12,9 @@ from deepspeed.utils import logger ...@@ -12,9 +12,9 @@ from deepspeed.utils import logger
def print_rank_0(message): def print_rank_0(message):
if torch.distributed.is_initialized(): if torch.distributed.is_initialized():
if torch.distributed.get_rank() == 0: if torch.distributed.get_rank() == 0:
logger.info(message) print(message)
else: else:
logger.info(message) print(message)
class SynchronizedWallClockTimer: class SynchronizedWallClockTimer:
...@@ -85,12 +85,15 @@ class SynchronizedWallClockTimer: ...@@ -85,12 +85,15 @@ class SynchronizedWallClockTimer:
def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False): def log(self, names, normalizer=1.0, reset=True, memory_breakdown=False):
"""Log a group of timers.""" """Log a group of timers."""
assert normalizer > 0.0 assert normalizer > 0.0
string = 'time (ms)' string = f'rank={torch.distributed.get_rank()} time (ms)'
for name in names: for name in names:
elapsed_time = self.timers[name].elapsed(reset=reset) * 1000.0 / normalizer if name in self.timers:
string += ' | {}: {:.2f}'.format(name, elapsed_time) elapsed_time = self.timers[name].elapsed(
if memory_breakdown: reset=reset) * 1000.0 / normalizer
string += self.memory_usage() string += ' | {}: {:.2f}'.format(name, elapsed_time)
# TODO: use our logging utilitied to selectively print. Useful for model
# parallelism because rank=0 is too restrictive.
print_rank_0(string) print_rank_0(string)
......
...@@ -66,5 +66,7 @@ lnav: ...@@ -66,5 +66,7 @@ lnav:
url: /tutorials/lrrt/ url: /tutorials/lrrt/
- title: "DeepSpeed Sparse Attention" - title: "DeepSpeed Sparse Attention"
url: /tutorials/sparse-attention/ url: /tutorials/sparse-attention/
- title: "Pipeline Parallelism"
url: /tutorials/pipeline/
- title: "Contributing" - title: "Contributing"
url: /contributing/ url: /contributing/
---
title: "Pipeline Parallelism"
---
DeepSpeed v0.3 includes new support for pipeline parallelism! Pipeline
parallelism improves both the memory and compute efficiency of deep learning
training by partitioning the layers of a model into stages that can be
processed in parallel.
DeepSpeed's training engine provides hybrid data and pipeline parallelism and
can be further combined with model parallelism such as
[Megatron-LM](https://github.com/NVIDIA/Megatron-LM).
An illustration of
3D parallelism is shown below. Our latest [results](linklinklink)
demonstrate that this 3D parallelism enables training models with over a
**trillion** parameters.
![3D parallelism in DeepSpeed](/assets/images/3d-parallelism.png)
DeepSpeed uses *gradient accumulation* to extract pipeline parallelism (shown
below). Each batch of training data is divided into micro-batches that can be
processed in parallel by the pipeline stages. Once a stage completes the
forward pass for a micro-batch, the activation memory is communicated to the
next stage in the pipeline. Similarly, as the next stage completes its
backward pass on a micro-batch, the gradient with respect to the activation
is communicated backwards through the pipeline. Each backward pass
accumulates gradients locally. Next, all data parallel groups perform
reductions of the gradients in parallel. Lastly, the optimizer updates the
model weights.
Below is an illustration of how DeepSpeed will train a batch with eight
micro-batches using hybrid two-way data parallelism and two-stage pipeline
parallelism. GPUs 0 and 2 are arranged in a pipeline and will alternate
forward (F) and backward (B) passes. They will then all-reduce (AR) gradients
with their data parallel counterparts, GPUs 1 and 3, respectively. Finally,
the two pipeline stages update their model weights.
![Pipeline Schedule](/assets/images/pipe-schedule.png)
## Getting Starting with Pipeline Parallelism
DeepSpeed strives to accelerate *and* simplify the process of pipeline
parallel training. This section provides first steps with hybrid data and
pipeline parallel training by preparing `torchvision`'s
[AlexNet](https://pytorch.org/docs/1.2.0/_modules/torchvision/models/alexnet.html)
model.
### Expressing Pipeline Models
Pipeline parallelism requires models to be expressed as a sequence of layers.
In the forward pass, each layer consumes the output of the previous
layer. In fact, there is no need to specify a `forward()` for a pipeline
parallel model! The forward pass of a pipeline parallel model implicitly
takes the form:
```python
def forward(self, inputs):
x = inputs
for layer in self.layers:
x = layer(x)
return x
```
PyTorch's
[`torch.nn.Sequential`](https://pytorch.org/docs/stable/generated/torch.nn.Sequential.html)
is a convenient container for expressing pipeline parallel models and can be
parallelized by DeepSpeed with no modification:
```python
net = nn.Sequential(
nn.Linear(in_features, hidden_dim),
nn.ReLU(inplace=True),
nn.Linear(hidden_dim, out_features)
)
from deepspeed.pipe import PipelineModule
net = PipelineModule(layers=net, num_stages=2)
```
`PipelineModule` uses its `layers` argument as the sequence of layers that
comprise the model. After initialization, `net` is divided into two pipeline
stages and its layers moved to the correpsonding GPUs. If more than two GPUs
are present, DeepSpeed will also use hybrid data parallelism.
**Note:** The total number of GPUs must be divisible by the number of pipeline
stages.
{: .notice--info}
**Note:** For large model training, see [memory-efficient model construction](#memory-efficient-module-initialization).
{: .notice--info}
### AlexNet
Let's look at an abbreviated implementation of `torchvision`'s
[AlexNet](https://pytorch.org/docs/1.2.0/_modules/torchvision/models/alexnet.html):
```python
class AlexNet(nn.Module):
def __init__(self, num_classes=1000):
super(AlexNet, self).__init__()
self.features = nn.Sequential(
nn.Conv2d(3, 64, kernel_size=11, stride=4, padding=2),
...
nn.MaxPool2d(kernel_size=3, stride=2),
)
self.avgpool = nn.AdaptiveAvgPool2d((6, 6))
self.classifier = nn.Sequential(
nn.Dropout(),
...
nn.Linear(4096, num_classes),
)
def forward(self, x):
x = self.features(x)
x = self.avgpool(x)
x = torch.flatten(x, 1)
x = self.classifier(x)
return x
```
`AlexNet` is mostly a composition of several `Sequential` submodules. We can
turn this into a `PipelineModule` by flattening its submodules into a single
sequence of layers:
```python
class AlexNetPipe(AlexNet):
def to_layers(self):
layers = [
*self.features,
self.avgpool,
lambda x: torch.flatten(x, 1),
*self.classifier
]
return layers
from deepspeed.pipe import PipelineModule
net = AlexNetPipe()
net = PipelineModule(layers=net.to_layers(), num_stages=2)
```
**Note:**
the `lamda` in the middle of `layers` above is not a `torch.nn.Module`
type. Any object that implements `__call__()` can be a layer in a
`PipelineModule`: this allows for convenient data transformations in the
pipeline.
{: .notice--info}
### Inputs and Outputs
Following `torch.nn.Sequential`, the inputs and outputs of each layer must be
either a single `torch.Tensor` or a `tuple` of tensors. In practice, some
models may need to modify their forward pass to pack and unpack arguments to
`forward()`. Consider an abbreviated implementation of a stack of Transformer
blocks:
```python
class TransformerBlock(nn.Module)
...
def forward(self, hidden, mask):
output = self.compute(hidden, mask)
return output
...
stack = [ TransformerBlock() for _ in range(num_layers) ]
```
Two modifications to `TransformerBlock` are required:
1. The arguments must be collected into a `tuple`.
2. `mask` must also be returned from `forward()` to pass to the next layer.
These modifications can be accomplished with a short subclass:
```python
class TransformerBlockPipe(TransformerBlock)
def forward(self, inputs):
hidden, mask = inputs
outputs = super().forward(hidden, mask)
return (output, mask)
stack = [ TransformerBlockPipe() for _ in range(num_layers) ]
```
### Training Loops
Pipeline parallelism interleaves forward and backward passes, and thus the
training loop cannot be divided into separate stages of `forward()`,
`backward()` and `step()`.
Instead, DeepSpeed's pipeline engine provides a `train_batch()` method that
advances the pipeline engine until the next batch of training data is
consumed and the model weights updated.
```python
train_iter = iter(train_loader)
loss = engine.train_batch(data_iter=train_iter)
```
The above `train_batch()` example is equivalent to the following with
traditional data parallel DeepSpeed:
```python
train_iter = iter(train_loader)
for micro_batch in engine.gradient_accumulation_steps():
batch = next(data_iter)
loss = engine(batch)
engine.backward(loss)
engine.step()
```
### Dealing with Data
Data parallel training typically has each worker perform IO independently at
the start of each batch. However, in a pipeline parallel environment, only the
first stage uses the input data, and only the last stage uses labels for loss
calculation.
**Note:**
The pipeline engine expects data loaders to return a `tuple` of two items. The
first returned item is the input batch data, and the second item is the data
to be used in the loss calculation. As before, inputs and labels should be
either `torch.Tensor` type or a `tuple` of tensors.
{: .notice--info}
For convenience, the DeepSpeed pipeline engine can construct a distributed
data loader when a dataset is provided to `deepspeed.initialize()`. DeepSpeed
handles the rest of the complexity of data loading, and so the pipeline
training loop becomes:
```python
engine, _, _, _ = deepspeed.initialize(
args=args,
model=net,
model_parameters=[p for p in net.parameters() if p.requires_grad],
training_data=cifar_trainset())
for step in range(args.steps):
loss = engine.train_batch()
```
Of course, DeepSpeed will work with any data loader that you wish to use.
Data loaders should be constructed by the first and last stages in the
pipeline. Each worker should load micro-batches of size
`engine.train_micro_batch_size_per_gpu()` and will be queried
a total of `engine.gradient_accumulation_steps()` times per `train_batch()`.
**Watch out!**
The pipeline engine *pulls* data from an iteratior instead of iterating over
it. It's critical that the data stream does not empty in the middle of a
training batch. Each invocation of `train_batch()` will pull
a total of `engine.gradient_accumulation_steps()` micro-batches of data from
the data iterator.
{: .notice--warning}
DeepSpeed provides a convenience class `deepspeed.utils.RepeatingLoader` that
simply wraps an iterable such as a data loader and restarts it whenever the
end is reached:
```python
train_loader = deepspeed.utils.RepeatingLoader(train_loader)
train_iter = iter(train_loader)
for step in range(args.steps):
loss = engine.train_batch(data_iter=trainiter)
```
## Advanced Topics
### Load Balancing Pipeline Modules
The performance of pipeline parallel training strongly relies on load
balance. DeepSpeed provides several mechanisms for partitioning the model
across GPUs. These strategies can be set with the `partition_method` keyword
argument to `PipelineModule`. Here are partitioning methods currently provided
by DeepSpeed:
* `partition_method="parameters"` (**default**)
balances the number of trainable parameters on each pipeline stage . This is
especially useful in memory-constrained environments and when the size of a
layer is proportional to the computation time.
* `partition_method="type:[regex]"`
balances layers whose class names match `[regex]`. The regular expression
is not case sensitive. For example, `partition_method="type:transformer"`
would balance the number of transformer layers per stage.
* `partition_method="uniform"` balances the number of layers per stage.
### Memory-Efficient Model Construction
Building a `Sequential` and providing it `PipelineModule` is a convenient way
of specifying a pipeline parallel model. However, this approach encounters
scalability issues for massive models. Starting from a `Sequential` allocates
the model in CPU memory redundantly by every worker. A machine with 16 GPUs
must have as much local CPU memory as 16 times the model size.
DeepSpeed provides a `LayerSpec` class that delays the construction of
modules until the model layers have been partitioned across workers. Then,
the modules are built on the GPU that owns the layer.
Here's an example of the abbreviated AlexNet model, but expressed only
with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
simply becomes `LayerSpec(nn.ReLU, inplace=True)`.
```python
from deepspeed.pipe import PipelineModule, LayerSpec
class AlexNetPipe(PipelineModule):
def __init__(self, num_classes=10, **kwargs):
self.num_classes = num_classes
specs = [
LayerSpec(nn.Conv2d, 3, 64, kernel_size=11, stride=4, padding=2),
LayerSpec(nn.ReLU, inplace=True),
...
LayerSpec(nn.ReLU, inplace=True),
LayerSpec(nn.Linear, 4096, self.num_classes),
]
super().__init__(layers=specs, loss_fn=nn.CrossEntropyLoss(), **kwargs)
```
### Tied Layers
Some models cannot be entirely expressed as pipeline parallel models because
some layers are reused in the pipeline. For example, Transformer based
language models commonly use an embedding layer early in the pipeline to map
vocabulary to hidden states, and then use the embedding to map hidden states
back to vocabulary at the end of the pipeline. If the model was restricted to
pure pipeline parallelism, this embedding reuse would prohibit pipeline
parallelism.
DeepSpeed provides a `TiedLayerSpec` that is an extension of
`LayerSpec`. `TiedLayerSpec` requires an additional argument: `key`.
Each reuse of a layer is specified with a `TiedLayerSpec`, and the `key` field
is used to identify where a layer is reused.
Tied layers are replicated on every pipeline stage that owns an instance of
reuse. Training then proceeds as normal, but an additional all-reduce of the
tied gradients is added after all backward passes complete. The all-reduce
ensures that the weights of the tied layer remain in sync across pipeline stages.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment