Commit eadbbe09 authored by 401qingkong's avatar 401qingkong
Browse files

push rocm deepspeed v0.3.13

parent ab5534fc
"""
Copyright 2020 The Microsoft DeepSpeed Team
"""
import torch
from .transformer import TransformerBuilder
class StochasticTransformerBuilder(TransformerBuilder):
BUILD_VAR = "DS_BUILD_STOCHASTIC_TRANSFORMER"
NAME = "stochastic_transformer"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def nvcc_args(self):
args = super().nvcc_args()
args.append('-D__STOCHASTIC_MODE__')
return args
"""
Copyright 2020 The Microsoft DeepSpeed Team
"""
import torch
from .builder import CUDAOpBuilder
class TransformerBuilder(CUDAOpBuilder):
BUILD_VAR = "DS_BUILD_TRANSFORMER"
NAME = "transformer"
def __init__(self, name=None):
name = self.NAME if name is None else name
super().__init__(name=name)
def absolute_name(self):
return f'deepspeed.ops.transformer.{self.NAME}_op'
def sources(self):
return [
'csrc/transformer/ds_transformer_cuda.cpp',
'csrc/transformer/cublas_wrappers.cu',
'csrc/transformer/transform_kernels.cu',
'csrc/transformer/gelu_kernels.cu',
'csrc/transformer/dropout_kernels.cu',
'csrc/transformer/normalize_kernels.cu',
'csrc/transformer/softmax_kernels.cu',
'csrc/transformer/general_kernels.cu'
]
def include_paths(self):
return ['csrc/includes']
def nvcc_args(self):
args = [
'-O3',
'--use_fast_math',
'-std=c++14',
'-U__CUDA_NO_HALF_OPERATORS__',
'-U__CUDA_NO_HALF_CONVERSIONS__',
'-U__CUDA_NO_HALF2_OPERATORS__'
]
return args + self.compute_capability_args()
def cxx_args(self):
return ['-O3', '-std=c++14', '-g', '-Wno-reorder']
"""
Copyright 2020 The Microsoft DeepSpeed Team
"""
from .builder import OpBuilder
class UtilsBuilder(OpBuilder):
BUILD_VAR = "DS_BUILD_UTILS"
NAME = "utils"
def __init__(self):
super().__init__(name=self.NAME)
def absolute_name(self):
return f'deepspeed.ops.{self.NAME}_op'
def sources(self):
return ['csrc/utils/flatten_unflatten.cpp']
'''
Copyright 2019 The Microsoft DeepSpeed Team
'''
from mpi4py import MPI
import numpy as np
import cupy
def my_igather(rank, size, comm, sendbuf, recbuf, root):
req = []
if rank == root:
for idx in range(size):
if idx != rank:
req.append(comm.Irecv(recbuf[idx], source=idx))
else:
recbuf[rank] = sendbuf
else:
req.append(comm.Isend(sendbuf, dest=root))
return req
def gather_cuda(rank,
world_size,
comm,
cupy_sign_list_packed,
cupy_recvbuf_sign,
cupy_worker_scale,
cupy_recvbuf_scale):
# We do in-place operations on cupy buffers so we do not return any buffers
requests = []
for idx in range(world_size):
req_sign = my_igather(rank,
world_size,
comm,
cupy_sign_list_packed[idx],
cupy_recvbuf_sign,
root=idx)
requests += req_sign
for idx in range(world_size):
req_scale = my_igather(rank,
world_size,
comm,
cupy_worker_scale,
cupy_recvbuf_scale,
root=idx)
requests += req_scale
MPI.Request.Waitall(requests)
def gather_host(rank,
world_size,
comm,
cupy_sign_list_packed,
cupy_recvbuf_sign,
cupy_worker_scale,
cupy_recvbuf_scale):
# In-place operations are not possible for newly created cupy arrays
# so we need to return the new buffers
numpy_recvbuf_sign = np.zeros([world_size,
cupy_sign_list_packed[rank].size],
dtype=cupy_sign_list_packed[0].dtype)
numpy_recvbuf_scale = np.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
# 1. convert from cupy to numpy
numpy_sign_list_packed = cupy_sign_list_packed
for idx in range(world_size):
numpy_sign_list_packed[idx] = cupy.asnumpy(cupy_sign_list_packed[idx])
numpy_worker_scale = cupy.asnumpy(cupy_worker_scale)
numpy_recvbuf_scale = cupy.asnumpy(cupy_recvbuf_scale)
cupy.cuda.get_current_stream().synchronize()
# 2. use numpy buffers for communication
requests = []
for idx in range(world_size):
req_sign = my_igather(rank,
world_size,
comm,
numpy_sign_list_packed[idx],
numpy_recvbuf_sign,
root=idx)
requests += req_sign
for idx in range(world_size):
req_scale = my_igather(rank,
world_size,
comm,
numpy_worker_scale,
numpy_recvbuf_scale,
root=idx)
requests += req_scale
MPI.Request.Waitall(requests)
# 3. Convert back from numpy to cupy
cupy_recvbuf_sign = cupy.asarray(numpy_recvbuf_sign)
for idx in range(world_size):
cupy_sign_list_packed[idx] = cupy.asarray(numpy_sign_list_packed[idx])
cupy_worker_scale = cupy.asarray(numpy_worker_scale)
cupy_recvbuf_scale = cupy.asarray(numpy_recvbuf_scale)
cupy.cuda.get_current_stream().synchronize()
return cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale
def allgather_cuda(comm,
cupy_server_sign_packed,
cupy_recvbuf_sign_server,
cupy_server_scale,
cupy_recvbuf_scale_server):
comm.Allgather(cupy_server_sign_packed, cupy_recvbuf_sign_server)
comm.Allgather(cupy_server_scale, cupy_recvbuf_scale_server)
def allgather_host(comm,
cupy_server_sign_packed,
cupy_recvbuf_sign_server,
cupy_server_scale,
cupy_recvbuf_scale_server):
# 1. Convert cupy to numpy
numpy_recvbuf_sign_server = np.zeros([comm.Get_size(),
cupy_server_sign_packed.size],
dtype=cupy_server_sign_packed.dtype)
numpy_recvbuf_scale_server = np.zeros([comm.Get_size(),
1],
dtype=cupy_server_scale.dtype)
numpy_server_sign_packed = cupy.asnumpy(cupy_server_sign_packed)
numpy_recvbuf_sign_server = cupy.asnumpy(cupy_recvbuf_sign_server)
numpy_server_scale = cupy.asnumpy(cupy_server_scale)
numpy_recvbuf_scale_server = cupy.asnumpy(cupy_recvbuf_scale_server)
cupy.cuda.get_current_stream().synchronize()
# 2. Communicate numpy buffers
comm.Allgather(numpy_server_sign_packed, numpy_recvbuf_sign_server)
comm.Allgather(numpy_server_scale, numpy_recvbuf_scale_server)
comm.Barrier()
# 3. Convert numpy back to cupy
cupy_server_sign_packed = cupy.asarray(numpy_server_sign_packed)
cupy_recvbuf_sign_server = cupy.asarray(numpy_recvbuf_sign_server)
cupy_server_scale = cupy.asarray(numpy_server_scale)
cupy_recvbuf_scale_server = cupy.asarray(numpy_recvbuf_scale_server)
cupy.cuda.get_current_stream().synchronize()
return cupy_server_sign_packed, cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server
......@@ -3,13 +3,10 @@ Copyright 2019 The Microsoft DeepSpeed Team
'''
import os
import stat
import torch
import warnings
import hashlib
import torch.distributed as dist
from collections import OrderedDict
from shutil import copyfile
from torch.nn.modules import Module
from torch.distributed.distributed_c10d import _get_global_rank
......@@ -388,9 +385,6 @@ class DeepSpeedEngine(Module):
def zero_param_persistence_threshold(self):
return self._config.zero_config.param_persistence_threshold
def zero_gather_fp16_weights_on_model_save(self):
return self._config.zero_config.gather_fp16_weights_on_model_save
def fp16_enabled(self):
return self._config.fp16_enabled
......@@ -681,12 +675,8 @@ class DeepSpeedEngine(Module):
from deepspeed.ops.lamb import FusedLamb
optimizer = FusedLamb(model_parameters, **optimizer_parameters)
elif self.optimizer_name() == ONEBIT_ADAM_OPTIMIZER:
from deepspeed.runtime.fp16.onebit.adam import OnebitAdam
from deepspeed.runtime.fp16.onebit_adam import OnebitAdam
optimizer = OnebitAdam(model_parameters, self, **optimizer_parameters)
if not self.fp16_enabled():
logger.warning(
f'Currently the convergence of 1-bit Adam is only verified under FP16'
)
else:
torch_optimizer = getattr(torch.optim, self.optimizer_name())
optimizer = torch_optimizer(model_parameters, **optimizer_parameters)
......@@ -955,7 +945,7 @@ class DeepSpeedEngine(Module):
Arguments:
loss: Torch tensor on which to execute backward propagation
allreduce_gradients: is deprecated, ignored, and will soon be removed'
allreduce_gradients: If this is False, then gradient averaging will be skipped. Default is True.
"""
if not allreduce_gradients:
......@@ -1358,7 +1348,7 @@ class DeepSpeedEngine(Module):
zero_ckpt_name = os.path.join(
checkpoints_path,
str(tag),
filename + '_mp_rank_{:02d}'.format(mp_rank) + '_optim_states.pt')
filename + '_mp_rank_{:02d}'.format(mp_rank) + 'optim_states.pt')
return zero_ckpt_name
def _get_zero_ckpt_name(self, checkpoints_path, tag):
......@@ -1535,20 +1525,13 @@ class DeepSpeedEngine(Module):
mp_rank=mp_rank,
dp_world_size=self.loaded_checkpoint_dp_world_size)
invalid_zero_ckpt_paths = []
for i, ckpt_name in enumerate(zero_ckpt_names):
for ckpt_name in zero_ckpt_names:
if not os.path.exists(ckpt_name):
# transparently handle the old file pattern for optim_states
if 'optim_states.pt' in ckpt_name:
ckpt_name_try = ckpt_name.replace("_optim_states.pt",
"optim_states.pt")
if os.path.exists(ckpt_name_try):
zero_ckpt_names[i] = ckpt_name_try
continue
invalid_zero_ckpt_paths.append(ckpt_name)
if len(invalid_zero_ckpt_paths) > 0:
logger.warn(
f"The following zero checkpoints paths are missing: {invalid_zero_ckpt_paths}"
f"Client provided zero checkpoint load paths: {invalid_zero_ckpt_paths} does not exist"
)
return None
......@@ -1690,125 +1673,8 @@ class DeepSpeedEngine(Module):
torch.save(state, save_path)
self._curr_save_path = None
def _get_param_shapes(self):
param_shapes = OrderedDict()
for name, param in self.module.named_parameters():
param_shapes[name] = param.ds_shape if hasattr(param,
"ds_shape") else param.shape
# print(f"saving param {name} {param_shapes[name]}")
return param_shapes
def _copy_recovery_script(self, save_path):
base_dir = os.path.dirname(os.path.dirname(__file__))
script = "zero_to_fp32.py"
src = os.path.join(base_dir, "utils", script)
dst = os.path.join(save_path, script)
logger.info(f"creating recovery script {dst}")
copyfile(src, dst)
# make executable
os.chmod(dst, os.stat(dst).st_mode | stat.S_IEXEC)
def _save_zero_checkpoint(self, save_path, tag):
zero_checkpoint_name = self._get_zero_ckpt_name(save_path, tag)
zero_sd = dict(
optimizer_state_dict=self.optimizer.state_dict(),
param_shapes=self._get_param_shapes(),
)
zero_sd = {'optimizer_state_dict': self.optimizer.state_dict()}
torch.save(zero_sd, zero_checkpoint_name)
self._copy_recovery_script(save_path)
logger.info('zero checkpoint saved {}'.format(zero_checkpoint_name))
def _zero3_consolidated_fp16_state_dict(self):
"""
Get a full non-partitioned state_dict with fp16 weights on cpu.
This is similar to nn.Module.state_dict (modelled after _save_to_state_dict), but:
1. consolidates the weights from different partitions on gpu0
2. works on one layer at a time to require as little gpu0 memory as possible, by
moving the already consolidated weights to cpu
3. takes care to keep the shared params shared when gradually copying the params to cpu
Returns:
a consolidated fp16 ``state_dict`` on cpu on rank 0, ``None`` on other ranks
"""
import deepspeed
if not self.zero_optimization_partition_weights():
raise ValueError("this function requires ZeRO-3 mode")
state_dict = OrderedDict() if torch.distributed.get_rank() == 0 else None
shared_weights = {}
def get_layer_state_dict(module, prefix=""):
# gather one layer at a time to be memory-efficient
with deepspeed.zero.GatheredParameters(list(
module.parameters(recurse=False))):
if torch.distributed.get_rank() == 0:
for name, param in module.named_parameters(recurse=False):
if param is None:
continue
key = prefix + name
# for shared weights we want to make sure not to unshare them when copying to cpu
data_ptr_id = param.storage().data_ptr()
if data_ptr_id in shared_weights:
# shared weights
# print(f"`{key}` is shared with `{shared_weights[data_ptr_id]}`")
state_dict[key] = state_dict[shared_weights[data_ptr_id]]
else:
state_dict[key] = param.detach().cpu()
shared_weights[data_ptr_id] = key
#print(f"param {name} {param.shape}")
#print(f"param {key} {param.shape} {state_dict[key].storage().data_ptr()}")
# now buffers - not sure if need to take care of potentially shared weights here
for name, buf in module.named_buffers(recurse=False):
if buf is not None and name not in module._non_persistent_buffers_set:
state_dict[prefix + name] = buf.detach().cpu()
for name, child in module.named_children():
if child is not None:
get_layer_state_dict(child, prefix + name + ".")
see_memory_usage("before get_layer_state_dict", force=False)
get_layer_state_dict(self.module, prefix="")
see_memory_usage("after get_layer_state_dict", force=False)
return state_dict
def save_fp16_model(self, save_dir, save_filename="pytorch_model.bin"):
r"""Save fp16 model weights
This method saves the fp16 model weights at the desired destination.
Arguments:
save_dir: Required. Directory for saving the model
save_filename: Optional. Filename to save to. Defaults to ``pytorch_model.bin``
Important: all processes must call this method and not just the process with rank 0. It is
because the processes need to work in sync to gather the weights. This method will hang
waiting to synchronize with other processes if it's called just for the process with rank 0.
"""
path = os.path.join(save_dir, save_filename)
if self.zero_optimization_partition_weights():
if self.zero_gather_fp16_weights_on_model_save():
# consolidation is expensive in time and memory and therefore isn't a default
state_dict = self._zero3_consolidated_fp16_state_dict()
else:
# the model will be bogus if not consolidated so don't confuse the user by saving it
logger.info(
f"Did not save the model {path} because `stage3_gather_fp16_weights_on_model_save` is False"
)
return
else:
state_dict = self.module.state_dict()
if torch.distributed.get_rank() == 0:
os.makedirs(save_dir, exist_ok=True)
logger.info(f"Saving model weights to {path}")
torch.save(state_dict, path)
'''
Copyright 2020 The Microsoft DeepSpeed Team
'''
import types
import torch
import importlib
import numpy as np
import time
import cupy
from torch.utils.dlpack import to_dlpack
from torch.utils.dlpack import from_dlpack
from deepspeed.utils.logging import logger
from mpi4py import MPI
from deepspeed.runtime.custom_collectives import gather_cuda, gather_host, allgather_cuda, allgather_host
class OnebitAdam(torch.optim.Optimizer):
"""Implements the 1-bit Adam algorithm. Currently GPU-only.
For usage example please see, TODO DeepSpeed Tutorial
It has been proposed in APMSqueeze (https://arxiv.org/abs/2008.11343)
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
freeze_step (int, optional): Number of steps for warmup (uncompressed)
stage before we start using compressed communication. (default 100000)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
max_coeff(float, optional): maximum value of the lamb coefficient (default: 10.0)
min_coeff(float, optional): minimum value of the lamb coefficient (default: 0.01)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False) NOT SUPPORTED in 1-bit Adam!
eps_inside_sqrt (boolean, optional): in the 'update parameters' step,
adds eps to the bias-corrected second moment estimate before
evaluating square root instead of adding it to the square root of
second moment estimate as in the original paper. (default: False)
cuda_aware (boolean, required): Set True if the underlying MPI implementation
supports CUDA-Aware communication. (default: False)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self,
params,
deepspeed=None,
lr=1e-3,
freeze_step=100000,
bias_correction=True,
betas=(0.9,
0.999),
eps=1e-8,
eps_inside_sqrt=False,
weight_decay=0.,
max_grad_norm=0.,
amsgrad=False,
cuda_aware=False):
if amsgrad:
raise RuntimeError('1-bit Adam does not support the AMSGrad variant.')
defaults = dict(lr=lr,
bias_correction=bias_correction,
betas=betas,
eps=eps,
weight_decay=weight_decay,
max_grad_norm=max_grad_norm)
super(OnebitAdam, self).__init__(params, defaults)
from mpi4py import MPI
self.eps_mode = 0 if eps_inside_sqrt else 1
self.comm = MPI.COMM_WORLD
self.rank = self.comm.Get_rank()
self.size = self.comm.Get_size()
self.comm_time = 0.0
self.step_time = 0.0
self.ave_step = 1
self.bk_time = 0.0
self.divider = int(self.size * 8 / np.gcd(self.size, 8))
self.deepspeed = deepspeed
self.adam_freeze_key = False
self.initialize = False
self.freeze_step = freeze_step
self.cuda_aware = cuda_aware
def torch2cupy(self, tensor):
return cupy.fromDlpack(to_dlpack(tensor))
def cupy2torch(self, cupy_tensor):
return from_dlpack(cupy_tensor.toDlpack())
def compress_by_chunk(self, cupy_bool_tensor, num_chunks):
packed_sign = cupy.packbits(cupy_bool_tensor)
sign_list_packed = cupy.split(packed_sign, num_chunks)
cupy.cuda.get_current_stream().synchronize()
return sign_list_packed
def Compressed_Allreduce(self,
buffer_m: torch.tensor,
worker_error,
server_error,
rank,
world_size,
comm,
local_rank):
all_start_time = time.time()
original_size = buffer_m.numel()
cupy.cuda.Device(local_rank).use()
if torch.numel(buffer_m) != torch.numel(worker_error):
empty_tensor = torch.zeros(torch.numel(worker_error) - torch.numel(buffer_m),
device=buffer_m.device)
buffer_m = torch.cat([buffer_m, empty_tensor])
buffer_m.add_(worker_error)
worker_scale = torch.norm(buffer_m) / np.sqrt(torch.numel(buffer_m))
sign_buffer_m = buffer_m.sign().add_(1).bool()
sign_buffer_m = sign_buffer_m.float()
sign_buffer_m.add_(-0.5).mul_(2.0)
worker_error.set_((buffer_m - worker_scale * sign_buffer_m))
sign_buffer_m = None
compensated_buffer_m = buffer_m
compensated_buffer_m.sign_()
compensated_buffer_m = compensated_buffer_m.add_(1).bool()
cupy_worker_scale = self.torch2cupy(worker_scale)
cupy_compensated_buffer_m = self.torch2cupy(compensated_buffer_m)
compensated_buffer_m = None
cupy_sign_list_packed = self.compress_by_chunk(cupy_compensated_buffer_m,
world_size)
cupy_compensated_buffer_m = None
cupy_recvbuf_sign = cupy.zeros([world_size,
cupy_sign_list_packed[rank].size],
dtype=cupy_sign_list_packed[0].dtype)
cupy_recvbuf_scale = cupy.zeros([world_size, 1], dtype=cupy_worker_scale.dtype)
# Communication Phase 1
gather_start = time.time()
if self.cuda_aware:
gather_cuda(rank,
world_size,
comm,
cupy_sign_list_packed,
cupy_recvbuf_sign,
cupy_worker_scale,
cupy_recvbuf_scale)
else:
cupy_sign_list_packed, cupy_recvbuf_sign, cupy_worker_scale, cupy_recvbuf_scale = gather_host(rank,
world_size,
comm,
cupy_sign_list_packed,
cupy_recvbuf_sign,
cupy_worker_scale,
cupy_recvbuf_scale)
gather_end = time.time()
cupy_unpacked_sign = (cupy.unpackbits(cupy_recvbuf_sign.flatten())).reshape(
world_size,
-1)
cupy_recvbuf_sign = None
unpacked_sign = self.cupy2torch(cupy_unpacked_sign).float()
cupy_unpacked_sign = None
unpacked_sign = unpacked_sign.add_(-0.5).mul_(2.0)
worker_scale = self.cupy2torch(cupy_recvbuf_scale).mul_(1 / world_size)
compensated_server_m = unpacked_sign.mul_(worker_scale).sum(0)
unpacked_sign = None
compensated_server_m.add_(server_error)
server_scale = torch.norm(compensated_server_m) / np.sqrt(
compensated_server_m.numel())
sign_server_m = compensated_server_m.sign().add_(1).bool()
sign_server_m = sign_server_m.float()
sign_server_m.add_(-0.5).mul_(2.0)
server_error.set_(compensated_server_m - server_scale * sign_server_m)
sign_server_m = None
compensated_server_m.sign_()
compensated_server_m = compensated_server_m.add_(1).bool()
cupy_server_scale = self.torch2cupy(server_scale)
cupy_compensated_server_m = self.torch2cupy(compensated_server_m)
compensated_server_m = None
cupy_server_sign_packed = self.compress_by_chunk(cupy_compensated_server_m, 1)
cupy_recvbuf_sign_server = cupy.zeros(
[world_size,
cupy_server_sign_packed[0].size],
dtype=cupy_sign_list_packed[0].dtype)
cupy_recvbuf_scale_server = cupy.zeros([world_size,
1],
dtype=cupy_worker_scale.dtype)
# Communication Phase 2
if self.cuda_aware:
allgather_cuda(comm,
cupy_server_sign_packed[0],
cupy_recvbuf_sign_server,
cupy_server_scale,
cupy_recvbuf_scale_server)
else:
cupy_server_sign_packed[0], cupy_recvbuf_sign_server, cupy_server_scale, cupy_recvbuf_scale_server = allgather_host(comm,
cupy_server_sign_packed[0],
cupy_recvbuf_sign_server,
cupy_server_scale,
cupy_recvbuf_scale_server)
cupy_server_unpacked_sign = (cupy.unpackbits(
cupy_recvbuf_sign_server.flatten())).reshape(world_size,
-1)
cupy_recvbuf_sign_server = None
server_unpacked_sign = self.cupy2torch(cupy_server_unpacked_sign)
cupy_server_unpacked_sign = None
server_unpacked_sign = server_unpacked_sign.float().add_(-0.5).mul_(2.0)
server_scale = self.cupy2torch(cupy_recvbuf_scale_server)
buffer_m = server_unpacked_sign.mul_(server_scale).flatten()[0:original_size]
return buffer_m
def step(self, closure=None, grads=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
grads (list of tensors, optional): weight gradient to use for the
optimizer update. If gradients have type torch.half, parameters
are expected to be in type torch.float. (default: None)
output params (list of tensors, optional): A reduced recision copy
of the updated weights written out in addition to the regular
updated weights. Have to be of same type as gradients. (default: None)
scale (float, optional): factor to divide gradient tensor values
by before applying to weights. (default: 1)
"""
loss = None
if closure is not None:
loss = closure()
gather_time = 0
allgather_time = 0
all_time = 0
if self.adam_freeze_key is False:
v_diff_buffer = 0.0
if grads is None:
grads_group = [None] * len(self.param_groups)
# backward compatibility
# assuming a list/generator of parameter means single group
elif isinstance(grads, types.GeneratorType):
grads_group = [grads]
elif type(grads[0]) != list:
grads_group = [grads]
else:
grads_group = grads
for group, grads_this_group in zip(self.param_groups, grads_group):
if grads_this_group is None:
grads_this_group = [None] * len(group['params'])
bias_correction = 1 if group['bias_correction'] else 0
for p, grad in zip(group['params'], grads_this_group):
if p.grad is None and grad is None:
continue
if grad is None:
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError(
'FusedAdam does not support sparse gradients, please consider SparseAdam instead'
)
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
state['tensor_size'] = torch.numel(p.data)
state['corrected_tensor_size'] = state['tensor_size']
if state['tensor_size'] % (self.size * self.divider) != 0:
state['corrected_tensor_size'] += ((self.size * self.divider) -
(state['tensor_size'] %
(self.size * self.divider)))
state['server_chunk_size'] = state[
'corrected_tensor_size'] // self.size
if not self.initialize or (self.adam_freeze_key
and 'worker_error' not in state.keys()):
torch.cuda.empty_cache()
state['worker_error'] = torch.zeros(state['corrected_tensor_size'],
device=p.device)
state['server_error'] = torch.zeros(state['server_chunk_size'],
device=p.device)
torch.cuda.empty_cache()
self.adam_freeze_key = True
if not self.initialize and torch.distributed.get_rank() == 0:
print("Cupy Buffers Initialized Successfully.")
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
if self.adam_freeze_key is False:
exp_avg.mul_(beta1).add_(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
grad = None
if self.initialize:
update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
else:
if 'non_freeze' in group.keys() and group['non_freeze'] is True:
dist.all_reduce(grad)
grad.mul_(1 / dist.get_world_size())
exp_avg.mul_(beta1).add(1 - beta1, grad)
exp_avg_sq.mul_(beta2).addcmul_(1 - beta2, grad, grad)
grad = None
else:
if self.initialize is True:
exp_avg.mul_(beta1).add_(1 - beta1, grad)
grad = None
if self.size > 1:
exp_avg.set_(
self.Compressed_Allreduce(exp_avg,
state['worker_error'],
state['server_error'],
self.rank,
self.size,
self.comm,
self.deepspeed.local_rank))
if self.initialize:
update = exp_avg / (exp_avg_sq.sqrt() + group['eps'])
if self.initialize:
if group['weight_decay'] > 0.0:
update += group['weight_decay'] * p.data
with torch.no_grad():
p.add_(-group['lr'] * update)
if not self.initialize:
print('Pop out errors', flush=True)
state.pop('worker_error')
state.pop('server_error')
if not self.initialize:
self.adam_freeze_key = False
self.initialize = True
print(
f"Finished the initialization step at rank {torch.distributed.get_rank()}"
)
return loss
if self.adam_freeze_key is False:
if state['step'] >= self.freeze_step:
self.adam_freeze_key = True
self.deepspeed.enable_backward_allreduce = False
return loss
......@@ -8,7 +8,6 @@ Helper functions and classes from multiple sources.
import os
import psutil
import gc
from math import ceil
from math import floor
from bisect import bisect_left, bisect_right
......@@ -552,9 +551,6 @@ def see_memory_usage(message, force=False):
if torch.distributed.is_initialized() and not torch.distributed.get_rank() == 0:
return
# python doesn't do real-time garbage collection so do it explicitly to get the correct RAM reports
gc.collect()
# Print message except when distributed but not rank 0
logger.info(message)
logger.info(
......@@ -568,10 +564,6 @@ def see_memory_usage(message, force=False):
logger.info(
f'CPU Virtual Memory: used = {used_GB} GB, percent = {vm_stats.percent}%')
# get the peak memory to report correct data, so reset the counter for the next call
if hasattr(torch.cuda, "reset_peak_memory_stats"): # pytorch 1.4+
torch.cuda.reset_peak_memory_stats()
def call_to_str(base, *args, **kwargs):
"""Construct a string representation of a call.
......
......@@ -34,7 +34,6 @@ class DeepSpeedZeroConfig(DeepSpeedConfigObject):
self.param_persistence_threshold = None
self.max_live_parameters = None
self.max_reuse_distance = None
self.gather_fp16_weights_on_model_save = None
#Stage3 Specific Parameters
self.prefetch_bucket_size = None
......@@ -151,8 +150,3 @@ class DeepSpeedZeroConfig(DeepSpeedConfigObject):
zero_config_dict,
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD,
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT)
self.gather_fp16_weights_on_model_save = get_scalar_param(
zero_config_dict,
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE,
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT)
......@@ -99,10 +99,6 @@ ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT = 50000000
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD = 'stage3_param_persistence_threshold'
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT = 100000
# gathers params for saving a model - inefficient but is required in certain situations
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE = 'stage3_gather_fp16_weights_on_model_save'
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT = False
ZERO_OPTIMIZATION_DEFAULT = {
ZERO_OPTIMIZATION_STAGE:
ZERO_OPTIMIZATION_STAGE_DEFAULT,
......@@ -137,7 +133,5 @@ ZERO_OPTIMIZATION_DEFAULT = {
ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE:
ZERO_OPTIMIZATION_PREFETCH_BUCKET_SIZE_DEFAULT,
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD:
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT,
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE:
ZERO_OPTIMIZATION_GATHER_FP16_WEIGHTS_ON_MODEL_SAVE_DEFAULT
ZERO_OPTIMIZATION_PARAM_PERSISTENCE_THRESHOLD_DEFAULT
}
......@@ -279,9 +279,6 @@ class Init(InsertPostInitMethodToModuleSubClasses):
For example, if a node has 1TB of memory and 8 GPUs, we could fit a trillion
parameter model with 4 nodes and 32 GPUs.
Important: If the fp16 weights of the model can't fit onto a single GPU memory
this feature must be used.
.. note::
Initializes ``torch.distributed`` if it has not already been done so.
See :meth:`deepseed.init_distributed` for more information.
......
......@@ -883,12 +883,8 @@ class FP16_DeepSpeedZeroOptimizer(object):
for p in params:
if is_model_parallel_parameter(p) or (self.model_parallel_rank == 0):
param_id = self.get_param_id(p)
# as some model have trainable parameters but skipped in training,
# their backward hooks in self.create_reduce_and_remove_grad_hooks() will not run,
# so they have no norm_for_param_grads
if param_id in self.norm_for_param_grads:
param_norm = self.norm_for_param_grads[param_id]
total_norm += param_norm.item()**2
param_norm = self.norm_for_param_grads[param_id]
total_norm += param_norm.item()**2
# Sum across all model parallel GPUs.
total_norm_cuda = torch.cuda.FloatTensor([float(total_norm)])
......
......@@ -68,7 +68,7 @@ GEM
jekyll-theme-time-machine (= 0.1.1)
jekyll-titles-from-headings (= 0.5.3)
jemoji (= 0.12.0)
kramdown (= 2.3.1)
kramdown (= 2.3.0)
kramdown-parser-gfm (= 1.1.0)
liquid (= 4.0.3)
mercenary (~> 0.3)
......@@ -196,7 +196,7 @@ GEM
gemoji (~> 3.0)
html-pipeline (~> 2.2)
jekyll (>= 3.0, < 5.0)
kramdown (2.3.1)
kramdown (2.3.0)
rexml
kramdown-parser-gfm (1.1.0)
kramdown (~> 2.0)
......
......@@ -60,7 +60,7 @@ The Adam optimizer also supports the following two params keys/values in additio
| torch\_adam | Use torch's implementation of adam instead of our fused adam implementation | false |
| adam\_w\_mode | Apply L2 regularization (also known as AdamW) | true |
Another example of ***optimizer*** with 1-bit Adam
Another example of ***optimizer*** with 1-bit Adam specific parameters is as follows.
```json
"optimizer": {
......@@ -74,20 +74,11 @@ The Adam optimizer also supports the following two params keys/values in additio
"eps": 1e-8,
"weight_decay": 3e-7,
"freeze_step": 400,
"cuda_aware": false,
"comm_backend_name": "nccl"
"cuda_aware": true
}
}
```
The 1-bit Adam optimizer supports the following three params keys/values in addition to the standard Adam (learn more in our [tutorial](/tutorials/onebit-adam/)):
| "params" key | Description | Default |
| ------------- | --------------------------------------------------------------------------- | ------- |
| freeze\_step | Number of warm up steps before 1-bit compression gets applied to the communication | 100000 |
| cuda\_aware | To indicate that the underlying MPI library supports CUDA-Aware communication | false |
| comm\_backend\_name | To indicate which backend implementation to use | "nccl" |
### Scheduler Parameters
***scheduler***: [dictionary]
......
......@@ -10,8 +10,6 @@ benefit all your large model training jobs.
If you don't already have an Azure account please see more details here: [https://azure.microsoft.com/](https://azure.microsoft.com/).
To use DeepSpeed on [Azure ML](https://azure.microsoft.com/en-us/services/machine-learning/), please take a look at easy-to-use examples for Transformers and CIFAR training from [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed).
To help with launching Azure instances we suggest using the [Azure
CLI](https://docs.microsoft.com/en-us/cli/azure/?view=azure-cli-latest). We have created
several helper scripts to get you quickly started using DeepSpeed with Azure.
......
......@@ -9,7 +9,6 @@ date: 2020-05-15
* Installing is as simple as `pip install deepspeed`, [see more details](/tutorials/advanced-install/).
* Please see our [Azure tutorial](/tutorials/azure/) to get started with DeepSpeed on Azure!
* To get started with DeepSpeed on AzureML, please see the [AzureML Examples GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
* If you're not on Azure, we recommend using our docker image via `docker pull deepspeed/deepspeed:latest` which contains a pre-installed version of DeepSpeed and all the necessary dependencies.
## Writing DeepSpeed Models
......@@ -187,8 +186,8 @@ slots available.
The following command launches a PyTorch training job across all available nodes and GPUs
specified in `myhostfile`:
```bash
deepspeed --hostfile=myhostfile <client_entry.py> <client args> \
--deepspeed --deepspeed_config ds_config.json
deepspeed <client_entry.py> <client args> \
--deepspeed --deepspeed_config ds_config.json --hostfile=myhostfile
```
Alternatively, DeepSpeed allows you to restrict distributed training of your model to a
......@@ -265,10 +264,3 @@ not detected or passed in then DeepSpeed will query the number of GPUs on the
local machine to discover the number of slots available. The `--include` and
`--exclude` arguments work as normal, but the user should specify 'localhost'
as the hostname.
Also note that `CUDA_VISIBLE_DEVICES` can't be used with DeepSpeed to control
which devices should be used. For example, to use only gpu1 of the current
node, do:
```bash
deepspeed --include localhost:1 ...
```
---
title: "1-bit Adam: Up to 5x less communication volume and up to 3.4x faster training"
title: "1-bit Adam: Up to 5x less communication volume and up to 2x faster training"
---
**Note:**
This tutorial is updated on 03/04/2021 to reflect the 1-bit Adam v2. Changes include: 1) NCCL-based implementation which provides better performance and usability compared to the MPI-based implementation. 2) Add support to momentum masks for those parameters with constant zero gradients during training. 3) Bug fixes. See details below.
{: .notice--info}
**Watch out!**
1) The NCCL-based implementation requires PyTorch >= 1.8 (and NCCL >= 2.8.3 when you have 64 or more GPUs). See details below. 2) Although 1-bit Adam is compatible with both FP16 and FP32, currently we only verified the convergence under mixed precision/FP16 training. 3) Currently 1-bit Adam is not compatible with pipeline parallelism. 4) Frequent checkpoint loading could hurt 1-bit Adam's convergence. See details below.
{: .notice--warning}
In this tutorial, we are going to introduce the 1-bit Adam optimizer in DeepSpeed. 1-bit Adam can improve model training speed on communication-constrained clusters, especially for communication-intensive large models by reducing the overall communication volume by up to 5x. Detailed description of the 1-bit Adam algorithm, its implementation in DeepSpeed, and performance evaluation is available from our [blog post](https://www.deepspeed.ai/news/2020/09/08/onebit-adam-blog-post.html). We also have a [paper](https://arxiv.org/abs/2102.02888) which provides the most complete details including algorithm, system implementation, theoretical analysis, and more evaluations.
To illustrate the benefits and usage of 1-bit Adam optimizer in DeepSpeed, we use the following two training tasks as examples:
......@@ -21,7 +13,7 @@ For more details on these tasks, please refer to the tutorial posts on [BingBert
## 1. Overview
### 1.1 Pre-requisites for installing DeepSpeed
### Pre-requisites for installing DeepSpeed
If you don't already have a copy of the DeepSpeed repository, please clone in
now and checkout the DeepSpeedExamples submodule that contains the BingBertSQuAD and BERT Pre-training examples.
......@@ -33,19 +25,9 @@ git submodule update --init --recursive
cd DeepSpeedExamples/
```
### 1.2 Pre-requisites for 1-bit Adam
#### 1.2.1 (New in v2) NCCL-based implementation
In 1-bit Adam v2, we introduce a new system implementation for compressed communication using the NCCL backend of PyTorch distributed. This significantly improves the usability due to NCCL’s integration with PyTorch distributed. The performance of our new NCCL-based implementation is also better than our earlier MPI-based implementation for Ethernet-based systems and on-par for InfiniBand-based systems. Thus we highly recommend users to choose this implementation.
**Watch out!**
This NCCL-based implementation requires PyTorch >= 1.8. It also requires NCCL >= 2.8.3 when you have 64 or more GPUs to avoid certain NCCL runtime bugs. Currently (2021/03/16) NCCL 2.8.3 is not officially supported by PyTorch. The solution we used is by hacking in NCCL 2.8.3 via `LD_PRELOAD`: 1) Install NCCL 2.8.3. This works for us on a CUDA 11 system: `apt-get install -y libnccl2=2.8.3-1+cuda11.0 libnccl-dev=2.8.3-1+cuda11.0`. 2) Set `LD_PRELOAD` to the the library path. This works for us: `LD_PRELOAD=/usr/lib/x86_64-linux-gnu/libnccl.so.2.8.3`. To confirm `LD_PRELOAD` is working you can see the version it uses in the NCCL logs if you have `NCCL_DEBUG=INFO`, it should say: NCCL version 2.8.3+cuda11.0.
{: .notice--warning}
### Pre-requisites for 1-bit Adam
#### 1.2.2 MPI-based implementation
For this implementation, we rely on Message Passing Interface (MPI) for advanced communication primitives.
1-bit Adam uses advanced communication schemes that are not yet supported by PyTorch distributed and NCCL. We rely on Message Passing Interface (MPI) for these advanced communication primitives.
We package the necessary dependencies in the DeepSpeed docker images. However, if you are using a different build system, please install MPI and mpi4py on your system. To install the prerequisites run:
......@@ -61,32 +43,31 @@ An example launch command for 1-bit Adam using the `deepspeed` launcher is as fo
deepspeed --launcher=[mvapich|openmpi] script.py
```
Please note that for MPI-based implementation of 1-bit Adam, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
Please note that because 1-bit Adam uses MPI backend to communicate during the compression stage, the `--launcher=[mvapich|openmpi]` flag is required when using the `deepspeed` launcher.
Alternatively, the standard mpirun launcher can also be used as follows:
```shell
mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] python [training_script.py]
mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flags] bash [training_script.sh]
```
### 1.3 1-bit Algorithm
### 1-bit Algorithm
The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
The detailed description of the 1-bit Algorithm can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
### 1.4 Configuration of 1-bit Adam
### Configuration of 1-bit Adam
The 1-bit Adam feature can be used by setting the optimizer configuration options as follows. An example json config file is shown below.
```json
{
"train_batch_size": 4096,
"train_micro_batch_size_per_gpu": 16,
"train_micro_batch_size_per_gpu": 64,
"optimizer": {
"type": "OneBitAdam",
"params": {
"lr": 4e-4,
"freeze_step": 23000,
"cuda_aware": false,
"comm_backend_name": "nccl"
"lr": 2e-4,
"freeze_step": 400,
"cuda_aware": true
}
},
"fp16": {
......@@ -94,20 +75,12 @@ The 1-bit Adam feature can be used by setting the optimizer configuration option
}
}
```
Please note three new parameters `freeze_step`, `cuda_aware`, and `comm_backend_name` that have been added to support the 1-bit Adam feature.
`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model (This is related to Adam's variance/second moment term. See detailed analysis in our [paper](https://arxiv.org/abs/2102.02888)). If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
Please note two new parameters `freeze_step` and `cuda_aware` that have been added to support the 1-bit Adam feature.
`cuda_aware` is used for MPI-based implementation to indicate that the underlying MPI library supports CUDA-Aware communication. This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
`cuda_aware` is used to indicate that the underlying MPI library support CUDA-Aware communication.
This feature is only supported on systems with InfiniBand interconnect and a CUDA-Aware MPI library like [MVAPICH2-GDR](http://mvapich.cse.ohio-state.edu/userguide/gdr/) or OpenMPI built with CUDA-Aware support. Setting `cuda_aware` to False will allow training on Ethernet based systems. However, the communication will happen using sender as well as receiver side memory copies between CPU and GPU buffers before and after communication.
(New in v2) `comm_backend_name` is used to indicate which backend implementation to use. You can choose between NCCL and MPI-based implementations by setting `comm_backend_name` to "nccl" and "mpi". When using NCCL-based implementation, there is no need to set `cuda_aware`.
#### 1.4.1 (New in v2) Momentum masks for parameters with constant zero gradients
Because 1-bit compression cannot represent exact zero, the compression error would keep accumulating in the momentum if a parameter have constant zero gradients during training. For example, for BERT pre-training seq length 128, `bert.embeddings.position_embeddings.weight` has constant zeros in its gradient and momentum for row 129 to 512, because it only learns up to seq length 128 while the model supports up to seq length 512. Thus in 1-bit Adam v2 we added support of a momentum mask for users to specify those params that have constant exact zeros in their gradients. See [example script](https://github.com/microsoft/DeepSpeedExamples/blob/master/bing_bert/deepspeed_train.py) for how to configure this momentum mask. One thing to note is that we don't use momentum mask saved in checkpoints since this mask could change during training (e.g., BERT seqlen 128 and 512 require different masks). So you have to provide this mask every time in your training script.
**Watch out!**
1-bit Adam replies on an compression error compensation mechanism to maintain the convergence speed at compression stage. When loading checkpoints, we actually reset the compression errors for 3 reasons: 1) The worker and server error at each GPU are distinct, so in current implementation only rank 0's errors are saved in the checkpoint. Thus we have to reset the errors. If we want to save them correctly we need O(num_gpu*model_size) memory in order to gather all the error, which is a very large memory requirement. It's possible to save them in a distributed way, but it will make the checkpoint saving/loading much more complicated. 2) Even if we are able to save the compression errors correctly, you need to have the exact same number of GPUs in order to load them correctly. 3) We verified on BERT pre-training that occasionally resetting the compression error at checkpoint loading does not affect the convergence. However, please avoid frequent checkpoint loading which could break the error compensation mechanism thus affect the convergence.
{: .notice--warning}
`freeze_step` is the number of warm up steps before 1-bit compression gets applied to the communication. In order to determine the number of warm up steps, one strategy is to set 15-25% of the total training steps for a given model. If it provides the desired outcome, one can try to extract more performance by reducing the steps systematically. In future, we plan to introduce a threshold that can automatically search and decide for the number of warm up steps for different models. The examples below have been tuned for the number of warm up steps. The `freeze_step` parameter has already been set to the best number we found in the corresponding run scripts.
## 2. BingBertSQuAD Fine-tuning with 1-bit Adam
......@@ -120,13 +93,9 @@ Because 1-bit compression cannot represent exact zero, the compression error wou
You can also use a pre-trained BERT model checkpoint from either DeepSpeed, [HuggingFace](https://github.com/huggingface/transformers), or [TensorFlow](https://github.com/google-research/bert#pre-trained-models) to run the fine-tuning.
**Note:** For details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
### 2.1 Running BingBertSQuAD with DeepSpeed and 1-bit Adam
We provide example scripts under [DeepSpeedExamples/BingBertSquad/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/BingBertSquad/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
<!-- The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
The main part of training is done in `nvidia_run_squad_deepspeed.py`, which has
already been modified to use DeepSpeed. The `run_squad_deepspeed.sh` script
helps to invoke training and setup several different hyperparameters relevant
to the training process.
......@@ -163,7 +132,7 @@ For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the su
```shell
mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash run_squad_mpi_onebitadam.sh
``` -->
```
### 2.2 Configuration for BingBertSQuAD with DeepSpeed and 1-bit Adam enabled
......@@ -179,16 +148,18 @@ Table 1 shows the fine-tuning configuration we used in our experiments.
| ------------------------------ | ---------------------|
| Total batch size | 96 |
| Train micro batch size per GPU | 3 |
| Optimizer | **"OnebitAdam"** |
| Optimizer | **OnebitAdam** |
| Learning rate | 3e-5 |
| Sequence-length | 384 |
| Weight-decay | 0.0 |
| Epoch count | 2 |
| **freeze_step** | 400 |
| **comm_backend_name** | "nccl" |
| **cuda_aware** | True |
Table 1. Fine-tuning configuration
**Note:** For more details about loading checkpoint, argument parsing, initialization, forward pass, backward pass, weight update and evaluation, please refer to the [BingBertSQuAD Fine-tuning](/tutorials/bert-finetuning/) tutorial.
### 2.3 Performance Results for BingBertSQuAD Fine-tuning
***Accuracy:***
......@@ -203,24 +174,19 @@ We fixed the learning rate to 3e-5. The table below shows the F1 and the EM scor
***Training Speed and Scalability:***
<!-- 1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
1-bit Adam enables up to 2.7x overall speedup in training speed for SQuAD fine-tuning. This is made possible by up to 6.2x faster throughput during the compressed stage of the algorithm as shown in Figure 1.
![SQuAD Finetuning](/assets/images/squad-scaling.png){: .align-center}
Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU. -->
Performance results of SQuAD Fine-tuning can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
Figure 1: Scalability of 1-bit Adam for SQuAD Finetuning on V100 GPUs with batch size of 3/GPU.
## 3. BERT Pre-training with 1-bit Adam
For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) tutorial.
For data downloading and pre-processing, please refer to the [BERT Pre-training](/tutorials/bert-pretraining/) post.
### 3.1 Running Pre-training with DeepSpeed and 1-bit Adam
We provide example scripts under [DeepSpeedExamples/bing_bert/1-bit_adam/](https://github.com/microsoft/DeepSpeedExamples/tree/master/bing_bert/1-bit_adam). There are 3 sets of scripts corresponding to NCCL-based implementation, MPI-based implementation on Ethernet systems, and MPI-based implementation on InfiniBand systems. For MPI-based implementation, we provide both example scripts when launching with deepspeed or mpirun.
<!-- The main part of training is done in `deepspeed_train.py`, which has
The main part of training is done in `deepspeed_train.py`, which has
already been modified to use DeepSpeed. The `ds_train_bert_onebit_bsz4k_seq128.sh` and `ds_train_bert_bsz64k_seq128.sh`
are the shell scripts that help to invoke training and setup several different hyperparameters relevant
to the training process.
......@@ -252,11 +218,11 @@ mpirun -np [#processes] -ppn [#GPUs on each node] -hostfile [hostfile] [MPI flag
For example, in order to use 32 GPUs (4GPUs/node, 8 nodes in total), with the support of InfiniBand, you can use MVAPICH2 as the launcher and run the following command:
```shell
mpirun -np 32 -ppn 4 -hostfile hosts -env MV2_USE_CUDA=1 -env MV2_SUPPORT_DL=1 -env MV2_ENABLE_AFFINITY=0 -env MV2_SMP_USE_CMA=0 bash ds_train_bert_onebit_bsz4k_seq128.sh
``` -->
```
### 3.2 Configuration for BERT Pre-training with DeepSpeed and 1-bit Adam enabled
The `deepspeed_bsz4k_onebit_config_seq128_*.json` file gives the user the ability to specify DeepSpeed
The `deepspeed_bsz4k_onebit_config_seq128.json` file gives the user the ability to specify DeepSpeed
options in terms of batch size, micro batch size, optimizer, learning rate, and other parameters.
Below is the DeepSpeed configuration file for running BERT-large pre-training with sequence length of 128 using the 1-bit Adam optimizer.
......@@ -274,7 +240,7 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
"weight_decay": 0.01,
"bias_correction": false,
"freeze_step": 23000,
"comm_backend_name": "nccl"
"cuda_aware": true
}
},
"gradient_clipping": 1.0,
......@@ -285,8 +251,8 @@ Below is the DeepSpeed configuration file for running BERT-large pre-training wi
}
}
```
The above file is for BERT-large. For BERT-base training (sequence length 128), the suggested `freeze_step` is 16000. For sequence 512 pre-training, we suggest to use a `freeze_step` of 1500 for both BERT-base and BERT-large. And make sure to set the `comm_backend_name` and `cuda_aware` correctly as described above.
The above file is for BERT-large but for BERT-base training (sequence length 128), the suggested `freeze_step` will need to be changed to 16000. For the rest of the pre-training using sequence 512, we suggest to use a `freeze_step` of 1500. And make sure to set the `cuda_aware` correctly as described above.
### 3.3 Performance Results for BERT Pre-training
Performance results of BERT Pre-training can be seen from our [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html) and our [paper](https://arxiv.org/abs/2102.02888).
Performance results of BERT Pre-training can be seen from our detailed [blog post](https://www.deepspeed.ai/news/2020/09/09/onebit-adam-blog-post.html).
......@@ -276,9 +276,9 @@ For example, a machine with 16 GPUs must have as much local CPU memory as 16 tim
DeepSpeed provides a `LayerSpec` class that delays the construction of
modules until the model layers have been partitioned across workers.
Then each worker will allocate only the layers it's assigned to. So, comparing to the
example from the previous paragraph, using `LayerSpec` a machine with 16 GPUs will need to
allocate a total of 1x model size on its CPU memory and not 16x.
Then each worker will allocate only the layers it's assigned to. So, continuing the
example from the previous paragraph, a machine with 16 GPUs will need to allocate a
total of 1x model size on its CPU, compared to 16x in the LayerSpec example.
Here is an example of the abbreviated AlexNet model, but expressed only
with `LayerSpec`s. Note that the syntax is almost unchanged: `nn.ReLU(inplace=True)`
......
......@@ -17,4 +17,4 @@ FusedLamb (GPU)
OneBitAdam (GPU)
----------------------------
.. autoclass:: deepspeed.runtime.fp16.onebit.adam.OneBitAdam
.. autoclass:: deepspeed.runtime.fp16.OneBitAdam
......@@ -4,8 +4,6 @@ toc: true
toc_label: "Contents"
---
<b>03/2021: DeepSpeed is hiring! Come join us: [SDE 2](https://careers.microsoft.com/us/en/job/1013160/Software-Engineer-2), [Sr. SDE](https://careers.microsoft.com/us/en/job/1017151/Senior-Software-Engineer), [Sr. Researcher](https://careers.microsoft.com/us/en/job/1016440/Senior-Researcher)</b>
DeepSpeed is a deep learning optimization library that makes distributed training easy,
efficient, and effective.
......@@ -30,11 +28,7 @@ initiative to enable next-generation AI capabilities at scale, where you can fin
information [here](https://innovation.microsoft.com/en-us/exploring-ai-at-scale).
# What's New?
* [2021/04/02] [[DeepSpeed on AzureML] Transformers and CIFAR examples are now available on AzureML GitHub](https://github.com/Azure/azureml-examples/tree/main/workflows/train/deepspeed)
* [2021/03/30] [[PyTorch Lightning Blog] Accessible Multi-Billion Parameter Model Training with PyTorch Lightning + DeepSpeed](https://medium.com/pytorch-lightning/accessible-multi-billion-parameter-model-training-with-pytorch-lightning-deepspeed-c9333ac3bb59)
* [2021/03/16] [1-bit Adam v2: NCCL-based implementation and more](https://www.deepspeed.ai/tutorials/onebit-adam/)
* [2021/03/08] [ZeRO-3 Offload: Scale your models to trillion parameters without code changes while leveraging both CPUs & GPUs](https://www.deepspeed.ai/news/2021/03/07/zero3-offload.html)
* [2021/01/19] [[🤗Hugging Face Blog] Fit More and Train Faster With ZeRO via DeepSpeed and FairScale](https://huggingface.co/blog/zero-deepspeed-fairscale)
* [2020/11/12] [Simplified install, JIT compiled ops, PyPI releases, and reduced dependencies](#installation)
* [2020/11/10] [Efficient and robust compressed training through progressive layer dropping](https://www.deepspeed.ai/news/2020/10/28/progressive-layer-dropping-news.html)
* [2020/09/10] [DeepSpeed v0.3: Extreme-scale model training for everyone]({{ site.press_release_v3 }})
......
include *.txt README.md
recursive-include requirements *.txt
recursive-include deepspeed *.cpp *.h *.cu *.tr *.cuh *.cc
recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment