"examples/others/tensorize_vllm_model.py" did not exist on "9553fdb41e63a2864684242f29851beb4afe75c1"
Commit da3f0934 authored by zhuwenwen's avatar zhuwenwen
Browse files

delete unused files

parent c4dd1fd4
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
import torch
import torch.nn as nn
from torch import Tensor
from torch.optim import Optimizer
from colossalai.utils import clip_grad_norm_fp32
class ColossalaiOptimizer(Optimizer):
def __init__(self, optim: Optimizer):
self.optim = optim
@property
def param_groups(self):
return self.optim.param_groups
@property
def defaults(self):
return self.optim.defaults
def add_param_group(self, *args, **kwargs):
return self.optim.add_param_group(*args, **kwargs)
def step(self, *args, **kwargs):
return self.optim.step(*args, **kwargs)
def zero_grad(self, *args, **kwargs):
self.optim.zero_grad(*args, **kwargs)
def load_state_dict(self, *args, **kwargs):
self.optim.load_state_dict(*args, **kwargs)
def state_dict(self):
return self.optim.state_dict()
def backward(self, loss: Tensor):
loss.backward()
def backward_by_grad(self, tensor: Tensor, grad: Tensor):
torch.autograd.backward(tensors=tensor, grad_tensors=grad)
def clip_grad_norm(self, model: nn.Module, max_norm: float):
if max_norm > 0.0:
clip_grad_norm_fp32(model.parameters(), max_norm)
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
import torch
from colossalai.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
@OPTIMIZERS.register_module
class FusedAdam(torch.optim.Optimizer):
"""Implements Adam algorithm.
Currently GPU-only. Requires ColossalAI to be installed via
``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
This version of fused Adam implements 2 fusions.
* Fusion of the Adam update's elementwise operations
* A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
:class:`colossalai.nn.optimizer.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
or ``torch.optim.Adam`` with ``adam_w_mode=False``
:class:`colossalai.nn.optimizer.FusedAdam` may be used with or without Amp.
Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-8)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
(default: False) NOT SUPPORTED in FusedAdam!
adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
True for decoupled weight decay(also known as AdamW) (default: True)
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
.. _Adam\: A Method for Stochastic Optimization:
https://arxiv.org/abs/1412.6980
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, bias_correction=True,
betas=(0.9, 0.999), eps=1e-8, adam_w_mode=True,
weight_decay=0., amsgrad=False, set_grad_none=True):
if amsgrad:
raise RuntimeError(
'FusedAdam does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay)
super(FusedAdam, self).__init__(params, defaults)
self.adam_w_mode = 1 if adam_w_mode else 0
self.set_grad_none = set_grad_none
if multi_tensor_applier.available:
import colossal_C
# Skip buffer
self._dummy_overflow_buf = torch.cuda.IntTensor([0])
self.multi_tensor_adam = colossal_C.multi_tensor_adam
else:
raise RuntimeError('FusedAdam requires cuda extensions')
def zero_grad(self):
if self.set_grad_none:
for group in self.param_groups:
for p in group['params']:
p.grad = None
else:
super(FusedAdam, self).zero_grad()
def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
"""
if any(p is not None for p in [grads, output_params, scale, grad_norms]):
raise RuntimeError(
'FusedAdam has been updated. Simply initialize it identically to torch.optim.Adam, and call step() with no arguments.')
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
# create lists for multi-tensor apply
g_16, p_16, m_16, v_16 = [], [], [], []
g_32, p_32, m_32, v_32 = [], [], [], []
for p in group['params']:
if p.grad is None:
continue
if p.grad.data.is_sparse:
raise RuntimeError(
'FusedAdam does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if p.dtype == torch.float16:
g_16.append(p.grad.data)
p_16.append(p.data)
m_16.append(state['exp_avg'])
v_16.append(state['exp_avg_sq'])
elif p.dtype == torch.float32:
g_32.append(p.grad.data)
p_32.append(p.data)
m_32.append(state['exp_avg'])
v_32.append(state['exp_avg_sq'])
else:
raise RuntimeError('FusedAdam only support fp16 and fp32.')
if (len(g_16) > 0):
multi_tensor_applier(self.multi_tensor_adam,
self._dummy_overflow_buf,
[g_16, p_16, m_16, v_16],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
self.adam_w_mode,
bias_correction,
group['weight_decay'])
if (len(g_32) > 0):
multi_tensor_applier(self.multi_tensor_adam,
self._dummy_overflow_buf,
[g_32, p_32, m_32, v_32],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
self.adam_w_mode,
bias_correction,
group['weight_decay'])
return loss
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
import torch
from colossalai.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
@OPTIMIZERS.register_module
class FusedLAMB(torch.optim.Optimizer):
"""Implements LAMB algorithm.
Currently GPU-only. Requires ColossalAI to be installed via
``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
This version of fused LAMB implements 2 fusions.
* Fusion of the LAMB update's elementwise operations
* A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
:class:`colossalai.nn.optimizer.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer
:class:`colossalai.nn.optimizer.FusedLAMB` may be used with or without Amp.
LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups.
lr (float, optional): learning rate. (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its norm. (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability. (default: 1e-6)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)
amsgrad (boolean, optional): whether to use the AMSGrad variant of this
algorithm from the paper `On the Convergence of Adam and Beyond`_
NOT SUPPORTED now! (default: False)
adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
True for decoupled weight decay(also known as AdamW) (default: True)
grad_averaging (bool, optional): whether apply (1-beta2) to grad when
calculating running averages of gradient. (default: True)
set_grad_none (bool, optional): whether set grad to None when zero_grad()
method is called. (default: True)
max_grad_norm (float, optional): value used to clip global grad norm
(default: 1.0)
use_nvlamb (boolean, optional): Apply adaptive learning rate to 0.0
weight decay parameter (default: False)
.. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
https://arxiv.org/abs/1904.00962
.. _On the Convergence of Adam and Beyond:
https://openreview.net/forum?id=ryQu7f-RZ
"""
def __init__(self, params, lr=1e-3, bias_correction=True,
betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01,
amsgrad=False, adam_w_mode=True,
grad_averaging=True, set_grad_none=True,
max_grad_norm=1.0, use_nvlamb=False):
if amsgrad:
raise RuntimeError(
'FusedLAMB does not support the AMSGrad variant.')
defaults = dict(lr=lr, bias_correction=bias_correction,
betas=betas, eps=eps, weight_decay=weight_decay,
grad_averaging=grad_averaging,
max_grad_norm=max_grad_norm)
super(FusedLAMB, self).__init__(params, defaults)
if multi_tensor_applier.available:
import colossal_C
self.multi_tensor_l2norm = colossal_C.multi_tensor_l2norm
# Skip buffer
self._dummy_overflow_buf = torch.tensor(
[0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
self.multi_tensor_lamb = colossal_C.multi_tensor_lamb
else:
raise RuntimeError('FusedLAMB requires cuda extensions')
self.adam_w_mode = 1 if adam_w_mode else 0
self.set_grad_none = set_grad_none
self.use_nvlamb = use_nvlamb
def zero_grad(self):
if self.set_grad_none:
for group in self.param_groups:
for p in group['params']:
p.grad = None
else:
super(FusedLAMB, self).zero_grad()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
# create separate grad lists for fp32 and fp16 params
g_all_32, g_all_16 = [], []
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
if p.dtype == torch.float32:
g_all_32.append(p.grad.data)
elif p.dtype == torch.float16:
g_all_16.append(p.grad.data)
else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.')
device = self.param_groups[0]["params"][0].device
g_norm_32, g_norm_16 = torch.zeros(
1, device=device), torch.zeros(1, device=device)
# compute grad norm for two lists
if len(g_all_32) > 0:
g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
self._dummy_overflow_buf,
[g_all_32], False)[0]
if len(g_all_16) > 0:
g_norm_16 = multi_tensor_applier(self.multi_tensor_l2norm,
self._dummy_overflow_buf,
[g_all_16], False)[0]
# blend two grad norms to get global grad norm
global_grad_norm = multi_tensor_applier(self.multi_tensor_l2norm,
self._dummy_overflow_buf,
[[g_norm_32, g_norm_16]],
False)[0]
max_grad_norm = self.defaults['max_grad_norm']
for group in self.param_groups:
bias_correction = 1 if group['bias_correction'] else 0
beta1, beta2 = group['betas']
grad_averaging = 1 if group['grad_averaging'] else 0
# assume same step across group now to simplify things
# per parameter step can be easily support by making it tensor, or pass list into kernel
if 'step' in group:
group['step'] += 1
else:
group['step'] = 1
# create lists for multi-tensor apply
g_16, p_16, m_16, v_16 = [], [], [], []
g_32, p_32, m_32, v_32 = [], [], [], []
for p in group['params']:
if p.grad is None:
continue
if p.grad.data.is_sparse:
raise RuntimeError(
'FusedLAMB does not support sparse gradients, please consider SparseAdam instead')
state = self.state[p]
# State initialization
if len(state) == 0:
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
if p.dtype == torch.float16:
g_16.append(p.grad.data)
p_16.append(p.data)
m_16.append(state['exp_avg'])
v_16.append(state['exp_avg_sq'])
elif p.dtype == torch.float32:
g_32.append(p.grad.data)
p_32.append(p.data)
m_32.append(state['exp_avg'])
v_32.append(state['exp_avg_sq'])
else:
raise RuntimeError('FusedLAMB only support fp16 and fp32.')
if (len(g_16) > 0):
multi_tensor_applier(self.multi_tensor_lamb,
self._dummy_overflow_buf,
[g_16, p_16, m_16, v_16],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
bias_correction,
group['weight_decay'],
grad_averaging,
self.adam_w_mode,
global_grad_norm,
max_grad_norm,
self.use_nvlamb)
if (len(g_32) > 0):
multi_tensor_applier(self.multi_tensor_lamb,
self._dummy_overflow_buf,
[g_32, p_32, m_32, v_32],
group['lr'],
beta1,
beta2,
group['eps'],
group['step'],
bias_correction,
group['weight_decay'],
grad_averaging,
self.adam_w_mode,
global_grad_norm,
max_grad_norm,
self.use_nvlamb)
return loss
# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_sgd.py
import torch
from torch.optim.optimizer import Optimizer, required
from colossalai.registry import OPTIMIZERS
from colossalai.utils import multi_tensor_applier
@OPTIMIZERS.register_module
class FusedSGD(Optimizer):
r"""Implements stochastic gradient descent (optionally with momentum).
Currently GPU-only. Requires ColossalAI to be installed via
``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
This version of fused SGD implements 2 fusions.
* Fusion of the SGD update's elementwise operations
* A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
:class:`colossalai.nn.optimizer.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``
:class:`colossalai.nn.optimizer.FusedSGD` may be used with or without Amp.
Nesterov momentum is based on the formula from
`On the importance of initialization and momentum in deep learning`__.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float): learning rate
momentum (float, optional): momentum factor (default: 0)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
dampening (float, optional): dampening for momentum (default: 0)
nesterov (bool, optional): enables Nesterov momentum (default: False)
__ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
.. note::
The implementation of SGD with Momentum/Nesterov subtly differs from
Sutskever et. al. and implementations in some other frameworks.
Considering the specific case of Momentum, the update can be written as
.. math::
v = \rho * v + g \\
p = p - lr * v
where p, g, v and :math:`\rho` denote the parameters, gradient,
velocity, and momentum respectively.
This is in contrast to Sutskever et. al. and
other frameworks which employ an update of the form
.. math::
v = \rho * v + lr * g \\
p = p - v
The Nesterov version is analogously modified.
"""
def __init__(self, params, lr=required, momentum=0, dampening=0,
weight_decay=0, nesterov=False,
wd_after_momentum=False,
materialize_master_grads=True,
set_grad_none=False):
if lr is not required and lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay))
defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
weight_decay=weight_decay, nesterov=nesterov)
if nesterov and (momentum <= 0 or dampening != 0):
raise ValueError(
"Nesterov momentum requires a momentum and zero dampening")
super(FusedSGD, self).__init__(params, defaults)
self.wd_after_momentum = wd_after_momentum
self.materialize_master_grads = materialize_master_grads
self.most_recent_scale = 1.0
self.scale_set_by_backward = False
self.set_grad_none = set_grad_none
if multi_tensor_applier.available:
import colossal_C
# Skip buffer
self._dummy_overflow_buf = torch.tensor(
[0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
self.multi_tensor_sgd = colossal_C.multi_tensor_sgd
else:
raise RuntimeError('FusedSGD requires cuda extensions')
def __setstate__(self, state):
super(FusedSGD, self).__setstate__(state)
for group in self.param_groups:
group.setdefault('nesterov', False)
def zero_grad(self):
if self.set_grad_none:
for group in self.param_groups:
for p in group['params']:
p.grad = None
else:
super(FusedSGD, self).zero_grad()
def get_momentums(self, params):
momentums = []
first_run = True
for p in params:
param_state = self.state[p]
# torch.optim.SGD initializes momentum in the main loop, we have
# to do it here, and track whether or not we've done so, so that
# momentum application can be skipped in the main kernel.
if 'momentum_buffer' not in param_state:
first_run = True
buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
momentums.append(buf)
else:
first_run = False
momentums.append(param_state['momentum_buffer'])
return momentums, first_run
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
explicit_master_params = (hasattr(self, "_amp_stash") and
hasattr(self._amp_stash, "fp32_from_fp16_groups"))
for gid, group in enumerate(self.param_groups):
weight_decay = group['weight_decay']
momentum = group['momentum']
dampening = group['dampening']
nesterov = group['nesterov']
# For each group, there are 3 possible combinations we need to consider:
# grad_type, param_to_update_type, momentum_type, requires_fp16_model_copy
# 1. fp16, fp16, fp16, No
# 2. fp32, fp32, fp32, No
# 3. fp16, fp32, fp32, Yes
first_runs = [True, True]
# I think a bit of code divergence in exchange for naming clarity is worthwhile
if explicit_master_params:
stash = self._amp_stash
fp32_params = [
p for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
fp32_grads = [
p.grad for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
if self.materialize_master_grads:
fp16_model_params = [p for i, p in enumerate(
stash.fp16_groups[gid]) if stash.fp32_from_fp16_groups[gid][i].grad is not None]
fp32_from_fp16_grads = [
p.grad for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None]
fp32_from_fp16_params = [
p for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None]
fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(
fp32_from_fp16_params)
fp16_set = [fp32_from_fp16_grads, fp32_from_fp16_params,
fp32_from_fp16_momentums, fp16_model_params]
else:
fp16_model_params = [
p for p in stash.fp16_groups[gid] if p.grad is not None]
fp16_model_grads = [
p.grad for p in stash.fp16_groups[gid] if p.grad is not None]
fp32_from_fp16_params = [p for i, p in enumerate(
stash.fp32_from_fp16_groups[gid]) if stash.fp16_groups[gid][i].grad is not None]
fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(
fp32_from_fp16_params)
fp16_set = [fp16_model_grads, fp32_from_fp16_params,
fp32_from_fp16_momentums, fp16_model_params]
launch_sets = [fp16_set, [
fp32_grads, fp32_params, fp32_momentums]]
else:
fp16_params = [p for p in group['params'] if (
p.dtype == torch.float16 and p.grad is not None)]
fp16_grads = [p.grad for p in group['params'] if (
p.dtype == torch.float16 and p.grad is not None)]
fp16_momentums, first_runs[0] = self.get_momentums(fp16_params)
fp32_params = [p for p in group['params'] if (
p.dtype == torch.float32 and p.grad is not None)]
fp32_grads = [p.grad for p in group['params'] if (
p.dtype == torch.float32 and p.grad is not None)]
fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
launch_sets = [[fp16_grads, fp16_params, fp16_momentums],
[fp32_grads, fp32_params, fp32_momentums]]
for s, (launch_set, first_run) in enumerate(zip(launch_sets, first_runs)):
assert len(launch_set[0]) == len(launch_set[1])
assert len(launch_set[0]) == len(launch_set[2])
if len(launch_set[0]) > 0:
multi_tensor_applier(
self.multi_tensor_sgd,
self._dummy_overflow_buf,
launch_set,
weight_decay,
momentum,
dampening,
group['lr'],
nesterov,
first_run,
self.wd_after_momentum,
1.0 / self.most_recent_scale)
self.most_recent_scale = 1.0
self.scale_set_by_backward = False
return loss
"""
Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-lamb
"""
import torch
from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS
@OPTIMIZERS.register_module
class Lamb(Optimizer):
r"""Implements Lamb algorithm.
It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
Arguments:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
betas (Tuple[float, float], optional): coefficients used for computing
running averages of gradient and its square (default: (0.9, 0.999))
eps (float, optional): term added to the denominator to improve
numerical stability (default: 1e-6)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
adam (bool, optional): always use trust ratio = 1, which turns this into
Adam. Useful for comparison purposes.
.. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
https://arxiv.org/abs/1904.00962
"""
def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
weight_decay=0, adam=False):
if not 0.0 <= lr:
raise ValueError("Invalid learning rate: {}".format(lr))
if not 0.0 <= eps:
raise ValueError("Invalid epsilon value: {}".format(eps))
if not 0.0 <= betas[0] < 1.0:
raise ValueError(
"Invalid beta parameter at index 0: {}".format(betas[0]))
if not 0.0 <= betas[1] < 1.0:
raise ValueError(
"Invalid beta parameter at index 1: {}".format(betas[1]))
defaults = dict(lr=lr, betas=betas, eps=eps,
weight_decay=weight_decay)
self.adam = adam
super(Lamb, self).__init__(params, defaults)
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
loss = closure()
for group in self.param_groups:
for p in group['params']:
if p.grad is None:
continue
grad = p.grad.data
if grad.is_sparse:
raise RuntimeError(
'Lamb does not support sparse gradients, consider SparseAdam instad.')
state = self.state[p]
# State initialization
if len(state) == 0:
state['step'] = 0
# Exponential moving average of gradient values
state['exp_avg'] = torch.zeros_like(p.data)
# Exponential moving average of squared gradient values
state['exp_avg_sq'] = torch.zeros_like(p.data)
exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
beta1, beta2 = group['betas']
state['step'] += 1
# Decay the first and second moment running average coefficient
# m_t
exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
# v_t
exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
# Paper v3 does not use debiasing.
# bias_correction1 = 1 - beta1 ** state['step']
# bias_correction2 = 1 - beta2 ** state['step']
# Apply bias to lr to avoid broadcast.
# * math.sqrt(bias_correction2) / bias_correction1
step_size = group['lr']
weight_norm = p.data.pow(2).sum().sqrt()
adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
if group['weight_decay'] != 0:
adam_step.add_(p.data, alpha=group['weight_decay'])
adam_norm = adam_step.pow(2).sum().sqrt()
if weight_norm == 0 or adam_norm == 0:
trust_ratio = 1
else:
trust_ratio = weight_norm / adam_norm
state['weight_norm'] = weight_norm
state['adam_norm'] = adam_norm
state['trust_ratio'] = trust_ratio
if self.adam:
trust_ratio = 1
p.data.add_(adam_step, alpha=-step_size * trust_ratio)
return loss
"""Adapted from https://github.com/NUS-HPC-AI-Lab/LARS-ImageNet-PyTorch/blob/main/lars.py"""
from typing import Iterable
import torch
from torch.optim import Optimizer
from colossalai.registry import OPTIMIZERS
@OPTIMIZERS.register_module
class Lars(Optimizer):
r"""Implements the LARS optimizer from `"Large batch training of convolutional networks"
<https://arxiv.org/pdf/1708.03888.pdf>`_.
Args:
params (iterable): iterable of parameters to optimize or dicts defining
parameter groups
lr (float, optional): learning rate (default: 1e-3)
momentum (float, optional): momentum factor (default: 0)
eeta (float, optional): LARS coefficient as used in the paper (default: 1e-3)
weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
"""
def __init__(
self,
params: Iterable[torch.nn.Parameter],
lr=1e-3,
momentum=0,
eeta=1e-3,
weight_decay=0,
epsilon=0.0
) -> None:
if not isinstance(lr, float) or lr < 0.0:
raise ValueError("Invalid learning rate: {}".format(lr))
if momentum < 0.0:
raise ValueError("Invalid momentum value: {}".format(momentum))
if weight_decay < 0.0:
raise ValueError(
"Invalid weight_decay value: {}".format(weight_decay))
if eeta <= 0 or eeta > 1:
raise ValueError("Invalid eeta value: {}".format(eeta))
if epsilon < 0:
raise ValueError("Invalid epsilon value: {}".format(epsilon))
defaults = dict(lr=lr, momentum=momentum,
weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
super().__init__(params, defaults)
@torch.no_grad()
def step(self, closure=None):
"""Performs a single optimization step.
Arguments:
closure (callable, optional): A closure that reevaluates the model
and returns the loss.
"""
loss = None
if closure is not None:
with torch.enable_grad():
loss = closure()
for group in self.param_groups:
weight_decay = group['weight_decay']
momentum = group['momentum']
eeta = group['eeta']
lr = group['lr']
lars = group['lars']
eps = group['epsilon']
for p in group['params']:
if p.grad is None:
continue
decayed_grad = p.grad
scaled_lr = lr
if lars:
w_norm = torch.norm(p)
g_norm = torch.norm(p.grad)
trust_ratio = torch.where(
w_norm > 0 and g_norm > 0,
eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
torch.ones_like(w_norm)
)
trust_ratio.clamp_(0.0, 50)
scaled_lr *= trust_ratio.item()
if weight_decay != 0:
decayed_grad = decayed_grad.add(p, alpha=weight_decay)
decayed_grad = torch.clamp(decayed_grad, -10.0, 10.0)
if momentum != 0:
param_state = self.state[p]
if 'momentum_buffer' not in param_state:
buf = param_state['momentum_buffer'] = torch.clone(
decayed_grad).detach()
else:
buf = param_state['momentum_buffer']
buf.mul_(momentum).add_(decayed_grad)
decayed_grad = buf
p.add_(decayed_grad, alpha=-scaled_lr)
return loss
import torch.distributed.optim as dist_optim
import torch.nn as nn
import torch.optim as optim
import torchvision.models as tv_models
import torchvision.datasets as tv_datasets
from torchvision import transforms
from .registry import Registry
LAYERS = Registry("layers", third_party_library=[nn])
LOSSES = Registry("losses")
MODELS = Registry("models", third_party_library=[tv_models])
OPTIMIZERS = Registry("optimizers", third_party_library=[optim, dist_optim])
DATASETS = Registry("datasets", third_party_library=[tv_datasets])
DIST_GROUP_INITIALIZER = Registry("dist_group_initializer")
GRADIENT_HANDLER = Registry("gradient_handler")
LOSSES = Registry("losses", third_party_library=[nn])
HOOKS = Registry("hooks")
TRANSFORMS = Registry("transforms", third_party_library=[transforms])
DATA_SAMPLERS = Registry("data_samplers")
LR_SCHEDULERS = Registry("lr_schedulers")
SCHEDULE = Registry("schedules")
OPHOOKS = Registry("ophooks")
#!/usr/bin/env python
# -*- encoding: utf-8 -*-
from types import ModuleType
from typing import List
class Registry:
"""This is a registry class used to register classes and modules so that a universal
object builder can be enabled.
:param name: The name of the registry
:type name: str
:param third_party_library: List of third party libraries which are used in the
initialization of the register module
:type third_party_library: list, optional
"""
def __init__(self, name: str, third_party_library: List[ModuleType] = None):
self._name = name
self._registry = dict()
self._third_party_lib = third_party_library
@property
def name(self):
return self._name
def register_module(self, module_class):
"""Registers a module represented in `module_class`.
:param module_class: The module to be registered
:type module_class: class
:raises AssertionError: Raises an AssertionError if the module has already been
registered before
:return: The module to be registered, so as to use it normally if via importing
:rtype: class
"""
module_name = module_class.__name__
assert module_name not in self._registry
self._registry[module_name] = module_class
# return so as to use it normally if via importing
return module_class
def get_module(self, module_name: str):
"""Retrieves a module with name `module_name` and returns the module if it has
already been registered before.
:param module_name: The name of the module to be retrieved
:type module_name: str
:raises NameError: Raises a NameError if the module to be retrieved has neither been
registered directly nor as third party modules before
:return: The retrieved module or None
:rtype: :class:`object`
"""
if module_name in self._registry:
return self._registry[module_name]
elif self._third_party_lib is not None:
for lib in self._third_party_lib:
if hasattr(lib, module_name):
return getattr(lib, module_name)
raise NameError(f'Module {module_name} not found in the registry {self.name}')
def has(self, module_name: str):
"""Searches for a module with name `module_name` and returns a boolean value indicating
whether the module has been registered directly or as third party modules before.
:param module_name: The name of the module to be searched for
:type module_name: str
:return: A boolean value indicating whether the module has been registered directly or
as third party modules before
:rtype: bool
"""
found_flag = module_name in self._registry
if self._third_party_lib:
for lib in self._third_party_lib:
if hasattr(lib, module_name):
found_flag = True
break
return found_flag
from ._trainer import Trainer
__all__ = ['Trainer']
from typing import Union, List
from colossalai.context.parallel_mode import ParallelMode
import torch
from torch import Tensor
from torch.utils.data import DataLoader
from tqdm import tqdm
from colossalai.core import global_context as gpc
from colossalai.engine import Engine
from colossalai.engine.schedule import NonPipelineSchedule, BaseSchedule
from colossalai.logging import DistributedLogger
from colossalai.utils import MultiTimer
from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
from colossalai.trainer.hooks import BaseHook
class Trainer:
"""This a class tending for easy deployments of users' training and evaluation instead of
writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
called `Trainer`.
:param engine: Engine responsible for the process function
:type engine: :class:`Engine`
:param schedule: Schedule responsible for forward and backward steps
:type schedule: :class:`BaseSchedule`, optional
:param timer: Timer used to monitor the whole training
:type timer: :class:`MultiTimer`, optional
:param logger: Logger used to record the whole training
:type logger: :class:`colossalai.logging.DistributedLogger`, optional
"""
def __init__(
self,
engine: Engine,
schedule: BaseSchedule = None,
timer: MultiTimer = None,
logger: DistributedLogger = None,
):
# training-ralated params
self._engine = engine
self._max_epochs = 0
self._cur_epoch = 0
self._max_steps = 0
self._cur_step = 0
self._steps_per_epoch = 0
# misc params
self._logger = logger
self._verbose = logger is not None
# hooks can store states in this dict, and could be consumed by other hooks
self.states = dict()
# build hooks
self.hooks = list()
# multi-timer for time benchmarking
self._timer = timer
# set schedule which specifies the training iteration for the engine
if schedule is None:
schedule = NonPipelineSchedule()
if (gpc.is_initialized(ParallelMode.PIPELINE)
and gpc.get_world_size(ParallelMode.PIPELINE) > 1):
assert not isinstance(
schedule, NonPipelineSchedule
), "NonPipelineSchedule cannot be used for pipeline parallel training, please use PipelineSchedule instead."
self._schedule = schedule
self._schedule.pre_processing(engine)
@property
def cur_epoch(self):
"""Returns the index of the current epoch."""
return self._cur_epoch
@cur_epoch.setter
def cur_epoch(self, epoch: int):
"""Set how many epochs have been processed."""
# allow setter for training resumption
self._cur_epoch = epoch
@property
def cur_step(self):
"""Returns how many iteration steps have been processed."""
return self._cur_step
@property
def max_epochs(self):
return self._max_epochs
@property
def max_steps(self):
return self._max_steps
@property
def steps_per_epoch(self):
return self._steps_per_epoch
@property
def engine(self):
return self._engine
@property
def schedule(self):
return self._schedule
def _set_current_step(self, epoch: int):
"""Sets current step number.
:param epoch: Step number to be set
:type epoch: int
"""
self._cur_step = epoch * self._steps_per_epoch
def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
"""Call timer funciton with a given timer name.
:param action: Function to be called on timer
:type action: str
:param item: Name of the timer
:type item: str
:param args: args used for action function
:param kwargs: kwargs used for action function
"""
if self._timer is not None:
getattr(self._timer, action)(item, *args, **kwargs)
def _reset_states(self) -> None:
"""Clear trainer states"""
self.states = dict()
def _call_hooks(self, func, output=None):
"""Calls specific hooks in the current time point.
:param func: A string represents the time point
:param output: Output of the model after running a iteration or None in any other time points
:type func: str
:type output: optional
"""
# Only after iter hook will receive output
for hook in self.hooks:
if output is None:
getattr(hook, func)(self)
else:
getattr(hook, func)(self, *output)
@staticmethod
def _should_display_progress(display_progress: bool):
"""Only display progress on DP rank 0, TP rank 0 and PP last rank"""
return (display_progress and is_dp_rank_0() and is_tp_rank_0()
and is_no_pp_or_last_stage())
def _train_epoch(
self,
train_dataloader: DataLoader,
epoch: int = None,
display_progress: bool = False,
return_output_label: bool = True,
):
# set training state
self._engine.train()
data_iter = iter(train_dataloader)
progress = range(self._steps_per_epoch)
if display_progress:
if epoch is None:
progress = tqdm(progress, desc="[Train]")
else:
progress = tqdm(progress, desc=f"[Epoch {epoch} / Train]")
self._call_hooks("before_train_epoch")
self._call_timer(action="start", item="Train-epoch")
for i in progress:
self._call_hooks("before_train_iter")
self._call_timer(action="start", item="Train-step")
# run 1 training step
self.engine.zero_grad()
logits, label, loss = self.schedule.forward_backward_step(
self.engine,
data_iter,
forward_only=False,
return_loss=True,
return_output_label=return_output_label,
)
self.engine.step()
self._call_timer(action="stop",
item="Train-step",
keep_in_history=True)
self._call_hooks("after_train_iter", output=(logits, label, loss))
self._cur_step += 1
if display_progress:
if "step_metrics" in self.states:
progress.set_postfix(**self.states["step_metrics"])
# stop when max iter is reached
if self._exceed_max_step():
break
self._call_timer(action="stop",
item="Train-epoch",
keep_in_history=True)
self._call_hooks("after_train_epoch")
self._call_timer(action="reset", item="Train-epoch")
def _eval(
self,
test_dataloader: DataLoader,
epoch: int = None,
display_progress: bool = False,
return_output_label: bool = True,
):
# switch engine status
self._engine.eval()
data_iter = iter(test_dataloader)
num_steps = len(test_dataloader)
self._call_hooks("before_test")
# prepare progress bar
progress = range(num_steps)
if display_progress:
desc = "Evaluation"
if epoch is not None:
desc = "[Epoch %d / Test]" % epoch
progress = tqdm(progress, desc=desc)
self._call_hooks("before_test_epoch")
self._call_timer(action="start", item="Test-epoch")
with torch.no_grad():
for _ in progress:
self._call_hooks("before_test_iter")
self._call_timer(action="start", item="Test-step")
logits, label, loss = self.schedule.forward_backward_step(
self.engine,
data_iter,
forward_only=True,
return_loss=True,
return_output_label=return_output_label,
)
self._call_timer(action="stop",
item="Test-step",
keep_in_history=True)
self._call_hooks("after_test_iter",
output=(logits, label, loss))
if display_progress:
if "step_metrics" in self.states:
progress.set_postfix(**self.states["step_metrics"])
self._call_timer(action="stop",
item="Test-epoch",
keep_in_history=True)
self._call_hooks("after_test_epoch")
self._call_hooks("after_test")
self._call_timer(action="reset", item="Test-step")
self._call_timer(action="reset", item="Test-epoch")
def _exceed_max_step(self):
return self._max_steps is not None and self._cur_step >= self._max_steps
def fit(
self,
train_dataloader: DataLoader,
epochs: int,
max_steps: int = None,
test_dataloader: DataLoader = None,
test_interval: int = 1,
hooks: List[BaseHook] = None,
display_progress: bool = False,
return_output_label: bool = True,
):
"""Trains the model to fit training data.
:param train_dataloader: DataLoader in training
:param epochs: Maximum number of epoches
:param max_steps: Maximum number of running iterations
:param test_dataloader: DataLoader in testing
:param test_interval: Interval of testing
:param hooks: A list of hooks used in training
:param display_progress: If True, the training progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type train_dataloader: DataLoader
:type epochs: int
:type max_steps: int, optional
:type test_dataloader: DataLoader, optional
:type test_interval: int, optional
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool, optional
"""
# set epochs and steps, consider gradient accumulation
self._steps_per_epoch = len(train_dataloader)
self._max_steps = max_steps
self._max_epochs = epochs
# check if testing is required
should_test = False
if test_dataloader is not None:
should_test = True
display_progress = self._should_display_progress(display_progress)
# reset hooks
self._reset_states()
if hooks is not None:
assert isinstance(
hooks, list
), f"expected argument hooks be to list, but got {type(hooks)}"
else:
hooks = []
self.hooks = hooks
self.hooks.sort(key=lambda hook: hook.priority)
if self._verbose:
for hook in self.hooks:
self._logger.info(
f"Using {hook.__class__.__name__} for training, priority = {hook.priority}",
ranks=[0],
)
self._logger.info(
"Lower value means higher priority for calling hook function",
ranks=[0])
self._call_hooks("after_hook_is_attached")
self._engine.train()
self._call_hooks("before_train")
# recover step value if resuming training
last_epoch = self._cur_epoch
if self.cur_epoch != 0:
self._set_current_step(last_epoch)
for epoch in range(last_epoch, epochs):
# train for one epoch
self._train_epoch(
train_dataloader=train_dataloader,
epoch=epoch,
display_progress=display_progress,
return_output_label=return_output_label,
)
# start eval
if should_test and epoch % test_interval == 0:
self._eval(
test_dataloader=test_dataloader,
display_progress=display_progress,
epoch=epoch,
return_output_label=return_output_label,
)
self._cur_epoch += 1
# check for termination
if self._exceed_max_step():
self._logger.info(
f"Max number of steps {max_steps} has been reached, training is stopped automatically",
ranks=[0],
)
break
self._call_hooks("after_train")
self._call_timer("reset", "Train-epoch")
def evaluate(
self,
test_dataloader: DataLoader,
hooks: List[BaseHook] = None,
display_progress: bool = False,
return_output_label: bool = True,
):
"""Evaluates the model with testing data.
:param test_dataloader: DataLoader in testing
:param hooks: A list of hooks used in evaluation
:param display_progress: If True, the evaluation progress will be printed
:param return_output_label: If True, the output of model and the label will be returned
:type test_dataloader: DataLoader
:type hooks: list, optional
:type display_progress: bool, optional
:type return_output_label: bool
"""
# set display
display_progress = self._should_display_progress(display_progress)
# reset hooks
self._reset_states()
if hooks is not None:
assert isinstance(
hooks, list
), f"expected argument hooks be to list, but got {type(hooks)}"
else:
hooks = []
self.hooks = hooks
self.hooks.sort(key=lambda hook: hook.priority)
if self._verbose:
for hook in self.hooks:
self._logger.info(
f"Using {hook.__class__.__name__} for training, priority = {hook.priority}",
ranks=[0],
)
self._logger.info(
"Lower value means higher priority for calling hook function",
ranks=[0])
self._call_hooks("after_hook_is_attached")
# eval
self._eval(
test_dataloader=test_dataloader,
display_progress=display_progress,
return_output_label=return_output_label,
)
def predict(self, data: Union[Tensor, List[Tensor]]):
"""Uses trained model to make a prediction for a tensor or a tensor list.
:param data: Data as the input
:type data: Union[Tensor, List[Tensor]
:return: The output of model as the prediction
:rtype: Tensor
"""
# predict without labels
if isinstance(data, (list, tuple)):
assert isinstance(data[0], Tensor)
else:
assert isinstance(data, Tensor)
self._engine.eval()
# prepare a list of (data, label) to make it iterable
# for compatibility with schedule
simple_dataloader = [(data, None)]
data_iter = iter(simple_dataloader)
output, _, _ = self.schedule.forward_backward_step(self.engine,
data_iter,
forward_only=True,
return_loss=False)
return output
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment