delete unused files

da3f0934 · zhuwenwen · c4dd1fd4 · c4dd1fd4 · c4dd1fd4 · c4dd1fd4
Commit da3f0934 authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/nn/optimizer/__pycache__/lamb.cpython-36.pyc
+++ b/colossalai/nn/optimizer/__pycache__/lamb.cpython-36.pyc
--- a/colossalai/nn/optimizer/__pycache__/lamb.cpython-37.pyc
+++ b/colossalai/nn/optimizer/__pycache__/lamb.cpython-37.pyc
--- a/colossalai/nn/optimizer/__pycache__/lars.cpython-36.pyc
+++ b/colossalai/nn/optimizer/__pycache__/lars.cpython-36.pyc
--- a/colossalai/nn/optimizer/__pycache__/lars.cpython-37.pyc
+++ b/colossalai/nn/optimizer/__pycache__/lars.cpython-37.pyc
--- a/colossalai/nn/optimizer/colossalai_optimizer.py
+++ b/colossalai/nn/optimizer/colossalai_optimizer.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import torch
-import torch.nn as nn
-from torch import Tensor
-from torch.optim import Optimizer
-from colossalai.utils import clip_grad_norm_fp32
-
-
-class ColossalaiOptimizer(Optimizer):
-
-    def __init__(self, optim: Optimizer):
-        self.optim = optim
-
-    @property
-    def param_groups(self):
-        return self.optim.param_groups
-
-    @property
-    def defaults(self):
-        return self.optim.defaults
-
-    def add_param_group(self, *args, **kwargs):
-        return self.optim.add_param_group(*args, **kwargs)
-
-    def step(self, *args, **kwargs):
-        return self.optim.step(*args, **kwargs)
-
-    def zero_grad(self, *args, **kwargs):
-        self.optim.zero_grad(*args, **kwargs)
-
-    def load_state_dict(self, *args, **kwargs):
-        self.optim.load_state_dict(*args, **kwargs)
-
-    def state_dict(self):
-        return self.optim.state_dict()
-
-    def backward(self, loss: Tensor):
-        loss.backward()
-
-    def backward_by_grad(self, tensor: Tensor, grad: Tensor):
-        torch.autograd.backward(tensors=tensor, grad_tensors=grad)
-
-    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        if max_norm > 0.0:
-            clip_grad_norm_fp32(model.parameters(), max_norm)
--- a/colossalai/nn/optimizer/fused_adam.py
+++ b/colossalai/nn/optimizer/fused_adam.py
-# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_adam.py
-import torch
-
-from colossalai.registry import OPTIMIZERS
-from colossalai.utils import multi_tensor_applier
-
-
-@OPTIMIZERS.register_module
-class FusedAdam(torch.optim.Optimizer):
-    """Implements Adam algorithm.
-
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
-
-    This version of fused Adam implements 2 fusions.
-
-      * Fusion of the Adam update's elementwise operations
-      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
-
-    :class:`colossalai.nn.optimizer.FusedAdam` may be used as a drop-in replacement for ``torch.optim.AdamW``,
-    or ``torch.optim.Adam`` with ``adam_w_mode=False``
-
-    :class:`colossalai.nn.optimizer.FusedAdam` may be used with or without Amp. 
-
-    Adam was been proposed in `Adam: A Method for Stochastic Optimization`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups.
-        lr (float, optional): learning rate. (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square. (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-8)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            (default: False) NOT SUPPORTED in FusedAdam!
-        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
-            True for decoupled weight decay(also known as AdamW) (default: True)
-        set_grad_none (bool, optional): whether set grad to None when zero_grad()
-            method is called. (default: True)
-
-    .. _Adam\: A Method for Stochastic Optimization:
-        https://arxiv.org/abs/1412.6980
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, bias_correction=True,
-                 betas=(0.9, 0.999), eps=1e-8, adam_w_mode=True,
-                 weight_decay=0., amsgrad=False, set_grad_none=True):
-
-        if amsgrad:
-            raise RuntimeError(
-                'FusedAdam does not support the AMSGrad variant.')
-        defaults = dict(lr=lr, bias_correction=bias_correction,
-                        betas=betas, eps=eps, weight_decay=weight_decay)
-        super(FusedAdam, self).__init__(params, defaults)
-        self.adam_w_mode = 1 if adam_w_mode else 0
-        self.set_grad_none = set_grad_none
-        if multi_tensor_applier.available:
-            import colossal_C
-            # Skip buffer
-            self._dummy_overflow_buf = torch.cuda.IntTensor([0])
-            self.multi_tensor_adam = colossal_C.multi_tensor_adam
-        else:
-            raise RuntimeError('FusedAdam requires cuda extensions')
-
-    def zero_grad(self):
-        if self.set_grad_none:
-            for group in self.param_groups:
-                for p in group['params']:
-                    p.grad = None
-        else:
-            super(FusedAdam, self).zero_grad()
-
-    def step(self, closure=None, grads=None, output_params=None, scale=None, grad_norms=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-
-        The remaining arguments are deprecated, and are only retained (for the moment) for error-checking purposes.
-        """
-        if any(p is not None for p in [grads, output_params, scale, grad_norms]):
-            raise RuntimeError(
-                'FusedAdam has been updated.  Simply initialize it identically to torch.optim.Adam, and call step() with no arguments.')
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            bias_correction = 1 if group['bias_correction'] else 0
-            beta1, beta2 = group['betas']
-
-            # assume same step across group now to simplify things
-            # per parameter step can be easily support by making it tensor, or pass list into kernel
-            if 'step' in group:
-                group['step'] += 1
-            else:
-                group['step'] = 1
-
-            # create lists for multi-tensor apply
-            g_16, p_16, m_16, v_16 = [], [], [], []
-            g_32, p_32, m_32, v_32 = [], [], [], []
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                if p.grad.data.is_sparse:
-                    raise RuntimeError(
-                        'FusedAdam does not support sparse gradients, please consider SparseAdam instead')
-
-                state = self.state[p]
-                # State initialization
-                if len(state) == 0:
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                if p.dtype == torch.float16:
-                    g_16.append(p.grad.data)
-                    p_16.append(p.data)
-                    m_16.append(state['exp_avg'])
-                    v_16.append(state['exp_avg_sq'])
-                elif p.dtype == torch.float32:
-                    g_32.append(p.grad.data)
-                    p_32.append(p.data)
-                    m_32.append(state['exp_avg'])
-                    v_32.append(state['exp_avg_sq'])
-                else:
-                    raise RuntimeError('FusedAdam only support fp16 and fp32.')
-
-            if (len(g_16) > 0):
-                multi_tensor_applier(self.multi_tensor_adam,
-                                     self._dummy_overflow_buf,
-                                     [g_16, p_16, m_16, v_16],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     group['step'],
-                                     self.adam_w_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
-            if (len(g_32) > 0):
-                multi_tensor_applier(self.multi_tensor_adam,
-                                     self._dummy_overflow_buf,
-                                     [g_32, p_32, m_32, v_32],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     group['step'],
-                                     self.adam_w_mode,
-                                     bias_correction,
-                                     group['weight_decay'])
-
-        return loss
--- a/colossalai/nn/optimizer/fused_lamb.py
+++ b/colossalai/nn/optimizer/fused_lamb.py
-# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_lamb.py
-import torch
-
-from colossalai.registry import OPTIMIZERS
-from colossalai.utils import multi_tensor_applier
-
-
-@OPTIMIZERS.register_module
-class FusedLAMB(torch.optim.Optimizer):
-    """Implements LAMB algorithm.
-
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
-
-    This version of fused LAMB implements 2 fusions.
-
-      * Fusion of the LAMB update's elementwise operations
-      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
-
-    :class:`colossalai.nn.optimizer.FusedLAMB`'s usage is identical to any ordinary Pytorch optimizer
-
-    :class:`colossalai.nn.optimizer.FusedLAMB` may be used with or without Amp.
-
-    LAMB was proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups.
-        lr (float, optional): learning rate. (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its norm. (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability. (default: 1e-6)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0.01)
-        amsgrad (boolean, optional): whether to use the AMSGrad variant of this
-            algorithm from the paper `On the Convergence of Adam and Beyond`_
-            NOT SUPPORTED now! (default: False)
-        adam_w_mode (boolean, optional): Apply L2 regularization or weight decay
-            True for decoupled weight decay(also known as AdamW) (default: True)
-        grad_averaging (bool, optional): whether apply (1-beta2) to grad when
-            calculating running averages of gradient. (default: True)
-        set_grad_none (bool, optional): whether set grad to None when zero_grad()
-            method is called. (default: True)
-        max_grad_norm (float, optional): value used to clip global grad norm
-            (default: 1.0)
-        use_nvlamb (boolean, optional): Apply adaptive learning rate to 0.0
-            weight decay parameter (default: False)
-
-    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
-        https://arxiv.org/abs/1904.00962
-    .. _On the Convergence of Adam and Beyond:
-        https://openreview.net/forum?id=ryQu7f-RZ
-    """
-
-    def __init__(self, params, lr=1e-3, bias_correction=True,
-                 betas=(0.9, 0.999), eps=1e-6, weight_decay=0.01,
-                 amsgrad=False, adam_w_mode=True,
-                 grad_averaging=True, set_grad_none=True,
-                 max_grad_norm=1.0, use_nvlamb=False):
-        if amsgrad:
-            raise RuntimeError(
-                'FusedLAMB does not support the AMSGrad variant.')
-        defaults = dict(lr=lr, bias_correction=bias_correction,
-                        betas=betas, eps=eps, weight_decay=weight_decay,
-                        grad_averaging=grad_averaging,
-                        max_grad_norm=max_grad_norm)
-        super(FusedLAMB, self).__init__(params, defaults)
-        if multi_tensor_applier.available:
-            import colossal_C
-            self.multi_tensor_l2norm = colossal_C.multi_tensor_l2norm
-            # Skip buffer
-            self._dummy_overflow_buf = torch.tensor(
-                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
-            self.multi_tensor_lamb = colossal_C.multi_tensor_lamb
-        else:
-            raise RuntimeError('FusedLAMB requires cuda extensions')
-
-        self.adam_w_mode = 1 if adam_w_mode else 0
-        self.set_grad_none = set_grad_none
-        self.use_nvlamb = use_nvlamb
-
-    def zero_grad(self):
-        if self.set_grad_none:
-            for group in self.param_groups:
-                for p in group['params']:
-                    p.grad = None
-        else:
-            super(FusedLAMB, self).zero_grad()
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        # create separate grad lists for fp32 and fp16 params
-        g_all_32, g_all_16 = [], []
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                if p.dtype == torch.float32:
-                    g_all_32.append(p.grad.data)
-                elif p.dtype == torch.float16:
-                    g_all_16.append(p.grad.data)
-                else:
-                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')
-
-        device = self.param_groups[0]["params"][0].device
-        g_norm_32, g_norm_16 = torch.zeros(
-            1, device=device), torch.zeros(1, device=device)
-        # compute grad norm for two lists
-        if len(g_all_32) > 0:
-            g_norm_32 = multi_tensor_applier(self.multi_tensor_l2norm,
-                                             self._dummy_overflow_buf,
-                                             [g_all_32], False)[0]
-        if len(g_all_16) > 0:
-            g_norm_16 = multi_tensor_applier(self.multi_tensor_l2norm,
-                                             self._dummy_overflow_buf,
-                                             [g_all_16], False)[0]
-
-        # blend two grad norms to get global grad norm
-        global_grad_norm = multi_tensor_applier(self.multi_tensor_l2norm,
-                                                self._dummy_overflow_buf,
-                                                [[g_norm_32, g_norm_16]],
-                                                False)[0]
-        max_grad_norm = self.defaults['max_grad_norm']
-
-        for group in self.param_groups:
-            bias_correction = 1 if group['bias_correction'] else 0
-            beta1, beta2 = group['betas']
-            grad_averaging = 1 if group['grad_averaging'] else 0
-
-            # assume same step across group now to simplify things
-            # per parameter step can be easily support by making it tensor, or pass list into kernel
-            if 'step' in group:
-                group['step'] += 1
-            else:
-                group['step'] = 1
-
-            # create lists for multi-tensor apply
-            g_16, p_16, m_16, v_16 = [], [], [], []
-            g_32, p_32, m_32, v_32 = [], [], [], []
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                if p.grad.data.is_sparse:
-                    raise RuntimeError(
-                        'FusedLAMB does not support sparse gradients, please consider SparseAdam instead')
-
-                state = self.state[p]
-                # State initialization
-                if len(state) == 0:
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                if p.dtype == torch.float16:
-                    g_16.append(p.grad.data)
-                    p_16.append(p.data)
-                    m_16.append(state['exp_avg'])
-                    v_16.append(state['exp_avg_sq'])
-                elif p.dtype == torch.float32:
-                    g_32.append(p.grad.data)
-                    p_32.append(p.data)
-                    m_32.append(state['exp_avg'])
-                    v_32.append(state['exp_avg_sq'])
-                else:
-                    raise RuntimeError('FusedLAMB only support fp16 and fp32.')
-
-            if (len(g_16) > 0):
-                multi_tensor_applier(self.multi_tensor_lamb,
-                                     self._dummy_overflow_buf,
-                                     [g_16, p_16, m_16, v_16],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     group['step'],
-                                     bias_correction,
-                                     group['weight_decay'],
-                                     grad_averaging,
-                                     self.adam_w_mode,
-                                     global_grad_norm,
-                                     max_grad_norm,
-                                     self.use_nvlamb)
-            if (len(g_32) > 0):
-                multi_tensor_applier(self.multi_tensor_lamb,
-                                     self._dummy_overflow_buf,
-                                     [g_32, p_32, m_32, v_32],
-                                     group['lr'],
-                                     beta1,
-                                     beta2,
-                                     group['eps'],
-                                     group['step'],
-                                     bias_correction,
-                                     group['weight_decay'],
-                                     grad_averaging,
-                                     self.adam_w_mode,
-                                     global_grad_norm,
-                                     max_grad_norm,
-                                     self.use_nvlamb)
-
-        return loss
--- a/colossalai/nn/optimizer/fused_sgd.py
+++ b/colossalai/nn/optimizer/fused_sgd.py
-# modified from https://github.com/NVIDIA/apex/blob/master/apex/optimizers/fused_sgd.py
-import torch
-from torch.optim.optimizer import Optimizer, required
-
-from colossalai.registry import OPTIMIZERS
-from colossalai.utils import multi_tensor_applier
-
-
-@OPTIMIZERS.register_module
-class FusedSGD(Optimizer):
-    r"""Implements stochastic gradient descent (optionally with momentum).
-
-    Currently GPU-only.  Requires ColossalAI to be installed via
-    ``pip install -v --no-cache-dir --global-option="--cuda_ext" ./``.
-
-    This version of fused SGD implements 2 fusions.
-
-      * Fusion of the SGD update's elementwise operations
-      * A multi-tensor apply launch that batches the elementwise updates applied to all the model's parameters into one or a few kernel launches.
-
-    :class:`colossalai.nn.optimizer.FusedSGD` may be used as a drop-in replacement for ``torch.optim.SGD``
-
-    :class:`colossalai.nn.optimizer.FusedSGD` may be used with or without Amp. 
-
-    Nesterov momentum is based on the formula from
-    `On the importance of initialization and momentum in deep learning`__.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float): learning rate
-        momentum (float, optional): momentum factor (default: 0)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        dampening (float, optional): dampening for momentum (default: 0)
-        nesterov (bool, optional): enables Nesterov momentum (default: False)
-
-    __ http://www.cs.toronto.edu/%7Ehinton/absps/momentum.pdf
-
-    .. note::
-        The implementation of SGD with Momentum/Nesterov subtly differs from
-        Sutskever et. al. and implementations in some other frameworks.
-        Considering the specific case of Momentum, the update can be written as
-
-        .. math::
-                  v = \rho * v + g \\
-                  p = p - lr * v
-
-        where p, g, v and :math:`\rho` denote the parameters, gradient,
-        velocity, and momentum respectively.
-        This is in contrast to Sutskever et. al. and
-        other frameworks which employ an update of the form
-
-        .. math::
-             v = \rho * v + lr * g \\
-             p = p - v
-
-        The Nesterov version is analogously modified.
-    """
-
-    def __init__(self, params, lr=required, momentum=0, dampening=0,
-                 weight_decay=0, nesterov=False,
-                 wd_after_momentum=False,
-                 materialize_master_grads=True,
-                 set_grad_none=False):
-        if lr is not required and lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if momentum < 0.0:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
-        if weight_decay < 0.0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay))
-
-        defaults = dict(lr=lr, momentum=momentum, dampening=dampening,
-                        weight_decay=weight_decay, nesterov=nesterov)
-        if nesterov and (momentum <= 0 or dampening != 0):
-            raise ValueError(
-                "Nesterov momentum requires a momentum and zero dampening")
-        super(FusedSGD, self).__init__(params, defaults)
-
-        self.wd_after_momentum = wd_after_momentum
-        self.materialize_master_grads = materialize_master_grads
-        self.most_recent_scale = 1.0
-        self.scale_set_by_backward = False
-        self.set_grad_none = set_grad_none
-
-        if multi_tensor_applier.available:
-            import colossal_C
-            # Skip buffer
-            self._dummy_overflow_buf = torch.tensor(
-                [0], dtype=torch.int, device=self.param_groups[0]["params"][0].device)
-            self.multi_tensor_sgd = colossal_C.multi_tensor_sgd
-        else:
-            raise RuntimeError('FusedSGD requires cuda extensions')
-
-    def __setstate__(self, state):
-        super(FusedSGD, self).__setstate__(state)
-        for group in self.param_groups:
-            group.setdefault('nesterov', False)
-
-    def zero_grad(self):
-        if self.set_grad_none:
-            for group in self.param_groups:
-                for p in group['params']:
-                    p.grad = None
-        else:
-            super(FusedSGD, self).zero_grad()
-
-    def get_momentums(self, params):
-        momentums = []
-        first_run = True
-        for p in params:
-            param_state = self.state[p]
-            # torch.optim.SGD initializes momentum in the main loop, we have
-            # to do it here, and track whether or not we've done so, so that
-            # momentum application can be skipped in the main kernel.
-            if 'momentum_buffer' not in param_state:
-                first_run = True
-                buf = param_state['momentum_buffer'] = torch.zeros_like(p.data)
-                momentums.append(buf)
-            else:
-                first_run = False
-                momentums.append(param_state['momentum_buffer'])
-        return momentums, first_run
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        explicit_master_params = (hasattr(self, "_amp_stash") and
-                                  hasattr(self._amp_stash, "fp32_from_fp16_groups"))
-
-        for gid, group in enumerate(self.param_groups):
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            dampening = group['dampening']
-            nesterov = group['nesterov']
-
-            # For each group, there are 3 possible combinations we need to consider:
-            # grad_type, param_to_update_type, momentum_type, requires_fp16_model_copy
-            # 1. fp16, fp16, fp16, No
-            # 2. fp32, fp32, fp32, No
-            # 3. fp16, fp32, fp32, Yes
-
-            first_runs = [True, True]
-
-            # I think a bit of code divergence in exchange for naming clarity is worthwhile
-            if explicit_master_params:
-                stash = self._amp_stash
-
-                fp32_params = [
-                    p for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
-                fp32_grads = [
-                    p.grad for p in stash.fp32_from_fp32_groups[gid] if p.grad is not None]
-                fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
-
-                if self.materialize_master_grads:
-                    fp16_model_params = [p for i, p in enumerate(
-                        stash.fp16_groups[gid]) if stash.fp32_from_fp16_groups[gid][i].grad is not None]
-                    fp32_from_fp16_grads = [
-                        p.grad for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None]
-                    fp32_from_fp16_params = [
-                        p for p in stash.fp32_from_fp16_groups[gid] if p.grad is not None]
-                    fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(
-                        fp32_from_fp16_params)
-
-                    fp16_set = [fp32_from_fp16_grads, fp32_from_fp16_params,
-                                fp32_from_fp16_momentums, fp16_model_params]
-                else:
-                    fp16_model_params = [
-                        p for p in stash.fp16_groups[gid] if p.grad is not None]
-                    fp16_model_grads = [
-                        p.grad for p in stash.fp16_groups[gid] if p.grad is not None]
-                    fp32_from_fp16_params = [p for i, p in enumerate(
-                        stash.fp32_from_fp16_groups[gid]) if stash.fp16_groups[gid][i].grad is not None]
-                    fp32_from_fp16_momentums, first_runs[0] = self.get_momentums(
-                        fp32_from_fp16_params)
-
-                    fp16_set = [fp16_model_grads, fp32_from_fp16_params,
-                                fp32_from_fp16_momentums, fp16_model_params]
-
-                launch_sets = [fp16_set, [
-                    fp32_grads, fp32_params, fp32_momentums]]
-            else:
-                fp16_params = [p for p in group['params'] if (
-                    p.dtype == torch.float16 and p.grad is not None)]
-                fp16_grads = [p.grad for p in group['params'] if (
-                    p.dtype == torch.float16 and p.grad is not None)]
-                fp16_momentums, first_runs[0] = self.get_momentums(fp16_params)
-
-                fp32_params = [p for p in group['params'] if (
-                    p.dtype == torch.float32 and p.grad is not None)]
-                fp32_grads = [p.grad for p in group['params'] if (
-                    p.dtype == torch.float32 and p.grad is not None)]
-                fp32_momentums, first_runs[1] = self.get_momentums(fp32_params)
-
-                launch_sets = [[fp16_grads, fp16_params, fp16_momentums],
-                               [fp32_grads, fp32_params, fp32_momentums]]
-
-            for s, (launch_set, first_run) in enumerate(zip(launch_sets, first_runs)):
-                assert len(launch_set[0]) == len(launch_set[1])
-                assert len(launch_set[0]) == len(launch_set[2])
-                if len(launch_set[0]) > 0:
-                    multi_tensor_applier(
-                        self.multi_tensor_sgd,
-                        self._dummy_overflow_buf,
-                        launch_set,
-                        weight_decay,
-                        momentum,
-                        dampening,
-                        group['lr'],
-                        nesterov,
-                        first_run,
-                        self.wd_after_momentum,
-                        1.0 / self.most_recent_scale)
-
-        self.most_recent_scale = 1.0
-        self.scale_set_by_backward = False
-
-        return loss
--- a/colossalai/nn/optimizer/lamb.py
+++ b/colossalai/nn/optimizer/lamb.py
-"""
-Adapted from the pytorch-lamb library at https://github.com/cybertronai/pytorch-lamb
-"""
-
-import torch
-from torch.optim import Optimizer
-
-from colossalai.registry import OPTIMIZERS
-
-
-@OPTIMIZERS.register_module
-class Lamb(Optimizer):
-    r"""Implements Lamb algorithm.
-    It has been proposed in `Large Batch Optimization for Deep Learning: Training BERT in 76 minutes`_.
-
-    Arguments:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        betas (Tuple[float, float], optional): coefficients used for computing
-            running averages of gradient and its square (default: (0.9, 0.999))
-        eps (float, optional): term added to the denominator to improve
-            numerical stability (default: 1e-6)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-        adam (bool, optional): always use trust ratio = 1, which turns this into
-            Adam. Useful for comparison purposes.
-
-    .. _Large Batch Optimization for Deep Learning\: Training BERT in 76 minutes:
-        https://arxiv.org/abs/1904.00962
-    """
-
-    def __init__(self, params, lr=1e-3, betas=(0.9, 0.999), eps=1e-6,
-                 weight_decay=0, adam=False):
-        if not 0.0 <= lr:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if not 0.0 <= eps:
-            raise ValueError("Invalid epsilon value: {}".format(eps))
-        if not 0.0 <= betas[0] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 0: {}".format(betas[0]))
-        if not 0.0 <= betas[1] < 1.0:
-            raise ValueError(
-                "Invalid beta parameter at index 1: {}".format(betas[1]))
-        defaults = dict(lr=lr, betas=betas, eps=eps,
-                        weight_decay=weight_decay)
-        self.adam = adam
-        super(Lamb, self).__init__(params, defaults)
-
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            loss = closure()
-
-        for group in self.param_groups:
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                grad = p.grad.data
-                if grad.is_sparse:
-                    raise RuntimeError(
-                        'Lamb does not support sparse gradients, consider SparseAdam instad.')
-
-                state = self.state[p]
-
-                # State initialization
-                if len(state) == 0:
-                    state['step'] = 0
-                    # Exponential moving average of gradient values
-                    state['exp_avg'] = torch.zeros_like(p.data)
-                    # Exponential moving average of squared gradient values
-                    state['exp_avg_sq'] = torch.zeros_like(p.data)
-
-                exp_avg, exp_avg_sq = state['exp_avg'], state['exp_avg_sq']
-                beta1, beta2 = group['betas']
-
-                state['step'] += 1
-
-                # Decay the first and second moment running average coefficient
-                # m_t
-                exp_avg.mul_(beta1).add_(grad, alpha=1 - beta1)
-                # v_t
-                exp_avg_sq.mul_(beta2).addcmul_(grad, grad, value=1 - beta2)
-
-                # Paper v3 does not use debiasing.
-                # bias_correction1 = 1 - beta1 ** state['step']
-                # bias_correction2 = 1 - beta2 ** state['step']
-                # Apply bias to lr to avoid broadcast.
-                # * math.sqrt(bias_correction2) / bias_correction1
-                step_size = group['lr']
-
-                weight_norm = p.data.pow(2).sum().sqrt()
-
-                adam_step = exp_avg / exp_avg_sq.sqrt().add(group['eps'])
-                if group['weight_decay'] != 0:
-                    adam_step.add_(p.data, alpha=group['weight_decay'])
-
-                adam_norm = adam_step.pow(2).sum().sqrt()
-                if weight_norm == 0 or adam_norm == 0:
-                    trust_ratio = 1
-                else:
-                    trust_ratio = weight_norm / adam_norm
-                state['weight_norm'] = weight_norm
-                state['adam_norm'] = adam_norm
-                state['trust_ratio'] = trust_ratio
-                if self.adam:
-                    trust_ratio = 1
-
-                p.data.add_(adam_step, alpha=-step_size * trust_ratio)
-
-        return loss
--- a/colossalai/nn/optimizer/lars.py
+++ b/colossalai/nn/optimizer/lars.py
-"""Adapted from https://github.com/NUS-HPC-AI-Lab/LARS-ImageNet-PyTorch/blob/main/lars.py"""
-
-from typing import Iterable
-
-import torch
-from torch.optim import Optimizer
-
-from colossalai.registry import OPTIMIZERS
-
-
-@OPTIMIZERS.register_module
-class Lars(Optimizer):
-    r"""Implements the LARS optimizer from `"Large batch training of convolutional networks"
-    <https://arxiv.org/pdf/1708.03888.pdf>`_.
-
-    Args:
-        params (iterable): iterable of parameters to optimize or dicts defining
-            parameter groups
-        lr (float, optional): learning rate (default: 1e-3)
-        momentum (float, optional): momentum factor (default: 0)
-        eeta (float, optional): LARS coefficient as used in the paper (default: 1e-3)
-        weight_decay (float, optional): weight decay (L2 penalty) (default: 0)
-    """
-
-    def __init__(
-            self,
-            params: Iterable[torch.nn.Parameter],
-            lr=1e-3,
-            momentum=0,
-            eeta=1e-3,
-            weight_decay=0,
-            epsilon=0.0
-    ) -> None:
-        if not isinstance(lr, float) or lr < 0.0:
-            raise ValueError("Invalid learning rate: {}".format(lr))
-        if momentum < 0.0:
-            raise ValueError("Invalid momentum value: {}".format(momentum))
-        if weight_decay < 0.0:
-            raise ValueError(
-                "Invalid weight_decay value: {}".format(weight_decay))
-        if eeta <= 0 or eeta > 1:
-            raise ValueError("Invalid eeta value: {}".format(eeta))
-        if epsilon < 0:
-            raise ValueError("Invalid epsilon value: {}".format(epsilon))
-        defaults = dict(lr=lr, momentum=momentum,
-                        weight_decay=weight_decay, eeta=eeta, epsilon=epsilon, lars=True)
-
-        super().__init__(params, defaults)
-
-    @torch.no_grad()
-    def step(self, closure=None):
-        """Performs a single optimization step.
-
-        Arguments:
-            closure (callable, optional): A closure that reevaluates the model
-                and returns the loss.
-        """
-        loss = None
-        if closure is not None:
-            with torch.enable_grad():
-                loss = closure()
-
-        for group in self.param_groups:
-            weight_decay = group['weight_decay']
-            momentum = group['momentum']
-            eeta = group['eeta']
-            lr = group['lr']
-            lars = group['lars']
-            eps = group['epsilon']
-
-            for p in group['params']:
-                if p.grad is None:
-                    continue
-                decayed_grad = p.grad
-                scaled_lr = lr
-                if lars:
-                    w_norm = torch.norm(p)
-                    g_norm = torch.norm(p.grad)
-                    trust_ratio = torch.where(
-                        w_norm > 0 and g_norm > 0,
-                        eeta * w_norm / (g_norm + weight_decay * w_norm + eps),
-                        torch.ones_like(w_norm)
-                    )
-                    trust_ratio.clamp_(0.0, 50)
-                    scaled_lr *= trust_ratio.item()
-                    if weight_decay != 0:
-                        decayed_grad = decayed_grad.add(p, alpha=weight_decay)
-                decayed_grad = torch.clamp(decayed_grad, -10.0, 10.0)
-
-                if momentum != 0:
-                    param_state = self.state[p]
-                    if 'momentum_buffer' not in param_state:
-                        buf = param_state['momentum_buffer'] = torch.clone(
-                            decayed_grad).detach()
-                    else:
-                        buf = param_state['momentum_buffer']
-                        buf.mul_(momentum).add_(decayed_grad)
-                    decayed_grad = buf
-
-                p.add_(decayed_grad, alpha=-scaled_lr)
-
-        return loss
--- a/colossalai/registry/__init__.py
+++ b/colossalai/registry/__init__.py
-import torch.distributed.optim as dist_optim
-import torch.nn as nn
-import torch.optim as optim
-import torchvision.models as tv_models
-import torchvision.datasets as tv_datasets
-from torchvision import transforms
-
-from .registry import Registry
-
-LAYERS = Registry("layers", third_party_library=[nn])
-LOSSES = Registry("losses")
-MODELS = Registry("models", third_party_library=[tv_models])
-OPTIMIZERS = Registry("optimizers", third_party_library=[optim, dist_optim])
-DATASETS = Registry("datasets", third_party_library=[tv_datasets])
-DIST_GROUP_INITIALIZER = Registry("dist_group_initializer")
-GRADIENT_HANDLER = Registry("gradient_handler")
-LOSSES = Registry("losses", third_party_library=[nn])
-HOOKS = Registry("hooks")
-TRANSFORMS = Registry("transforms", third_party_library=[transforms])
-DATA_SAMPLERS = Registry("data_samplers")
-LR_SCHEDULERS = Registry("lr_schedulers")
-SCHEDULE = Registry("schedules")
-OPHOOKS = Registry("ophooks")
--- a/colossalai/registry/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/registry/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/registry/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/registry/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/registry/__pycache__/registry.cpython-36.pyc
+++ b/colossalai/registry/__pycache__/registry.cpython-36.pyc
--- a/colossalai/registry/__pycache__/registry.cpython-37.pyc
+++ b/colossalai/registry/__pycache__/registry.cpython-37.pyc
--- a/colossalai/registry/registry.py
+++ b/colossalai/registry/registry.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-from types import ModuleType
-from typing import List
-
-
-class Registry:
-    """This is a registry class used to register classes and modules so that a universal 
-    object builder can be enabled.
-
-    :param name: The name of the registry
-    :type name: str
-    :param third_party_library: List of third party libraries which are used in the 
-        initialization of the register module
-    :type third_party_library: list, optional
-    """
-
-    def __init__(self, name: str, third_party_library: List[ModuleType] = None):
-        self._name = name
-        self._registry = dict()
-        self._third_party_lib = third_party_library
-
-    @property
-    def name(self):
-        return self._name
-
-    def register_module(self, module_class):
-        """Registers a module represented in `module_class`.
-
-        :param module_class: The module to be registered
-        :type module_class: class
-        :raises AssertionError: Raises an AssertionError if the module has already been 
-            registered before
-        :return: The module to be registered, so as to use it normally if via importing
-        :rtype: class
-        """
-        module_name = module_class.__name__
-        assert module_name not in self._registry
-        self._registry[module_name] = module_class
-
-        # return so as to use it normally if via importing
-        return module_class
-
-    def get_module(self, module_name: str):
-        """Retrieves a module with name `module_name` and returns the module if it has 
-        already been registered before.
-
-        :param module_name: The name of the module to be retrieved
-        :type module_name: str
-        :raises NameError: Raises a NameError if the module to be retrieved has neither been 
-            registered directly nor as third party modules before
-        :return: The retrieved module or None
-        :rtype: :class:`object`
-        """
-        if module_name in self._registry:
-            return self._registry[module_name]
-        elif self._third_party_lib is not None:
-            for lib in self._third_party_lib:
-                if hasattr(lib, module_name):
-                    return getattr(lib, module_name)
-            raise NameError(f'Module {module_name} not found in the registry {self.name}')
-
-    def has(self, module_name: str):
-        """Searches for a module with name `module_name` and returns a boolean value indicating
-        whether the module has been registered directly or as third party modules before.
-
-        :param module_name: The name of the module to be searched for
-        :type module_name: str
-        :return: A boolean value indicating whether the module has been registered directly or
-            as third party modules before
-        :rtype: bool
-        """
-        found_flag = module_name in self._registry
-
-        if self._third_party_lib:
-            for lib in self._third_party_lib:
-                if hasattr(lib, module_name):
-                    found_flag = True
-                    break
-
-        return found_flag
--- a/colossalai/trainer/__init__.py
+++ b/colossalai/trainer/__init__.py
-from ._trainer import Trainer
-
-__all__ = ['Trainer']
--- a/colossalai/trainer/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/trainer/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/trainer/__pycache__/_trainer.cpython-37.pyc
+++ b/colossalai/trainer/__pycache__/_trainer.cpython-37.pyc
--- a/colossalai/trainer/_trainer.py
+++ b/colossalai/trainer/_trainer.py
-from typing import Union, List
-from colossalai.context.parallel_mode import ParallelMode
-
-import torch
-from torch import Tensor
-from torch.utils.data import DataLoader
-from tqdm import tqdm
-
-from colossalai.core import global_context as gpc
-
-from colossalai.engine import Engine
-from colossalai.engine.schedule import NonPipelineSchedule, BaseSchedule
-from colossalai.logging import DistributedLogger
-from colossalai.utils import MultiTimer
-from colossalai.utils import is_dp_rank_0, is_tp_rank_0, is_no_pp_or_last_stage
-from colossalai.trainer.hooks import BaseHook
-
-
-class Trainer:
-    """This a class tending for easy deployments of users' training and evaluation instead of
-    writing their own scripts. It is similar with ``ignite.engine`` and ``keras.engine``, but is
-    called `Trainer`.
-
-    :param engine: Engine responsible for the process function
-    :type engine: :class:`Engine`
-    :param schedule: Schedule responsible for forward and backward steps
-    :type schedule: :class:`BaseSchedule`, optional
-    :param timer: Timer used to monitor the whole training
-    :type timer: :class:`MultiTimer`, optional
-    :param logger: Logger used to record the whole training
-    :type logger: :class:`colossalai.logging.DistributedLogger`, optional
-    """
-    def __init__(
-            self,
-            engine: Engine,
-            schedule: BaseSchedule = None,
-            timer: MultiTimer = None,
-            logger: DistributedLogger = None,
-    ):
-        # training-ralated params
-        self._engine = engine
-        self._max_epochs = 0
-        self._cur_epoch = 0
-        self._max_steps = 0
-        self._cur_step = 0
-        self._steps_per_epoch = 0
-
-        # misc params
-        self._logger = logger
-        self._verbose = logger is not None
-
-        # hooks can store states in this dict, and could be consumed by other hooks
-        self.states = dict()
-
-        # build hooks
-        self.hooks = list()
-
-        # multi-timer for time benchmarking
-        self._timer = timer
-
-        # set schedule which specifies the training iteration for the engine
-        if schedule is None:
-            schedule = NonPipelineSchedule()
-        if (gpc.is_initialized(ParallelMode.PIPELINE)
-                and gpc.get_world_size(ParallelMode.PIPELINE) > 1):
-            assert not isinstance(
-                schedule, NonPipelineSchedule
-            ), "NonPipelineSchedule cannot be used for pipeline parallel training, please use PipelineSchedule instead."
-        self._schedule = schedule
-        self._schedule.pre_processing(engine)
-
-    @property
-    def cur_epoch(self):
-        """Returns the index of the current epoch."""
-        return self._cur_epoch
-
-    @cur_epoch.setter
-    def cur_epoch(self, epoch: int):
-        """Set how many epochs have been processed."""
-        # allow setter for training resumption
-        self._cur_epoch = epoch
-
-    @property
-    def cur_step(self):
-        """Returns how many iteration steps have been processed."""
-        return self._cur_step
-
-    @property
-    def max_epochs(self):
-        return self._max_epochs
-
-    @property
-    def max_steps(self):
-        return self._max_steps
-
-    @property
-    def steps_per_epoch(self):
-        return self._steps_per_epoch
-
-    @property
-    def engine(self):
-        return self._engine
-
-    @property
-    def schedule(self):
-        return self._schedule
-
-    def _set_current_step(self, epoch: int):
-        """Sets current step number.
-
-        :param epoch: Step number to be set
-        :type epoch: int
-        """
-        self._cur_step = epoch * self._steps_per_epoch
-
-    def _call_timer(self, action: str, item: str, *args, **kwargs) -> None:
-        """Call timer funciton with a given timer name.
-
-        :param action: Function to be called on timer
-        :type action: str
-        :param item: Name of the timer
-        :type item: str
-        :param args: args used for action function
-        :param kwargs: kwargs used for action function
-        """
-
-        if self._timer is not None:
-            getattr(self._timer, action)(item, *args, **kwargs)
-
-    def _reset_states(self) -> None:
-        """Clear trainer states"""
-        self.states = dict()
-
-    def _call_hooks(self, func, output=None):
-        """Calls specific hooks in the current time point.
-
-        :param func: A string represents the time point
-        :param output: Output of the model after running a iteration or None in any other time points
-        :type func: str
-        :type output: optional
-        """
-        # Only after iter hook will receive output
-        for hook in self.hooks:
-            if output is None:
-                getattr(hook, func)(self)
-            else:
-                getattr(hook, func)(self, *output)
-
-    @staticmethod
-    def _should_display_progress(display_progress: bool):
-        """Only display progress on DP rank 0, TP rank 0 and PP last rank"""
-        return (display_progress and is_dp_rank_0() and is_tp_rank_0()
-                and is_no_pp_or_last_stage())
-
-    def _train_epoch(
-            self,
-            train_dataloader: DataLoader,
-            epoch: int = None,
-            display_progress: bool = False,
-            return_output_label: bool = True,
-    ):
-        # set training state
-        self._engine.train()
-        data_iter = iter(train_dataloader)
-        progress = range(self._steps_per_epoch)
-        if display_progress:
-            if epoch is None:
-                progress = tqdm(progress, desc="[Train]")
-            else:
-                progress = tqdm(progress, desc=f"[Epoch {epoch} / Train]")
-
-        self._call_hooks("before_train_epoch")
-        self._call_timer(action="start", item="Train-epoch")
-        for i in progress:
-            self._call_hooks("before_train_iter")
-            self._call_timer(action="start", item="Train-step")
-
-            # run 1 training step
-            self.engine.zero_grad()
-            logits, label, loss = self.schedule.forward_backward_step(
-                self.engine,
-                data_iter,
-                forward_only=False,
-                return_loss=True,
-                return_output_label=return_output_label,
-            )
-            self.engine.step()
-            self._call_timer(action="stop",
-                             item="Train-step",
-                             keep_in_history=True)
-            self._call_hooks("after_train_iter", output=(logits, label, loss))
-
-            self._cur_step += 1
-
-            if display_progress:
-                if "step_metrics" in self.states:
-                    progress.set_postfix(**self.states["step_metrics"])
-
-            # stop when max iter is reached
-            if self._exceed_max_step():
-                break
-
-        self._call_timer(action="stop",
-                         item="Train-epoch",
-                         keep_in_history=True)
-        self._call_hooks("after_train_epoch")
-        self._call_timer(action="reset", item="Train-epoch")
-
-    def _eval(
-            self,
-            test_dataloader: DataLoader,
-            epoch: int = None,
-            display_progress: bool = False,
-            return_output_label: bool = True,
-    ):
-        # switch engine status
-        self._engine.eval()
-
-        data_iter = iter(test_dataloader)
-        num_steps = len(test_dataloader)
-
-        self._call_hooks("before_test")
-        # prepare progress bar
-        progress = range(num_steps)
-        if display_progress:
-            desc = "Evaluation"
-            if epoch is not None:
-                desc = "[Epoch %d / Test]" % epoch
-            progress = tqdm(progress, desc=desc)
-
-        self._call_hooks("before_test_epoch")
-        self._call_timer(action="start", item="Test-epoch")
-        with torch.no_grad():
-            for _ in progress:
-                self._call_hooks("before_test_iter")
-                self._call_timer(action="start", item="Test-step")
-                logits, label, loss = self.schedule.forward_backward_step(
-                    self.engine,
-                    data_iter,
-                    forward_only=True,
-                    return_loss=True,
-                    return_output_label=return_output_label,
-                )
-                self._call_timer(action="stop",
-                                 item="Test-step",
-                                 keep_in_history=True)
-                self._call_hooks("after_test_iter",
-                                 output=(logits, label, loss))
-
-                if display_progress:
-                    if "step_metrics" in self.states:
-                        progress.set_postfix(**self.states["step_metrics"])
-
-        self._call_timer(action="stop",
-                         item="Test-epoch",
-                         keep_in_history=True)
-        self._call_hooks("after_test_epoch")
-        self._call_hooks("after_test")
-        self._call_timer(action="reset", item="Test-step")
-        self._call_timer(action="reset", item="Test-epoch")
-
-    def _exceed_max_step(self):
-        return self._max_steps is not None and self._cur_step >= self._max_steps
-
-    def fit(
-            self,
-            train_dataloader: DataLoader,
-            epochs: int,
-            max_steps: int = None,
-            test_dataloader: DataLoader = None,
-            test_interval: int = 1,
-            hooks: List[BaseHook] = None,
-            display_progress: bool = False,
-            return_output_label: bool = True,
-    ):
-        """Trains the model to fit training data.
-
-        :param train_dataloader: DataLoader in training
-        :param epochs: Maximum number of epoches
-        :param max_steps: Maximum number of running iterations
-        :param test_dataloader: DataLoader in testing
-        :param test_interval: Interval of testing
-        :param hooks: A list of hooks used in training
-        :param display_progress: If True, the training progress will be printed
-        :param return_output_label: If True, the output of model and the label will be returned
-
-        :type train_dataloader: DataLoader
-        :type epochs: int
-        :type max_steps: int, optional
-        :type test_dataloader: DataLoader, optional
-        :type test_interval: int, optional
-        :type hooks: list, optional
-        :type display_progress: bool, optional
-        :type return_output_label: bool, optional
-        """
-
-        # set epochs and steps, consider gradient accumulation
-        self._steps_per_epoch = len(train_dataloader)
-        self._max_steps = max_steps
-        self._max_epochs = epochs
-
-        # check if testing is required
-        should_test = False
-        if test_dataloader is not None:
-            should_test = True
-
-        display_progress = self._should_display_progress(display_progress)
-
-        # reset hooks
-        self._reset_states()
-        if hooks is not None:
-            assert isinstance(
-                hooks, list
-            ), f"expected argument hooks be to list, but got {type(hooks)}"
-        else:
-            hooks = []
-        self.hooks = hooks
-        self.hooks.sort(key=lambda hook: hook.priority)
-        if self._verbose:
-            for hook in self.hooks:
-                self._logger.info(
-                    f"Using {hook.__class__.__name__} for training, priority = {hook.priority}",
-                    ranks=[0],
-                )
-            self._logger.info(
-                "Lower value means higher priority for calling hook function",
-                ranks=[0])
-        self._call_hooks("after_hook_is_attached")
-
-        self._engine.train()
-        self._call_hooks("before_train")
-
-        # recover step value if resuming training
-        last_epoch = self._cur_epoch
-        if self.cur_epoch != 0:
-            self._set_current_step(last_epoch)
-
-        for epoch in range(last_epoch, epochs):
-            # train for one epoch
-            self._train_epoch(
-                train_dataloader=train_dataloader,
-                epoch=epoch,
-                display_progress=display_progress,
-                return_output_label=return_output_label,
-            )
-
-            # start eval
-            if should_test and epoch % test_interval == 0:
-                self._eval(
-                    test_dataloader=test_dataloader,
-                    display_progress=display_progress,
-                    epoch=epoch,
-                    return_output_label=return_output_label,
-                )
-
-            self._cur_epoch += 1
-
-            # check for termination
-            if self._exceed_max_step():
-                self._logger.info(
-                    f"Max number of steps {max_steps} has been reached, training is stopped automatically",
-                    ranks=[0],
-                )
-                break
-        self._call_hooks("after_train")
-        self._call_timer("reset", "Train-epoch")
-
-    def evaluate(
-            self,
-            test_dataloader: DataLoader,
-            hooks: List[BaseHook] = None,
-            display_progress: bool = False,
-            return_output_label: bool = True,
-    ):
-        """Evaluates the model with testing data.
-
-        :param test_dataloader: DataLoader in testing
-        :param hooks: A list of hooks used in evaluation
-        :param display_progress: If True, the evaluation progress will be printed
-        :param return_output_label: If True, the output of model and the label will be returned
-
-        :type test_dataloader: DataLoader
-        :type hooks: list, optional
-        :type display_progress: bool, optional
-        :type return_output_label: bool
-        """
-        # set display
-        display_progress = self._should_display_progress(display_progress)
-
-        # reset hooks
-        self._reset_states()
-        if hooks is not None:
-            assert isinstance(
-                hooks, list
-            ), f"expected argument hooks be to list, but got {type(hooks)}"
-        else:
-            hooks = []
-        self.hooks = hooks
-        self.hooks.sort(key=lambda hook: hook.priority)
-        if self._verbose:
-            for hook in self.hooks:
-                self._logger.info(
-                    f"Using {hook.__class__.__name__} for training, priority = {hook.priority}",
-                    ranks=[0],
-                )
-            self._logger.info(
-                "Lower value means higher priority for calling hook function",
-                ranks=[0])
-        self._call_hooks("after_hook_is_attached")
-
-        # eval
-        self._eval(
-            test_dataloader=test_dataloader,
-            display_progress=display_progress,
-            return_output_label=return_output_label,
-        )
-
-    def predict(self, data: Union[Tensor, List[Tensor]]):
-        """Uses trained model to make a prediction for a tensor or a tensor list.
-
-        :param data: Data as the input
-        :type data: Union[Tensor, List[Tensor]
-        :return: The output of model as the prediction
-        :rtype: Tensor
-        """
-        # predict without labels
-        if isinstance(data, (list, tuple)):
-            assert isinstance(data[0], Tensor)
-        else:
-            assert isinstance(data, Tensor)
-        self._engine.eval()
-
-        # prepare a list of (data, label) to make it iterable
-        # for compatibility with schedule
-        simple_dataloader = [(data, None)]
-        data_iter = iter(simple_dataloader)
-        output, _, _ = self.schedule.forward_backward_step(self.engine,
-                                                           data_iter,
-                                                           forward_only=True,
-                                                           return_loss=False)
-        return output