Support TP-compatible Torch AMP and Update trainer API (#27)

* Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com>

Support TP-compatible Torch AMP and Update trainer API (#27)
* Add gradient accumulation, fix lr scheduler * fix FP16 optimizer and adapted torch amp with tensor parallel (#18) * fixed bugs in compatibility between torch amp and tensor parallel and performed some minor fixes * fixed trainer * Revert "fixed trainer" This reverts commit 2e0b0b76990e8d4e337add483d878c0f61cf5097. * improved consistency between trainer, engine and schedule (#23) Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: 1SAA <c2h214748@gmail.com> Co-authored-by: ver217 <lhx0217@gmail.com>
3defa32a · Frank Lee · GitHub · 2b05de4c · 3defa32a · 3defa32a
Unverified Commit 3defa32a authored Nov 18, 2021 by Frank Lee Committed by GitHub Nov 18, 2021
20 changed files
--- a/README.md
+++ b/README.md
@@ -42,26 +42,18 @@ pip install -v --no-cache-dir --global-option="--cuda_ext" .

 ```python
 import colossalai
-from colossalai.engine import Engine
 from colossalai.trainer import Trainer
 from colossalai.core import global_context as gpc

-model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
-engine = Engine(
-    model=model,
-    criterion=criterion,
-    optimizer=optimizer,
-    lr_scheduler=lr_scheduler,
-    schedule=schedule
-)
+engine, train_dataloader, test_dataloader = colossalai.initialize()

 trainer = Trainer(engine=engine,
-                  hooks_cfg=gpc.config.hooks,
                  verbose=True)
 trainer.fit(
    train_dataloader=train_dataloader,
    test_dataloader=test_dataloader,
-    max_epochs=gpc.config.num_epochs,
+    epochs=gpc.config.num_epochs,
+    hooks_cfg=gpc.config.hooks,
    display_progress=True,
    test_interval=5
 )

--- a/colossalai/builder/__init__.py
+++ b/colossalai/builder/__init__.py
-from .builder import *
+from .builder import (build_schedule, build_lr_scheduler, build_model, build_optimizer, build_optimizer_wrapper,
+                      build_layer, build_loss, build_hooks, build_dataset, build_transform, build_data_sampler,
+                      build_gradient_handler)
 from .pipeline import ModelInitializer
+
+__all__ = [
+    'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer', 'build_optimizer_wrapper',
+    'build_layer', 'build_loss', 'build_hooks', 'build_dataset', 'build_transform', 'build_data_sampler',
+    'build_gradient_handler', 'ModelInitializer'
+]
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
@@ -181,18 +181,6 @@ def build_transform(config):
    return build_from_registry(config, TRANSFORMS)


-def build_pipe_alloc_policy(config):
-    """Returns a pipeline allocation policy object constructed from `config`.
-
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: A pipeline allocation policy object
-    :rtype: 
-    """
-    return build_from_registry(config, PIPE_ALLOC_POLICY)
-
-
 def build_data_sampler(config, dataset):
    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
    constructed from `config`.
@@ -235,7 +223,7 @@ def build_optimizer_wrapper(config, optimizer, model=None):
        return OPTIMIZER_WRAPPERS.get_module(mod_type)(optimizer, **config_)


-def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
+def build_lr_scheduler(config, optimizer):
    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler` 
    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.

@@ -254,9 +242,16 @@ def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
    """
    config_ = config.copy()
    mod_type = config_.pop('type')
-    # warmup epochs will overwrite warmup steps
-    if 'warmup_epochs' in config_:
-        warmup_epochs = config_.pop('warmup_epochs')
-        config_['warmup_steps'] = int(num_steps_per_epoch * warmup_epochs)
-    return LR_SCHEDULERS.get_module(mod_type)(optimizer, total_steps, num_steps_per_epoch=num_steps_per_epoch,
-                                              **config_)
+    return LR_SCHEDULERS.get_module(mod_type)(optimizer, **config_)
+
+
+def build_schedule(config):
+    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
+    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
+    """
+    return build_from_registry(config, SCHEDULE)
--- a/colossalai/engine/__init__.py
+++ b/colossalai/engine/__init__.py
-from .amp_type import AMP_TYPE
 from ._base_engine import Engine
 from .gradient_handler import *
 from .schedule import *
+from .amp import *


 __all__ = ['Engine']
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-from typing import Optional
+from torch.nn import Module
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer

 from colossalai.builder import build_gradient_handler
 from colossalai.context import ParallelMode
@@ -9,162 +11,166 @@ from colossalai.core import global_context as gpc
 from colossalai.logging import get_global_dist_logger
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
                           ZeroRedundancyOptimizer_Level_3)
-from torch.nn import Module
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-from torch.optim.lr_scheduler import _LRScheduler
-from torch.utils.data import DataLoader
-
-from .schedule import BaseSchedule, NoPipelineSchedule
+from .schedule import BaseSchedule


 class Engine:
    """Basic engine class for training and evaluation. It runs a specific process method 
    :meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
+    It controls a iteration in training.

-    :param train_dataloader: Dataloader in training
-    :param test_dataloader: Dataloader in evaluation
    :param model: The neural network model
-    :param criterion: Criterion for calculating loss
    :param optimizer: Optimizer for updating the parameters
-    :param lr_scheduler: Learning rate scheduler ajusting learning rate during the training or evaluation
-    :param schedule: Running schedule in :meth:`step`
-    :type train_dataloader: DataLoader, optional
-    :type test_dataloader: DataLoader, optional
+    :param step_schedule: Running schedule in :meth:`step`
+    :param gradient_accumulation: Steps of gradient accumulation
+    :param gradient_clipping: The norm of gradient clipping
    :type model: Module
-    :type criterion: _Loss, optional
-    :type optimizer: Optimizer, optional
-    :type lr_scheduler: _LRScheduler, optional
-    :type schedule: BaseSchedule, optional
+    :type optimizer: Optimizer
+    :type step_schedule: BaseSchedule, optional
+    :type gradient_accumulation: int, optional
+    :type gradient_clipping: float, optional
    """
+
    def __init__(self,
-                 train_dataloader: Optional[DataLoader] = None,
-                 test_dataloader: Optional[DataLoader] = None,
-                 model: Module = None,
-                 criterion: _Loss = None,
-                 optimizer: Optimizer = None,
-                 lr_scheduler: Optional[_LRScheduler] = None,
-                 schedule: BaseSchedule = None):
-        self.train_dataloader = train_dataloader
-        self.test_dataloader = test_dataloader
-        assert model is not None, "Engine requires a model"
-        self.model = model
-        self.criterion = criterion
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        self.schedule = schedule if schedule is not None \
-            else NoPipelineSchedule()
+                 model: Module,
+                 optimizer: Optimizer,
+                 criterion: _Loss,
+                 step_schedule: BaseSchedule,
+                 gradient_handlers: list = None,
+                 gradient_accumulation: int = 1,
+                 gradient_clipping: float = 0.0,
+                 ):
+        self._model = model
+        self._optimizer = optimizer
+        self._criterion = criterion
+        self._schedule = step_schedule
+
+        # schedule initialize
+        self._schedule.initialize(model, optimizer)
+
+        # state
+        self.training = True  # default
+
+        # gradient accumulation
+        assert gradient_accumulation > 0, 'gradient accumulation size must be larger than 0'
+        self._grad_accum_size = gradient_accumulation
+        self._grad_clip = gradient_clipping
        self._logger = get_global_dist_logger()

        # build gradient handler
        self._gradient_handlers = []
-        gradient_handler_cfg = []

-        if hasattr(gpc.config, 'gradient_handler'):
-            assert isinstance(gpc.config.gradient_handler, list), \
+        if gradient_handlers is not None:
+            assert isinstance(gradient_handlers, list), \
                f'argument gradient_handler_cfg expected type list, ' \
-                f'but got type {type(gpc.config.gradient_handler)}'
-            gradient_handler_cfg = gpc.config.gradient_handler
-        elif isinstance(self.optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                         ZeroRedundancyOptimizer_Level_3)):
-            gradient_handler_cfg = [dict(type='ZeROGradientHandler')]
+                f'but got type {type(gradient_handlers)}'
+        elif isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
+                                    ZeroRedundancyOptimizer_Level_3)):
+            gradient_handlers = [dict(type='ZeROGradientHandler')]
            self._logger.info(
                "Training with zero is detected, ZeROGradientHandler is automatically "
                "added even though not specified in the configuration",
                ranks=[0])
        elif gpc.is_initialized(ParallelMode.DATA) and gpc.get_world_size(
                ParallelMode.DATA) > 1:
-            gradient_handler_cfg = [dict(type='DataParallelGradientHandler')]
+            gradient_handlers = [dict(type='DataParallelGradientHandler')]
            self._logger.info(
                "Data parallel training is detected, DataParallelGradientHandler is automatically "
                "added even though not specified in the configuration",
                ranks=[0])
-        if len(gradient_handler_cfg) == 0:
+
+        if gradient_handlers is None:
            self._logger.warning(
                "No gradient handler is set up, please make sure you do not need "
                "to all-reduce the gradients after a training step.",
                ranks=[0])
-        for cfg in gradient_handler_cfg:
-            handler = build_gradient_handler(cfg, self.model, self.optimizer)
-            self._gradient_handlers.append(handler)
+        else:
+            for cfg in gradient_handlers:
+                handler = build_gradient_handler(cfg, model, optimizer)
+                self._gradient_handlers.append(handler)

-        self.schedule.initialize(self.train_dataloader, self.model,
-                                 self.criterion, self.optimizer,
-                                 self.lr_scheduler)
-        self.forward_only = False
+    @property
+    def model(self):
+        return self._model

-    def handle_gradient(self):
-        """Handles all-reduce operations of gradients across different parallel groups.
-        """
-        for handler in self._gradient_handlers:
-            handler.handle_gradient()
+    @property
+    def optimizer(self):
+        return self._optimizer

-    def set_dataloader(self, data: DataLoader, train: bool = True):
-        """Sets dataloader in training or evaluation.
+    @property
+    def criterion(self):
+        return self._criterion

-        :param data: Dataloader to be set
-        :param train: Set training dataloader if True, otherwise evaluation dataloader
-        :type data: DataLoader
-        :type train: bool
-        """
-        if train:
-            self.train_dataloader = data
-        else:
-            self.test_dataloader = data
+    @property
+    def schedule(self):
+        return self._schedule

-    def get_model(self):
-        """Returns the neural network model in the engine.
-        """
-        return self.model
-    def get_optimizer(self):
-        """Returns optimizier in the engine.
-        """
-        return self.optimizer
+    @property
+    def gradient_accumulation(self):
+        return self._grad_accum_size

-    def get_lr_scheduler(self):
-        """Returns the learning rate scheduler in the engine.
+    def handle_gradient(self):
+        """Handles all-reduce operations of gradients across different parallel groups.
        """
-        return self.lr_scheduler
+        for handler in self._gradient_handlers:
+            handler.handle_gradient()

    def train(self):
        """Sets the model to training mode.
        """
-        self.forward_only = False
-        self.schedule.train(dataloader=self.train_dataloader, mode=True)
+        self.training = True
+        self._model.train()

    def eval(self):
        """Sets the model to evaluation mode.
        """
-        self.forward_only = True
-        self.schedule.train(dataloader=self.test_dataloader, mode=False)
+        self.training = False
+        self._model.eval()

-    def is_train(self):
-        """Returns True if it is in training, otherwise False.
-        """
-        return not self.forward_only
-
-    def get_lr(self):
-        """Gets current learning rate.
-        """
-        return self.schedule.get_lr()
-
-    def step(self, return_loss=True):
+    def step(self,
+             data_iter,
+             is_last_iteration: bool = False,
+             return_loss=True):
        """A running step based on the schedule. Usually, it runs a training or
        evaluation over a batch of dataset.

+        :param data_iter: Data iterator of the dataset
+        :param is_last_iteration: If True, this iteration is the last iteration in the epoch
        :param return_loss: loss will be returned if True
-        :type return_loss: bool
+        :type data_iter: Iterator
+        :type is_last_iteration: bool, optional
+        :type return_loss: bool, optional
        :return: (output, lablel, loss)
        """
-        self.schedule.zero_grad(forward_only=self.forward_only)
-
-        output, label, loss = self.schedule.forward_backward_step(
-            forward_only=self.forward_only, return_loss=return_loss)
-
-        if not self.forward_only:
-            # all reduce gradients
-            self.handle_gradient()
-
-            self.schedule.step()
+        if self.training:
+            self._optimizer.zero_grad()
+
+        # differentiate training and eval with grad accum
+        if self.training:
+            for i in range(self._grad_accum_size):
+                output, label, loss = self._schedule.forward_backward_step(
+                    data_iter, self._model, self._criterion, self._optimizer,
+                    forward_only=False,
+                    grad_accum_size=self._grad_accum_size,
+                    return_loss=return_loss)
+
+                if i == self._grad_accum_size - 1:
+                    # all reduce gradients
+                    self.handle_gradient()
+                    self._schedule.optimizer_step(self._model, self._optimizer, self._grad_clip)
+        else:
+            output, label, loss = self._schedule.forward_backward_step(
+                data_iter, self._model, self._criterion, self._optimizer,
+                forward_only=True,
+                grad_accum_size=1,
+                return_loss=return_loss)
+
+        # consume the remaining dataset left out due to gradient accumulation
+        if is_last_iteration:
+            while True:
+                try:
+                    _ = next(data_iter)
+                except StopIteration:
+                    break

        return output, label, loss
--- a/colossalai/engine/amp/__init__.py
+++ b/colossalai/engine/amp/__init__.py
+from .grad_scaler import GradScaler
+from .amp_type import AMP_TYPE
--- a/colossalai/engine/amp_type.py
+++ b/colossalai/engine/amp_type.py
--- a/colossalai/engine/amp/grad_scaler.py
+++ b/colossalai/engine/amp/grad_scaler.py
+# modified from https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.p
+import torch
+from collections import defaultdict, abc
+import warnings
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+from colossalai.context import ParallelMode
+import torch.distributed as dist
+from colossalai.core import global_context as gpc
+
+
+class _MultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda or master_tensor.device.type == 'xla'
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(
+                device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_per_optimizer_state():
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+class GradScaler(object):
+    _scale: Optional[torch.Tensor]
+    _grows_tracker: Optional[torch.Tensor]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
+    """
+    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example::
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+    """
+
+    def __init__(self,
+                 init_scale=2.**16,
+                 growth_factor=2.0,
+                 backoff_factor=0.5,
+                 growth_interval=2000,
+                 enabled=True):
+        if enabled and not torch.cuda.is_available():
+            warnings.warn(
+                "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
+            self._enabled = False
+        else:
+            self._enabled = enabled
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker = None
+            self._per_optimizer_states = defaultdict(
+                _refresh_per_optimizer_state)
+
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, "Attempted {} but _scale is None.  ".format(
+            funcname) + fix
+        assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(
+            funcname) + fix
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self, dev):
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = torch.full(
+            (1,), self._init_scale, dtype=torch.float32, device=dev)
+        self._growth_tracker = torch.full(
+            (1,), self._init_growth_tracker, dtype=torch.int32, device=dev)
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda or outputs.device.type == 'xla'
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        # holds a reference that can be overwritten by apply_scale
+        stash: List[_MultiDeviceReplicator] = []
+
+        def apply_scale(val):
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda or val.device.type == 'xla'
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError(
+                    "outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
+        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(
+            lambda: defaultdict(list))  # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError(
+                            "Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(
+                        to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(grads,
+                                                                     per_device_found_inf.get(
+                                                                         device),
+                                                                     per_device_inv_scale.get(device))
+        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
+        if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(ParallelMode.TENSOR) > 1:
+            for tensor in per_device_found_inf._per_device_tensors.values():
+                dist.all_reduce(tensor, op=dist.ReduceOp.MAX,
+                                group=gpc.get_group(ParallelMode.TENSOR))
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer):
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError(
+                "unscale_() has already been called on this optimizer since the last update().")
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full(
+            (1,), 0.0, dtype=torch.float32, device=self._scale.device)
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
+            optimizer, inv_scale, found_inf, False)
+        optimizer_state["stage"] = OptState.UNSCALED
+
+    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
+        retval = None
+        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
+            retval = optimizer.step(*args, **kwargs)
+        return retval
+
+    def step(self, optimizer, *args, **kwargs):
+        """
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if (not self._enabled):
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError(
+                "Closure use is not currently supported if GradScaler is enabled.")
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError(
+                "step() has already been called since the last update().")
+
+        retval = None
+
+        if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
+            optimizer_state["stage"] = OptState.STEPPED
+            return retval
+
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+
+        assert len(optimizer_state["found_inf_per_device"]
+                   ) > 0, "No inf checks were recorded for this optimizer."
+
+        retval = self._maybe_opt_step(
+            optimizer, optimizer_state, *args, **kwargs)
+
+        optimizer_state["stage"] = OptState.STEPPED
+
+        return retval
+
+    def update(self, new_scale=None):
+        """
+        Updates the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)  # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                # type: ignore[attr-defined]
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)  # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [found_inf.to(device=_scale.device, non_blocking=True)
+                          for state in self._per_optimizer_states.values()
+                          for found_inf in state["found_inf_per_device"].values()]
+
+            assert len(
+                found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            torch._amp_update_scale_(_scale,
+                                     _growth_tracker,
+                                     found_inf_combined,
+                                     self._growth_factor,
+                                     self._backoff_factor,
+                                     self._growth_interval)
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self):
+        return self._scale
+
+    def get_scale(self):
+        """
+        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return self._init_scale if self._scale is None else self._get_scale_async().item()
+        else:
+            return 1.0
+
+    def get_growth_factor(self):
+        r"""
+        Returns a Python float containing the scale growth factor.
+        """
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self):
+        r"""
+        Returns a Python float containing the scale backoff factor.
+        """
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self):
+        r"""
+        Returns a Python int containing the growth interval.
+        """
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval):
+        r"""
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self):
+        if self._enabled:
+            return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item()
+        else:
+            return 0
+
+    def is_enabled(self):
+        r"""
+        Returns a bool indicating whether this instance is enabled.
+        """
+        return self._enabled
+
+    def state_dict(self):
+        r"""
+        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        return {"scale": self.get_scale(),
+                "growth_factor": self._growth_factor,
+                "backoff_factor": self._backoff_factor,
+                "growth_interval": self._growth_interval,
+                "_growth_tracker": self._get_growth_tracker()} if self._enabled else {}
+
+    def load_state_dict(self, state_dict):
+        r"""
+        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError("The source state dict is empty, possibly because it was saved "
+                               "from a disabled instance of GradScaler.")
+
+        self._init_scale = state_dict["scale"]
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = state_dict["growth_factor"]
+        self._backoff_factor = state_dict["backoff_factor"]
+        self._growth_interval = state_dict["growth_interval"]
+        self._init_growth_tracker = state_dict["_growth_tracker"]
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\
+                                                         "of an iteration, or at the end after scaler.update()."
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state['_init_scale'] = self.get_scale()
+            state['_init_growth_tracker'] = self._get_growth_tracker()
+            state['_scale'] = None
+            state['_growth_tracker'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer):
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = torch.full(
+            (1,), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full(
+            (1,), 0.0, dtype=torch.float32, device=_scale.device)
+
+        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
+            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer):
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
--- a/colossalai/engine/schedule/_base_schedule.py
+++ b/colossalai/engine/schedule/_base_schedule.py
@@ -5,125 +5,85 @@ from abc import ABC, abstractmethod

 import torch

+from colossalai.core import global_context as gpc
 from colossalai.logging import get_global_dist_logger
 from colossalai.utils import get_current_device


 class BaseSchedule(ABC):
    """A basic helper class to control the process of training or evaluation.
+    It mainly composes of forward_backward_step for gradient backward and
+    optimizer_step for parameters update.
+    For the convenience to enable FP16, we aggreate all codes that contain the
+    control of FP16 in class schedule.
    """
+
    def __init__(self):
-        self.initialized = False
        self.logger = get_global_dist_logger()

-    @property
-    @abstractmethod
-    def num_steps(self):
-        """The number of batches in training or evaluation.
-        """
-        pass
+    @staticmethod
+    def _move_tensor(element):
+        if torch.is_tensor(element):
+            if not element.is_cuda:
+                return element.to(get_current_device()).detach()
+        return element

-    def initialize(self,
-                   dataloader=None,
-                   model=None,
-                   criterion=None,
-                   optimizer=None,
-                   lr_scheduler=None):
-        """Initializes the schedule and set parameters before running.
-
-        :param dataloader: DataLoader in training or evaluation
-        :param model: The neural network model
-        :param criterion: Criterion for calculating loss
-        :param optimizer: Optimizer for updating the parameters
-        :param lr_scheduler: Learning rate scheduler in the process
-        """
-        self.dataloader = dataloader
-        assert model is not None, "Schedule requires a model"
-        self.model = model
-        assert criterion is not None, "Schedule requires a criterion"
-        self.criterion = criterion
-        assert optimizer is not None, "Schedule requires an optimizer"
-        self.optimizer = optimizer
-        self.lr_scheduler = lr_scheduler
-        self.initialized = True
-
-    def check_initialized(self):
-        """Checks whether the schedule is initialized.
-        """
-        assert self.initialized, \
-            'Schedule is not initialized. Call schedule.initialize(...) before using it.'
+    def _move_to_device(self, data):
+        if isinstance(data, (tuple, list)):
+            data = tuple([self._move_tensor(d) for d in data])
+        elif torch.is_tensor(data):
+            data = data.to(get_current_device()).detach()
+        return data

-    def load_batch(self):
-        """Loads a batch of dataset. It returns the data and labels which are
+    def load_batch(self, data_iter):
+        """Loads a batch from data iterator. It returns the data and labels which are
        already in the same GPU as where the model's.

        :return: (data, label)
-        :rtype: (Tensor, Tensor) 
+        :rtype: (Tensor, Tensor)
        """
-        self.check_initialized()
-        if self.data_iter is None:
+        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')
-        data, label = next(self.data_iter)
+        data, label = next(data_iter)
        return self._move_to_device(data), self._move_to_device(label)

-    def _move_to_device(self, data):
-        if isinstance(data, (
-                tuple,
-                list,
-        )):
-            data = tuple([
-                d.to(get_current_device()).detach() for d in data
-                if torch.is_tensor(d)
-            ])
-        elif torch.is_tensor(data):
-            data = data.to(get_current_device()).detach()
-        return data
-
-    def train(self, dataloader=None, mode=True):
-        """Sets the dataloader to be used and turn the model to 
-        training or evaluation mode.
+    def initialize(self, model, optimizer):
+        """Initializes the model and the optimizer before training.
+         This is often used in FP16 training.

-        :param dataloader: Dataloader to be used
-        :param mode: If True, the model will set as training mode. Otherwise, evaluation mode.
-        """
-        self.check_initialized()
-        if mode:
-            self.model.train()
-        else:
-            self.model.eval()
-        if dataloader is not None:
-            self.dataloader = dataloader
-            self.data_iter = iter(dataloader)
-
-    def zero_grad(self, forward_only=False):
-        """Cleans gradients with the optimizer.
+        :param model: The neural network model
+        :param optimizer: Optimizer for updating the parameters
        """
-        if not forward_only:
-            self.check_initialized()
-            self.optimizer.zero_grad()
+        return model, optimizer

-    def get_lr(self):
-        """Returns the current learning rate.
-        """
-        if self.lr_scheduler is not None:
-            return self.lr_scheduler.get_lr()[0]
-        else:
-            return self.optimizer.param_groups[0]['lr']
+    @abstractmethod
+    def forward_backward_step(self,
+                              data_iter,
+                              model,
+                              criterion,
+                              optimizer=None,
+                              forward_only=False,
+                              grad_accum_size: int = 1,
+                              return_loss=True):
+        """The process function over a batch of dataset for training or evaluation.

-    def step(self):
-        """Updates the parameters and learning rate with the optimizer.
+        :param data_iter: Data iterator of the dataset
+        :param model: Model used in training or evaluation
+        :param optimizer: Optimizer used in training or evaluation
+        :param criterion: Loss function
+        :param forward_only: If True, the process won't include backward
+        :param grad_accum_size: Steps of gradient accumulation
+        :param return_loss: If False, the loss won't be returned
        """
-        self.check_initialized()
-        self.optimizer.step()
-        # update lr scheduler
-        if self.lr_scheduler is not None:
-            self.lr_scheduler.step()
+        pass

    @abstractmethod
-    def forward_backward_step(self, forward_only=False, return_loss=True):
-        """The process function over a batch of dataset for training or evaluation.
+    def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
+        """Updates the parameters with the optimizer.

-        :param forward_only: If True, the process won't include backward.
-        :param return_loss: If False, the loss won't be returned.
+        :param model: The neural network model
+        :param optimizer: Optimizer for updating the parameters
+        :param grad_clipping: The norm of gradient clipping
+        :type grad_clipping: float, optional
        """
        pass
--- a/colossalai/engine/schedule/_no_pipeline.py
+++ b/colossalai/engine/schedule/_no_pipeline.py
@@ -4,19 +4,24 @@
 try:
    import apex.amp as apex_amp
 except:
-    print('apex is required for mixed precision training')
+    pass
+
 try:
    import torch.cuda.amp as torch_amp
 except:
-    print('PyTorch amp is not supported with the current PyTorch version')
+    pass
+
+from typing import Iterable
+
+import torch.nn as nn
+from torch.optim import Optimizer

-from colossalai.context import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.engine.amp_type import AMP_TYPE
 from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
                           ZeroRedundancyOptimizer_Level_3)
-from ._utils import convert_to_fp16
+from colossalai.nn.optimizer._utils import clip_grad_norm_fp32
 from ._base_schedule import BaseSchedule
+from ._utils import convert_to_fp16, convert_to_fp32
+from ..amp import AMP_TYPE, GradScaler


 class NoPipelineSchedule(BaseSchedule):
@@ -30,6 +35,7 @@ class NoPipelineSchedule(BaseSchedule):
    :type amp_type: AMP_TYPE
    :type amp_config: dict
    """
+
    def __init__(
            self,
            amp_type: AMP_TYPE = None,
@@ -41,12 +47,6 @@ class NoPipelineSchedule(BaseSchedule):
        assert amp_type is None or isinstance(amp_type, AMP_TYPE), \
            'unrecognised value for argument fp16, it can only be None, torch or apex'

-        # LSG: check compatibility
-        # LSG: torch.cuda.amp and apex.amp cannot be used for tensor parallel
-        if gpc.is_initialized(ParallelMode.TENSOR) and gpc.get_world_size(
-                ParallelMode.TENSOR) > 1:
-            assert amp_type != AMP_TYPE.TORCH and amp_type != AMP_TYPE.APEX, \
-                'You can only AMP_TYPE.PARALLEL for tensor parallel training'
        self.use_zero_level_2_3 = False

        if amp_type is not None:
@@ -79,107 +79,110 @@ class NoPipelineSchedule(BaseSchedule):
            self.fp16 = False
            self.amp_type = None

-    @property
-    def num_steps(self):
-        return len(self.dataloader)
-
-    def initialize(self,
-                   dataloader,
-                   model,
-                   criterion,
-                   optimizer,
-                   lr_scheduler=None):
-        super().initialize(dataloader,
-                           model,
-                           criterion,
-                           optimizer,
-                           lr_scheduler=lr_scheduler)
-        if isinstance(self.optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                       ZeroRedundancyOptimizer_Level_3)):
+    def initialize(self, model: nn.Module, optimizer: Optimizer):
+        if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2,
+                                  ZeroRedundancyOptimizer_Level_3)):
            self.use_zero_level_2_3 = True
-            assert self.amp_type != AMP_TYPE.PARALLEL, 'ZeRO Level 2 and 3 are mutually exclusive with AMP_TYPE.PARALLEL'
+            assert self.amp_type != AMP_TYPE.PARALLEL, \
+                'ZeRO Level 2 and 3 are mutually exclusive with AMP_TYPE.PARALLEL'

        if self.fp16:
            if self.amp_type == AMP_TYPE.TORCH:
-                self._torch_amp_scaler = torch_amp.GradScaler(**self.amp_cfg)
+                self._torch_amp_scaler = GradScaler(**self.amp_cfg)
            elif self.amp_type == AMP_TYPE.APEX:
-                self.model, self.optimizer = apex_amp.initialize(
-                    self.model, self.optimizer, **self.amp_cfg)
-
-    def forward_backward_step(self, forward_only=False, return_loss=True):
+                model, optimizer = apex_amp.initialize(model, optimizer, **self.amp_cfg)
+
+        return model, optimizer
+
+    def forward_backward_step(self,
+                              data_iter: Iterable,
+                              model: nn.Module,
+                              criterion: nn.modules.loss._Loss,
+                              optimizer: Optimizer = None,
+                              forward_only: bool = False,
+                              grad_accum_size: int = 1,
+                              return_loss: bool = True):
        """The process function that loads loads a batch of dataset and feeds it to the model.
        The returned labels and loss will None if :attr:`return_loss` is False.

+        :param data_iter: Data iterator of the dataloader, e.g. iter(dataloader)
+        :param model: Model for training and inference
+        :param criterion: Loss function for training
+        :param optimizer: Optimizer used for training
+        :param forward_only: If True, the model is run for the forward pass, else back propagation will be executed
+        :param grad_accum_size: The number of iterations for gradient accumulation
+        :param return_loss: Loss will be returned if True
+        :type data_iter: Iterator
+        :type model: torch.nn.Module
+        :type criterion: torch.nn.modules.loss._Loss
+        :type optimizer: torch.optim.Optimizer
+        :type forward_only: bool, optional
+        :type grad_accum_size: int
+        :type return_loss: bool, optional
        :return: (output, label, loss)
        """
        assert forward_only or return_loss, \
            'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'

-        data, label = self.load_batch()
+        data, label = self.load_batch(data_iter)
        loss = None

-        # LSG: leave for debug, make sure dataloader is deterministic
-        # if forward_only:
-        #     img = data[0]
-        #     rank = gpc.get_local_rank(ParallelMode.DATA)
-        #     world_size = gpc.get_world_size(ParallelMode.DATA)
-        #     group = gpc.get_group(ParallelMode.DATA)
-        #     input_list = [img.clone() for _ in range(world_size)]
-        #     output_list = [torch.empty_like(img) for _ in range(world_size)]
-        #     output_list[rank] = img.clone()
-        #     dist.all_to_all(output_tensor_list=output_list, input_tensor_list=input_list, group=group)
-        #     assert torch.equal(output_list[0], output_list[1])  # and torch.equal(output_list[1], output_list[2])
-
        # forward
        if self.fp16 and self.amp_type == AMP_TYPE.TORCH:
            with torch_amp.autocast():
-                output = self.model(*data)
+                output = model(*data)
                if not isinstance(output, (tuple, list)):
                    output = (output,)
                if return_loss:
-                    loss = self.criterion(*output, *label)
+                    loss = criterion(*output, *label)
        else:
            if self.use_zero_level_2_3 or self.amp_type == AMP_TYPE.PARALLEL:
                data = convert_to_fp16(data)

-            output = self.model(*data)
+            output = model(*data)
+
+            if self.use_zero_level_2_3 or self.amp_type == AMP_TYPE.PARALLEL:
+                output = convert_to_fp32(output)
+
            if not isinstance(output, (tuple, list)):
                output = (output,)
            if return_loss:
-                loss = self.criterion(*output, *label)
+                loss = criterion(*output, *label)
+
+        loss /= grad_accum_size

        if not forward_only:
            # backward
            if self.use_zero_level_2_3:
-                self.optimizer.backward(loss)
+                optimizer.backward(loss)
            elif self.fp16:
                if self.amp_type == AMP_TYPE.APEX:
-                    with apex_amp.scale_loss(loss,
-                                             self.optimizer) as scaled_loss:
+                    with apex_amp.scale_loss(loss, optimizer) as scaled_loss:
                        scaled_loss.backward()
                elif self.amp_type == AMP_TYPE.TORCH:
                    self._torch_amp_scaler.scale(loss).backward()
                elif self.amp_type == AMP_TYPE.PARALLEL:
-                    loss = self.optimizer.scale_loss(loss)
+                    loss = optimizer.scale_loss(loss)
                    loss.backward()
                    # scale back to display the original value in logs
-                    loss.div_(self.optimizer.grad_scaler.scale)
+                    loss.div_(optimizer.grad_scaler.scale)
            else:
                loss.backward()

        if return_loss:
-            return output, label, loss
+            return output, label, loss * grad_accum_size
        else:
            return output, None, None

-    def step(self):
+    def optimizer_step(self, model: nn.Module, optimizer: Optimizer, grad_clipping: float = 0.0):
        # step optimizer
        if self.fp16 and self.amp_type == AMP_TYPE.TORCH:
-            self._torch_amp_scaler.step(self.optimizer)
+            if grad_clipping > 0.0:
+                self._torch_amp_scaler.unscale_(optimizer)
+                clip_grad_norm_fp32(model.parameters(), grad_clipping)
+            self._torch_amp_scaler.step(optimizer)
            self._torch_amp_scaler.update()
        else:
-            self.optimizer.step()
-
-        # update lr scheduler
-        if self.lr_scheduler is not None:
-            self.lr_scheduler.step()
+            if not self.fp16 and not self.use_zero_level_2_3 and grad_clipping > 0.0:
+                clip_grad_norm_fp32(model.parameters(), grad_clipping)
+            optimizer.step()
--- a/colossalai/engine/schedule/_pipeline.py
+++ b/colossalai/engine/schedule/_pipeline.py
@@ -15,7 +15,7 @@ from colossalai.nn import (ZeroRedundancyOptimizer_Level_2,
 from colossalai.utils import get_current_device
 from ._base_schedule import BaseSchedule
 from ._utils import convert_to_fp16
-from ..amp_type import AMP_TYPE
+from ..amp import AMP_TYPE


 def squeeze(x: Union[Tensor, tuple, list]):
@@ -93,12 +93,11 @@ class PipelineSchedule(BaseSchedule):
            )

    # Pipeline schedule just puts data in memory
-    def load_batch(self):
-        self.check_initialized()
-        if self.data_iter is None:
+    def load_batch(self, data_iter):
+        if data_iter is None:
            raise RuntimeError('Dataloader is not defined.')
        self.batch_pos = 0
-        data, label = next(self.data_iter)
+        data, label = next(data_iter)
        self.batch_data, self.batch_label = \
            self._move_to_device(data), self._move_to_device(label)
        batch_size = self.batch_data.shape[0]
@@ -117,23 +116,8 @@ class PipelineSchedule(BaseSchedule):
        self.batch_pos += self.microbatch_size
        return (data,), (label,)

-    @property
-    def num_steps(self):
-        return len(self.dataloader)
-
-    def initialize(self,
-                   dataloader,
-                   model,
-                   criterion,
-                   optimizer,
-                   lr_scheduler=None):
-        super().initialize(dataloader,
-                           model,
-                           criterion,
-                           optimizer,
-                           lr_scheduler=lr_scheduler)
-        if isinstance(self.optimizer, (ZeroRedundancyOptimizer_Level_2,
-                                       ZeroRedundancyOptimizer_Level_3)):
+    def initialize(self, model, optimizer):
+        if isinstance(optimizer, (ZeroRedundancyOptimizer_Level_2, ZeroRedundancyOptimizer_Level_3)):
            raise TypeError(
                "Pipeline schedule is currently not compatible with ZeRO Level 2 and Level 3"
            )
@@ -145,7 +129,8 @@ class PipelineSchedule(BaseSchedule):
                'default tensor dtype is set to torch.half for fp16 training',
                ranks=[0])

-    def forward_step(self, input_tensor, return_tensors, return_loss=True):
+    def forward_step(self, model, criterion, input_tensor, return_tensors,
+                     grad_accum_size, return_loss=True):
        """Forward step for passed-in model. If it is the first stage, the input tensor 
        is obtained from data_iterator, otherwise the passed-in input_tensor is used.
        Returns output tensor. This is a helper function and can be ignored by users.
@@ -156,14 +141,14 @@ class PipelineSchedule(BaseSchedule):
            if self.amp_type == AMP_TYPE.PARALLEL:
                input_tensor = convert_to_fp16(input_tensor)
        input_tensor = squeeze(input_tensor)
-        output_tensor = self.model(input_tensor)
+        output_tensor = model(input_tensor)
        output_tensor = squeeze(output_tensor)

        if gpc.is_last_rank(ParallelMode.PIPELINE):
            if return_loss:
                input_tensor, label = self.load_micro_batch()
-                loss_reduced = self.criterion(output_tensor, *
-                label) / self.num_microbatches
+                loss_reduced = criterion(output_tensor, *label) \
+                               / (self.num_microbatches * grad_accum_size)
                return_tensors.append(
                    tuple((output_tensor, label[0], loss_reduced)))
                return loss_reduced
@@ -174,7 +159,7 @@ class PipelineSchedule(BaseSchedule):
        else:
            return output_tensor

-    def backward_step(self, input_tensor, output_tensor, output_tensor_grad):
+    def backward_step(self, optimizer, input_tensor, output_tensor, output_tensor_grad):
        """Backward step through the passed-in output tensor. If it is the last stage, the 
        output_tensor_grad is None, otherwise it is the gradients with respect to stage's output tensor.
        Returns the gradients with respect to the input tensor (None if first stage).
@@ -187,7 +172,7 @@ class PipelineSchedule(BaseSchedule):

        # Backward pass.
        if output_tensor_grad is None and self.amp_type == AMP_TYPE.PARALLEL:
-            output_tensor = self.optimizer.scale_loss(output_tensor)
+            output_tensor = optimizer.scale_loss(output_tensor)
        torch.autograd.backward(output_tensor, grad_tensors=output_tensor_grad)

        # Collect the grad of the input_tensor.
@@ -197,17 +182,24 @@ class PipelineSchedule(BaseSchedule):

        return input_tensor_grad

-    def forward_backward_step(self, forward_only=True, return_loss=True):
+    def forward_backward_step(self,
+                              data_iter,
+                              model,
+                              criterion,
+                              optimizer=None,
+                              forward_only=False,
+                              grad_accum_size: int = 1,
+                              return_loss=True):
        """Runs non-interleaved 1F1B schedule, with communication between pipeline stages.
        Returns a tuple with losses if the last stage, an empty tuple otherwise.
-        
+
        :return: (output, label, loss)
        """

        assert forward_only or return_loss, \
            'The argument \'return_loss\' has to be True when \'forward_only\' is False, but got False.'

-        self.load_batch()
+        self.load_batch(data_iter)
        num_warmup_microbatches = \
            (gpc.get_world_size(ParallelMode.PIPELINE) -
             gpc.get_local_rank(ParallelMode.PIPELINE) - 1)
@@ -233,9 +225,11 @@ class PipelineSchedule(BaseSchedule):
            if not gpc.is_first_rank(ParallelMode.PIPELINE):
                ft_shape = recv_tensor_meta(ft_shape)
            input_tensor = recv_forward(ft_shape)
-            output_tensor = self.forward_step(input_tensor,
-                                              return_tensors,
-                                              return_loss=return_loss)
+            output_tensor = self.forward_step(
+                model, criterion,
+                input_tensor, return_tensors,
+                grad_accum_size, return_loss=return_loss
+            )
            if not gpc.is_last_rank(ParallelMode.PIPELINE):
                bt_shape = output_tensor.shape
                fs_checker = send_tensor_meta(output_tensor, fs_checker)
@@ -257,9 +251,11 @@ class PipelineSchedule(BaseSchedule):
        for i in range(num_microbatches_remaining):
            last_iteration = (i == (num_microbatches_remaining - 1))

-            output_tensor = self.forward_step(input_tensor,
-                                              return_tensors,
-                                              return_loss=return_loss)
+            output_tensor = self.forward_step(
+                model, criterion,
+                input_tensor, return_tensors,
+                grad_accum_size, return_loss=return_loss
+            )
            if forward_only:
                send_forward(output_tensor)

@@ -279,9 +275,11 @@ class PipelineSchedule(BaseSchedule):
                input_tensor = input_tensors.pop(0)
                output_tensor = output_tensors.pop(0)

-                input_tensor_grad = self.backward_step(input_tensor,
-                                                       output_tensor,
-                                                       output_tensor_grad)
+                input_tensor_grad = self.backward_step(
+                    optimizer,
+                    input_tensor, output_tensor,
+                    output_tensor_grad
+                )

                if last_iteration:
                    input_tensor = None
@@ -298,9 +296,11 @@ class PipelineSchedule(BaseSchedule):

                output_tensor_grad = recv_backward(bt_shape)

-                input_tensor_grad = self.backward_step(input_tensor,
-                                                       output_tensor,
-                                                       output_tensor_grad)
+                input_tensor_grad = self.backward_step(
+                    optimizer,
+                    input_tensor, output_tensor,
+                    output_tensor_grad
+                )

                send_backward(input_tensor_grad)

@@ -309,8 +309,11 @@ class PipelineSchedule(BaseSchedule):
                output, label, loss = tuple(map(list, zip(*return_tensors)))
                return (torch.cat(output, dim=0),
                        torch.cat(label, dim=0),
-                        sum(loss))
+                        sum(loss) * grad_accum_size)
            else:
                return tuple((torch.cat(return_tensors, dim=0), None, None))
        else:
            return tuple((None, None, None))
+
+    def optimizer_step(self, model, optimizer, grad_clipping: float = 0.0):
+        optimizer.step()
--- a/colossalai/engine/schedule/_utils.py
+++ b/colossalai/engine/schedule/_utils.py
@@ -14,3 +14,14 @@ def convert_to_fp16(data: Union[Tensor, List[Tensor]]):
    else:
        raise TypeError(f"Expected argument 'data' to be a Tensor or a list/tuple of Tensor, but got {type(data)}")
    return ret
+
+
+def convert_to_fp32(data: Union[Tensor, List[Tensor]]):
+    if isinstance(data, Tensor):
+        ret = data.float()
+    elif isinstance(data, (list, tuple)):
+        ret = [val.float() for val in data]
+    else:
+        raise TypeError(f"Expected argument 'data' to be a Tensor or a list/tuple of Tensor, but got {type(data)}")
+    return ret
+
--- a/colossalai/initialize.py
+++ b/colossalai/initialize.py
@@ -6,18 +6,20 @@ import pprint
 import random
 from pathlib import Path
 from typing import Callable, Iterable, Optional, Union
+from typing import Tuple

 import numpy as np
 import torch
 from torch.utils.data import DataLoader

 from colossalai.engine import AMP_TYPE, NoPipelineSchedule, PipelineSchedule
+from colossalai.engine import Engine
 from colossalai.logging import get_global_dist_logger, init_global_dist_logger
 from colossalai.nn import DataParallelSampler
 from colossalai.nn.model.base_model import BaseModel
 from .builder import (ModelInitializer, build_dataset, build_loss,
-                      build_lr_scheduler, build_model, build_optimizer,
-                      build_optimizer_wrapper)
+                      build_model, build_optimizer,
+                      build_optimizer_wrapper, build_schedule)
 from .context import Config, ParallelMode
 from .core import global_context as gpc
 from .utils import get_current_device, sync_model_param_in_dp
@@ -182,7 +184,7 @@ def initialize(config: Union[str, dict] = None,
               backend: str = None,
               train_dataloader: Optional[Union[Iterable, Callable]] = None,
               test_dataloader: Optional[Union[Iterable, Callable]] = None,
-               ):
+               ) -> Tuple[Engine, DataLoader, DataLoader]:
    '''Core function that initializes distributed environment, logger, cudnn, data, model, loss function, optimizer, and lr_scheduler(their configs are in gpc.config).

    :param config: config file or config file path are both acceptable
@@ -201,7 +203,7 @@ def initialize(config: Union[str, dict] = None,
    :type train_dataloader: Optional[Union[Iterable, Callable]], optional
    :param test_dataloader: If None, the config is used to build a dataloder; Else, it should be a dataloader object or a function with no arguments which can build a dataloader, defaults to None
    :type test_dataloader: Optional[Union[Iterable, Callable]], optional
-    :return: (model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler)
+    :return: (engine, train_dataloader, test_dataloader, criterion)
    :rtype: tuple
    '''
    # initialize distributed environment
@@ -337,21 +339,7 @@ def initialize(config: Union[str, dict] = None,
        optimizer = build_optimizer_wrapper(fp16_cfg, optimizer)
    logger.info('Optimizer is created', ranks=[0])

-    lr_scheduler = None
-    if hasattr(gpc.config, 'lr_scheduler'):
-        if hasattr(gpc.config, 'num_steps'):
-            total_steps = gpc.config.num_steps
-        elif hasattr(gpc.config, 'num_epochs'):
-            total_steps = int(gpc.config.num_epochs * len(train_dataloader))
-        else:
-            raise Exception(
-                'Please specify training stopping criterion num_steps or num_epochs in your configuration.'
-            )
-        lr_scheduler = build_lr_scheduler(gpc.config.lr_scheduler, optimizer,
-                                          total_steps, len(train_dataloader))
-        logger.info('Learning rate scheduler is created', ranks=[0])
-
-    # pipeline or no pipeline schedule
+    # build schedule and engine
    if hasattr(gpc.config, 'fp16'):
        amp_type = gpc.config.fp16.mode
        amp_cfg = gpc.config.fp16.copy()
@@ -360,12 +348,32 @@ def initialize(config: Union[str, dict] = None,
        amp_type = None
        amp_cfg = None

-    if gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
-        assert hasattr(gpc.config,
-                       'schedule'), "Config 'schedule' not found in your configuration file for pipeline parallel training"
+    engine_cfg = gpc.config.get('engine', dict())
+    schedule_cfg = engine_cfg.pop('schedule', None)
+
+    schedule_type = None
+    if schedule_cfg is not None:
+        schedule_type = schedule_cfg.get('type', None)
+
+    if schedule_type is not None:
+        # run customized schedule
+        schedule_cfg['amp_type'] = amp_type
+        schedule_cfg['amp_config'] = amp_cfg
+        schedule = build_schedule(schedule_cfg)
+    elif gpc.is_initialized(ParallelMode.PIPELINE) and gpc.get_world_size(ParallelMode.PIPELINE) > 1:
+        assert schedule_cfg is not None, \
+            "Config 'engine.schedule' not found in your configuration file for pipeline parallel training"
        schedule = PipelineSchedule(
-            amp_type=amp_type, amp_config=amp_cfg, **gpc.config.schedule.copy())
+            amp_type=amp_type, amp_config=amp_cfg, **schedule_cfg.copy())
    else:
        schedule = NoPipelineSchedule(amp_type=amp_type, amp_config=amp_cfg)

-    return model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler
+    engine = Engine(
+        model=model,
+        optimizer=optimizer,
+        criterion=criterion,
+        step_schedule=schedule,
+        **gpc.config.get('engine', dict())
+    )
+
+    return engine, train_dataloader, test_dataloader
--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@@ -7,6 +7,7 @@ from torch import Tensor
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
+from torch.cuda.amp import custom_bwd, custom_fwd


 def matmul_2d(a,
@@ -60,6 +61,7 @@ class Matmul_AB_2D(torch.autograd.Function):
    """Matrix multiplication for :math:`C = AB`
    """
    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx: Any,
                A: Tensor,
                B: Tensor,
@@ -120,32 +122,32 @@ class Matmul_AB_2D(torch.autograd.Function):
        return out

    @staticmethod
+    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        A, B = ctx.saved_tensors
-        A_grad = Matmul_ABT_2D.forward(
-            None,
-            output_grad, B,
-            ctx.summa_dim, ctx.A_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
-        B_grad = Matmul_ATB_2D.forward(
-            None,
-            A, output_grad,
-            ctx.summa_dim, ctx.B_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
+        with torch.no_grad():
+            A_grad = Matmul_ABT_2D.apply(
+                output_grad, B,
+                ctx.summa_dim, ctx.A_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
+            B_grad = Matmul_ATB_2D.apply(
+                A, output_grad,
+                ctx.summa_dim, ctx.B_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
        return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None


@@ -153,6 +155,7 @@ class Matmul_ABT_2D(torch.autograd.Function):
    """Matrix multiplication for :math:`C = AB^T`
    """
    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx: Any,
                A: Tensor,
                B: Tensor,
@@ -214,32 +217,33 @@ class Matmul_ABT_2D(torch.autograd.Function):
        return out

    @staticmethod
+    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        A, B = ctx.saved_tensors
-        A_grad = Matmul_AB_2D.forward(
-            None,
-            output_grad, B,
-            ctx.summa_dim, ctx.A_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
-        B_grad = Matmul_ATB_2D.forward(
-            None,
-            output_grad, A,
-            ctx.summa_dim, ctx.B_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
+
+        with torch.no_grad():
+            A_grad = Matmul_AB_2D.apply(
+                output_grad, B,
+                ctx.summa_dim, ctx.A_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
+            B_grad = Matmul_ATB_2D.apply(
+                output_grad, A,
+                ctx.summa_dim, ctx.B_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
        return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None


@@ -247,6 +251,7 @@ class Matmul_ATB_2D(torch.autograd.Function):
    """Matrix multiplication for :math:`C = A^TB`
    """
    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx: Any,
                A: Tensor,
                B: Tensor,
@@ -308,32 +313,33 @@ class Matmul_ATB_2D(torch.autograd.Function):
        return out

    @staticmethod
+    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        A, B = ctx.saved_tensors
-        A_grad = Matmul_ABT_2D.forward(
-            None,
-            B, output_grad,
-            ctx.summa_dim, ctx.A_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
-        B_grad = Matmul_AB_2D.forward(
-            None,
-            A, output_grad,
-            ctx.summa_dim, ctx.B_shape,
-            ctx.row_rank, ctx.col_rank,
-            ctx.row_parallel_mode,
-            ctx.col_parallel_mode,
-            ctx.data_parallel_rank,
-            ctx.pipeline_parallel_rank,
-            ctx.pipeline_parallel_size,
-            ctx.tensor_parallel_size
-        )
+
+        with torch.no_grad():
+            A_grad = Matmul_ABT_2D.apply(
+                B, output_grad,
+                ctx.summa_dim, ctx.A_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
+            B_grad = Matmul_AB_2D.apply(
+                A, output_grad,
+                ctx.summa_dim, ctx.B_shape,
+                ctx.row_rank, ctx.col_rank,
+                ctx.row_parallel_mode,
+                ctx.col_parallel_mode,
+                ctx.data_parallel_rank,
+                ctx.pipeline_parallel_rank,
+                ctx.pipeline_parallel_size,
+                ctx.tensor_parallel_size
+            )
        return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None


@@ -341,6 +347,7 @@ class Add_Bias_2D(torch.autograd.Function):
    """Matrix add bias: :math:`C = A + b`
    """
    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx: Any,
                input: Tensor,
                bias: Tensor,
@@ -384,6 +391,7 @@ class Add_Bias_2D(torch.autograd.Function):
            return output

    @staticmethod
+    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        row_rank = ctx.row_rank
        col_rank = ctx.col_rank
@@ -423,6 +431,7 @@ class Add_Bias_2D(torch.autograd.Function):
 class _LayerNorm_2D(torch.autograd.Function):

    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
    def forward(ctx: Any,
                input: Tensor,
                E_x: Tensor,
@@ -440,6 +449,7 @@ class _LayerNorm_2D(torch.autograd.Function):
        return output

    @staticmethod
+    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        row_parallel_mode = ctx.row_parallel_mode
        col_parallel_mode = ctx.col_parallel_mode
@@ -492,6 +502,7 @@ class _LayerNorm_2D(torch.autograd.Function):
 class _ViT_Split_Input_2D(torch.autograd.Function):

    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
    def forward(ctx: Any,
                inputs: Tensor,
                batch_size: int,
@@ -509,6 +520,7 @@ class _ViT_Split_Input_2D(torch.autograd.Function):
        return output

    @staticmethod
+    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        # output_grad: [b/q, s, h/q]
        # grads: [b, s, h/q]

--- a/colossalai/nn/lr_scheduler/__init__.py
+++ b/colossalai/nn/lr_scheduler/__init__.py
 from .cosine import CosineAnnealingLR, CosineAnnealingWarmupLR, FlatAnnealingLR, FlatAnnealingWarmupLR
-from .linear import LinearWarmupLR, LinearWarmupDecay
+from .linear import LinearWarmupLR
 from .multistep import MultiStepLR, MultiStepWarmupLR
 from .onecycle import OneCycleLR
 from .poly import PolynomialLR, PolynomialWarmupLR

--- a/colossalai/nn/lr_scheduler/cosine.py
+++ b/colossalai/nn/lr_scheduler/cosine.py
@@ -66,11 +66,10 @@ class CosineAnnealingWarmupLR(WarmupScheduler):
    :type last_epoch: int, optional
    """

-    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: int = 0, last_epoch: int = -1,
-                 **kwargs):
+    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, eta_min: int = 0, last_epoch: int = -1):
        base_scheduler = _CosineAnnealingLR(
-            optimizer, total_steps - warmup_steps, eta_min=eta_min)
-        super().__init__(optimizer, warmup_steps, base_scheduler, last_epoch=last_epoch)
+            optimizer, total_steps - warmup_steps, eta_min=eta_min, last_epoch=last_epoch)
+        super().__init__(optimizer, warmup_steps, base_scheduler)


 @LR_SCHEDULERS.register_module

--- a/colossalai/nn/lr_scheduler/delayed.py
+++ b/colossalai/nn/lr_scheduler/delayed.py
@@ -55,7 +55,7 @@ class DelayerScheduler(_LRScheduler):


 class WarmupScheduler(_LRScheduler):
-    """ Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler 
+    """ Starts with a linear warmup lr schedule until it reaches N epochs the applies a scheduler

    :param optimizer: Wrapped optimizer.
    :type optimizer: torch.optim.Optimizer
@@ -66,11 +66,8 @@ class WarmupScheduler(_LRScheduler):
    :param last_epoch: The index of last epoch, defaults to -1
    :type last_epoch: int, optional
    """
-
    def __init__(self, optimizer, warmup_epochs, after_scheduler, last_epoch=-1):
-        if warmup_epochs < 0:
-            raise ValueError(f'warmup_epochs must >= 0, got {warmup_epochs}')
-        self.warmup_epochs = warmup_epochs
+        self.warmup_epochs = int(warmup_epochs)
        self.after_scheduler = after_scheduler
        self.finished = False
        super().__init__(optimizer, last_epoch)
@@ -79,14 +76,10 @@ class WarmupScheduler(_LRScheduler):
        if self.last_epoch >= self.warmup_epochs:
            if not self.finished:
                self.after_scheduler.base_lrs = self.base_lrs
-                # reset lr to base_lr
-                for group, base_lr in zip(self.optimizer.param_groups, self.base_lrs):
-                    group['lr'] = base_lr
                self.finished = True
-            with _enable_get_lr_call(self.after_scheduler):
-                return self.after_scheduler.get_lr()
+            return self.after_scheduler.get_lr()

-        return [(self.last_epoch + 1) / (self.warmup_epochs + 1) * lr for lr in self.base_lrs]
+        return [(self.last_epoch + 1) / self.warmup_epochs * lr for lr in self.base_lrs]

    def step(self, epoch=None):
        if self.finished:

--- a/colossalai/nn/lr_scheduler/linear.py
+++ b/colossalai/nn/lr_scheduler/linear.py
@@ -28,18 +28,3 @@ class LinearWarmupLR(_LRScheduler):
        else:
            return [(self.total_steps - self.last_epoch) / (self.total_steps - self.warmup_steps) * lr for lr in
                    self.base_lrs]
-
-
-@LR_SCHEDULERS.register_module
-class LinearWarmupDecay(_LRScheduler):
-    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, last_epoch: int = -1, **kwargs):
-        self.warmup_steps = int(warmup_steps)
-        self.total_steps = total_steps
-        super().__init__(optimizer, last_epoch=last_epoch)
-
-    def get_lr(self):
-        if self.last_epoch < self.warmup_steps:
-            return [(self.last_epoch + 1) / self.warmup_steps * lr for lr in self.base_lrs]
-        else:
-            return [(self.total_steps - self.last_epoch - 1) / (self.total_steps - self.warmup_steps) * lr for lr in
-                    self.base_lrs]
--- a/colossalai/nn/lr_scheduler/multistep.py
+++ b/colossalai/nn/lr_scheduler/multistep.py
@@ -27,12 +27,7 @@ class MultiStepLR(_MultiStepLR):
    :type last_epoch: int, optional
    """

-    def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1,
-                 num_steps_per_epoch: int = -1, last_epoch: int = -1, **kwargs):
-        if num_steps_per_epoch <= 0:
-            raise ValueError(
-                f'num_steps_per_epoch must > 0, got {num_steps_per_epoch}')
-        milestones = [v * num_steps_per_epoch for v in milestones]
+    def __init__(self, optimizer, total_steps: int, milestones: List[int] = None, gamma: float = 0.1, last_epoch: int = -1, **kwargs):
        super().__init__(optimizer, milestones, gamma=gamma, last_epoch=last_epoch)


@@ -57,14 +52,11 @@ class MultiStepWarmupLR(WarmupScheduler):
    """

    def __init__(self, optimizer, total_steps: int, warmup_steps: int = 0, milestones: List[int] = None,
-                 gamma: float = 0.1, num_steps_per_epoch: int = -1, last_epoch: int = -1, **kwargs):
+                 gamma: float = 0.1, last_epoch: int = -1, **kwargs):
        if len(milestones) == 0:
            raise ValueError('milestones cannot be empty')
-        if num_steps_per_epoch <= 0:
-            raise ValueError(
-                f'num_steps_per_epoch must > 0, got {num_steps_per_epoch}')
-        milestones = [v * num_steps_per_epoch - warmup_steps for v in milestones if v *
-                      num_steps_per_epoch >= warmup_steps]
+        milestones = [
+            v - warmup_steps for v in milestones if v >= warmup_steps]
        base_scheduler = _MultiStepLR(optimizer, milestones=milestones,
                                      gamma=gamma)
        super().__init__(optimizer, warmup_steps, base_scheduler, last_epoch=last_epoch)
--- a/colossalai/nn/lr_scheduler/torch.py
+++ b/colossalai/nn/lr_scheduler/torch.py
 from torch.optim.lr_scheduler import LambdaLR as _LambdaLR
 from torch.optim.lr_scheduler import MultiplicativeLR as _MultiplicativeLR
 from torch.optim.lr_scheduler import StepLR as _StepLR
-from torch.optim.lr_scheduler import _LRScheduler
+from torch.optim.lr_scheduler import ExponentialLR as _ExponentialLR

 from colossalai.registry import LR_SCHEDULERS

@@ -25,11 +25,8 @@ class LambdaLR(_LambdaLR):
    :type last_epoch: int, optional
    """

-    def __init__(self, optimizer, total_steps, lr_lambda=None, num_steps_per_epoch: int = -1,
-                 last_epoch: int = -1) -> None:
-        def func(step): return lr_lambda(step // num_steps_per_epoch)
-
-        super().__init__(optimizer, func, last_epoch=last_epoch)
+    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
+        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)


 @LR_SCHEDULERS.register_module
@@ -51,11 +48,8 @@ class MultiplicativeLR(_MultiplicativeLR):
    :type last_epoch: int, optional
    """

-    def __init__(self, optimizer, total_steps, lr_lambda=None, num_steps_per_epoch: int = -1,
-                 last_epoch: int = -1) -> None:
-        def func(step): return lr_lambda(step // num_steps_per_epoch)
-
-        super().__init__(optimizer, func, last_epoch=last_epoch)
+    def __init__(self, optimizer, total_steps, lr_lambda=None, last_epoch: int = -1) -> None:
+        super().__init__(optimizer, lr_lambda, last_epoch=last_epoch)


 @LR_SCHEDULERS.register_module
@@ -79,14 +73,13 @@ class StepLR(_StepLR):
    :type last_epoch: int, optional
    """

-    def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, num_steps_per_epoch: int = -1,
-                 last_epoch: int = -1) -> None:
-        super().__init__(optimizer, step_size * num_steps_per_epoch,
+    def __init__(self, optimizer, total_steps, step_size: int = 1, gamma: float = 0.1, last_epoch: int = -1) -> None:
+        super().__init__(optimizer, step_size,
                         gamma=gamma, last_epoch=last_epoch)


 @LR_SCHEDULERS.register_module
-class ExponentialLR(_LRScheduler):
+class ExponentialLR(_ExponentialLR):
    """Decays the learning rate of each parameter group by gamma every epoch.
    When last_epoch=-1, sets initial lr as lr

@@ -102,21 +95,6 @@ class ExponentialLR(_LRScheduler):
    :type last_epoch: int, optional
    """

-    def __init__(self, optimizer, total_steps, gamma: float = 1.0, num_steps_per_epoch: int = -1,
+    def __init__(self, optimizer, total_steps, gamma: float = 1.0,
                 last_epoch: int = -1) -> None:
-        self.gamma = gamma
-        self.num_steps_per_epoch = num_steps_per_epoch
-        super().__init__(optimizer, last_epoch=last_epoch)
-
-    def get_lr(self):
-        if self.last_epoch == 0:
-            return self.base_lrs
-        elif (self.last_epoch + 1) % self.num_steps_per_epoch == 0:
-            return [group['lr'] * self.gamma
-                    for group in self.optimizer.param_groups]
-        return [group['lr']
-                for group in self.optimizer.param_groups]
-
-    def _get_closed_form_lr(self):
-        return [base_lr * self.gamma ** (self.last_epoch // self.num_steps_per_epoch)
-                for base_lr in self.base_lrs]
+        super().__init__(optimizer, gamma, last_epoch=last_epoch)