delete unused files

da3f0934 · zhuwenwen · c4dd1fd4 · c4dd1fd4 · c4dd1fd4 · c4dd1fd4
Commit da3f0934 authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/amp/torch_amp/_grad_scaler.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-# modified from https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py
-# to support tensor parallel
-import torch
-from collections import defaultdict, abc
-import warnings
-from enum import Enum
-from typing import Any, Dict, List, Optional, Tuple
-from colossalai.context import ParallelMode
-import torch.distributed as dist
-from colossalai.core import global_context as gpc
-from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
-class _MultiDeviceReplicator(object):
-    """
-    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
-    """
-    def __init__(self, master_tensor: torch.Tensor) -> None:
-        assert master_tensor.is_cuda or master_tensor.device.type == 'xla'
-        self.master = master_tensor
-        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
-    def get(self, device) -> torch.Tensor:
-        retval = self._per_device_tensors.get(device, None)
-        if retval is None:
-            retval = self.master.to(
-                device=device, non_blocking=True, copy=True)
-            self._per_device_tensors[device] = retval
-        return retval
-# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
-# as well as associated "enum" values.  Prefers defining these at top level because
-# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
-# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
-#   causes a circular reference, which we'd rather avoid.
-class OptState(Enum):
-    READY = 0
-    UNSCALED = 1
-    STEPPED = 2
-def _refresh_per_optimizer_state():
-    return {"stage": OptState.READY, "found_inf_per_device": {}}
-class GradScaler(object):
-    _scale: Optional[torch.Tensor]
-    _grows_tracker: Optional[torch.Tensor]
-    _per_optimizer_states: Dict[int, Dict[str, Any]]
-    """
-    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
-    conveniently.
-    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
-    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
-    * ``scaler.update()`` updates ``scaler``'s scale factor.
-    Example::
-        # Creates a GradScaler once at the beginning of training.
-        scaler = GradScaler()
-        for epoch in epochs:
-            for input, target in data:
-                optimizer.zero_grad()
-                output = model(input)
-                loss = loss_fn(output, target)
-                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
-                scaler.scale(loss).backward()
-                # scaler.step() first unscales gradients of the optimizer's params.
-                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
-                # otherwise, optimizer.step() is skipped.
-                scaler.step(optimizer)
-                # Updates the scale for next iteration.
-                scaler.update()
-    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
-    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
-    and multiple losses/optimizers.
-    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
-    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
-    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
-    without incurring inf or NaN gradient values.
-    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
-    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
-    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
-      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
-    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
-      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
-      ``growth_factor``.
-    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
-    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
-    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
-    Args:
-        init_scale (float, optional, default=2.**16):  Initial scale factor.
-        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
-            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
-        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
-            :meth:`update` if inf/NaN gradients occur in an iteration.
-        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
-            that must occur for the scale to be multiplied by ``growth_factor``.
-        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
-            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
-    """
-    def __init__(self,
-                 init_scale=2.**16,
-                 growth_factor=2.0,
-                 backoff_factor=0.5,
-                 growth_interval=2000,
-                 enabled=True):
-        if enabled and not torch.cuda.is_available():
-            warnings.warn(
-                "torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
-            self._enabled = False
-        else:
-            self._enabled = enabled
-        if self._enabled:
-            assert growth_factor > 1.0, "The growth factor must be > 1.0."
-            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
-            self._init_scale = init_scale
-            # self._scale will be lazily initialized during the first call to scale()
-            self._scale = None
-            self._growth_factor = growth_factor
-            self._backoff_factor = backoff_factor
-            self._growth_interval = growth_interval
-            self._init_growth_tracker = 0
-            # self._growth_tracker will be lazily initialized during the first call to scale()
-            self._growth_tracker = None
-            self._per_optimizer_states = defaultdict(
-                _refresh_per_optimizer_state)
-    def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]:
-        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
-        assert self._scale is not None, "Attempted {} but _scale is None.  ".format(
-            funcname) + fix
-        assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(
-            funcname) + fix
-        return (self._scale, self._growth_tracker)
-    def _lazy_init_scale_growth_tracker(self, dev):
-        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
-        self._scale = torch.full(
-            (1,), self._init_scale, dtype=torch.float32, device=dev)
-        self._growth_tracker = torch.full(
-            (1,), self._init_growth_tracker, dtype=torch.int32, device=dev)
-    def scale(self, outputs):
-        """
-        Multiplies ('scales') a tensor or list of tensors by the scale factor.
-        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
-        unmodified.
-        Args:
-            outputs (Tensor or iterable of Tensors):  Outputs to scale.
-        """
-        if not self._enabled:
-            return outputs
-        # Short-circuit for the common case.
-        if isinstance(outputs, torch.Tensor):
-            assert outputs.is_cuda or outputs.device.type == 'xla'
-            if self._scale is None:
-                self._lazy_init_scale_growth_tracker(outputs.device)
-            assert self._scale is not None
-            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
-        # Invoke the more complex machinery only if we're treating multiple outputs.
-        # holds a reference that can be overwritten by apply_scale
-        stash: List[_MultiDeviceReplicator] = []
-        def apply_scale(val):
-            if isinstance(val, torch.Tensor):
-                assert val.is_cuda or val.device.type == 'xla'
-                if len(stash) == 0:
-                    if self._scale is None:
-                        self._lazy_init_scale_growth_tracker(val.device)
-                    assert self._scale is not None
-                    stash.append(_MultiDeviceReplicator(self._scale))
-                return val * stash[0].get(val.device)
-            elif isinstance(val, abc.Iterable):
-                iterable = map(apply_scale, val)
-                if isinstance(val, list) or isinstance(val, tuple):
-                    return type(val)(iterable)
-                else:
-                    return iterable
-            else:
-                raise ValueError(
-                    "outputs must be a Tensor or an iterable of Tensors")
-        return apply_scale(outputs)
-    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
-        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
-        per_device_found_inf = _MultiDeviceReplicator(found_inf)
-        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
-        # There could be hundreds of grads, so we'd like to iterate through them just once.
-        # However, we don't know their devices or dtypes in advance.
-        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
-        # Google says mypy struggles with defaultdicts type annotations.
-        per_device_and_dtype_grads = defaultdict(
-            lambda: defaultdict(list))  # type: ignore[var-annotated]
-        with torch.no_grad():
-            for group in optimizer.param_groups:
-                for param in group["params"]:
-                    if param.grad is None:
-                        continue
-                    if (not allow_fp16) and param.grad.dtype == torch.float16:
-                        raise ValueError(
-                            "Attempting to unscale FP16 gradients.")
-                    if param.grad.is_sparse:
-                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
-                        # coalesce() deduplicates indices and adds all values that have the same index.
-                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
-                        # so we should check the coalesced _values().
-                        if param.grad.dtype is torch.float16:
-                            param.grad = param.grad.coalesce()
-                        to_unscale = param.grad._values()
-                    else:
-                        to_unscale = param.grad
-                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
-                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(
-                        to_unscale)
-            for device, per_dtype_grads in per_device_and_dtype_grads.items():
-                for grads in per_dtype_grads.values():
-                    torch._amp_foreach_non_finite_check_and_unscale_(grads,
-                                                                     per_device_found_inf.get(
-                                                                         device),
-                                                                     per_device_inv_scale.get(device))
-        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
-        if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
-            vals = [val for val in per_device_found_inf._per_device_tensors.values()]
-            coalesced = _flatten_dense_tensors(vals)
-            dist.all_reduce(coalesced,
-                            op=dist.ReduceOp.MAX,
-                            group=gpc.get_group(ParallelMode.MODEL))
-            for buf, synced in zip(vals, _unflatten_dense_tensors(coalesced, vals)):
-                buf.copy_(synced)
-        return per_device_found_inf._per_device_tensors
-    def unscale_(self, optimizer):
-        """
-        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
-        :meth:`unscale_` is optional, serving cases where you need to
-        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
-        between the backward pass(es) and :meth:`step`.
-        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
-        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
-            ...
-            scaler.scale(loss).backward()
-            scaler.unscale_(optimizer)
-            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
-            scaler.step(optimizer)
-            scaler.update()
-        Args:
-            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
-        .. note::
-            :meth:`unscale_` does not incur a CPU-GPU sync.
-        .. warning::
-            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
-            and only after all gradients for that optimizer's assigned parameters have been accumulated.
-            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
-        .. warning::
-            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
-        """
-        if not self._enabled:
-            return
-        self._check_scale_growth_tracker("unscale_")
-        optimizer_state = self._per_optimizer_states[id(optimizer)]
-        if optimizer_state["stage"] is OptState.UNSCALED:
-            raise RuntimeError(
-                "unscale_() has already been called on this optimizer since the last update().")
-        elif optimizer_state["stage"] is OptState.STEPPED:
-            raise RuntimeError("unscale_() is being called after step().")
-        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
-        assert self._scale is not None
-        inv_scale = self._scale.double().reciprocal().float()
-        found_inf = torch.full(
-            (1,), 0.0, dtype=torch.float32, device=self._scale.device)
-        optimizer_state["found_inf_per_device"] = self._unscale_grads_(
-            optimizer, inv_scale, found_inf, False)
-        optimizer_state["stage"] = OptState.UNSCALED
-    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
-        retval = None
-        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
-            retval = optimizer.step(*args, **kwargs)
-        return retval
-    def step(self, optimizer, *args, **kwargs):
-        """
-        :meth:`step` carries out the following two operations:
-        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
-            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
-        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
-            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
-        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
-        Returns the return value of ``optimizer.step(*args, **kwargs)``.
-        Args:
-            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
-            args:  Any arguments.
-            kwargs:  Any keyword arguments.
-        .. warning::
-            Closure use is not currently supported.
-        """
-        if (not self._enabled):
-            return optimizer.step(*args, **kwargs)
-        if "closure" in kwargs:
-            raise RuntimeError(
-                "Closure use is not currently supported if GradScaler is enabled.")
-        self._check_scale_growth_tracker("step")
-        optimizer_state = self._per_optimizer_states[id(optimizer)]
-        if optimizer_state["stage"] is OptState.STEPPED:
-            raise RuntimeError(
-                "step() has already been called since the last update().")
-        retval = None
-        if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling):
-            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
-            # The contract with custom optimizers is that their step() should accept an additional,
-            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
-            # it can query its own state, invoke unscale_ on itself, etc
-            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
-            optimizer_state["stage"] = OptState.STEPPED
-            return retval
-        if optimizer_state["stage"] is OptState.READY:
-            self.unscale_(optimizer)
-        assert len(optimizer_state["found_inf_per_device"]
-                   ) > 0, "No inf checks were recorded for this optimizer."
-        retval = self._maybe_opt_step(
-            optimizer, optimizer_state, *args, **kwargs)
-        optimizer_state["stage"] = OptState.STEPPED
-        return retval
-    def update(self, new_scale=None):
-        """
-        Updates the scale factor.
-        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
-        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
-        the scale is multiplied by ``growth_factor`` to increase it.
-        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
-        used directly, it's used to fill GradScaler's internal scale tensor. So if
-        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
-        affect the scale GradScaler uses internally.)
-        Args:
-            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
-        .. warning::
-            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
-            been invoked for all optimizers used this iteration.
-        """
-        if not self._enabled:
-            return
-        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
-        if new_scale is not None:
-            # Accept a new user-defined scale.
-            if isinstance(new_scale, float):
-                self._scale.fill_(new_scale)  # type: ignore[union-attr]
-            else:
-                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
-                # type: ignore[attr-defined]
-                assert isinstance(new_scale, torch.cuda.FloatTensor), reason
-                assert new_scale.numel() == 1, reason
-                assert new_scale.requires_grad is False, reason
-                self._scale.copy_(new_scale)  # type: ignore[union-attr]
-        else:
-            # Consume shared inf/nan data collected from optimizers to update the scale.
-            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
-            found_infs = [found_inf.to(device=_scale.device, non_blocking=True)
-                          for state in self._per_optimizer_states.values()
-                          for found_inf in state["found_inf_per_device"].values()]
-            assert len(
-                found_infs) > 0, "No inf checks were recorded prior to update."
-            found_inf_combined = found_infs[0]
-            if len(found_infs) > 1:
-                for i in range(1, len(found_infs)):
-                    found_inf_combined += found_infs[i]
-            torch._amp_update_scale_(_scale,
-                                     _growth_tracker,
-                                     found_inf_combined,
-                                     self._growth_factor,
-                                     self._backoff_factor,
-                                     self._growth_interval)
-        # To prepare for next iteration, clear the data collected from optimizers this iteration.
-        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
-    def _get_scale_async(self):
-        return self._scale
-    def get_scale(self):
-        """
-        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
-        .. warning::
-            :meth:`get_scale` incurs a CPU-GPU sync.
-        """
-        if self._enabled:
-            return self._init_scale if self._scale is None else self._get_scale_async().item()
-        else:
-            return 1.0
-    def get_growth_factor(self):
-        r"""
-        Returns a Python float containing the scale growth factor.
-        """
-        return self._growth_factor
-    def set_growth_factor(self, new_factor):
-        r"""
-        Args:
-            new_scale (float):  Value to use as the new scale growth factor.
-        """
-        self._growth_factor = new_factor
-    def get_backoff_factor(self):
-        r"""
-        Returns a Python float containing the scale backoff factor.
-        """
-        return self._backoff_factor
-    def set_backoff_factor(self, new_factor):
-        r"""
-        Args:
-            new_scale (float):  Value to use as the new scale backoff factor.
-        """
-        self._backoff_factor = new_factor
-    def get_growth_interval(self):
-        r"""
-        Returns a Python int containing the growth interval.
-        """
-        return self._growth_interval
-    def set_growth_interval(self, new_interval):
-        r"""
-        Args:
-            new_interval (int):  Value to use as the new growth interval.
-        """
-        self._growth_interval = new_interval
-    def _get_growth_tracker(self):
-        if self._enabled:
-            return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item()
-        else:
-            return 0
-    def is_enabled(self):
-        r"""
-        Returns a bool indicating whether this instance is enabled.
-        """
-        return self._enabled
-    def state_dict(self):
-        r"""
-        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
-        * ``"scale"`` - a Python float containing the current scale
-        * ``"growth_factor"`` - a Python float containing the current growth factor
-        * ``"backoff_factor"`` - a Python float containing the current backoff factor
-        * ``"growth_interval"`` - a Python int containing the current growth interval
-        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
-        If this instance is not enabled, returns an empty dict.
-        .. note::
-           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
-           should be called after :meth:`update`.
-        """
-        return {"scale": self.get_scale(),
-                "growth_factor": self._growth_factor,
-                "backoff_factor": self._backoff_factor,
-                "growth_interval": self._growth_interval,
-                "_growth_tracker": self._get_growth_tracker()} if self._enabled else {}
-    def load_state_dict(self, state_dict):
-        r"""
-        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
-        Args:
-           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
-        """
-        if not self._enabled:
-            return
-        if len(state_dict) == 0:
-            raise RuntimeError("The source state dict is empty, possibly because it was saved "
-                               "from a disabled instance of GradScaler.")
-        self._init_scale = state_dict["scale"]
-        if self._scale is not None:
-            self._scale.fill_(state_dict["scale"])
-        self._growth_factor = state_dict["growth_factor"]
-        self._backoff_factor = state_dict["backoff_factor"]
-        self._growth_interval = state_dict["growth_interval"]
-        self._init_growth_tracker = state_dict["_growth_tracker"]
-        if self._growth_tracker is not None:
-            self._growth_tracker.fill_(state_dict["_growth_tracker"])
-    def __getstate__(self):
-        state = self.__dict__.copy()
-        if self._enabled:
-            assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\
-                                                         "of an iteration, or at the end after scaler.update()."
-            # Pickling _scale and _growth_tracker Tensors directly triggers
-            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
-            # so instead, we set the unpickled instance up to reinitialize them lazily.
-            state['_init_scale'] = self.get_scale()
-            state['_init_growth_tracker'] = self._get_growth_tracker()
-            state['_scale'] = None
-            state['_growth_tracker'] = None
-        return state
-    def __setstate__(self, state):
-        self.__dict__.update(state)
-    def _check_inf_per_device(self, optimizer):
-        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
-        dummy_inv_scale = torch.full(
-            (1,), 1.0, dtype=torch.float32, device=_scale.device)
-        found_inf = torch.full(
-            (1,), 0.0, dtype=torch.float32, device=_scale.device)
-        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
-            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
-        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
-    def _found_inf_per_device(self, optimizer):
-        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import torch.nn as nn
-import torch.cuda.amp as torch_amp
-from torch import Tensor
-from torch.nn.modules.loss import _Loss
-from torch.optim import Optimizer
-from ._grad_scaler import GradScaler
-from colossalai.nn.optimizer import ColossalaiOptimizer
-from colossalai.utils import clip_grad_norm_fp32
-class TorchAMPOptimizer(ColossalaiOptimizer):
-    """A wrapper class which integrate pytorch amp with an optimizer
-    :param optim: A normal optimizer like Adam or SGD
-    :param args: Args used to initialize gradient scaler
-    :param kwargs: Kwargs used to initialize gradient scaler
-    :type optim: torch.optim.Optimizer
-    """
-    def __init__(self, optim: Optimizer, *args, **kwargs):
-        super().__init__(optim)
-        self.scaler = GradScaler(*args, **kwargs)
-    def backward(self, loss: Tensor):
-        """Backward with torch amp gradient scaler
-        :param loss: Loss computed by a loss function
-        :type loss: torch.Tensor
-        """
-        self.scaler.scale(loss).backward()
-    def step(self):
-        """Update the parameters of the model
-        """
-        self.scaler.step(self.optim)
-        self.scaler.update()
-    def clip_grad_norm(self, model: nn.Module, max_norm: float):
-        """Apply gradient clipping to the model parameters
-        :param model: Your model object
-        :type model: torch.nn.Module
-        :param max_norm: Max norm value for gradient clipping
-        :type max_norm: float
-        """
-        if max_norm > 0.0:
-            self.scaler.unscale_(self.optim)
-            clip_grad_norm_fp32(model.parameters(), max_norm)
-class TorchAMPModel(nn.Module):
-    """A wrapper class for a model object which executes forward with values automatically
-    cast to fp16
-    """
-    def __init__(self, model: nn.Module) -> None:
-        super().__init__()
-        self.model = model
-    @torch_amp.autocast()
-    def forward(self, *args, **kwargs):
-        return self.model(*args, **kwargs)
-class TorchAMPLoss(nn.Module):
-    """A wrapper class for a criterion object which computes the loss in mixed-precision context
-    :param loss: A loss function object
-    :type loss: torch.nn.modules.loss._Loss
-    """
-    def __init__(self, loss: _Loss):
-        super().__init__()
-        self.loss = loss
-    @torch_amp.autocast()
-    def forward(self, *args, **kwargs):
-        return self.loss(*args, **kwargs)
--- a/colossalai/builder/__init__.py
+++ b/colossalai/builder/__init__.py
-from .builder import (build_schedule, build_lr_scheduler, build_model,
-                      build_optimizer, build_layer, build_loss, build_hooks,
-                      build_dataset, build_transform, build_data_sampler,
-                      build_gradient_handler, build_ophooks)
-from .pipeline import build_pipeline_model, build_pipeline_model_from_cfg
-__all__ = [
-    'build_schedule', 'build_lr_scheduler', 'build_model', 'build_optimizer',
-    'build_layer', 'build_loss', 'build_hooks', 'build_dataset',
-    'build_transform', 'build_data_sampler', 'build_gradient_handler',
-    'build_pipeline_model', 'build_pipeline_model_from_cfg', 'build_ophooks'
-]
--- a/colossalai/builder/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/builder/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/builder/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/builder/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/builder/__pycache__/builder.cpython-36.pyc
+++ b/colossalai/builder/__pycache__/builder.cpython-36.pyc
--- a/colossalai/builder/__pycache__/builder.cpython-37.pyc
+++ b/colossalai/builder/__pycache__/builder.cpython-37.pyc
--- a/colossalai/builder/__pycache__/pipeline.cpython-36.pyc
+++ b/colossalai/builder/__pycache__/pipeline.cpython-36.pyc
--- a/colossalai/builder/__pycache__/pipeline.cpython-37.pyc
+++ b/colossalai/builder/__pycache__/pipeline.cpython-37.pyc
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-import inspect
-from collections.abc import Iterable
-from colossalai.registry import *
-def build_from_config(module, config: dict):
-    """Returns an object of :class:`module` constructed from `config`.
-    :param module: A python or user-defined class
-    :type module: class
-    :param config: A python dict containing information used in the construction
-        of the return object
-    :type config: dict
-    :raises AssertionError: Raises an AssertionError if `module` is not a class
-    :return: An object of interest
-    :rtype: Object
-    """
-    assert inspect.isclass(module), 'module must be a class'
-    return module(**config)
-def build_from_registry(config, registry: Registry):
-    """Returns an object constructed from `config`, the type of the object
-    is specified by `registry`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.colossalai.context.Config`
-    :param registry: A registry specifying the type of the return object
-    :type registry: :class:`Registry`
-    :raises AssertionError: Raises an AssertionError if `registry` is not an object
-        of :class:`Registry` or `mod_type` in `config` is not found in `registry`
-    :raises Exception: Raises an Exception if an error occurred when building
-        from registry
-    :return: An object specified by `registry`
-    :rtype: Python object specified by `registry`
-    """
-    config_ = config.copy()  # keep the original config untouched
-    assert isinstance(
-        registry, Registry), f'Expected type Registry but got {type(registry)}'
-    mod_type = config_.pop('type')
-    assert registry.has(
-        mod_type), f'{mod_type} is not found in registry {registry.name}'
-    try:
-        obj = registry.get_module(mod_type)(**config_)
-    except Exception as e:
-        print(
-            f'An error occurred when building {mod_type} from registry {registry.name}',
-            flush=True)
-        raise e
-    return obj
-def build_layer(config):
-    """Returns a layer object of :class:`nn.Module` constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.nn.Module`
-    :rtype: :class:`torch.nn.Module`
-    """
-    return build_from_registry(config, LAYERS)
-def build_loss(config):
-    """Returns a loss function object of :class:`torch.autograd.Function` constructed
-    from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.nn.modules.loss._Loss`
-    :rtype: :class:`torch.nn.modules.loss._Loss`
-    """
-    return build_from_registry(config, LOSSES)
-def build_model(config):
-    """Returns a model object of :class:`nn.Module` constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.nn.Module`
-    :rtype: :class:`torch.nn.Module`
-    """
-    return build_from_registry(config, MODELS)
-def build_dataset(config):
-    """Returns a dataset object of :class:`torch.utils.data.Dataset` constructed
-    from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torch.utils.data.Dataset`
-    :rtype: :class:`torch.utils.data.Dataset`
-    """
-    return build_from_registry(config, DATASETS)
-def build_optimizer(config, model):
-    """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`,
-    'model' and 'params'.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param model: A model containing parameters for the optimizer
-    :type model: :class:`nn.Module`
-    :return: An object of :class:`torch.optim.Optimizer`
-    :rtype: :class:`torch.optim.Optimizer`
-    """
-    config_ = config.copy()
-    config_['params'] = model.parameters()
-    return build_from_registry(config_, OPTIMIZERS)
-def build_gradient_handler(config, model, optimizer):
-    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
-    `model` and `optimizer`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param model: A model containing parameters for the gradient handler
-    :type model: :class:`nn.Module`
-    :param optimizer: An optimizer object containing parameters for the gradient handler
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`colossalai.engine.BaseGradientHandler`
-    :rtype: :class:`colossalai.engine.BaseGradientHandler`
-    """
-    config_ = config.copy()
-    config_['model'] = model
-    config_['optimizer'] = optimizer
-    return build_from_registry(config_, GRADIENT_HANDLER)
-def build_hooks(config, trainer):
-    """Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param trainer: A :class:`Trainer` object containing parameters for the hook
-    :type trainer: :class:`Trainer`
-    :return: An object of :class:`colossalai.trainer.hooks.BaseHook`
-    :rtype: :class:`colossalai.trainer.hooks.BaseHook`
-    """
-    config_ = config.copy()
-    config_['trainer'] = trainer
-    return build_from_registry(config_, HOOKS)
-def build_ophooks(config):
-    """Returns a hook object of :class:`BaseOpHook` constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`colossalai.trainer.hooks.BaseOpHook`
-    :rtype: :class:`colossalai.trainer.hooks.BaseOpHook`
-    """
-    config_ = config.copy()
-    return build_from_registry(config_, OPHOOKS)
-def build_transform(config):
-    """Returns a transformation object of :class:`torchvision.transforms` constructed
-    from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`torchvision.transforms`
-    :rtype: :class:`torchvision.transforms`
-    """
-    return build_from_registry(config, TRANSFORMS)
-def build_data_sampler(config, dataset):
-    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
-    constructed from `config`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
-        used in the construction of the return object
-    :type dataset: :class:`torch.utils.data.Dataset`
-    :return: An object of :class:`colossalai.utils.data_sampler.BaseSampler`
-    :rtype: :class:`colossalai.utils.data_sampler.BaseSampler`
-    """
-    config_ = config.copy()
-    config_['dataset'] = dataset
-    return build_from_registry(config_, DATA_SAMPLERS)
-def build_lr_scheduler(config, optimizer):
-    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler`
-    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :param optimizer: An optimizer object containing parameters for the learning rate
-        scheduler
-    :type optimizer: :class:`torch.optim.Optimizer`
-    :return: An object of :class:`torch.optim.lr_scheduler`
-    :rtype: :class:`torch.optim.lr_scheduler`
-    """
-    config_ = config.copy()
-    config_['optimizer'] = optimizer
-    return build_from_registry(config_, LR_SCHEDULERS)
-def build_schedule(config):
-    """Returns a schedule of :class:`colossalai.engine.schedule.BaseSchedule`.
-    :param config: A python dict or a :class:`colossalai.context.Config` object
-        containing information used in the construction of the return object
-    :type config: dict or :class:`colossalai.context.Config`
-    :return: An object of :class:`colossalai.engine.schedule.BaseSchedule`
-    :rtype: :class:`colossalai.engine.schedule.BaseSchedule`
-    """
-    return build_from_registry(config, SCHEDULE)
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
-import copy
-import heapq
-from colossalai.builder import build_model, build_layer
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.logging import get_dist_logger
-import torch.nn as nn
-def _binary_partition(weights, st, ed):
-    """Returns the binary partition position of `weights`, given the start
-    position `st` and the end position `ed`.
-    :param weights: A python list to be binary partitioned
-    :type weights: list
-    :param st: the start position of the binary partition
-    :type st: int
-    :param ed: the end postition of the binary partition
-    :type ed: int
-    :return: the binary partition position of `weights`
-    :rtype: int
-    """
-    w_sum = weights[ed - 1]
-    prefix = 0
-    if st > 0:
-        w_sum -= weights[st - 1]
-        prefix = weights[st - 1]
-    minimum = float("inf")
-    for idx in range(st + 1, ed):
-        front = weights[idx - 1] - prefix
-        diff = abs(w_sum - 2 * front)
-        if diff < minimum:
-            pos = idx
-            minimum = diff
-    return st, pos, ed
-def _heap_addition(weights, intervals, add_cnt):
-    """
-    """
-    def _heap_push(heap, st, ed):
-        value = weights[ed - 1]
-        if st > 0:
-            value -= weights[st - 1]
-        heapq.heappush(heap, (-value, st, ed))
-    ret_intervals = []
-    heap = []
-    for st, ed in intervals:
-        _heap_push(heap, st, ed)
-    while add_cnt > 0:
-        _, st, ed = heapq.heappop(heap)
-        if ed - st == 1:
-            ret_intervals.append((st, ed))
-        else:
-            l, m, r = _binary_partition(weights, st, ed)
-            _heap_push(heap, l, m)
-            _heap_push(heap, m, r)
-            add_cnt -= 1
-    while heap:
-        _, st, ed = heapq.heappop(heap)
-        ret_intervals.append((st, ed))
-    ret_intervals.sort()
-    return ret_intervals
-def _calc_partitions(weights, value):
-    prev = 0
-    prefix = 0
-    num_block = 0
-    intervals = []
-    for idx, w in enumerate(weights):
-        if weights[idx] - prefix > value:
-            intervals.append((prev, idx))
-            prev = idx
-            prefix = weights[idx - 1]
-            num_block += 1
-    intervals.append((prev, len(weights)))
-    return num_block + 1, intervals
-def _binary_search(weights, num):
-    length = len(weights)
-    prefix = [1 if w == 0 else w for w in weights]
-    for i in range(1, length):
-        prefix[i] += prefix[i - 1]
-    lower_bound = max(weights)
-    upper_bound = prefix[length - 1]
-    while upper_bound > lower_bound:
-        mid = (upper_bound + lower_bound) // 2
-        number, _ = _calc_partitions(prefix, mid)
-        if number <= num:
-            upper_bound = mid
-        else:
-            lower_bound = mid + 1
-    num_block, intervals = _calc_partitions(prefix, upper_bound)
-    if num_block < num:
-        intervals = _heap_addition(prefix, intervals, num - num_block)
-    return intervals
-def partition_uniform(num_items, pipeline_parallel_size, num_chunks):
-    assert num_items % num_chunks == 0, \
-        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
-    logger = get_dist_logger()
-    parts = [[] for _ in range(pipeline_parallel_size)]
-    partition_items = num_items // num_chunks
-    for idx in range(num_chunks):
-        base_idx = idx * partition_items
-        chunk_size = partition_items // pipeline_parallel_size
-        left = pipeline_parallel_size - partition_items % pipeline_parallel_size
-        if chunk_size == 0:
-            logger.warning("Some nodes in Pipeline have no requests")
-        for p in range(pipeline_parallel_size):
-            st = base_idx
-            base_idx += chunk_size + (p >= left)
-            parts[p].append((st, base_idx))
-    return parts
-def partition_balanced(weights, pipeline_parallel_size, num_chunks):
-    num_total = pipeline_parallel_size * num_chunks
-    num_items = len(weights)
-    if num_items <= num_total:
-        return partition_uniform(num_items, pipeline_parallel_size, num_chunks)
-    intervals = _binary_search(weights, num_total)
-    current = 0
-    parts = [[] for _ in range(pipeline_parallel_size)]
-    for inter in intervals:
-        parts[current].append(inter)
-        current = (current + 1) % pipeline_parallel_size
-    return parts
-def count_layer_params(layers):
-    """Count the number of parameters in each layer
-    """
-    param_counts = [0] * len(layers)
-    for idx, cfg in enumerate(layers):
-        layer = build_layer(cfg)
-        params = filter(lambda p: p.requires_grad, layer.parameters())
-        param_counts[idx] = sum(p.numel() for p in params)
-    return param_counts
-def build_pipeline_model_from_cfg(config, num_chunks: int = 1, partition_method: str = 'parameter', verbose: bool = False):
-    """An intializer to split the model into different stages for pipeline parallelism.
-    An example for the model config is shown below. The class VisionTransformerFromConfig should
-    inherit colossalai.nn.model.ModelFromConfig to allow this initializer to build model from a sequence
-    of layer configurations.
-    model_config = dict(
-        type='VisionTransformerFromConfig',
-        embedding_cfg=dict(...),
-        ...
-    )
-    :param config: Configuration of the model
-    :type config: dict
-    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
-                        in most cases unless you are using virutal pipeline parallelism.
-    :type num_chunks: int, optional
-    :param partition_method: This parameter determines how you want to split your model layers into stages,
-                                you can set it as 'layer' or 'parameter'
-    :type partition_method: str, optional
-    :param verbose: Whether to print the logs
-    :type verbose: bool, optional
-    """
-    ori_model = build_model(config)
-    layers = ori_model.layers_cfg
-    layer_length = len(layers)
-    logger = get_dist_logger()
-    if verbose:
-        logger.info(f"The total length of layers is {layer_length}", ranks=[0])
-    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
-    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-    method = partition_method.lower()
-    # Make a partition
-    if method == 'layer':
-        num_layers = len(layers)
-        parts = partition_uniform(num_layers, pipeline_parallel_size, num_chunks)
-    elif method == 'parameter':
-        param_counts = count_layer_params(layers)
-        # print_rank_0(param_counts)
-        parts = partition_balanced(param_counts, pipeline_parallel_size, num_chunks)
-    else:
-        raise ValueError("Method should be a pre-set string in [layer, parameter]")
-    # Display the partition
-    if verbose:
-        log_str = 'Layer allocation after partitioning: \n'
-        for stage in range(pipeline_parallel_size):
-            num_layers = 0
-            for st, ed in parts[stage]:
-                num_layers += ed - st
-            log_str += f'\n===== stage={stage}, layers={num_layers} =====\n'
-            for st, ed in parts[stage]:
-                for idx, layer in enumerate(layers[st: ed]):
-                    log_str += f'\t{idx + st:2d}: {layer}\n'
-        logger.info(log_str, ranks=[0])
-    # Save the partition
-    interval = parts[pipeline_rank]
-    models = []
-    for st, ed in interval:
-        model = copy.deepcopy(ori_model)
-        model.build_from_cfg(st, ed)
-        models.append(model)
-    return nn.ModuleList(models) if len(models) > 1 else models[0]
-def build_pipeline_model(layers: nn.Sequential, num_chunks: int = 1, verbose: bool = False):
-    """An intializer to split the model into different stages for pipeline parallelism.
-    Note that `layer` must be `torch.nn.Sequential`.
-    :param layers: Layers of model
-    :type layers: `torch.nn.Sequential`
-    :param num_chunks: The number of chunks you want to have on the current stage. This value should be 1
-                        in most cases unless you are using virutal pipeline parallelism.
-    :type num_chunks: int, optional
-    :param verbose: Whether to print the logs
-    :type verbose: bool, optional
-    """
-    pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
-    pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
-    partitions = partition_uniform(len(layers), pipeline_parallel_size, num_chunks)
-    module_list = []
-    for start, end in partitions[pipeline_rank]:
-        module_list.append(nn.Sequential(*layers[start:end]))
-    if verbose:
-        logger = get_dist_logger()
-        logger.info(f'Total {len(layers)} layers', ranks=[0])
-        for rank, part in enumerate(partitions):
-            log_str = f'===== stage={rank} =====\n'
-            for chunk, (start, end) in enumerate(part):
-                log_str += f'===== chunk={chunk}, layer=[{start}-{end}] =====\n'
-                log_str += '\n'.join([str(layer) for layer in layers[start:end]]) + '\n'
-            logger.info(log_str, ranks=[0])
-    return nn.ModuleList(module_list) if len(module_list) > 1 else module_list[0]
--- a/colossalai/communication/__init__.py
+++ b/colossalai/communication/__init__.py
-from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
-from .p2p import (send_forward, send_forward_recv_forward,
-                  send_backward_recv_forward, send_backward,
-                  send_backward_recv_backward, send_forward_recv_backward,
-                  send_forward_backward_recv_forward_backward, recv_forward,
-                  recv_backward)
-from .ring import ring_forward
-from .utils import send_tensor_meta, recv_tensor_meta
-__all__ = [
-    'all_gather', 'reduce_scatter', 'all_reduce', 'broadcast', 'reduce',
-    'send_forward', 'send_forward_recv_forward',
-    'send_forward_backward_recv_forward_backward', 'send_backward',
-    'send_backward_recv_backward', 'send_backward_recv_forward',
-    'send_forward_recv_backward', 'recv_backward', 'recv_forward',
-    'ring_forward', 'send_tensor_meta', 'recv_tensor_meta',
-]
--- a/colossalai/communication/__pycache__/__init__.cpython-36.pyc
+++ b/colossalai/communication/__pycache__/__init__.cpython-36.pyc
--- a/colossalai/communication/__pycache__/__init__.cpython-37.pyc
+++ b/colossalai/communication/__pycache__/__init__.cpython-37.pyc
--- a/colossalai/communication/__pycache__/collective.cpython-36.pyc
+++ b/colossalai/communication/__pycache__/collective.cpython-36.pyc
--- a/colossalai/communication/__pycache__/collective.cpython-37.pyc
+++ b/colossalai/communication/__pycache__/collective.cpython-37.pyc
--- a/colossalai/communication/__pycache__/p2p.cpython-36.pyc
+++ b/colossalai/communication/__pycache__/p2p.cpython-36.pyc
--- a/colossalai/communication/__pycache__/p2p.cpython-37.pyc
+++ b/colossalai/communication/__pycache__/p2p.cpython-37.pyc
--- a/colossalai/communication/__pycache__/ring.cpython-36.pyc
+++ b/colossalai/communication/__pycache__/ring.cpython-36.pyc
--- a/colossalai/communication/__pycache__/ring.cpython-37.pyc
+++ b/colossalai/communication/__pycache__/ring.cpython-37.pyc