Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

7bc5a8e3 · zhuwenwen · e6748d82 · 0f785cb1 · 7bc5a8e3 · 7bc5a8e3
Commit 7bc5a8e3 authored May 05, 2023 by zhuwenwen
20 changed files
--- a/colossalai/amp/naive_amp/_utils.py
+++ b/colossalai/amp/naive_amp/_utils.py
+from typing import List
+
+from torch import Tensor
+
+
+def has_inf_or_nan(tensor):
+    """Check if tensor has inf or nan values.
+
+    Args:
+        tensor (:class:`torch.Tensor`): a torch tensor object
+
+    Returns:
+        bool: Whether the tensor has inf or nan. True for yes and False for no.
+    """
+    try:
+        # if tensor is half, the .float() incurs an additional deep copy, but it's necessary if
+        # Pytorch's .sum() creates a one-element tensor of the same type as tensor
+        # (which is true for some recent version of pytorch).
+        tensor_sum = float(tensor.float().sum())
+        # More efficient version that can be used if .sum() returns a Python scalar
+        # tensor_sum = float(tensor.sum())
+    except RuntimeError as instance:
+        # We want to check if inst is actually an overflow exception.
+        # RuntimeError could come from a different error.
+        # If so, we still want the exception to propagate.
+        if "value cannot be converted" not in instance.args[0]:
+            raise
+        return True
+    else:
+        if tensor_sum == float('inf') or tensor_sum == -float('inf') or tensor_sum != tensor_sum:
+            return True
+        return False
+
+
+def zero_gard_by_list(tensor_list: List[Tensor], set_to_none: bool = True) -> None:
+    """Clear the gradient of a list of tensors,
+
+    Note: copied from torch.optim.optimizer.
+    """
+    for param in tensor_list:
+        if param.grad is not None:
+            if set_to_none:
+                param.grad = None
+            else:
+                if param.grad.grad_fn is not None:
+                    param.grad.detach_()
+                else:
+                    param.grad.requires_grad_(False)
+                param.grad.zero_()
--- a/colossalai/amp/naive_amp/grad_scaler/__init__.py
+++ b/colossalai/amp/naive_amp/grad_scaler/__init__.py
+from .base_grad_scaler import BaseGradScaler
+from .constant_grad_scaler import ConstantGradScaler
+from .dynamic_grad_scaler import DynamicGradScaler
+
+__all__ = ['BaseGradScaler', 'ConstantGradScaler', 'DynamicGradScaler']
--- a/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py
+++ b/colossalai/amp/naive_amp/grad_scaler/base_grad_scaler.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from abc import ABC, abstractmethod
+from typing import Dict
+
+import torch
+from torch import Tensor
+
+from colossalai.logging import get_dist_logger
+
+__all__ = ['BaseGradScaler']
+
+
+class BaseGradScaler(ABC):
+    """A base class for the gradient scaler.
+
+    Args:
+        initial_scale (float): the initial loss scale
+        verbose (bool): whether to log messages
+    """
+
+    def __init__(self, initial_scale: float, verbose: bool):
+        assert initial_scale > 0
+        self._scale = torch.cuda.FloatTensor([initial_scale])
+        self._verbose = verbose
+
+        if self._verbose:
+            self._logger = get_dist_logger()
+
+    @property
+    def scale(self) -> Tensor:
+        """Returns the loss scale.
+        """
+
+        return self._scale
+
+    @property
+    def inv_scale(self) -> Tensor:
+        """Returns the inverse of the loss scale.
+        """
+
+        return self._scale.double().reciprocal().float()
+
+    def state_dict(self) -> Dict:
+        """Returns the states of the gradient scaler as a dict object.
+        """
+
+        state_dict = dict()
+        state_dict['scale'] = self.scale
+        return state_dict
+
+    def load_state_dict(self, state_dict: Dict) -> None:
+        """Load the states of the gradient scaler from a dict object.
+
+        Args:
+            state_dict (dict): the states of the gradient scaler
+        """
+
+        self._scale = state_dict['scale']
+
+    @abstractmethod
+    def update(self, overflow: bool) -> None:
+        """Update the loss scale.
+
+        Args:
+            overflow (bool): whether overflow occurs
+        """
+
+        pass
+
+    def log(self, message, *args, **kwargs):
+        """Log messages.
+
+        Args:
+            message (str): the message to log
+            *args: positional arguments for :class:`colossalai.logging.DistributedLogger`
+            **kwargs: key-word arguments for :class:`colossalai.logging.DistributedLogger`
+        """
+
+        if self._verbose:
+            self._logger.info(message, *args, **kwargs)
--- a/colossalai/amp/naive_amp/grad_scaler/constant_grad_scaler.py
+++ b/colossalai/amp/naive_amp/grad_scaler/constant_grad_scaler.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from .base_grad_scaler import BaseGradScaler
+
+__all__ = ['ConstantGradScaler']
+
+
+class ConstantGradScaler(BaseGradScaler):
+    """A gradient scaler which uses constant loss scale
+
+    Args:
+        initial_scale (float): the initial loss scale
+        verbose (bool): whether to log messages
+    """
+
+    def __init__(self, initial_scale: int, verbose: bool):
+        super().__init__(initial_scale, verbose)
+        self.log(f"Constant Gradient Scaler is initialized with scale {self.scale}", ranks=[0])
+
+    def update(self, overflow: bool) -> None:
+        """Do nothing to keep the loss scale constant.
+
+        Args:
+            overflow (bool): whether overflow occurs
+        """
+        pass
--- a/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
+++ b/colossalai/amp/naive_amp/grad_scaler/dynamic_grad_scaler.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import Optional
+
+import torch
+
+from .base_grad_scaler import BaseGradScaler
+
+__all__ = ['DynamicGradScaler']
+
+
+class DynamicGradScaler(BaseGradScaler):
+    """A gradient scaler which uses dynamic loss scale
+
+    Args:
+        initial_scale (float): the initial loss scale, defaults to 2**16
+        growth_factor (float): the multiplication factor for increasing loss scale, defaults to 2
+        backoff_factor (float): the multiplication factor for decreasing loss scale, defaults to 0.5
+        growth_interval (int): the number of steps to increase loss scale when no overflow occurs, defaults to 1000
+        min_scale (float): the minimum loss scale, defaults to None
+        max_scale (float): the maximum loss scale, defaults to None
+        hysteresis (int):  the number of overflows before decreasing loss scale, defaults to 2
+        verbose (bool): whether to log messages, defaults to False
+    """
+
+    def __init__(self,
+                 initial_scale: float = 2**16,
+                 growth_factor: float = 2,
+                 backoff_factor: float = 0.5,
+                 growth_interval: int = 1000,
+                 min_scale: Optional[float] = None,
+                 max_scale: Optional[float] = None,
+                 hysteresis: int = 2,
+                 verbose: bool = False):
+        super().__init__(initial_scale, verbose)
+        if min_scale:
+            self._min_scale = torch.cuda.FloatTensor([min_scale])
+        else:
+            self._min_scale = None
+
+        if max_scale:
+            self._max_scale = torch.cuda.FloatTensor([max_scale])
+        else:
+            self._max_scale = None
+
+        self._growth_factor = growth_factor
+        self._backoff_factor = backoff_factor
+        self._growth_interval = growth_interval
+        self._growth_step = 0
+        self._hysteresis = hysteresis
+        self._hysteresis_step = 0
+        self._sanity_checks()
+
+    def _sanity_checks(self) -> None:
+        """Check if the arguments are correct.
+        """
+
+        if self._min_scale:
+            assert self._min_scale > 0, 'The minimum gradient scale cannot be zero or negative'
+            assert self._min_scale <= self._scale, 'The minimum gradient scale cannot be greater than the current scale'
+        if self._max_scale:
+            assert self._max_scale > 0, 'The maximum gradient scale cannot be zero or negative'
+            assert self._max_scale >= self._scale, 'The maximum gradient scale cannot be smaller than the current scale'
+        assert self._growth_factor > 1, 'The growth factor cannot be equal or smaller than 1'
+        assert 0 < self._backoff_factor < 1, 'The backoff factor must be between 0 and 1'
+        assert self._hysteresis >= 0, 'The hysteresis cannot be negative'
+
+    def update(self, overflow: bool) -> None:
+        """Update the loss scale.
+
+        Args:
+            overflow (bool): whether overflow occurs
+        """
+        if overflow:
+            self._hysteresis_step += 1
+            self._growth_step = 0
+
+            if self._hysteresis_step >= self._hysteresis:
+                self._backoff_scale()
+                self.log(f"Overflow occurs, the loss scale is adjusted to {self.scale.item()}", ranks=[0])
+        else:
+            self._growth_step += 1
+            if self._growth_step == self._growth_interval:
+                self._growth_step = 0
+                self._hysteresis_step = 0
+                self._grow_scale()
+                self.log(
+                    f"No overflow for consecutive {self._growth_interval} steps, "
+                    f"the loss scale is adjusted to {self.scale.item()}",
+                    ranks=[0])
+
+    def _backoff_scale(self) -> None:
+        """Decrease the loss scale
+        """
+
+        self._scale = self._scale * self._backoff_factor
+        if self._min_scale:
+            self._scale = torch.max(self._scale, self._min_scale)
+
+    def _grow_scale(self) -> None:
+        """Increase the loss scale
+        """
+
+        self._scale = self._scale * self._growth_factor
+        if self._max_scale:
+            self._scale = torch.min(self._scale, self._max_scale)
+
+    def state_dict(self):
+        state_dict = dict()
+        state_dict['scale'] = self._scale
+        state_dict['growth_factor'] = self._growth_factor
+        state_dict['backoff_factor'] = self._backoff_factor
+        state_dict['hysteresis'] = self._hysteresis
+        return state_dict
+
+    def load_state_dict(self, state_dict):
+        self._scale = state_dict['scale'].cuda(torch.cuda.current_device())
+        self._growth_factor = state_dict['growth_factor']
+        self._backoff_factor = state_dict['backoff_factor']
+        self._hysteresis = state_dict['hysteresis']
--- a/colossalai/amp/naive_amp/naive_amp.py
+++ b/colossalai/amp/naive_amp/naive_amp.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import Any
+
+import torch
+import torch.distributed as dist
+import torch.nn as nn
+from torch import Tensor
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from torch.distributed import ReduceOp
+from torch.optim import Optimizer
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.nn.optimizer import ColossalaiOptimizer
+
+from ._fp16_optimizer import FP16Optimizer
+
+
+class NaiveAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class for optimizer to cast all parameters to fp16
+
+    Args:
+        optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
+        grad_scaler (BaseGradScaler): grad scaler for gradient chose in
+                                      ``constant_grad_scaler`` or ``dynamic_grad_scaler``.
+        clip_grad_norm (float, optional): clip gradients with this global L2 norm. Default 0.
+        verbose (bool, optional): if set to `True`, will print debug info. Default False.
+
+    Note:
+        clipping is ignored if ``clip_grad_norm`` equals 0.
+    """
+
+    def __init__(self, optim: Optimizer, *args, **kwargs):
+        optim = FP16Optimizer(optim, *args, **kwargs)
+        super().__init__(optim)
+
+    def backward(self, loss: Tensor):
+        self.optim.backward(loss)
+
+    def step(self):
+        return self.optim.step()
+
+    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        if self.optim.max_norm == max_norm:
+            return
+        raise RuntimeError("NaiveAMP optimizer has clipped gradients during optimizer.step(). "
+                           "If you have supplied clip_grad_norm in the amp_config, "
+                           "executing the method clip_grad_norm is not allowed.")
+
+
+class NaiveAMPModel(nn.Module):
+    r"""A wrapper class for model to cast the model into fp16 and
+    automatically cast the input and output
+
+    Args:
+        model (torch.nn.Module): torch.nn.Module to be wrapped.
+        output_to_fp32 (bool, optional): Whether cast output of this module into fp32. (Default: True)
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this module.
+                                                                  (Default: ``ParallelMode.DATA``)
+        sync_buffer (bool, optional): whether to synchronize buffer. (Default: True)
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+    """
+
+    def __init__(self,
+                 model: nn.Module,
+                 output_to_fp32: bool = True,
+                 parallel_mode: ParallelMode = ParallelMode.DATA,
+                 sync_buffer: bool = True):
+        super().__init__()
+        self.model = model.half()
+        self._output_to_fp32 = output_to_fp32
+        self._sync_buf = sync_buffer
+
+        if gpc.is_initialized(parallel_mode) and gpc.get_world_size(parallel_mode) > 1:
+            self._process_group = gpc.get_group(parallel_mode)
+            self._world_size = gpc.get_world_size(parallel_mode)
+        else:
+            self._process_group = None
+            self._world_size = 1
+            self._sync_buf = False
+        self._first_eval_run = False
+
+    @property
+    def sync_buffer(self):
+        return self._sync_buf
+
+    @sync_buffer.setter
+    def sync_buffer(self, state: bool):
+        self._sync_buf = state
+
+    def _convert_to_fp16(self, input_: Any):
+        if isinstance(input_, Tensor) and input_.dtype == torch.float32:
+            input_ = input_.half()
+        return input_
+
+    def _convert_to_fp32(self, input_: Any):
+        if isinstance(input_, Tensor) and input_.dtype == torch.float16:
+            input_ = input_.float()
+        return input_
+
+    def _reduce_module_buffer(self):
+        """
+        All-reduce the buffers (e.g. running stats of batch normalization) across
+        data parallel ranks so that all the ranks will produce consistent results
+        when given the same input
+        """
+        buf_list = []
+
+        # find valid buffers
+        for buf in self.model.buffers():
+            if buf is not None:
+                buf_list.append(buf)
+
+        # reduce buffers across data parallel ranks
+        if buf_list:
+            coalesced_buf = _flatten_dense_tensors(buf_list)
+            coalesced_buf.div_(self._world_size)
+            dist.all_reduce(coalesced_buf, op=ReduceOp.SUM, group=self._process_group)
+            unflattened_buf_list = _unflatten_dense_tensors(coalesced_buf, buf_list)
+            for old, new in zip(buf_list, unflattened_buf_list):
+                old.copy_(new)
+
+    def eval(self):
+        self.model.eval()
+
+        # we only sync buffer in the first eval iteration
+        # so that future eval iterations can be done without communication
+        self._first_eval_run = True
+
+    def forward(self, *args, **kwargs):
+        # reduce buffers after forward will lead to error
+        # as we cannot change the variables needed for gradient computation after forward
+        # so we sync buffer before forward
+        if (self.training or self._first_eval_run) and self._sync_buf:
+            with torch.no_grad():
+                self._reduce_module_buffer()
+
+            if self._first_eval_run:
+                self._first_eval_run = False
+
+        if args:
+            args = [self._convert_to_fp16(arg) for arg in args]
+        if kwargs:
+            for k, v in kwargs.items():
+                kwargs[k] = self._convert_to_fp16(v)
+
+        out = self.model(*args, **kwargs)
+
+        if self._output_to_fp32:
+            if isinstance(out, Tensor):
+                out = self._convert_to_fp32(out)
+            elif isinstance(out, (tuple, list)):
+                out = [self._convert_to_fp32(val) for val in out]
+            elif isinstance(out, dict):
+                out = {key: self._convert_to_fp32(val) for key, val in out.items()}
+        return out
--- a/colossalai/amp/torch_amp/__init__.py
+++ b/colossalai/amp/torch_amp/__init__.py
+from typing import Optional
+
+import torch.nn as nn
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
+
+from colossalai.context import Config
+
+from .torch_amp import TorchAMPLoss, TorchAMPModel, TorchAMPOptimizer
+
+
+def convert_to_torch_amp(model: nn.Module,
+                         optimizer: Optimizer,
+                         criterion: Optional[_Loss] = None,
+                         amp_config: Optional[Config] = None):
+    """A helper function to wrap training components with Pytorch AMP modules
+
+    Args:
+        model (:class:`torch.nn.Module`): your model object.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object
+        criterion (:class:`torch.nn.modules.loss._Loss`, optional): your loss function object
+        amp_config (:class:`colossalai.context.Config` or dict, optional): configuration for Pytorch AMP.
+
+    The ``amp_config`` should include parameters below:
+    ::
+
+        init_scale (float, optional, default=2.**16)
+        growth_factor (float, optional, default=2.0)
+        backoff_factor (float, optional, default=0.5)
+        growth_interval (int, optional, default=2000)
+        enabled (bool, optional, default=True)
+
+    Returns:
+        A tuple (model, optimizer, criterion)
+    """
+    model = TorchAMPModel(model)
+    if amp_config is None:
+        amp_config = dict()
+    optimizer = TorchAMPOptimizer(optimizer, **amp_config)
+    if criterion:
+        criterion = TorchAMPLoss(criterion)
+    return model, optimizer, criterion
+
+
+__all__ = ['convert_to_torch_amp', 'TorchAMPModel', 'TorchAMPLoss', 'TorchAMPOptimizer']
--- a/colossalai/amp/torch_amp/_grad_scaler.py
+++ b/colossalai/amp/torch_amp/_grad_scaler.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+# modified from https://github.com/pytorch/pytorch/blob/master/torch/cuda/amp/grad_scaler.py
+# to support tensor parallel
+
+import warnings
+from collections import abc, defaultdict
+from enum import Enum
+from typing import Any, Dict, List, Optional, Tuple
+
+import torch
+import torch.distributed as dist
+from packaging import version
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+
+
+class _MultiDeviceReplicator(object):
+    """
+    Lazily serves copies of a tensor to requested devices.  Copies are cached per-device.
+    """
+
+    def __init__(self, master_tensor: torch.Tensor) -> None:
+        assert master_tensor.is_cuda or master_tensor.device.type == 'xla'
+        self.master = master_tensor
+        self._per_device_tensors: Dict[torch.device, torch.Tensor] = {}
+
+    def get(self, device) -> torch.Tensor:
+        retval = self._per_device_tensors.get(device, None)
+        if retval is None:
+            retval = self.master.to(device=device, non_blocking=True, copy=True)
+            self._per_device_tensors[device] = retval
+        return retval
+
+
+# Defines default_factory for GradScaler's _per_optimizer_states defaultdict,
+# as well as associated "enum" values.  Prefers defining these at top level because
+# - Lambdas can't be pickled, so we don't want to supply a lambda as the factory.
+# - Defining READY, UNSCALED, STEPPED and _refresh_per_optimizer_state within GradScaler
+#   causes a circular reference, which we'd rather avoid.
+class OptState(Enum):
+    READY = 0
+    UNSCALED = 1
+    STEPPED = 2
+
+
+def _refresh_per_optimizer_state():
+    return {"stage": OptState.READY, "found_inf_per_device": {}}
+
+
+class GradScaler(object):
+    _scale: Optional[torch.Tensor]
+    _grows_tracker: Optional[torch.Tensor]
+    _per_optimizer_states: Dict[int, Dict[str, Any]]
+    """
+    An instance ``scaler`` of :class:`GradScaler` helps perform the steps of gradient scaling
+    conveniently.
+
+    * ``scaler.scale(loss)`` multiplies a given loss by ``scaler``'s current scale factor.
+    * ``scaler.step(optimizer)`` safely unscales gradients and calls ``optimizer.step()``.
+    * ``scaler.update()`` updates ``scaler``'s scale factor.
+
+    Example:
+
+        # Creates a GradScaler once at the beginning of training.
+        scaler = GradScaler()
+
+        for epoch in epochs:
+            for input, target in data:
+                optimizer.zero_grad()
+                output = model(input)
+                loss = loss_fn(output, target)
+
+                # Scales loss.  Calls backward() on scaled loss to create scaled gradients.
+                scaler.scale(loss).backward()
+
+                # scaler.step() first unscales gradients of the optimizer's params.
+                # If gradients don't contain infs/NaNs, optimizer.step() is then called,
+                # otherwise, optimizer.step() is skipped.
+                scaler.step(optimizer)
+
+                # Updates the scale for next iteration.
+                scaler.update()
+
+    See the :ref:`Automatic Mixed Precision examples<amp-examples>` for usage
+    (along with autocasting) in more complex cases like gradient clipping, gradient accumulation, gradient penalty,
+    and multiple losses/optimizers.
+
+    ``scaler`` dynamically estimates the scale factor each iteration.  To minimize gradient underflow,
+    a large scale factor should be used.  However, ``float16`` values can "overflow" (become inf or NaN) if
+    the scale factor is too large.  Therefore, the optimal scale factor is the largest factor that can be used
+    without incurring inf or NaN gradient values.
+    ``scaler`` approximates the optimal scale factor over time by checking the gradients for infs and NaNs during every
+    ``scaler.step(optimizer)`` (or optional separate ``scaler.unscale_(optimizer)``, see :meth:`unscale_`).
+
+    * If infs/NaNs are found, ``scaler.step(optimizer)`` skips the underlying ``optimizer.step()`` (so the params
+      themselves remain uncorrupted) and ``update()`` multiplies the scale by ``backoff_factor``.
+
+    * If no infs/NaNs are found, ``scaler.step(optimizer)`` runs the underlying ``optimizer.step()`` as usual.
+      If ``growth_interval`` unskipped iterations occur consecutively, ``update()`` multiplies the scale by
+      ``growth_factor``.
+
+    The scale factor often causes infs/NaNs to appear in gradients for the first few iterations as its
+    value calibrates.  ``scaler.step`` will skip the underlying ``optimizer.step()`` for these
+    iterations.  After that, step skipping should occur rarely (once every few hundred or thousand iterations).
+
+    Args:
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+    """
+
+    def __init__(self, init_scale=2.**16, growth_factor=2.0, backoff_factor=0.5, growth_interval=2000, enabled=True):
+        if enabled and not torch.cuda.is_available():
+            warnings.warn("torch.cuda.amp.GradScaler is enabled, but CUDA is not available.  Disabling.")
+            self._enabled = False
+        else:
+            self._enabled = enabled
+
+        # check version
+        torch_version = version.parse(torch.__version__)
+        assert torch_version.major == 1
+        if torch_version.minor > 8:
+            self._higher_than_torch18 = True
+        else:
+            self._higher_than_torch18 = False
+
+        if self._enabled:
+            assert growth_factor > 1.0, "The growth factor must be > 1.0."
+            assert backoff_factor < 1.0, "The backoff factor must be < 1.0."
+
+            self._init_scale = init_scale
+            # self._scale will be lazily initialized during the first call to scale()
+            self._scale = None
+            self._growth_factor = growth_factor
+            self._backoff_factor = backoff_factor
+            self._growth_interval = growth_interval
+            self._init_growth_tracker = 0
+            # self._growth_tracker will be lazily initialized during the first call to scale()
+            self._growth_tracker = None
+            self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _check_scale_growth_tracker(self, funcname) -> Tuple[torch.Tensor, torch.Tensor]:
+        fix = "This may indicate your script did not use scaler.scale(loss or outputs) earlier in the iteration."
+        assert self._scale is not None, "Attempted {} but _scale is None.  ".format(funcname) + fix
+        assert self._growth_tracker is not None, "Attempted {} but _growth_tracker is None.  ".format(funcname) + fix
+        return (self._scale, self._growth_tracker)
+
+    def _lazy_init_scale_growth_tracker(self, dev):
+        assert self._growth_tracker is None, "_growth_tracker initialized before _scale"
+        self._scale = torch.full((1,), self._init_scale, dtype=torch.float32, device=dev)
+        self._growth_tracker = torch.full((1,), self._init_growth_tracker, dtype=torch.int32, device=dev)
+
+    def scale(self, outputs):
+        """
+        Multiplies ('scales') a tensor or list of tensors by the scale factor.
+
+        Returns scaled outputs.  If this instance of :class:`GradScaler` is not enabled, outputs are returned
+        unmodified.
+
+        Args:
+            outputs (Tensor or iterable of Tensors):  Outputs to scale.
+        """
+        if not self._enabled:
+            return outputs
+
+        # Short-circuit for the common case.
+        if isinstance(outputs, torch.Tensor):
+            assert outputs.is_cuda or outputs.device.type == 'xla'
+            if self._scale is None:
+                self._lazy_init_scale_growth_tracker(outputs.device)
+            assert self._scale is not None
+            return outputs * self._scale.to(device=outputs.device, non_blocking=True)
+
+        # Invoke the more complex machinery only if we're treating multiple outputs.
+        # holds a reference that can be overwritten by apply_scale
+        stash: List[_MultiDeviceReplicator] = []
+
+        def apply_scale(val):
+            if isinstance(val, torch.Tensor):
+                assert val.is_cuda or val.device.type == 'xla'
+                if len(stash) == 0:
+                    if self._scale is None:
+                        self._lazy_init_scale_growth_tracker(val.device)
+                    assert self._scale is not None
+                    stash.append(_MultiDeviceReplicator(self._scale))
+                return val * stash[0].get(val.device)
+            elif isinstance(val, abc.Iterable):
+                iterable = map(apply_scale, val)
+                if isinstance(val, list) or isinstance(val, tuple):
+                    return type(val)(iterable)
+                else:
+                    return iterable
+            else:
+                raise ValueError("outputs must be a Tensor or an iterable of Tensors")
+
+        return apply_scale(outputs)
+
+    def _unscale_grads_(self, optimizer, inv_scale, found_inf, allow_fp16):
+        per_device_inv_scale = _MultiDeviceReplicator(inv_scale)
+        per_device_found_inf = _MultiDeviceReplicator(found_inf)
+
+        # To set up _amp_foreach_non_finite_check_and_unscale_, split grads by device and dtype.
+        # There could be hundreds of grads, so we'd like to iterate through them just once.
+        # However, we don't know their devices or dtypes in advance.
+
+        # https://stackoverflow.com/questions/5029934/defaultdict-of-defaultdict
+        # Google says mypy struggles with defaultdicts type annotations.
+        per_device_and_dtype_grads = defaultdict(lambda: defaultdict(list))    # type: ignore[var-annotated]
+        with torch.no_grad():
+            for group in optimizer.param_groups:
+                for param in group["params"]:
+                    if param.grad is None:
+                        continue
+                    if (not allow_fp16) and param.grad.dtype == torch.float16:
+                        raise ValueError("Attempting to unscale FP16 gradients.")
+                    if param.grad.is_sparse:
+                        # is_coalesced() == False means the sparse grad has values with duplicate indices.
+                        # coalesce() deduplicates indices and adds all values that have the same index.
+                        # For scaled fp16 values, there's a good chance coalescing will cause overflow,
+                        # so we should check the coalesced _values().
+                        if param.grad.dtype is torch.float16:
+                            param.grad = param.grad.coalesce()
+                        to_unscale = param.grad._values()
+                    else:
+                        to_unscale = param.grad
+
+                    # TODO: is there a way to split by device and dtype without appending in the inner loop?
+                    per_device_and_dtype_grads[to_unscale.device][to_unscale.dtype].append(to_unscale)
+
+            for device, per_dtype_grads in per_device_and_dtype_grads.items():
+                for grads in per_dtype_grads.values():
+                    torch._amp_foreach_non_finite_check_and_unscale_(grads, per_device_found_inf.get(device),
+                                                                     per_device_inv_scale.get(device))
+        # For tensor parallel paramters it should be all-reduced over tensor parallel process group
+        if gpc.is_initialized(ParallelMode.MODEL) and gpc.get_world_size(ParallelMode.MODEL) > 1:
+            vals = [val for val in per_device_found_inf._per_device_tensors.values()]
+            coalesced = _flatten_dense_tensors(vals)
+            dist.all_reduce(coalesced, op=dist.ReduceOp.MAX, group=gpc.get_group(ParallelMode.MODEL))
+            for buf, synced in zip(vals, _unflatten_dense_tensors(coalesced, vals)):
+                buf.copy_(synced)
+        return per_device_found_inf._per_device_tensors
+
+    def unscale_(self, optimizer):
+        """
+        Divides ("unscales") the optimizer's gradient tensors by the scale factor.
+
+        :meth:`unscale_` is optional, serving cases where you need to
+        :ref:`modify or inspect gradients<working-with-unscaled-gradients>`
+        between the backward pass(es) and :meth:`step`.
+        If :meth:`unscale_` is not called explicitly,  gradients will be unscaled  automatically during :meth:`step`.
+
+        Simple example, using :meth:`unscale_` to enable clipping of unscaled gradients::
+
+            ...
+            scaler.scale(loss).backward()
+            scaler.unscale_(optimizer)
+            torch.nn.utils.clip_grad_norm_(model.parameters(), max_norm)
+            scaler.step(optimizer)
+            scaler.update()
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that owns the gradients to be unscaled.
+
+        .. note::
+            :meth:`unscale_` does not incur a CPU-GPU sync.
+
+        .. warning::
+            :meth:`unscale_` should only be called once per optimizer per :meth:`step` call,
+            and only after all gradients for that optimizer's assigned parameters have been accumulated.
+            Calling :meth:`unscale_` twice for a given optimizer between each :meth:`step` triggers a RuntimeError.
+
+        .. warning::
+            :meth:`unscale_` may unscale sparse gradients out of place, replacing the ``.grad`` attribute.
+        """
+        if not self._enabled:
+            return
+
+        self._check_scale_growth_tracker("unscale_")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.UNSCALED:
+            raise RuntimeError("unscale_() has already been called on this optimizer since the last update().")
+        elif optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("unscale_() is being called after step().")
+
+        # FP32 division can be imprecise for certain compile options, so we carry out the reciprocal in FP64.
+        assert self._scale is not None
+        inv_scale = self._scale.double().reciprocal().float()
+        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=self._scale.device)
+
+        optimizer_state["found_inf_per_device"] = self._unscale_grads_(optimizer, inv_scale, found_inf, False)
+        optimizer_state["stage"] = OptState.UNSCALED
+
+    def _maybe_opt_step(self, optimizer, optimizer_state, *args, **kwargs):
+        retval = None
+        if not sum(v.item() for v in optimizer_state["found_inf_per_device"].values()):
+            retval = optimizer.step(*args, **kwargs)
+        return retval
+
+    def step(self, optimizer, *args, **kwargs):
+        """
+        :meth:`step` carries out the following two operations:
+
+        1.  Internally invokes ``unscale_(optimizer)`` (unless :meth:`unscale_` was explicitly called for ``optimizer``
+            earlier in the iteration).  As part of the :meth:`unscale_`, gradients are checked for infs/NaNs.
+        2.  If no inf/NaN gradients are found, invokes ``optimizer.step()`` using the unscaled
+            gradients.  Otherwise, ``optimizer.step()`` is skipped to avoid corrupting the params.
+
+        ``*args`` and ``**kwargs`` are forwarded to ``optimizer.step()``.
+
+        Returns the return value of ``optimizer.step(*args, **kwargs)``.
+
+        Args:
+            optimizer (torch.optim.Optimizer):  Optimizer that applies the gradients.
+            args:  Any arguments.
+            kwargs:  Any keyword arguments.
+
+        .. warning::
+            Closure use is not currently supported.
+        """
+        if (not self._enabled):
+            return optimizer.step(*args, **kwargs)
+
+        if "closure" in kwargs:
+            raise RuntimeError("Closure use is not currently supported if GradScaler is enabled.")
+
+        self._check_scale_growth_tracker("step")
+
+        optimizer_state = self._per_optimizer_states[id(optimizer)]
+
+        if optimizer_state["stage"] is OptState.STEPPED:
+            raise RuntimeError("step() has already been called since the last update().")
+
+        retval = None
+
+        if (hasattr(optimizer, "_step_supports_amp_scaling") and optimizer._step_supports_amp_scaling):
+            # This optimizer has customized scale-handling logic, so we can call optimizer.step() directly.
+            # The contract with custom optimizers is that their step() should accept an additional,
+            # optional grad_scaler kwarg.  We append self to the kwargs so the custom optimizer has full information:
+            # it can query its own state, invoke unscale_ on itself, etc
+            retval = optimizer.step(*args, **dict(kwargs, grad_scaler=self))
+            optimizer_state["stage"] = OptState.STEPPED
+            return retval
+
+        if optimizer_state["stage"] is OptState.READY:
+            self.unscale_(optimizer)
+
+        assert len(optimizer_state["found_inf_per_device"]) > 0, "No inf checks were recorded for this optimizer."
+
+        retval = self._maybe_opt_step(optimizer, optimizer_state, *args, **kwargs)
+
+        optimizer_state["stage"] = OptState.STEPPED
+
+        return retval
+
+    def update(self, new_scale=None):
+        """
+        Updates the scale factor.
+
+        If any optimizer steps were skipped the scale is multiplied by ``backoff_factor``
+        to reduce it. If ``growth_interval`` unskipped iterations occurred consecutively,
+        the scale is multiplied by ``growth_factor`` to increase it.
+
+        Passing ``new_scale`` sets the new scale value manually. (``new_scale`` is not
+        used directly, it's used to fill GradScaler's internal scale tensor. So if
+        ``new_scale`` was a tensor, later in-place changes to that tensor will not further
+        affect the scale GradScaler uses internally.)
+
+        Args:
+            new_scale (float or :class:`torch.cuda.FloatTensor`, optional, default=None):  New scale factor.
+
+        .. warning::
+            :meth:`update` should only be called at the end of the iteration, after ``scaler.step(optimizer)`` has
+            been invoked for all optimizers used this iteration.
+        """
+        if not self._enabled:
+            return
+
+        _scale, _growth_tracker = self._check_scale_growth_tracker("update")
+
+        if new_scale is not None:
+            # Accept a new user-defined scale.
+            if isinstance(new_scale, float):
+                self._scale.fill_(new_scale)    # type: ignore[union-attr]
+            else:
+                reason = "new_scale should be a float or a 1-element torch.cuda.FloatTensor with requires_grad=False."
+                # type: ignore[attr-defined]
+                assert isinstance(new_scale, torch.cuda.FloatTensor), reason
+                assert new_scale.numel() == 1, reason
+                assert new_scale.requires_grad is False, reason
+                self._scale.copy_(new_scale)    # type: ignore[union-attr]
+        else:
+            # Consume shared inf/nan data collected from optimizers to update the scale.
+            # If all found_inf tensors are on the same device as self._scale, this operation is asynchronous.
+            found_infs = [
+                found_inf.to(device=_scale.device, non_blocking=True)
+                for state in self._per_optimizer_states.values()
+                for found_inf in state["found_inf_per_device"].values()
+            ]
+
+            assert len(found_infs) > 0, "No inf checks were recorded prior to update."
+
+            found_inf_combined = found_infs[0]
+            if len(found_infs) > 1:
+                for i in range(1, len(found_infs)):
+                    found_inf_combined += found_infs[i]
+
+            if self._higher_than_torch18:
+                torch._amp_update_scale_(_scale, _growth_tracker, found_inf_combined, self._growth_factor,
+                                         self._backoff_factor, self._growth_interval)
+            else:
+                self._scale = torch._amp_update_scale(_growth_tracker, _scale, found_inf_combined, self._growth_factor,
+                                                      self._backoff_factor, self._growth_interval)
+
+        # To prepare for next iteration, clear the data collected from optimizers this iteration.
+        self._per_optimizer_states = defaultdict(_refresh_per_optimizer_state)
+
+    def _get_scale_async(self):
+        return self._scale
+
+    def get_scale(self):
+        """
+        Returns a Python float containing the current scale, or 1.0 if scaling is disabled.
+
+        .. warning::
+            :meth:`get_scale` incurs a CPU-GPU sync.
+        """
+        if self._enabled:
+            return self._init_scale if self._scale is None else self._get_scale_async().item()
+        else:
+            return 1.0
+
+    def get_growth_factor(self):
+        r"""
+        Returns a Python float containing the scale growth factor.
+        """
+        return self._growth_factor
+
+    def set_growth_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale growth factor.
+        """
+        self._growth_factor = new_factor
+
+    def get_backoff_factor(self):
+        r"""
+        Returns a Python float containing the scale backoff factor.
+        """
+        return self._backoff_factor
+
+    def set_backoff_factor(self, new_factor):
+        r"""
+        Args:
+            new_scale (float):  Value to use as the new scale backoff factor.
+        """
+        self._backoff_factor = new_factor
+
+    def get_growth_interval(self):
+        r"""
+        Returns a Python int containing the growth interval.
+        """
+        return self._growth_interval
+
+    def set_growth_interval(self, new_interval):
+        r"""
+        Args:
+            new_interval (int):  Value to use as the new growth interval.
+        """
+        self._growth_interval = new_interval
+
+    def _get_growth_tracker(self):
+        if self._enabled:
+            return self._init_growth_tracker if self._growth_tracker is None else self._growth_tracker.item()
+        else:
+            return 0
+
+    def is_enabled(self):
+        r"""
+        Returns a bool indicating whether this instance is enabled.
+        """
+        return self._enabled
+
+    def state_dict(self):
+        r"""
+        Returns the state of the scaler as a :class:`dict`.  It contains five entries:
+
+        * ``"scale"`` - a Python float containing the current scale
+        * ``"growth_factor"`` - a Python float containing the current growth factor
+        * ``"backoff_factor"`` - a Python float containing the current backoff factor
+        * ``"growth_interval"`` - a Python int containing the current growth interval
+        * ``"_growth_tracker"`` - a Python int containing the number of recent consecutive unskipped steps.
+
+        If this instance is not enabled, returns an empty dict.
+
+        .. note::
+           If you wish to checkpoint the scaler's state after a particular iteration, :meth:`state_dict`
+           should be called after :meth:`update`.
+        """
+        return {
+            "scale": self.get_scale(),
+            "growth_factor": self._growth_factor,
+            "backoff_factor": self._backoff_factor,
+            "growth_interval": self._growth_interval,
+            "_growth_tracker": self._get_growth_tracker()
+        } if self._enabled else {}
+
+    def load_state_dict(self, state_dict):
+        r"""
+        Loads the scaler state.  If this instance is disabled, :meth:`load_state_dict` is a no-op.
+
+        Args:
+           state_dict(dict): scaler state.  Should be an object returned from a call to :meth:`state_dict`.
+        """
+        if not self._enabled:
+            return
+
+        if len(state_dict) == 0:
+            raise RuntimeError("The source state dict is empty, possibly because it was saved "
+                               "from a disabled instance of GradScaler.")
+
+        self._init_scale = state_dict["scale"]
+        if self._scale is not None:
+            self._scale.fill_(state_dict["scale"])
+        self._growth_factor = state_dict["growth_factor"]
+        self._backoff_factor = state_dict["backoff_factor"]
+        self._growth_interval = state_dict["growth_interval"]
+        self._init_growth_tracker = state_dict["_growth_tracker"]
+        if self._growth_tracker is not None:
+            self._growth_tracker.fill_(state_dict["_growth_tracker"])
+
+    def __getstate__(self):
+        state = self.__dict__.copy()
+        if self._enabled:
+            assert len(self._per_optimizer_states) == 0, "A GradScaler instance may only be pickled at the beginning "\
+                                                         "of an iteration, or at the end after scaler.update()."
+            # Pickling _scale and _growth_tracker Tensors directly triggers
+            # "warnings.warn("pickle support for Storage will be removed in 1.5..."
+            # so instead, we set the unpickled instance up to reinitialize them lazily.
+            state['_init_scale'] = self.get_scale()
+            state['_init_growth_tracker'] = self._get_growth_tracker()
+            state['_scale'] = None
+            state['_growth_tracker'] = None
+        return state
+
+    def __setstate__(self, state):
+        self.__dict__.update(state)
+
+    def _check_inf_per_device(self, optimizer):
+        _scale, _ = self._check_scale_growth_tracker("_check_inf_per_device")
+
+        dummy_inv_scale = torch.full((1,), 1.0, dtype=torch.float32, device=_scale.device)
+        found_inf = torch.full((1,), 0.0, dtype=torch.float32, device=_scale.device)
+
+        self._per_optimizer_states[id(optimizer)]["found_inf_per_device"] = \
+            self._unscale_grads_(optimizer, dummy_inv_scale, found_inf, True)
+
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
+
+    def _found_inf_per_device(self, optimizer):
+        return self._per_optimizer_states[id(optimizer)]["found_inf_per_device"]
--- a/colossalai/amp/torch_amp/torch_amp.py
+++ b/colossalai/amp/torch_amp/torch_amp.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch.cuda.amp as torch_amp
+import torch.nn as nn
+from torch import Tensor
+from torch.nn.modules.loss import _Loss
+from torch.optim import Optimizer
+
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from colossalai.utils import clip_grad_norm_fp32
+
+from ._grad_scaler import GradScaler
+
+
+class TorchAMPOptimizer(ColossalaiOptimizer):
+    """A wrapper class which integrate Pytorch AMP with an optimizer
+
+    Args:
+        optim (torch.optim.Optimizer): A normal optimizer like Adam or SGD.
+        init_scale (float, optional, default=2.**16):  Initial scale factor.
+        growth_factor (float, optional, default=2.0):  Factor by which the scale is multiplied during
+            :meth:`update` if no inf/NaN gradients occur for ``growth_interval`` consecutive iterations.
+        backoff_factor (float, optional, default=0.5):  Factor by which the scale is multiplied during
+            :meth:`update` if inf/NaN gradients occur in an iteration.
+        growth_interval (int, optional, default=2000):  Number of consecutive iterations without inf/NaN gradients
+            that must occur for the scale to be multiplied by ``growth_factor``.
+        enabled (bool, optional, default=True):  If ``False``, disables gradient scaling. :meth:`step` simply
+            invokes the underlying ``optimizer.step()``, and other methods become no-ops.
+    """
+
+    def __init__(self, optim: Optimizer, *args, **kwargs):
+        super().__init__(optim)
+        self.scaler = GradScaler(*args, **kwargs)
+
+    def backward(self, loss: Tensor):
+        """Backward with torch amp gradient scaler
+
+        Args:
+            loss (torch.Tensor): Loss computed by a loss function
+        """
+        self.scaler.scale(loss).backward()
+
+    def step(self):
+        """Update the parameters of the model
+        """
+        self.scaler.step(self.optim)
+        self.scaler.update()
+
+    def clip_grad_norm(self, model: nn.Module, max_norm: float):
+        """Apply gradient clipping to the model parameters
+
+        Args:
+            model (torch.nn.Module): Your model object
+            max_norm (float): Max norm value for gradient clipping
+        """
+        if max_norm > 0.0:
+            self.scaler.unscale_(self.optim)
+            clip_grad_norm_fp32(model.parameters(), max_norm)
+
+
+class TorchAMPModel(nn.Module):
+    """A wrapper class for a model object which executes forward with values automatically
+    cast to fp16
+
+    Args:
+        model (:class:`torch.nn.Module`): a torch model instance
+    """
+
+    def __init__(self, model: nn.Module) -> None:
+        super().__init__()
+        self.model = model
+
+    @torch_amp.autocast()
+    def forward(self, *args, **kwargs):
+        """
+        Execute forward under the torch amp context
+        """
+        return self.model(*args, **kwargs)
+
+
+class TorchAMPLoss(nn.Module):
+    """A wrapper class for a criterion object which computes the loss in mixed-precision context
+
+    Args:
+        loss (torch.nn.modules.loss._Loss): A loss function object
+    """
+
+    def __init__(self, loss: _Loss):
+        super().__init__()
+        self.loss = loss
+
+    @torch_amp.autocast()
+    def forward(self, *args, **kwargs):
+        """
+        Execute forward under the torch amp context
+        """
+        return self.loss(*args, **kwargs)
--- a/colossalai/auto_parallel/README.md
+++ b/colossalai/auto_parallel/README.md
+# Colossal-AUTO
+
+## Challenges
+Recently, large models have achieved the state of the art performances in various fields. In order to support large model training, we have to use distributed training techniques. However, finding an efficient distributed execution plan not only requires fine-grained model statistics, such as memory and computing overhead of each operator but also is a labor-intensive task even for an expert in the field of distributed training.
+
+## Our solution
+To simplify the process of distributed training for foundational models, recent advancements in machine learning systems have led to the emergence of automatic parallel systems. We investigate and research a number of current automatic parallel systems(<a href="https://arxiv.org/abs/1807.08887"> Tofu </a>, <a href="https://arxiv.org/abs/1807.05358"> Flexflow </a>, <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>) and some auto activation checkpoint algorithms(<a href="https://hal.inria.fr/hal-02352969"> Rotor </a>, <a href="https://arxiv.org/abs/1604.06174"> Sublinear </a>). Inspired from these advanced systems, we build an automatic parallel system upon PyTorch framework. The input of the system is the serial PyTorch code, and the output is a PyTorch program with an optimized distributed execution plan. It is worth emphasizing that the output is a regular PyTorch program, so it is compatible with runtime optimization methods, such as ZeRO-Offload and PatrickStar.
+
+## Key modules
+
+### Analyzer
+
+**Analyzer** is a static analysis system consisting of three parts:
+A *symbolic profiler* for collecting computing and memory overhead related to static computation graph, a *cluster detector* for collecting hardware characteristics and detecting cluster topology and a *tensor layout manager* to find efficient tensor layout conversion path from different sharding spec and record conversion cost.
+
+### Solver
+
+**Solver** is designed to find the optimal execution plan for a given computation graph and cluster in two stages:
+1) *Intra-op parallelism stage* is to find the plan with the minimum total execution time of all nodes with respect to the constraint of the memory budget. The optimaztion goal of intra-op parallelism solver is modified from <a href="https://arxiv.org/abs/2201.12023"> Alpa </a>'s intra-op parallelsim ILP solver.
+2) *Activation checkpoint stage* is to search for the fastest execution plan that meets the memory budget on the computation graph after inserting the communication nodes by the intra-op parallelism stage. The algorithm to find optimial activation checkpoint is modified from <a href="https://hal.inria.fr/hal-02352969"> Rotor </a>. The reason we use two-stage optimization is that if the two tasks are formulated together, the solving time will be significantly increased, which will greatly affect the user experience of the system. On the contrary, solving in two hierarchical levels has many advantages. Firstly, compared with the computation graph with activation checkpointing, the original graph has fewer nodes, which can reduce the solving cost of intra-op parallelism solver. In addition, a more optimal solution can be found by adding the communication overhead into the activation checkpoint modeling.
+
+### Generator
+**Generator** applies the searched execution plan to the computation graph and recompiles the computation graph to optimized PyTorch code. It has *a series compile pass* to insert a communication node or do the kernel substitution as the intra-op parallelism solver required. Additionally, we implement a *code generation* feature to recognize the annotation from the activation checkpoint solver and inject the activation checkpoint block following annotation instructions.
--- a/colossalai/auto_parallel/__init__.py
+++ b/colossalai/auto_parallel/__init__.py
--- a/colossalai/auto_parallel/checkpoint/__init__.py
+++ b/colossalai/auto_parallel/checkpoint/__init__.py
+from .ckpt_solver_base import CheckpointSolverBase
+from .ckpt_solver_chen import CheckpointSolverChen
+from .ckpt_solver_rotor import CheckpointSolverRotor
--- a/colossalai/auto_parallel/checkpoint/build_c_ext.py
+++ b/colossalai/auto_parallel/checkpoint/build_c_ext.py
+import os
+
+from setuptools import Extension, setup
+
+this_dir = os.path.dirname(os.path.abspath(__file__))
+ext_modules = [Extension(
+    'rotorc',
+    sources=[os.path.join(this_dir, 'ckpt_solver_rotor.c')],
+)]
+
+setup(
+    name='rotor c extension',
+    version='0.1',
+    description='rotor c extension for faster dp computing',
+    ext_modules=ext_modules,
+)
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_base.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_base.py
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import Any, List
+
+import torch
+from torch.fx import Graph, Node
+
+from colossalai.auto_parallel.passes.runtime_apply_pass import (
+    runtime_apply,
+    runtime_apply_for_iterable_object,
+    runtime_comm_spec_apply,
+)
+from colossalai.fx.codegen.activation_checkpoint_codegen import ActivationCheckpointCodeGen
+
+__all___ = ['CheckpointSolverBase']
+
+
+def _copy_output(src: Graph, dst: Graph):
+    """Copy the output node from src to dst"""
+    for n_src, n_dst in zip(src.nodes, dst.nodes):
+        if n_src.op == 'output':
+            n_dst.meta = n_src.meta
+
+
+def _get_param_size(module: torch.nn.Module):
+    """Get the size of the parameters in the module"""
+    return sum([p.numel() * torch.tensor([], dtype=p.dtype).element_size() for p in module.parameters()])
+
+
+class CheckpointSolverBase(ABC):
+
+    def __init__(
+        self,
+        graph: Graph,
+        free_memory: float = -1.0,
+        requires_linearize: bool = False,
+        cnode: List[str] = None,
+        optim_multiplier: float = 1.0,
+    ):
+        """``CheckpointSolverBase`` class will integrate information provided by the components
+        and use an existing solver to find a possible optimal strategies combination for target
+        computing graph.
+
+        Existing Solvers:
+            Chen's Greedy solver: https://arxiv.org/abs/1604.06174  (CheckpointSolverChen)
+            Rotor solver: https://hal.inria.fr/hal-02352969  (CheckpointSolverRotor)
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+            free_memory (float): Memory constraint for the solution.
+            requires_linearize (bool): Whether the graph needs to be linearized.
+            cnode (List[str], optional): Common node List, should be the subset of input. Default to None.
+            optim_multiplier (float, optional): The multiplier of extra weight storage for the
+            ``torch.optim.Optimizer``. Default to 1.0.
+
+        Warnings:
+            Meta information of the graph is required for any ``CheckpointSolver``.
+        """
+        # super-dainiu: this graph is a temporary graph which can refer to
+        # the owning module, but we will return another deepcopy of it after
+        # the solver is executed.
+        self.graph = deepcopy(graph)
+        self.graph.owning_module = graph.owning_module
+        _copy_output(graph, self.graph)
+        self.graph.set_codegen(ActivationCheckpointCodeGen())
+
+        # check if has meta information
+        if any(len(node.meta) == 0 for node in self.graph.nodes):
+            raise RuntimeError(
+                "Nodes meta information hasn't been prepared! Please extract from graph before constructing the solver!"
+            )
+
+        # parameter memory = parameter size + optimizer extra weight storage
+        self.free_memory = free_memory - _get_param_size(self.graph.owning_module) * (optim_multiplier + 1)
+        self.cnode = cnode
+        self.requires_linearize = requires_linearize
+        if self.requires_linearize:
+            self.node_list = self._linearize_graph()
+        else:
+            self.node_list = self.get_node_list()
+
+    @abstractmethod
+    def solve(self):
+        """Solve the checkpointing problem and return the solution.
+        """
+        pass
+
+    def get_node_list(self):
+        """Get the node list.
+        """
+        return [[node] for node in self.graph.nodes]
+
+    def _linearize_graph(self) -> List[List[Node]]:
+        """Linearizing the graph
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+
+        Returns:
+            List[List[Node]]: List of list, each inside list of Node presents
+            the actual 'node' in linearized manner.
+
+        Remarks:
+            Do merge the inplace ops and shape-consistency ops into the previous node.
+        """
+
+        # Common nodes are type of nodes that could be seen as attributes and remain
+        # unchanged throughout the whole model, it will be used several times by
+        # different blocks of model, so that it is hard for us to linearize the graph
+        # when we encounter those kinds of nodes. We let users to annotate some of the
+        # input as common node, such as attention mask, and the followings are some of
+        # the ops that could actually be seen as common nodes. With our common node prop,
+        # we could find some of the "real" common nodes (e.g. the real attention mask
+        # used in BERT and GPT), the rule is simple, for node who's parents are all common
+        # nodes or it's op belongs to the following operations, we view this node as a
+        # newly born common node.
+        # List of target name that could be seen as common node
+        common_ops = ["getattr", "getitem", "size"]
+
+        def _is_cop(target: Any) -> bool:
+            """Check if an op could be seen as common node
+
+            Args:
+                target (Any): node target
+
+            Returns:
+                bool
+            """
+
+            if isinstance(target, str):
+                return target in common_ops
+            else:
+                return target.__name__ in common_ops
+
+        def _is_sink() -> bool:
+            """Check if we can free all dependencies
+
+            Returns:
+                bool
+            """
+
+            def _is_inplace(n: Node):
+                """Get the inplace argument from ``torch.fx.Node``
+                """
+                inplace = False
+                if n.op == "call_function":
+                    inplace = n.kwargs.get("inplace", False)
+                elif n.op == "call_module":
+                    inplace = getattr(n.graph.owning_module.get_submodule(n.target), "inplace", False)
+                return inplace
+
+            def _is_shape_consistency(n: Node):
+                """Check if this node is shape-consistency node (i.e. ``runtime_apply`` or ``runtime_apply_for_iterable_object``)
+                """
+                return n.target in [runtime_apply, runtime_apply_for_iterable_object, runtime_comm_spec_apply]
+
+            return not sum([v for _, v in deps.items()]) and not any(map(_is_inplace, n.users)) and not any(
+                map(_is_shape_consistency, n.users))
+
+        # make sure that item in cnode is valid
+        if self.cnode:
+            for name in self.cnode:
+                try:
+                    assert next(node for node in self.graph.nodes if node.name == name).op == "placeholder", \
+                    f"Common node {name} is not an input of the model."
+                except StopIteration:
+                    raise ValueError(f"Common node name {name} not in graph.")
+
+        else:
+            self.cnode = []
+
+        deps = {}
+        node_list = []
+        region = []
+
+        for n in self.graph.nodes:
+            if n.op != "placeholder" and n.op != "output":
+                for n_par in n.all_input_nodes:
+                    if n_par.op != "placeholder" and n_par.name not in self.cnode:
+                        deps[n_par] -= 1
+                region.append(n)
+
+                # if the node could free all dependencies in graph
+                # we could begin a new node
+                if _is_sink():
+                    node_list.append(region)
+                    region = []
+
+                # propagate common node attr if possible
+                if len(n.all_input_nodes) == len([node for node in n.all_input_nodes if node.name in self.cnode
+                                                 ]) or _is_cop(n.target):
+                    self.cnode.append(n.name)
+                else:
+                    deps[n] = len([user for user in n.users if user.op != "output"])
+        return node_list
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_chen.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_chen.py
+import math
+from copy import deepcopy
+from typing import List, Set, Tuple
+
+from torch.fx import Graph, Node
+
+from colossalai.fx.profiler import calculate_fwd_in, calculate_fwd_tmp
+
+from .ckpt_solver_base import CheckpointSolverBase
+
+__all__ = ['CheckpointSolverChen']
+
+
+class CheckpointSolverChen(CheckpointSolverBase):
+
+    def __init__(self, graph: Graph, cnode: List[str] = None, num_grids: int = 6):
+        """
+        This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
+        Note that this algorithm targets at memory optimization only, using techniques in appendix A.
+
+        Usage:
+            Assume that we have a ``GraphModule``, and we have already done the extractions
+            to the graph to retrieve all information needed, then we could use the following
+            code to find a solution using ``CheckpointSolverChen``:
+            >>> solver = CheckpointSolverChen(gm.graph)
+            >>> chen_graph = solver.solve()
+            >>> gm.graph = chen_graph    # set the graph to a new graph
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+            cnode (List[str], optional): Common node List, should be the subset of input. Defaults to None.
+            num_grids (int, optional): Number of grids to search for b. Defaults to 6.
+        """
+        super().__init__(graph, 0, 0, True, cnode)
+        self.num_grids = num_grids
+
+    def solve(self) -> Graph:
+        """Solve the checkpointing problem using Algorithm 3.
+
+        Returns:
+            graph (Graph): The optimized graph, should be a copy of the original graph.
+        """
+        checkpointable_op = ['call_module', 'call_method', 'call_function', 'get_attr']
+        ckpt = self.grid_search()
+        for i, seg in enumerate(ckpt):
+            for idx in range(*seg):
+                nodes = self.node_list[idx]
+                for n in nodes:
+                    if n.op in checkpointable_op:
+                        n.meta['activation_checkpoint'] = i
+        return deepcopy(self.graph)
+
+    def run_chen_greedy(self, b: int = 0) -> Tuple[Set, int]:
+        """
+        This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
+        """
+        ckpt_intv = []
+        temp = 0
+        x = 0
+        y = 0
+        prev_idx = 2
+        for idx, nodes in enumerate(self.node_list):
+            for n in nodes:
+                n: Node
+                temp += calculate_fwd_in(n) + calculate_fwd_tmp(n)
+                y = max(y, temp)
+            if temp > b and idx > prev_idx:
+                x += calculate_fwd_in(nodes[0])
+                temp = 0
+                ckpt_intv.append((prev_idx, idx + 1))
+                prev_idx = idx + 1
+        return ckpt_intv, math.floor(math.sqrt(x * y))
+
+    def grid_search(self) -> Set:
+        """
+        Search ckpt strategy with b = 0, then run the allocation algorithm again with b = √xy.
+        Grid search over [√2/2 b, √2 b] for ``ckpt_opt`` over ``num_grids`` as in appendix A.
+        """
+        _, b_approx = self.run_chen_greedy(0)
+        b_min, b_max = math.floor(b_approx / math.sqrt(2)), math.ceil(b_approx * math.sqrt(2))
+        b_opt = math.inf
+        for b in range(b_min, b_max, (b_max - b_min) // self.num_grids):
+            ckpt_intv, b_approx = self.run_chen_greedy(b)
+            if b_approx < b_opt:
+                b_opt = b_approx
+                ckpt_opt = ckpt_intv
+        return ckpt_opt
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+/*
+Rotor solver for checkpointing problem in C. We follow the modeling mentioned in
+paper `Optimal checkpointing for heterogeneous chains: how to train deep neural
+networks with limited memory` https://hal.inria.fr/hal-02352969. Some lines of
+the code are adapted from https://gitlab.inria.fr/hiepacs/rotor.
+*/
+long* PySequenceToLongArray(PyObject* pylist) {
+  if (!(pylist && PySequence_Check(pylist))) return NULL;
+  Py_ssize_t len = PySequence_Size(pylist);
+  long* result = (long*)calloc(len + 1, sizeof(long));
+  for (Py_ssize_t i = 0; i < len; ++i) {
+    PyObject* item = PySequence_GetItem(pylist, i);
+    result[i] = PyLong_AsLong(item);
+    Py_DECREF(item);
+  }
+  result[len] = 0;
+  return result;
+}
+
+double* PySequenceToDoubleArray(PyObject* pylist) {
+  if (!(pylist && PySequence_Check(pylist))) return NULL;
+  Py_ssize_t len = PySequence_Size(pylist);
+  double* result = (double*)calloc(len + 1, sizeof(double));
+  for (Py_ssize_t i = 0; i < len; ++i) {
+    PyObject* item = PySequence_GetItem(pylist, i);
+    result[i] = PyFloat_AsDouble(item);
+    Py_DECREF(item);
+  }
+  result[len] = 0;
+  return result;
+}
+
+long* getLongArray(PyObject* container, const char* attributeName) {
+  PyObject* sequence = PyObject_GetAttrString(container, attributeName);
+  long* result = PySequenceToLongArray(sequence);
+  Py_DECREF(sequence);
+  return result;
+}
+
+double* getDoubleArray(PyObject* container, const char* attributeName) {
+  PyObject* sequence = PyObject_GetAttrString(container, attributeName);
+  double* result = PySequenceToDoubleArray(sequence);
+  Py_DECREF(sequence);
+  return result;
+}
+
+static PyObject* computeTable(PyObject* self, PyObject* args) {
+  PyObject* chainParam;
+  int mmax;
+
+  if (!PyArg_ParseTuple(args, "Oi", &chainParam, &mmax)) return NULL;
+
+  double* ftime = getDoubleArray(chainParam, "ftime");
+  if (!ftime) return NULL;
+
+  double* btime = getDoubleArray(chainParam, "btime");
+  if (!btime) return NULL;
+
+  long* x = getLongArray(chainParam, "x");
+  if (!x) return NULL;
+
+  long* xbar = getLongArray(chainParam, "xbar");
+  if (!xbar) return NULL;
+
+  long* ftmp = getLongArray(chainParam, "btmp");
+  if (!ftmp) return NULL;
+
+  long* btmp = getLongArray(chainParam, "btmp");
+  if (!btmp) return NULL;
+
+  long chainLength = PyObject_Length(chainParam);
+  if (!chainLength) return NULL;
+
+#define COST_TABLE(m, i, l)                               \
+  costTable[(m) * (chainLength + 1) * (chainLength + 1) + \
+            (i) * (chainLength + 1) + (l)]
+  double* costTable = (double*)calloc(
+      (mmax + 1) * (chainLength + 1) * (chainLength + 1), sizeof(double));
+
+#define BACK_PTR(m, i, l)                               \
+  backPtr[(m) * (chainLength + 1) * (chainLength + 1) + \
+          (i) * (chainLength + 1) + (l)]
+  long* backPtr = (long*)calloc(
+      (mmax + 1) * (chainLength + 1) * (chainLength + 1), sizeof(long));
+
+  for (long m = 0; m <= mmax; ++m)
+    for (long i = 0; i <= chainLength; ++i) {
+      if ((m >= x[i + 1] + xbar[i + 1] + btmp[i]) &&
+          (m >= x[i + 1] + xbar[i + 1] + ftmp[i])) {
+        COST_TABLE(m, i, i) = ftime[i] + btime[i];
+      } else {
+        COST_TABLE(m, i, i) = INFINITY;
+      }
+    }
+
+  for (long m = 0; m <= mmax; ++m) {
+    for (long d = 1; d <= chainLength; ++d) {
+      for (long i = 0; i <= chainLength - d; ++i) {
+        long idx = i + d;
+        long mmin = x[idx + 1] + x[i + 1] + ftmp[i];
+        if (idx > i + 1) {
+          long maxCostFWD = 0;
+          for (long j = i + 1; j < idx; j++) {
+            maxCostFWD = fmaxl(maxCostFWD, x[j] + x[j + 1] + ftmp[j]);
+          }
+          mmin = fmaxl(mmin, x[idx + 1] + maxCostFWD);
+        }
+        if ((m >= mmin)) {
+          long bestLeaf = -1;
+          double sumFw = 0;
+          double bestLeafCost = INFINITY;
+          for (long j = i + 1; j <= idx; ++j) {
+            sumFw += ftime[j - 1];
+            if (m >= x[j]) {
+              double cost = sumFw + COST_TABLE(m - x[j], j, idx) +
+                            COST_TABLE(m, i, j - 1);
+              if (cost < bestLeafCost) {
+                bestLeafCost = cost;
+                bestLeaf = j;
+              }
+            }
+          }
+          double chainCost = INFINITY;
+          if (m >= xbar[i + 1]) {
+            chainCost =
+                COST_TABLE(m, i, i) + COST_TABLE(m - xbar[i + 1], i + 1, idx);
+          }
+          if (bestLeafCost <= chainCost) {
+            COST_TABLE(m, i, idx) = bestLeafCost;
+            BACK_PTR(m, i, idx) = bestLeaf;
+          } else {
+            COST_TABLE(m, i, idx) = chainCost;
+            BACK_PTR(m, i, idx) = -1;
+          }
+        } else {
+          COST_TABLE(m, i, idx) = INFINITY;
+        }
+      }
+    }
+  }
+
+  free(ftime);
+  free(btime);
+  free(x);
+  free(xbar);
+  free(ftmp);
+  free(btmp);
+
+  PyObject* pyCostTable = PyList_New(mmax + 1);
+  PyObject* pyBackPtr = PyList_New(mmax + 1);
+
+  // Convert the result into Python world
+  for (long m = 0; m <= mmax; ++m) {
+    PyObject* pyCostTable_m = PyList_New(chainLength + 1);
+    PyList_SET_ITEM(pyCostTable, m, pyCostTable_m);
+    PyObject* pyBackPtr_m = PyList_New(chainLength + 1);
+    PyList_SET_ITEM(pyBackPtr, m, pyBackPtr_m);
+    for (long i = 0; i <= chainLength; ++i) {
+      PyObject* pyCostTable_m_i = PyDict_New();
+      PyList_SET_ITEM(pyCostTable_m, i, pyCostTable_m_i);
+      PyObject* pyBackPtr_m_i = PyDict_New();
+      PyList_SET_ITEM(pyBackPtr_m, i, pyBackPtr_m_i);
+      for (long l = i; l <= chainLength; ++l) {
+        PyObject* pyVar_l = PyLong_FromLong(l);
+        PyObject* pyCostTable_m_i_l = PyFloat_FromDouble(COST_TABLE(m, i, l));
+        PyDict_SetItem(pyCostTable_m_i, pyVar_l, pyCostTable_m_i_l);
+        Py_DECREF(pyCostTable_m_i_l);
+        PyObject* pyBackPtr_m_i_l;
+        if (BACK_PTR(m, i, l) < 0) {
+          pyBackPtr_m_i_l = Py_BuildValue("(O)", Py_True);
+        } else {
+          pyBackPtr_m_i_l = Py_BuildValue("(Ol)", Py_False, BACK_PTR(m, i, l));
+        }
+        PyDict_SetItem(pyBackPtr_m_i, pyVar_l, pyBackPtr_m_i_l);
+        Py_DECREF(pyBackPtr_m_i_l);
+        Py_DECREF(pyVar_l);
+      }
+    }
+  }
+
+  free(costTable);
+  free(backPtr);
+
+  PyObject* result = PyTuple_Pack(2, pyCostTable, pyBackPtr);
+  Py_DECREF(pyCostTable);
+  Py_DECREF(pyBackPtr);
+  return result;
+}
+
+static PyMethodDef rotorMethods[] = {
+    {"compute_table", computeTable, METH_VARARGS,
+     "Compute the optimal table with the rotor algorithm."},
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static struct PyModuleDef rotorModule = {
+    PyModuleDef_HEAD_INIT, "rotorc", /* name of module */
+    "A simple implementation of dynamic programming algorithm rotor with C in "
+    "https://hal.inria.fr/hal-02352969. Some code are adapted from "
+    "https://gitlab.inria.fr/hiepacs/rotor.", /* module documentation, may be
+                                                 NULL */
+    -1, /* size of per-interpreter state of the module,
+                   or -1 if the module keeps state in global variables. */
+    rotorMethods};
+
+PyMODINIT_FUNC PyInit_rotorc(void) { return PyModule_Create(&rotorModule); }
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
+from copy import deepcopy
+from typing import Any, Dict, List, Tuple
+
+from torch import Tensor
+from torch.fx import Graph, Node
+
+from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply, runtime_comm_spec_apply
+from colossalai.fx.codegen.activation_checkpoint_codegen import _find_nested_ckpt_regions
+from colossalai.fx.profiler import (
+    activation_size,
+    calculate_bwd_time,
+    calculate_fwd_out,
+    calculate_fwd_time,
+    calculate_fwd_tmp,
+)
+from colossalai.logging import get_dist_logger
+
+from .ckpt_solver_base import CheckpointSolverBase
+from .operation import Backward, Chain, ForwardCheck, ForwardEnable, ForwardNograd, Loss, Sequence
+
+__all__ = ['CheckpointSolverRotor']
+
+
+class CheckpointSolverRotor(CheckpointSolverBase):
+
+    def __init__(self,
+                 graph: Graph,
+                 free_memory: float = -1,
+                 cnode: List[str] = None,
+                 memory_slots: int = 500,
+                 optim_multiplier: float = 1.0):
+        """This is the simple implementation of dynamic programming algorithm rotor
+        in https://hal.inria.fr/hal-02352969. Some code are adapted from
+        https://gitlab.inria.fr/hiepacs/rotor.
+
+        Usage:
+            Assume that we have a ``GraphModule``, and we have already done the extractions
+            to the graph to retrieve all information needed, then we could use the following
+            code to find a solution using ``CheckpointSolverRotor``:
+            >>> solver = CheckpointSolverRotor(gm.graph, free_memory=torch.cuda.mem_get_info(device=0)[0])
+            >>> rotor_graph = solver.solve(force_python=True)   # otherwise use C solver
+            >>> gm.graph = rotor_graph    # set the graph to a new graph
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+            free_memory (float, optional): Memory constraint for the solution, unit is byte.
+                Use ``torch.cuda.mem_get_info(device=0)[0]`` to estimate the free_memory. Defaults to -1.
+            cnode (List[str], optional): Common node List, should be the subset of input. Defaults to None.
+            memory_slots (int, optional): Number of slots for discretizing memory budget. Defaults to 500.
+            optim_multiplier (float, optional): The multiplier of extra weight storage for the
+            ``torch.optim.Optimizer``. Default to 1.0.
+        """
+        super().__init__(graph, free_memory, True, cnode, optim_multiplier)
+        self.memory_slots = memory_slots
+
+        # construct chain
+        unit = self.free_memory // self.memory_slots
+        self.chain = self._construct_chain(self.graph, self.node_list)
+        self.chain.discretize_all(unit)
+
+        self.cost_table = None
+        self.back_ptr = None
+        self.sequence = None
+
+    def solve(self, force_python: bool = False, verbose: bool = False) -> Graph:
+        """Solve the checkpointing problem using rotor algorithm.
+
+        Args:
+            force_python (bool, optional): Use Python version of solver, else use C version. Defaults to False.
+            verbose (bool, optional): Print verbose information. Defaults to False.
+
+        Returns:
+            graph (Graph): The optimized graph, should be a copy of the original graph.
+        """
+        chain = self.chain
+
+        # compute cost table
+        if force_python:
+            self.cost_table, self.back_ptr = self._compute_table(chain, self.memory_slots)
+        else:
+            self.cost_table, self.back_ptr = self._compute_table_c(chain, self.memory_slots)
+
+        if verbose:
+            self.print_chain()
+
+        # backtrack
+        try:
+            self.sequence = self._backtrack(chain, 0, len(chain), self.memory_slots - chain.x[0], self.cost_table,
+                                            self.back_ptr)
+            self._annotate_from_sequence(self.sequence, self.node_list)
+        except ValueError as e:
+            # using logger to annonce that the solver is failed
+            logger = get_dist_logger()
+            logger.warning(f'Checkpoint solver failed: {e}')
+            raise ValueError
+
+        if verbose:
+            self.print_sequence()
+
+        return deepcopy(self.graph)
+
+    def print_chain(self):
+        print('[input]', self.chain.x[0], self.chain.xbar[0], self.chain.ftmp[0], self.chain.btmp[0])
+        for idx in range(len(self.node_list) - 1):
+            print(self.node_list[idx], self.chain.x[idx + 1], self.chain.xbar[idx + 1], self.chain.ftmp[idx],
+                  self.chain.btmp[idx])
+        print(f'Chain = {self.chain}')
+
+    def print_sequence(self):
+        print(f'Sequence = {self.sequence}')
+
+    @classmethod
+    def _construct_chain(cls, graph: Graph, node_list: List[List[Node]]) -> Chain:
+        input_tensors = cls._extract_input(graph)
+        ftime, btime, ftmp, btmp = list(), list(), list(), list()
+        xbar, x = [activation_size(input_tensors)], [activation_size(input_tensors)]
+
+        for node in node_list:
+            node_info = cls._extract_node_info(node)
+            ftime.append(node_info[0])
+            btime.append(node_info[1])
+            x.append(node_info[2])
+            xbar.append(node_info[3])
+            ftmp.append(node_info[4])
+            btmp.append(node_info[5])
+
+        # currently we view loss backward temp as zero
+        btime.append(0)
+        btmp.append(0)
+
+        return Chain(ftime, btime, x, xbar, ftmp, btmp)
+
+    @classmethod
+    def _extract_node_info(cls, node: List[Node]) -> Tuple[int, ...]:
+        """Extract node info from a list of nodes"""
+        xbar = 0
+        ftime = 0
+        btime = 0
+        fwd_mem_peak = 0
+        for n in node:
+            assert isinstance(n, Node), f'{n} is not a Node'
+            if n.target == runtime_apply or n.target == runtime_comm_spec_apply:
+                # in this case we need to calculate memory usage directly based on the statics that hooked in node.meta
+                xbar += n.meta['fwd_mem_out']
+                fwd_mem_peak = max(fwd_mem_peak, xbar + n.meta['fwd_mem_tmp'])
+            else:
+                xbar += calculate_fwd_tmp(n) + calculate_fwd_out(n)
+                fwd_mem_peak = max(fwd_mem_peak, xbar + n.meta['fwd_mem_tmp'] + cls._extract_unused_output(n))
+
+            # minimum flop count is required
+            ftime += max(calculate_fwd_time(n), 1.0)
+            btime += max(calculate_bwd_time(n), 1.0)
+
+        x = calculate_fwd_out(node[-1])
+        xbar = max(x, xbar)
+        ftmp = fwd_mem_peak - xbar
+        btmp = cls._extract_btmp(node)
+        return ftime, btime, x, xbar, ftmp, btmp
+
+    @staticmethod
+    def _extract_input(graph: Graph) -> Tuple[Tensor, ...]:
+        """Extract input tensors from a Graph"""
+        input_tensors = []
+        for node in graph.nodes:
+            if node.op == 'placeholder':
+                input_tensors.append(node.meta['fwd_out'])
+        return input_tensors
+
+    @staticmethod
+    def _extract_unused_output(node: Node) -> int:
+        """Extract unused output from `torch.fx.Node`"""
+        return activation_size(node.meta['fwd_out']) - calculate_fwd_out(node)
+
+    @staticmethod
+    def _extract_btmp(node: List[Node]) -> int:
+        """Extract btmp from a list of nodes"""
+
+        def _extract_deps_size():
+            deps_size = 0
+            for k, v in deps.items():
+                k: Node
+                if v > 0:
+                    deps_size += k.meta['bwd_mem_out']
+                if v == float('-inf'):
+                    deps_size -= calculate_fwd_tmp(k) + calculate_fwd_out(k)
+
+            return deps_size
+
+        btmp = 0
+        deps = {}
+        for n in reversed(node):
+            deps[n] = len(n.all_input_nodes)
+            btmp = max(btmp, _extract_deps_size() + n.meta['bwd_mem_tmp'])
+            for child in n.users:
+                if child in deps:
+                    deps[child] -= 1
+                    if deps[child] <= 0:
+                        deps[child] = float('-inf')    # free
+        return btmp
+
+    @staticmethod
+    def _compute_table(chain: Chain, mmax: int) -> Tuple:
+        """Compute the table using dynamic programming. Returns the cost table and the backtracking pointer.
+
+        Args:
+            chain (Chain): A basic linearized structure for solving the dynamic programming problem.
+            mmax (int): Maximum number of memory slots.
+
+        Returns:
+            cost_table (List): cost_table[m][lhs][rhs] indicates the optimal cost of the subproblem from lhs to rhs
+            with m memory slots.
+            back_ptr (List): back_ptr[m][lhs][rhs] indicates the best operation at this point. It is (True,) if the optimal choice
+            is a chain checkpoint, it is (False, j) if the optimal choice is a leaf checkpoint of length j
+        """
+
+        ftime = chain.ftime + [0.0]
+        btime = chain.btime
+        x = chain.x + [0]
+        xbar = chain.xbar + [0]
+        ftmp = chain.ftmp + [0]
+        btmp = chain.btmp + [0]
+
+        # Build table
+        cost_table = [[{} for _ in range(len(chain) + 1)] for _ in range(mmax + 1)]
+        back_ptr = [[{} for _ in range(len(chain) + 1)] for _ in range(mmax + 1)]
+
+        # Initialize corner cases where length of sequence equals to 1, i.e. lhs == rhs
+        for m in range(mmax + 1):
+            for i in range(len(chain) + 1):
+                limit = max(x[i + 1] + xbar[i + 1] + ftmp[i], x[i + 1] + xbar[i + 1] + btmp[i])
+                if m >= limit:
+                    cost_table[m][i][i] = ftime[i] + btime[i]
+                else:
+                    cost_table[m][i][i] = float("inf")
+
+        # Compute tables
+        for m in range(mmax + 1):
+            for d in range(1, len(chain) + 1):
+                for i in range(len(chain) + 1 - d):
+                    idx = i + d
+                    mmin = x[idx + 1] + x[i + 1] + ftmp[i]
+                    if idx > i + 1:
+                        mmin = max(mmin, x[idx + 1] + max(x[j] + x[j + 1] + ftmp[j] for j in range(i + 1, idx)))
+                    if m < mmin:
+                        cost_table[m][i][idx] = float("inf")
+                    else:
+                        leaf_checkpoints = [(j,
+                                             sum(ftime[i:j]) + cost_table[m - x[j]][j][idx] + cost_table[m][i][j - 1])
+                                            for j in range(i + 1, idx + 1)
+                                            if m >= x[j]]
+                        if leaf_checkpoints:
+                            best_leaf = min(leaf_checkpoints, key=lambda t: t[1])
+                        else:
+                            best_leaf = None
+                        if m >= xbar[i + 1]:
+                            chain_checkpoint = cost_table[m][i][i] + cost_table[m - xbar[i + 1]][i + 1][idx]
+                        else:
+                            chain_checkpoint = float("inf")
+                        if best_leaf and best_leaf[1] <= chain_checkpoint:
+                            cost_table[m][i][idx] = best_leaf[1]
+                            back_ptr[m][i][idx] = (False, best_leaf[0])
+                        else:
+                            cost_table[m][i][idx] = chain_checkpoint
+                            back_ptr[m][i][idx] = (True,)
+        return cost_table, back_ptr
+
+    @staticmethod
+    def _compute_table_c(chain: Chain, mmax: int) -> Tuple:
+        try:
+            from .rotorc import compute_table
+
+        # build module if module not found
+        except ModuleNotFoundError:
+            import os
+            import subprocess
+            import sys
+            logger = get_dist_logger()
+            logger.info("rotorc hasn't been built! Building library...", ranks=[0])
+            this_dir = os.path.dirname(os.path.abspath(__file__))
+            result = subprocess.Popen(
+                [
+                    f"{sys.executable}", f"{os.path.join(this_dir, 'build_c_ext.py')}", "build_ext",
+                    f"--build-lib={this_dir}"
+                ],
+                stdout=subprocess.PIPE,
+                stderr=subprocess.PIPE,
+            )
+            if result.wait() == 0:
+                logger.info("rotorc has been built!", ranks=[0])
+                from .rotorc import compute_table
+            else:
+                logger.warning("rotorc built failed! Using python version!", ranks=[0])
+                return CheckpointSolverRotor._compute_table(chain, mmax)
+        return compute_table(chain, mmax)
+
+    @staticmethod
+    def _backtrack(chain: Chain, lhs: int, rhs: int, budget: int, cost_table: List[Any],
+                   back_ptr: List[Any]) -> "Sequence":
+        """Backtrack the cost table and retrieve the optimal checkpointing strategy.
+
+        Args:
+            chain (Chain): A basic linearized structure for solving the dynamic programming problem.
+            lhs (int): The left index of the interval to backtrack.
+            rhs (int): The right index of the interval to backtrack.
+            budget (int): The memory budget for processing this interval.
+            cost_table (List[Any]): See ``._compute_table()`` for definitions
+            back_ptr (List[Any]): See ``._compute_table()`` for definitions
+
+        Raises:
+            ValueError: Can not process the chain.
+
+        Returns:
+            sequence (Sequence): The sequence of executing nodes with checkpoints.
+        """
+        if budget <= 0:
+            raise ValueError(f"Can not process a chain with negative memory {budget}")
+        elif cost_table[budget][lhs][rhs] == float("inf"):
+            raise ValueError(f"Can not process this chain from index {lhs} to {rhs} with memory {budget}")
+
+        sequence = Sequence()
+        if rhs == lhs:
+            if lhs == len(chain):
+                sequence += [Loss()]
+            else:
+                sequence += [ForwardEnable(lhs), Backward(lhs)]
+            return sequence
+
+        if back_ptr[budget][lhs][rhs][0]:
+            sequence += [
+                ForwardEnable(lhs),
+                CheckpointSolverRotor._backtrack(chain, lhs + 1, rhs, budget - chain.xbar[lhs + 1], cost_table,
+                                                 back_ptr),
+                Backward(lhs),
+            ]
+        else:
+            best_leaf = back_ptr[budget][lhs][rhs][1]
+            sequence += [ForwardCheck(lhs)]
+            sequence += [ForwardNograd(k) for k in range(lhs + 1, best_leaf)]
+            sequence += [
+                CheckpointSolverRotor._backtrack(chain, best_leaf, rhs, budget - chain.x[best_leaf], cost_table,
+                                                 back_ptr),
+                CheckpointSolverRotor._backtrack(chain, lhs, best_leaf - 1, budget, cost_table, back_ptr),
+            ]
+        return sequence
+
+    @staticmethod
+    def _annotate_from_sequence(sequence: Sequence, node_list: List[List[Node]]):
+        """Annotate the nodes in the ``node_list`` with activation checkpoint from the sequence.
+
+        Args:
+            sequence (Sequence): The sequence of executing nodes with activation checkpoint annotations.
+            node_list (List[List[Node]]): The list of nodes to annotate.
+        """
+        op_list = sequence.list_operations()
+        loss_op = next(op for op in op_list if isinstance(op, Loss))
+        fwd_list = op_list[:op_list.index(loss_op)]
+        bwd_list = op_list[op_list.index(loss_op) + 1:]
+        ckpt_idx = 0
+        in_ckpt = False
+        ckpt_region = []
+
+        # forward annotation
+        for idx, op in enumerate(fwd_list, 0):
+            if in_ckpt:
+                if isinstance(op, ForwardNograd):
+                    ckpt_region.append(idx)
+
+                elif isinstance(op, ForwardEnable):
+                    in_ckpt = False
+                    for node_idx in ckpt_region:
+                        for n in node_list[node_idx]:
+                            n.meta['activation_checkpoint'] = [ckpt_idx]
+
+                    ckpt_idx += 1
+                    ckpt_region = []
+
+                elif isinstance(op, ForwardCheck):
+                    for node_idx in ckpt_region:
+                        for n in node_list[node_idx]:
+                            n.meta['activation_checkpoint'] = [ckpt_idx]
+
+                    ckpt_idx += 1
+                    ckpt_region = [idx]
+
+            else:
+                if isinstance(op, ForwardCheck):
+                    in_ckpt = True
+                    ckpt_region.append(idx)
+
+        # annotate the backward if there is any nested activation checkpoint
+        in_recompute = False
+        for op in bwd_list:
+            if in_recompute:
+                if isinstance(op, ForwardNograd):
+                    ckpt_region.append(op.index)
+
+                elif isinstance(op, ForwardEnable):
+                    for node_idx in ckpt_region:
+                        for n in node_list[node_idx]:
+                            n.meta['activation_checkpoint'].append(ckpt_idx)
+
+                    ckpt_idx += 1
+                    ckpt_region = []
+
+                elif isinstance(op, ForwardCheck):
+                    for node_idx in ckpt_region:
+                        for n in node_list[node_idx]:
+                            n.meta['activation_checkpoint'].append(ckpt_idx)
+
+                    ckpt_idx += 1
+                    ckpt_region = [op.index]
+
+                elif isinstance(op, Backward):
+                    for node_idx in ckpt_region:
+                        for n in node_list[node_idx]:
+                            n.meta['activation_checkpoint'].append(ckpt_idx)
+
+                    in_recompute = False
+
+            else:
+                if not isinstance(op, Backward):
+                    in_recompute = True
+                    ckpt_idx = 0
+                    ckpt_region = []
+                    if isinstance(op, ForwardCheck):
+                        ckpt_region.append(op.index)
+
+        # postprocess, make sure every activation checkpoint label in the
+        # same activation checkpoint region (level = 0) has the same length
+        op_list = []
+        for node in node_list:
+            op_list += node
+        ckpt_regions = _find_nested_ckpt_regions(op_list)
+        for (start_idx, end_idx) in ckpt_regions:
+            nested_length = max(
+                len(op_list[idx].meta['activation_checkpoint']) for idx in range(start_idx, end_idx + 1))
+            for idx in range(start_idx, end_idx + 1):
+                op_list[idx].meta['activation_checkpoint'] += [None] * (nested_length -
+                                                                        len(op_list[idx].meta['activation_checkpoint']))
--- a/colossalai/auto_parallel/checkpoint/operation.py
+++ b/colossalai/auto_parallel/checkpoint/operation.py
+import math
+from abc import ABC
+from typing import Any, Iterable, List
+
+from torch.utils._pytree import tree_map
+
+
+class Chain:
+
+    def __init__(self,
+                 ftime: List[float],
+                 btime: List[float],
+                 x: List[int],
+                 xbar: List[int],
+                 ftmp: List[int],
+                 btmp: List[int],
+                 check_consistency: bool = True):
+        """The chain is a basic linearized structure for solving the dynamic programming problem for activation checkpoint.
+        See paper https://hal.inria.fr/hal-02352969 for details.
+
+        Args:
+            ftime (List[float]): The forward time of each node.
+            btime (List[float]): The backward time of each node.
+            x (List[int]): The forward memory of each node (if save_output). Same as `a` in the paper.
+            xbar (List[int]): The forward memory of each node (if save_all). Same as `a_bar` in the paper.
+            ftmp (List[int]): The temporary forward memory of each node.
+            btmp (List[int]): The temporary backward memory of each node, can be used to control memory budget.
+            check_consistency (bool, optional): Check the lengths consistency for the `Chain`. Defaults to True.
+        """
+        self.ftime = ftime
+        self.btime = btime
+        self.x = x
+        self.xbar = xbar
+        self.ftmp = ftmp
+        self.btmp = btmp
+        if check_consistency and not self.check_lengths():
+            raise AttributeError("In Chain, input lists do not have consistent lengths")
+
+    def check_lengths(self):
+        return ((len(self.ftime) == len(self)) and (len(self.btime) == len(self) + 1) and (len(self.x) == len(self) + 1)
+                and (len(self.ftmp) == len(self)) and (len(self.btmp) == len(self) + 1)
+                and (len(self.xbar) == len(self) + 1))
+
+    def __repr__(self):
+        chain_list = []
+        for i in range(len(self)):
+            chain_list.append((self.ftime[i], self.btime[i], self.x[i], self.xbar[i], self.ftmp[i], self.btmp[i]))
+        i = len(self)
+        chain_list.append((None, self.btime[i], self.x[i], self.xbar[i], None, self.btmp[i]))
+        return chain_list.__repr__()
+
+    def __len__(self):
+        return len(self.ftime)
+
+    def discretize_all(self, unit: int):
+        """Discretize the chain into a list of chains according to unit size."""
+        discretizer = lambda val: math.ceil(val / unit)
+        self.x = tree_map(discretizer, self.x)
+        self.xbar = tree_map(discretizer, self.xbar)
+        self.ftmp = tree_map(discretizer, self.ftmp)
+        self.btmp = tree_map(discretizer, self.btmp)
+
+
+class Operation(ABC):
+    name = "Op"
+
+    def __repr__(self) -> str:
+        return f"{self.name}_{self.index}"
+
+    def shift(self, value):
+        if type(self.index) is tuple:
+            self.index = tuple(x + value for x in self.index)
+        else:
+            self.index += value
+
+
+class Forward(Operation):
+    name = "F"
+
+    def __init__(self, index):
+        self.index = index
+
+    def cost(self, chain: Chain):
+        if chain is not None:
+            return chain.ftime[self.index]
+        else:
+            return 1
+
+
+class ForwardEnable(Forward):
+    name = "Fe"
+
+
+class ForwardNograd(Forward):
+    name = "Fn"
+
+
+class ForwardCheck(Forward):
+    name = "CF"
+
+
+class Forwards(Operation):
+
+    def __init__(self, start, end):
+        self.index = (start, end)
+
+    def __repr__(self):
+        return "F_{i}->{j}".format(i=self.index[0], j=self.index[1])
+
+    def cost(self, chain: Chain):
+        if chain is not None:
+            return sum(chain.ftime[self.index[0]:self.index[1] + 1])
+        else:
+            return (self.index[1] - self.index[0] + 1)
+
+
+def isForward(op):
+    return type(op) is Forward or type(op) is Forwards
+
+
+class Backward(Operation):
+    name = "B"
+
+    def __init__(self, index):
+        self.index = index
+
+    def cost(self, chain: Chain):
+        if chain is not None:
+            return chain.btime[self.index]
+        else:
+            return 1
+
+
+class Loss(Operation):
+
+    def __init__(self):
+        pass
+
+    def __repr__(self):
+        return "L"
+
+    def cost(self, chain):
+        return 0
+
+
+class MemoryAccess(Operation):
+    name = "MA"
+
+    def __init__(self, index):
+        self.index = index
+
+    def cost(self, chain: Chain):
+        return 0
+
+
+class WriteMemory(MemoryAccess):
+    name = "WM"
+
+
+class ReadMemory(MemoryAccess):
+    name = "RM"
+
+
+class DiscardMemory(MemoryAccess):
+    name = "DM"
+
+
+class Sequence(list):
+
+    def __init__(self):
+        super().__init__()
+
+    def __repr__(self):
+        return repr(self.list_operations())
+
+    def list_operations(self):
+        op_list = []
+        for x in self:
+            if isinstance(x, Operation):
+                op_list.append(x)
+            else:
+                assert isinstance(x, Sequence)
+                op_list += x.list_operations()
+        return op_list
--- a/colossalai/auto_parallel/meta_profiler/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/__init__.py
+from .meta_registry import *
+from .registry import meta_register
+from .shard_metainfo import *
--- a/colossalai/auto_parallel/meta_profiler/constants.py
+++ b/colossalai/auto_parallel/meta_profiler/constants.py
+import operator
+
+import torch
+import torch.nn as nn
+
+from ..tensor_shard.constants import *
+
+# list of inplace module
+INPLACE_MODULE = [nn.ReLU]
+
+# list of inplace operations
+INPLACE_OPS = [torch.flatten]
+
+# list of operations that do not save forward activations
+NO_SAVE_ACTIVATION = [torch.add, torch.sub, operator.add, operator.sub]