init colossalai, support dtk2304

08f2920e · zhuwenwen · da3f0934 · 08f2920e · 08f2920e · 08f2920e
Commit 08f2920e authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_base.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_base.py
+from abc import ABC, abstractmethod
+from copy import deepcopy
+from typing import Any, List
+
+import torch
+from torch.fx import Graph, Node
+
+from colossalai.fx.codegen.activation_checkpoint_codegen import ActivationCheckpointCodeGen
+from colossalai.fx.profiler.memory_utils import is_inplace
+
+__all___ = ['CheckpointSolverBase']
+
+
+def _copy_output(src: Graph, dst: Graph):
+    """Copy the output node from src to dst"""
+    for n_src, n_dst in zip(src.nodes, dst.nodes):
+        if n_src.op == 'output':
+            n_dst.meta = n_src.meta
+
+
+def _get_param_size(module: torch.nn.Module):
+    """Get the size of the parameters in the module"""
+    return sum([p.numel() * torch.tensor([], dtype=p.dtype).element_size() for p in module.parameters()])
+
+
+class CheckpointSolverBase(ABC):
+
+    def __init__(
+        self,
+        graph: Graph,
+        free_memory: float = -1.0,
+        requires_linearize: bool = False,
+        cnode: List[str] = None,
+    ):
+        """CheckpointSolver class will integrate information provided by the components
+        and use an existing solver to find a possible optimal strategies combination for
+        target computing graph.
+
+        Existing Solvers:
+            Chen's Greedy solver: https://arxiv.org/abs/1604.06174  (CheckpointSolverChen)
+            Rotor solver: https://hal.inria.fr/hal-02352969  (CheckpointSolverRotor)
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+            free_memory (float): Memory constraint for the solution.
+            requires_linearize (bool): Whether the graph needs to be linearized.
+            cnode (List[str], optional): Common node List, should be the subset of input. Default to None.
+
+        Warnings:
+            `MetaInfoProp` should be done before constructing the solver. Meta information of the graph is required.
+        """
+        # super-dainiu: this graph is a temporary graph which can refer to
+        # the owning module, but we will return another deepcopy of it after
+        # the solver is executed.
+        self.graph = deepcopy(graph)
+        self.graph.owning_module = graph.owning_module
+        _copy_output(graph, self.graph)
+        self.graph.set_codegen(ActivationCheckpointCodeGen())
+
+        # check if `MetaInfoProp` is done
+        if any(len(node.meta) == 0 for node in self.graph.nodes):
+            raise RuntimeError(
+                "Nodes meta information hasn't been prepared! Please run MetaInfoProp before constructing the solver!")
+
+        self.free_memory = free_memory
+        self.parameter_size = _get_param_size(self.graph.owning_module)
+        self.cnode = cnode
+        self.requires_linearize = requires_linearize
+        if self.requires_linearize:
+            self.node_list = self._linearize_graph()
+        else:
+            self.node_list = self.get_node_list()
+
+    @abstractmethod
+    def solve(self):
+        """Solve the checkpointing problem and return the solution.
+        """
+        pass
+
+    def get_node_list(self):
+        """Get the node list.
+        """
+        return [[node] for node in self.graph.nodes]
+
+    def _linearize_graph(self) -> List[List[Node]]:
+        """Linearizing the graph
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+
+        Returns:
+            List[List[Node]]: List of list, each inside list of Node presents
+            the actual 'node' in linearized manner.
+
+        Remarks:
+            Do merge the inplace ops into the previous node.
+        """
+
+        # Common nodes are type of nodes that could be seen as attributes and remain
+        # unchanged throughout the whole model, it will be used several times by
+        # different blocks of model, so that it is hard for us to linearize the graph
+        # when we encounter those kinds of nodes. We let users to annotate some of the
+        # input as common node, such as attention mask, and the followings are some of
+        # the ops that could actually be seen as common nodes. With our common node prop,
+        # we could find some of the "real" common nodes (e.g. the real attention mask
+        # used in BERT and GPT), the rule is simple, for node who's parents are all common
+        # nodes or it's op belongs to the following operations, we view this node as a
+        # newly born common node.
+        # List of target name that could be seen as common node
+        common_ops = ["getattr", "getitem", "size"]
+
+        def _is_cop(target: Any) -> bool:
+            """Check if an op could be seen as common node
+
+            Args:
+                target (Any): node target
+
+            Returns:
+                bool
+            """
+
+            if isinstance(target, str):
+                return target in common_ops
+            else:
+                return target.__name__ in common_ops
+
+        def _is_sink() -> bool:
+            """Check if we can free all dependencies
+
+            Returns:
+                bool
+            """
+
+            return not sum([v for _, v in deps.items()]) and not any(map(is_inplace, n.users))
+
+        # make sure that item in cnode is valid
+        if self.cnode:
+            for name in self.cnode:
+                try:
+                    assert next(node for node in self.graph.nodes if node.name == name).op == "placeholder", \
+                    f"Common node {name} is not an input of the model."
+                except StopIteration:
+                    raise ValueError(f"Common node name {name} not in graph.")
+
+        else:
+            self.cnode = []
+
+        deps = {}
+        node_list = []
+        region = []
+
+        for n in self.graph.nodes:
+            if n.op != "placeholder" and n.op != "output":
+                for n_par in n.all_input_nodes:
+                    if n_par.op != "placeholder" and n_par.name not in self.cnode:
+                        deps[n_par] -= 1
+                region.append(n)
+
+                # if the node could free all dependencies in graph
+                # we could begin a new node
+                if _is_sink():
+                    node_list.append(region)
+                    region = []
+
+                # propagate common node attr if possible
+                if len(n.all_input_nodes) == len([node for node in n.all_input_nodes if node.name in self.cnode
+                                                 ]) or _is_cop(n.target):
+                    self.cnode.append(n.name)
+                else:
+                    deps[n] = len([user for user in n.users if user.op != "output"])
+        return node_list
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_chen.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_chen.py
+import math
+from copy import deepcopy
+from typing import List, Set, Tuple
+
+from torch.fx import Graph, Node
+
+from colossalai.fx.profiler import calculate_fwd_in, calculate_fwd_tmp
+
+from .ckpt_solver_base import CheckpointSolverBase
+
+__all__ = ['CheckpointSolverChen']
+
+
+class CheckpointSolverChen(CheckpointSolverBase):
+
+    def __init__(self, graph: Graph, cnode: List[str] = None, num_grids: int = 6):
+        """
+        This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
+        Note that this algorithm targets at memory optimization only, using techniques in appendix A.
+
+        Usage:
+            Assume that we have a `GraphModule`, and we already applied the `MetaInfoProp`
+            to the graph to retrieve all information needed, then we could use the following
+            code to find a solution using `CheckpointSolverChen`:
+            >>> solver = CheckpointSolverChen(gm.graph)
+            >>> chen_graph = solver.solve()
+            >>> gm.graph = chen_graph    # set the graph to a new graph
+
+        Args:
+            graph (Graph): The computing graph to be optimized.
+            cnode (List[str], optional): Common node List, should be the subset of input. Defaults to None.
+            num_grids (int, optional): Number of grids to search for b. Defaults to 6.
+        """
+        super().__init__(graph, 0, 0, True, cnode)
+        self.num_grids = num_grids
+
+    def solve(self) -> Graph:
+        """Solve the checkpointing problem using Algorithm 3.
+
+        Returns:
+            graph (Graph): The optimized graph, should be a copy of the original graph.
+        """
+        checkpointable_op = ['call_module', 'call_method', 'call_function', 'get_attr']
+        ckpt = self.grid_search()
+        for i, seg in enumerate(ckpt):
+            for idx in range(*seg):
+                nodes = self.node_list[idx]
+                for n in nodes:
+                    if n.op in checkpointable_op:
+                        n.meta['activation_checkpoint'] = i
+        return deepcopy(self.graph)
+
+    def run_chen_greedy(self, b: int = 0) -> Tuple[Set, int]:
+        """
+        This is the simple implementation of Algorithm 3 in https://arxiv.org/abs/1604.06174.
+        """
+        ckpt_intv = []
+        temp = 0
+        x = 0
+        y = 0
+        prev_idx = 2
+        for idx, nodes in enumerate(self.node_list):
+            for n in nodes:
+                n: Node
+                temp += calculate_fwd_in(n) + calculate_fwd_tmp(n)
+                y = max(y, temp)
+            if temp > b and idx > prev_idx:
+                x += calculate_fwd_in(nodes[0])
+                temp = 0
+                ckpt_intv.append((prev_idx, idx + 1))
+                prev_idx = idx + 1
+        return ckpt_intv, math.floor(math.sqrt(x * y))
+
+    def grid_search(self) -> Set:
+        """
+        Search ckpt strategy with b = 0, then run the allocation algorithm again with b = √xy.
+        Grid search over [√2/2 b, √2 b] for ckpt_opt over num_grids as in appendix A.
+        """
+        _, b_approx = self.run_chen_greedy(0)
+        b_min, b_max = math.floor(b_approx / math.sqrt(2)), math.ceil(b_approx * math.sqrt(2))
+        b_opt = math.inf
+        for b in range(b_min, b_max, (b_max - b_min) // self.num_grids):
+            ckpt_intv, b_approx = self.run_chen_greedy(b)
+            if b_approx < b_opt:
+                b_opt = b_approx
+                ckpt_opt = ckpt_intv
+        return ckpt_opt
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.c
+#define PY_SSIZE_T_CLEAN
+#include <Python.h>
+
+long* PySequenceToLongArray(PyObject* pylist) {
+  if (!(pylist && PySequence_Check(pylist))) return NULL;
+  Py_ssize_t len = PySequence_Size(pylist);
+  long* result = (long*)calloc(len + 1, sizeof(long));
+  for (Py_ssize_t i = 0; i < len; ++i) {
+    PyObject* item = PySequence_GetItem(pylist, i);
+    result[i] = PyLong_AsLong(item);
+    Py_DECREF(item);
+  }
+  result[len] = 0;
+  return result;
+}
+
+double* PySequenceToDoubleArray(PyObject* pylist) {
+  if (!(pylist && PySequence_Check(pylist))) return NULL;
+  Py_ssize_t len = PySequence_Size(pylist);
+  double* result = (double*)calloc(len + 1, sizeof(double));
+  for (Py_ssize_t i = 0; i < len; ++i) {
+    PyObject* item = PySequence_GetItem(pylist, i);
+    result[i] = PyFloat_AsDouble(item);
+    Py_DECREF(item);
+  }
+  result[len] = 0;
+  return result;
+}
+
+long* getLongArray(PyObject* container, const char* attributeName) {
+  PyObject* sequence = PyObject_GetAttrString(container, attributeName);
+  long* result = PySequenceToLongArray(sequence);
+  Py_DECREF(sequence);
+  return result;
+}
+
+double* getDoubleArray(PyObject* container, const char* attributeName) {
+  PyObject* sequence = PyObject_GetAttrString(container, attributeName);
+  double* result = PySequenceToDoubleArray(sequence);
+  Py_DECREF(sequence);
+  return result;
+}
+
+static PyObject* computeTable(PyObject* self, PyObject* args) {
+  PyObject* chainParam;
+  int mmax;
+
+  if (!PyArg_ParseTuple(args, "Oi", &chainParam, &mmax)) return NULL;
+
+  double* ftime = getDoubleArray(chainParam, "ftime");
+  if (!ftime) return NULL;
+
+  double* btime = getDoubleArray(chainParam, "btime");
+  if (!btime) return NULL;
+
+  long* x = getLongArray(chainParam, "x");
+  if (!x) return NULL;
+
+  long* xbar = getLongArray(chainParam, "xbar");
+  if (!xbar) return NULL;
+
+  long* ftmp = getLongArray(chainParam, "btmp");
+  if (!ftmp) return NULL;
+
+  long* btmp = getLongArray(chainParam, "btmp");
+  if (!btmp) return NULL;
+
+  long chainLength = PyObject_Length(chainParam);
+  if (!chainLength) return NULL;
+
+#define COST_TABLE(m, i, l)                               \
+  costTable[(m) * (chainLength + 1) * (chainLength + 1) + \
+            (i) * (chainLength + 1) + (l)]
+  double* costTable = (double*)calloc(
+      (mmax + 1) * (chainLength + 1) * (chainLength + 1), sizeof(double));
+
+#define BACK_PTR(m, i, l)                               \
+  backPtr[(m) * (chainLength + 1) * (chainLength + 1) + \
+          (i) * (chainLength + 1) + (l)]
+  long* backPtr = (long*)calloc(
+      (mmax + 1) * (chainLength + 1) * (chainLength + 1), sizeof(long));
+
+  for (long m = 0; m <= mmax; ++m)
+    for (long i = 0; i <= chainLength; ++i)
+      if ((m >= x[i + 1] + xbar[i + 1] + btmp[i]) &&
+          (m >= x[i + 1] + xbar[i + 1] + ftmp[i]))
+        COST_TABLE(m, i, i) = ftime[i] + btime[i];
+      else
+        COST_TABLE(m, i, i) = INFINITY;
+
+  for (long m = 0; m <= mmax; ++m)
+    for (long d = 1; d <= chainLength; ++d) {
+      for (long i = 0; i <= chainLength - d; ++i) {
+        long idx = i + d;
+        long mmin = x[idx + 1] + x[i + 1] + ftmp[i];
+        if (idx > i + 1) {
+          long maxCostFWD = 0;
+          for (long j = i + 1; j < idx; j++) {
+            maxCostFWD = fmaxl(maxCostFWD, x[j] + x[j + 1] + ftmp[j]);
+          }
+          mmin = fmaxl(mmin, x[idx + 1] + maxCostFWD);
+        }
+        if ((m >= mmin)) {
+          long bestLeaf = -1;
+          double sumFw = 0;
+          double bestLeafCost = INFINITY;
+          for (long j = i + 1; j <= idx; ++j) {
+            sumFw += ftime[j - 1];
+            if (m >= x[j]) {
+              double cost = sumFw + COST_TABLE(m - x[j], j, idx) +
+                            COST_TABLE(m, i, j - 1);
+              if (cost < bestLeafCost) {
+                bestLeafCost = cost;
+                bestLeaf = j;
+              }
+            }
+          }
+          double chainCost = INFINITY;
+          if (m >= xbar[i + 1])
+            chainCost =
+                COST_TABLE(m, i, i) + COST_TABLE(m - xbar[i + 1], i + 1, idx);
+          if (bestLeafCost <= chainCost) {
+            COST_TABLE(m, i, idx) = bestLeafCost;
+            BACK_PTR(m, i, idx) = bestLeaf;
+          } else {
+            COST_TABLE(m, i, idx) = chainCost;
+            BACK_PTR(m, i, idx) = -1;
+          }
+        } else
+          COST_TABLE(m, i, idx) = INFINITY;
+      }
+    }
+
+  free(ftime);
+  free(btime);
+  free(x);
+  free(xbar);
+  free(ftmp);
+  free(btmp);
+
+  PyObject* pyCostTable = PyList_New(mmax + 1);
+  PyObject* pyBackPtr = PyList_New(mmax + 1);
+
+  // Convert the result into Python world
+  for (long m = 0; m <= mmax; ++m) {
+    PyObject* pyCostTable_m = PyList_New(chainLength + 1);
+    PyList_SET_ITEM(pyCostTable, m, pyCostTable_m);
+    PyObject* pyBackPtr_m = PyList_New(chainLength + 1);
+    PyList_SET_ITEM(pyBackPtr, m, pyBackPtr_m);
+    for (long i = 0; i <= chainLength; ++i) {
+      PyObject* pyCostTable_m_i = PyDict_New();
+      PyList_SET_ITEM(pyCostTable_m, i, pyCostTable_m_i);
+      PyObject* pyBackPtr_m_i = PyDict_New();
+      PyList_SET_ITEM(pyBackPtr_m, i, pyBackPtr_m_i);
+      for (long l = i; l <= chainLength; ++l) {
+        PyObject* pyVar_l = PyLong_FromLong(l);
+        PyObject* pyCostTable_m_i_l = PyFloat_FromDouble(COST_TABLE(m, i, l));
+        PyDict_SetItem(pyCostTable_m_i, pyVar_l, pyCostTable_m_i_l);
+        Py_DECREF(pyCostTable_m_i_l);
+        PyObject* pyBackPtr_m_i_l;
+        if (BACK_PTR(m, i, l) < 0)
+          pyBackPtr_m_i_l = Py_BuildValue("(O)", Py_True);
+        else
+          pyBackPtr_m_i_l = Py_BuildValue("(Ol)", Py_False, BACK_PTR(m, i, l));
+        PyDict_SetItem(pyBackPtr_m_i, pyVar_l, pyBackPtr_m_i_l);
+        Py_DECREF(pyBackPtr_m_i_l);
+        Py_DECREF(pyVar_l);
+      }
+    }
+  }
+
+  free(costTable);
+  free(backPtr);
+
+  PyObject* result = PyTuple_Pack(2, pyCostTable, pyBackPtr);
+  Py_DECREF(pyCostTable);
+  Py_DECREF(pyBackPtr);
+  return result;
+}
+
+static PyMethodDef rotorMethods[] = {
+    {"compute_table", computeTable, METH_VARARGS,
+     "Compute the optimal table with the rotor algorithm."},
+    {NULL, NULL, 0, NULL} /* Sentinel */
+};
+
+static struct PyModuleDef rotorModule = {
+    PyModuleDef_HEAD_INIT, "rotorc", /* name of module */
+    "A simple implementation of dynamic programming algorithm rotor with C in "
+    "https://hal.inria.fr/hal-02352969. Some code are adapted from "
+    "https://gitlab.inria.fr/hiepacs/rotor.", /* module documentation, may be
+                                                 NULL */
+    -1, /* size of per-interpreter state of the module,
+                   or -1 if the module keeps state in global variables. */
+    rotorMethods};
+
+PyMODINIT_FUNC PyInit_rotorc(void) { return PyModule_Create(&rotorModule); }
--- a/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
+++ b/colossalai/auto_parallel/checkpoint/ckpt_solver_rotor.py
--- a/colossalai/auto_parallel/checkpoint/operation.py
+++ b/colossalai/auto_parallel/checkpoint/operation.py
+import math
+from abc import ABC
+from typing import Any, Iterable, List
+
+from torch.utils._pytree import tree_map
+
+
+class Chain:
+
+    def __init__(self,
+                 ftime: List[float],
+                 btime: List[float],
+                 x: List[int],
+                 xbar: List[int],
+                 ftmp: List[int],
+                 btmp: List[int],
+                 check_consistency: bool = True):
+        """The chain is a basic linearized structure for solving the dynamic programming problem for activation checkpoint.
+        See paper https://hal.inria.fr/hal-02352969 for details.
+
+        Args:
+            ftime (List[float]): The forward time of each node.
+            btime (List[float]): The backward time of each node.
+            x (List[int]): The forward memory of each node (if save_output). Same as `a` in the paper.
+            xbar (List[int]): The forward memory of each node (if save_all). Same as `a_bar` in the paper.
+            ftmp (List[int]): The temporary forward memory of each node.
+            btmp (List[int]): The temporary backward memory of each node, can be used to control memory budget.
+            check_consistency (bool, optional): Check the lengths consistency for the `Chain`. Defaults to True.
+        """
+        self.ftime = ftime
+        self.btime = btime
+        self.x = x
+        self.xbar = xbar
+        self.ftmp = ftmp
+        self.btmp = btmp
+        if check_consistency and not self.check_lengths():
+            raise AttributeError("In Chain, input lists do not have consistent lengths")
+
+    def check_lengths(self):
+        return ((len(self.ftime) == len(self)) and (len(self.btime) == len(self) + 1) and (len(self.x) == len(self) + 1)
+                and (len(self.ftmp) == len(self)) and (len(self.btmp) == len(self) + 1)
+                and (len(self.xbar) == len(self) + 1))
+
+    def __repr__(self):
+        chain_list = []
+        for i in range(len(self)):
+            chain_list.append((self.ftime[i], self.btime[i], self.x[i], self.xbar[i], self.ftmp[i], self.btmp[i]))
+        i = len(self)
+        chain_list.append((None, self.btime[i], self.x[i], self.xbar[i], None, self.btmp[i]))
+        return chain_list.__repr__()
+
+    def __len__(self):
+        return len(self.ftime)
+
+    def discretize_all(self, unit: int):
+        """Discretize the chain into a list of chains according to unit size."""
+        discretizer = lambda val: math.ceil(val / unit)
+        self.x = tree_map(discretizer, self.x)
+        self.xbar = tree_map(discretizer, self.xbar)
+        self.ftmp = tree_map(discretizer, self.ftmp)
+        self.btmp = tree_map(discretizer, self.btmp)
+
+
+class Operation(ABC):
+    name = "Op"
+
+    def __repr__(self) -> str:
+        return f"{self.name}_{self.index}"
+
+    def shift(self, value):
+        if type(self.index) is tuple:
+            self.index = tuple(x + value for x in self.index)
+        else:
+            self.index += value
+
+
+class Forward(Operation):
+    name = "F"
+
+    def __init__(self, index):
+        self.index = index
+
+    def cost(self, chain: Chain):
+        if chain is not None:
+            return chain.ftime[self.index]
+        else:
+            return 1
+
+
+class ForwardEnable(Forward):
+    name = "Fe"
+
+
+class ForwardNograd(Forward):
+    name = "Fn"
+
+
+class ForwardCheck(Forward):
+    name = "CF"
+
+
+class Forwards(Operation):
+
+    def __init__(self, start, end):
+        self.index = (start, end)
+
+    def __repr__(self):
+        return "F_{i}->{j}".format(i=self.index[0], j=self.index[1])
+
+    def cost(self, chain: Chain):
+        if chain is not None:
+            return sum(chain.ftime[self.index[0]:self.index[1] + 1])
+        else:
+            return (self.index[1] - self.index[0] + 1)
+
+
+def isForward(op):
+    return type(op) is Forward or type(op) is Forwards
+
+
+class Backward(Operation):
+    name = "B"
+
+    def __init__(self, index):
+        self.index = index
+
+    def cost(self, chain: Chain):
+        if chain is not None:
+            return chain.btime[self.index]
+        else:
+            return 1
+
+
+class Loss(Operation):
+
+    def __init__(self):
+        pass
+
+    def __repr__(self):
+        return "L"
+
+    def cost(self, chain):
+        return 0
+
+
+class MemoryAccess(Operation):
+    name = "MA"
+
+    def __init__(self, index):
+        self.index = index
+
+    def cost(self, chain: Chain):
+        return 0
+
+
+class WriteMemory(MemoryAccess):
+    name = "WM"
+
+
+class ReadMemory(MemoryAccess):
+    name = "RM"
+
+
+class DiscardMemory(MemoryAccess):
+    name = "DM"
+
+
+class Sequence(list):
+
+    def __init__(self):
+        super().__init__()
+
+    def __repr__(self):
+        return repr(self.list_operations())
+
+    def list_operations(self):
+        op_list = []
+        for x in self:
+            if isinstance(x, Operation):
+                op_list.append(x)
+            else:
+                assert isinstance(x, Sequence)
+                op_list += x.list_operations()
+        return op_list
--- a/colossalai/auto_parallel/meta_profiler/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/__init__.py
+from .meta_registry import *
+from .metainfo import *
+from .registry import meta_register
--- a/colossalai/auto_parallel/meta_profiler/constants.py
+++ b/colossalai/auto_parallel/meta_profiler/constants.py
+import operator
+
+import torch
+import torch.nn as nn
+
+from ..tensor_shard.constants import *
+
+# list of inplace operations
+INPLACE_MODULE = [nn.ReLU]
+
+# list of operations that do not save forward activations
+NO_SAVE_ACTIVATION = [torch.add, torch.sub, operator.add, operator.sub]
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/__init__.py
+from .activation import *
+from .binary_elementwise_ops import *
+from .conv import *
+from .linear import *
+from .norm import *
+from .pooling import *
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/activation.py
+from typing import List, Tuple
+
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
+from colossalai.fx.profiler.memory_utils import activation_size
+from colossalai.fx.profiler.opcount import flop_mapping
+
+from ..registry import meta_register
+
+__all__ = ["relu_meta_info"]
+
+
+@meta_register.register(torch.nn.ReLU)
+def relu_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """torch.nn.ReLU metainfo generator
+    The aten graph of torch.nn.ReLU is
+    graph():
+    %input_2 : [#users=1] = placeholder[target=placeholder](default=)
+    %relu_default : [#users=2] = call_function[target=torch.ops.aten.relu.default](args = (%input_2,), kwargs = {})
+    %zeros_like_default : [#users=1] = call_function[target=torch.ops.aten.zeros_like.default](args = (%relu_default,), kwargs = {dtype: None, layout: None, device: None, pin_memory: None})
+    %detach_default : [#users=1] = call_function[target=torch.ops.aten.detach.default](args = (%relu_default,), kwargs = {})
+    %threshold_backward_default : [#users=1] = call_function[target=torch.ops.aten.threshold_backward.default](args = (%zeros_like_default, %detach_default, None), kwargs = {})
+    %detach_default_1 : [#users=1] = call_function[target=torch.ops.aten.detach.default](args = (%threshold_backward_default,), kwargs = {})
+    %detach_default_2 : [#users=0] = call_function[target=torch.ops.aten.detach.default](args = (%detach_default_1,), kwargs = {})
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+
+    input_tensor = next(filter(lambda x: x.type == OperationDataType.ARG, args)).data
+    output_tensor = next(filter(lambda x: x.type == OperationDataType.OUTPUT, args)).data
+    inplace = kwargs.get("inplace", False)
+
+    # construct input args for forward
+    fwd_in_args = [input_tensor]
+
+    # construct input args for backward
+    bwd_in_args = [output_tensor]
+
+    # calculate cost
+    # the fwd op with compute cost is relu.default
+    # the bwd op with compute cost is threshold_backward
+
+    # calculate compute cost
+    fwd_compute_cost = flop_mapping[torch.ops.aten.relu.default](fwd_in_args, (output_tensor,))
+    bwd_compute_cost = flop_mapping[torch.ops.aten.threshold_backward.default](bwd_in_args, (input_tensor,))
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # calculate memory cost
+    # NOTE: the inplace ReLU don't have forward memory cost
+    # NOTE: currently in SPMD solver we always believe that there will be a new tensor created in forward
+    fwd_memory_cost = MemoryCost(
+        activation=activation_size(input_tensor) if inplace else activation_size([output_tensor, input_tensor]),
+        parameter=0,
+        temp=0,
+        buffer=0)
+
+    bwd_memory_cost = MemoryCost(activation=activation_size(input_tensor), parameter=0, temp=0, buffer=0)
+
+    # total cost is the sum of forward and backward cost
+    total_cost = MemoryCost(activation=fwd_memory_cost.activation + bwd_memory_cost.activation,
+                            parameter=fwd_memory_cost.parameter + bwd_memory_cost.parameter)
+
+    memory_cost = TrainCycleItem(fwd=fwd_memory_cost, bwd=bwd_memory_cost, total=total_cost)
+
+    # store fwd_in
+    fwd_in = [input_tensor]
+
+    return compute_cost, memory_cost, fwd_in
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/binary_elementwise_ops.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/binary_elementwise_ops.py
+from typing import List, Tuple
+
+import torch
+
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, OperationDataType, TrainCycleItem
+from colossalai.fx.profiler.memory_utils import activation_size
+from colossalai.fx.profiler.opcount import flop_mapping
+
+from ..constants import BCAST_FUNC_OP
+from ..registry import meta_register
+
+__all__ = ['binary_elementwise_meta_info']
+
+
+@meta_register.register(BCAST_FUNC_OP)
+def binary_elementwise_meta_info(*args, **kwargs) -> Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]:
+    """Meta information generator for binary elementwise operations
+    NOTE: Some of the binary elementwise operations will discard the input activation after computation, as they
+    don't need those tensors for back propagation, for example, if there are two tensors being sent for `torch.add`,
+    they will be discarded right after add operation is done. We create a simple API in `MetaInfo` class to identify
+    this behavior, it is critical for better memory estimation.
+
+    Returns:
+        Tuple[TrainCycleItem, TrainCycleItem, List[torch.Tensor]]: compute cost, memory cost and forward inputs
+    """
+
+    input_op_data, other_op_data = [arg for arg in args if arg.type != OperationDataType.OUTPUT]
+    output_op_data = next(filter(lambda arg: arg.type == OperationDataType.OUTPUT, args))
+
+    # construct forward args for flop mapping
+    fwd_in_args = [input_op_data.data, other_op_data.data]
+    fwd_out_args = [output_op_data.data]
+
+    # calculate cost
+
+    # calculate compute cost
+    # NOTE: we set bwd_compute_cost two times of fwd_compute_cost in this case
+    fwd_compute_cost = flop_mapping[torch.ops.aten._adaptive_avg_pool2d.default](fwd_in_args, fwd_out_args)
+    bwd_compute_cost = fwd_compute_cost * 2
+    compute_cost = TrainCycleItem(fwd=fwd_compute_cost, bwd=bwd_compute_cost, total=fwd_compute_cost + bwd_compute_cost)
+
+    # calculate memory cost
+    param_mem_cost = activation_size(
+        [arg.data for arg in [input_op_data, other_op_data] if arg.type == OperationDataType.PARAM])
+    fwd_mem_cost = MemoryCost(
+        activation=activation_size([input_op_data.data, output_op_data.data]),
+        parameter=param_mem_cost,
+    )
+    bwd_mem_cost = MemoryCost(
+        activation=activation_size(fwd_in_args),
+        parameter=param_mem_cost,
+    )
+
+    # total cost
+    total_mem_cost = MemoryCost(
+        activation=fwd_mem_cost.activation + bwd_mem_cost.activation,
+        parameter=fwd_mem_cost.parameter + bwd_mem_cost.parameter,
+    )
+
+    memory_cost = TrainCycleItem(fwd=fwd_mem_cost, bwd=bwd_mem_cost, total=total_mem_cost)
+
+    # store fwd_in
+    fwd_in = fwd_in_args
+
+    return compute_cost, memory_cost, fwd_in
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/conv.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/conv.py
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/linear.py
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/norm.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/norm.py
--- a/colossalai/auto_parallel/meta_profiler/meta_registry/pooling.py
+++ b/colossalai/auto_parallel/meta_profiler/meta_registry/pooling.py
--- a/colossalai/auto_parallel/meta_profiler/metainfo.py
+++ b/colossalai/auto_parallel/meta_profiler/metainfo.py
--- a/colossalai/auto_parallel/meta_profiler/registry.py
+++ b/colossalai/auto_parallel/meta_profiler/registry.py
+__all__ = ['Registry']
+
+
+class Registry:
+
+    def __init__(self, name):
+        self.name = name
+        self.store = {}
+
+    def register(self, source):
+
+        def wrapper(func):
+            if isinstance(source, (list, tuple)):
+                # support register a list of items for this func
+                for element in source:
+                    self.store[element] = func
+            else:
+                self.store[source] = func
+            return func
+
+        return wrapper
+
+    def get(self, source):
+        assert source in self.store, f'{source} not found in the {self.name} registry'
+        target = self.store[source]
+        return target
+
+    def has(self, source):
+        return source in self.store
+
+
+meta_register = Registry('meta')
--- a/colossalai/auto_parallel/passes/__init__.py
+++ b/colossalai/auto_parallel/passes/__init__.py
--- a/colossalai/auto_parallel/passes/runtime_apply_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_apply_pass.py
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
--- a/colossalai/auto_parallel/pipeline_shard/__init__.py
+++ b/colossalai/auto_parallel/pipeline_shard/__init__.py