[autoparallel] remove deprecated codes (#2664)

0b2a7383 · YuliangLiu0306 · GitHub · 7fa6be49 · 7fa6be49 · 7fa6be49
Unverified Commit 0b2a7383 authored Feb 15, 2023 by YuliangLiu0306 Committed by GitHub Feb 15, 2023
20 changed files
--- a/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
-from .cost_graph import CostGraph
-from .graph_analysis import GraphAnalyser
-from .options import SolverOptions
-from .sharding_strategy import ShardingStrategy, StrategiesVector
-from .solver import Solver
-from .strategies_constructor import StrategiesConstructor
--- a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
-import functools
-import operator
-import warnings
-from functools import reduce
-from typing import Dict, List, Optional, Union
-
-import torch
-from torch.fx.node import Node
-
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .constants import INFINITY_COST
-
-
-def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,
-                           dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-    """
-    Generate the sharding spec of the tensor based on the given dim_partition_dict.
-
-
-    Args:
-        input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
-        device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.
-        dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding.
-    """
-
-    if isinstance(input_, Node):
-        assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
-        meta_tensor = input_._meta_data
-        assert meta_tensor is not None, "The given node's _meta_data attribute is None"
-        shape = meta_tensor.shape
-    elif isinstance(input_, torch.Tensor):
-        shape = input_.shape
-    else:
-        raise TypeError(
-            f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'
-        )
-    for dim_index, sharding_index_list in dim_partition_dict.items():
-        sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
-        sharding_size = reduce(operator.mul, sharding_list, 1)
-        assert shape[
-            dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
-
-    sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)
-    return sharding_spec
-
-
-def generate_resharding_costs(nodes: List[Node],
-                              sharding_specs: List[ShardingSpec],
-                              count_backward: Optional[bool] = True,
-                              dtype: Optional[torch.dtype] = None,
-                              index=None):
-    '''
-    Compute the resharding costs with this specific strategy.
-
-    Argument:
-        nodes (List[Node]): a list of nodes
-        sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
-        count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
-        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
-    '''
-    # The resharding_cost of weight is counted due to sharing weight cases.
-    resharding_costs = {}
-    size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-    # shape consistency manager is a singleton class
-    shape_consistency_manager = ShapeConsistencyManager()
-
-    for input_node, input_spec in zip(nodes, sharding_specs):
-        resharding_costs[input_node] = []
-        for strategy in input_node.strategies_vector:
-            input_sharding_spec = strategy.output_sharding_spec
-            if not isinstance(input_sharding_spec, ShardingSpec):
-                assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'
-                input_sharding_spec = input_sharding_spec[index]
-            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-            try:
-                # compute the resharding cost
-                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
-                    input_sharding_spec, input_spec)
-
-                # we need multiply the size of elem dtype to get correct communication cost
-                resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                resharding_cost = INFINITY_COST
-            resharding_costs[input_node].append(resharding_cost)
-    return resharding_costs
-
-
-def ignore_sharding_exception(func):
-    """
-    A function wrapper which executes the function with a specified seed.
-    """
-
-    @functools.wraps(func)
-    def wrapper(*args, **kwargs):
-        try:
-            rst = func(*args, **kwargs)
-            return rst
-        except AssertionError as e:
-            warnings.warn(f'{e}')
-
-    return wrapper
-
-
-def enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size):
-    dim_partition_list = []
-    # enumerate all the 2D sharding cases
-    for i in range(dim_size):
-        for j in range(i + 1, dim_size):
-            dim_partition_dict_0 = {i: [mesh_dim_0], j: [mesh_dim_1]}
-            dim_partition_dict_1 = {i: [mesh_dim_1], j: [mesh_dim_0]}
-            dim_partition_list.append(dim_partition_dict_0)
-            dim_partition_list.append(dim_partition_dict_1)
-    for i in range(dim_size):
-        dim_partition_dict_flatten = {i: [mesh_dim_0, mesh_dim_1]}
-        dim_partition_list.append(dim_partition_dict_flatten)
-
-    return dim_partition_list
-
-
-def enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size):
-    dim_partition_list = []
-    # enumerate all the 1D sharding cases
-    for i in range(dim_size):
-        dim_partition_dict_0 = {i: [mesh_dim_0]}
-        dim_partition_list.append(dim_partition_dict_0)
-
-    return dim_partition_list
-
-
-def generate_sharding_size(dim_partition_dict, device_mesh):
-    total_sharding_size = 1
-    for mesh_dim_list in dim_partition_dict.values():
-        mesh_dim_sharding_size = [device_mesh.shape[mesh_dim] for mesh_dim in mesh_dim_list]
-        sharding_size = reduce(operator.mul, mesh_dim_sharding_size)
-        total_sharding_size *= sharding_size
-
-    return total_sharding_size
--- a/colossalai/auto_parallel/tensor_shard/deprecated/constants.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/constants.py
-import torch
-import operator
-
-__all__ = [
-    'ELEMENTWISE_MODULE_OP', 'ELEMENTWISE_FUNC_OP', 'RESHAPE_FUNC_OP', 'CONV_MODULE_OP', 'CONV_FUNC_OP',
-    'LINEAR_MODULE_OP', 'LINEAR_FUNC_OP', 'BATCHNORM_MODULE_OP', 'POOL_MODULE_OP', 'NON_PARAM_FUNC_OP', 'BCAST_FUNC_OP',
-    'EMBEDDING_MODULE_OP', 'LAYERNORM_MODULE_OP', 'ELEMENTWISE_METHOD_OP', 'RESHAPE_METHOD_OP', 'INFINITY_COST'
-]
-
-ELEMENTWISE_MODULE_OP = [torch.nn.Dropout, torch.nn.ReLU]
-ELEMENTWISE_FUNC_OP = [
-    torch.abs,
-    torch.cos,
-    torch.exp,
-    operator.neg,
-    torch.multiply,
-    torch.nn.functional.relu,
-    torch.nn.functional.dropout,
-    # softmax should not be here
-    torch.nn.functional.softmax
-]
-ELEMENTWISE_METHOD_OP = [
-    torch.Tensor.to,
-    torch.Tensor.type,
-    # TODO: contiguous maybe need some extra processes.
-    torch.Tensor.contiguous
-]
-RESHAPE_FUNC_OP = [torch.flatten, torch.reshape]
-RESHAPE_METHOD_OP = [
-    torch.Tensor.view,
-    torch.Tensor.unsqueeze,
-    torch.Tensor.split,
-    torch.Tensor.permute,
-    torch.Tensor.transpose,
-]
-BCAST_FUNC_OP = [
-    torch.add, torch.sub, torch.mul, torch.div, torch.floor_divide, torch.true_divide, operator.add, operator.sub,
-    operator.mul, operator.floordiv, operator.truediv, torch.matmul, torch.where, operator.pow, torch.pow, torch.tanh
-]
-CONV_MODULE_OP = [
-    torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d, torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d,
-    torch.nn.ConvTranspose3d
-]
-CONV_FUNC_OP = [
-    torch.conv1d, torch.conv2d, torch.conv3d, torch.conv_transpose1d, torch.conv_transpose2d, torch.conv_transpose3d
-]
-EMBEDDING_MODULE_OP = [torch.nn.modules.sparse.Embedding]
-LINEAR_MODULE_OP = [torch.nn.Linear]
-LINEAR_FUNC_OP = [torch.nn.functional.linear, torch.matmul, torch.bmm]
-BATCHNORM_MODULE_OP = [torch.nn.BatchNorm1d, torch.nn.BatchNorm2d, torch.nn.BatchNorm3d, torch.nn.SyncBatchNorm]
-LAYERNORM_MODULE_OP = [torch.nn.LayerNorm]
-POOL_MODULE_OP = [torch.nn.MaxPool1d, torch.nn.MaxPool2d, torch.nn.MaxPool3d, torch.nn.AdaptiveAvgPool2d]
-NON_PARAM_FUNC_OP = [
-    torch.flatten,
-    torch.reshape,
-    torch.abs,
-    torch.cos,
-    torch.exp,
-    operator.neg,
-    torch.multiply,
-    torch.nn.functional.relu,
-    torch.nn.functional.dropout,
-    torch.flatten,
-    torch.where,
-    operator.pow,
-    torch.pow,
-    torch.tanh,
-    torch.add,
-    torch.sub,
-    torch.mul,
-    torch.div,
-    torch.floor_divide,
-    torch.true_divide,
-    operator.add,
-    operator.sub,
-    operator.mul,
-    operator.floordiv,
-    operator.truediv,
-    # softmax should not be here
-    torch.nn.functional.softmax
-]
-
-INFINITY_COST = 1e13
--- a/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/cost_graph.py
-from typing import List
-import math
-from torch.fx.node import Node
-from .constants import INFINITY_COST
-
-
-class CostGraph:
-    '''
-    A graph data structure to simplify the edge cost graph. It has two main functions:
-    1. To feed the quadratic resharding costs into solver, we need to linearize it. We build edge_cost in
-    CostGraph, and it stored every combinations of strategies for a src-dst node pair in an 1D list.
-    2. To reduce the searching space, we merge computationally-trivial operators, such as 
-    element-wise operators, transpose, and reduction, into their following nodes. The merging infomation will
-    be given by the StrategiesVector depending on the type of target node and following nodes.
-
-    Argument:
-        leaf_strategies(List[StrategiesVector]): It stores StrategiesVector of every nodes on the graph.
-        simplify(bool, optional): The generated cost graph will be simplified if it is true. (default to True)
-    '''
-
-    def __init__(self, leaf_strategies, simplify=True):
-        self.leaf_strategies = leaf_strategies
-        self.nodes = [strategies_vector.node for strategies_vector in self.leaf_strategies]
-        # stores number of strategies in each node
-        self.node_lens = {strategies_vector.node: len(strategies_vector) for strategies_vector in self.leaf_strategies}
-        # extra_node_costs will store the extra costs introduced by merging nodes
-        self.extra_node_costs = {}
-        self.following_dict = {}
-        self.simplify = simplify
-        self._build_cost_graph()
-
-    def _remove_invalid_node(self, node, attr_name):
-        remove_list = []
-        target_node_list = getattr(node, attr_name, [])
-        for target_node in target_node_list:
-            if target_node not in self.nodes:
-                remove_list.append(target_node)
-        for element in remove_list:
-            target_node_list.remove(element)
-
-    def _build_cost_graph(self):
-        '''
-        This method will generate edge_cost for adjacent node pair. Additionally, 'parents' and 'children' attribute will be
-        set to node.
-        '''
-        self.edge_costs = {}
-        if self.simplify:
-            self.merge_pair = []
-        for strategies_vector in self.leaf_strategies:
-            # build edge_cost
-            dst_node = strategies_vector.node
-            for src_node in strategies_vector.predecessor_nodes:
-                if src_node not in self.nodes:
-                    continue
-                node_pair = (src_node, dst_node)
-                # src_index = strategies_vector.predecessor_nodes.index(src_node)
-                edge_cost = {}
-                for i in range(len(strategies_vector)):
-                    for j in range(len(src_node.strategies_vector)):
-                        edge_cost[(j, i)] = strategies_vector[i].resharding_costs[src_node][j]
-                self.edge_costs[node_pair] = edge_cost
-            # add parents and children attribute to node
-            setattr(dst_node, 'parents', strategies_vector.predecessor_nodes)
-            setattr(dst_node, 'children', strategies_vector.successor_nodes)
-            self._remove_invalid_node(dst_node, 'parents')
-            self._remove_invalid_node(dst_node, 'children')
-
-            if self.simplify and strategies_vector.check_merge():
-                for followed_node in strategies_vector.predecessor_nodes:
-                    self.merge_pair.append((followed_node, dst_node))
-
-    def get_edge_cost(self, src_node, dst_node):
-        return self.edge_costs[(src_node, dst_node)]
-
-    def merge_node(self, src_node, dst_node):
-        '''
-        To merge dst_node into src_node, we need to do it in following steps:
-        
-        1. For each strategy in dst_node, we need to pick an appropriate strategy
-        of src_node to merge, it is important because the logical resharding costs 
-        between the parents node of src_node and merged node depend on the src_node 
-        strategies dispatching. For example, for the graph 0->1->2, after merging node 1
-        into node 2, edge_costs[(node 0, node 2)][(0, 0)] = edge_costs[(node 0, node 1)][(0, x)]
-        x represents the picking strategy of node 1 merged into node 2 strategy 0.
-        
-        2. We need to accumulate the extra costs introduced by merging nodes, the extra costs
-        contains two parts, one is resharding costs between src_node strategy and dst_node strategy,
-        another is the origin extra costs in src_node strategy.
-
-        3. Build connections between new node pairs, and remove the src_node after all consumer nodes
-        detached from it.
-
-        Argument:
-            src_node(Node): The node will be merged into dst_node.
-            dst_node(Node): The node to integrate src_node.
-        '''
-        src_node_index = dst_node.parents.index(src_node)
-        # build merge_map
-        merge_map = {}
-        for src_index, strategy in enumerate(src_node.strategies_vector):
-            min_cost = INFINITY_COST
-            lowest_cost_index = -1
-            for dst_index, dst_strategy in enumerate(dst_node.strategies_vector):
-                resharding_cost = dst_strategy.resharding_costs[src_node][src_index]
-                if resharding_cost <= min_cost:
-                    min_cost = resharding_cost
-                    lowest_cost_index = dst_index
-            merge_map[src_index] = lowest_cost_index
-
-        # extra_node_cost for src node
-        self.extra_node_costs[src_node] = [0.0] * self.node_lens[src_node]
-        for src_index, strategy in enumerate(src_node.strategies_vector):
-            target_strate_index = merge_map[src_index]
-            target_strategy = dst_node.strategies_vector[target_strate_index]
-            self.extra_node_costs[src_node][src_index] += target_strategy.resharding_costs[src_node][src_index]
-            if dst_node in self.extra_node_costs:
-                self.extra_node_costs[src_node][src_index] += self.extra_node_costs[dst_node][target_strate_index]
-
-        # add new node pair to cost graph
-        for child_node in dst_node.children:
-            new_node_pair = (src_node, child_node)
-            old_node_pair = (dst_node, child_node)
-            if new_node_pair in self.edge_costs:
-                continue
-            edge_cost = {}
-            for i in range(self.node_lens[src_node]):
-                for j in range(self.node_lens[child_node]):
-                    dst_strate_index = merge_map[i]
-                    # dst_strategy = dst_node.strategies_vector[dst_strate_index]
-                    edge_cost[(i, j)] = self.edge_costs[old_node_pair][(dst_strate_index, j)]
-            if new_node_pair not in self.edge_costs:
-                self.edge_costs[new_node_pair] = edge_cost
-            else:
-                # we should accumulate the resharding costs if args of child node contain
-                # both src node and dst node.
-                for index_pair, resharding_cost in self.edge_costs[new_node_pair]:
-                    self.edge_costs[new_node_pair][index_pair] += edge_cost[index_pair]
-
-        # connect src node and children of dst node
-        dst_node.parents.remove(src_node)
-        src_node.children.remove(dst_node)
-        self.edge_costs.pop((src_node, dst_node))
-        for child_node in dst_node.children:
-            if child_node not in src_node.children:
-                src_node.children.append(child_node)
-            if src_node not in child_node.parents:
-                child_node.parents.append(src_node)
-            # remove dst node from cost graph when dst node has no producer.
-            if len(dst_node.parents) == 0:
-                child_node.parents.remove(dst_node)
-                node_pair = (dst_node, child_node)
-                self.edge_costs.pop(node_pair)
-        if len(dst_node.parents) == 0:
-            self.following_dict[dst_node] = src_node
-            dst_node.children = []
-
-    def _reindexing_src(self, src):
-        if src not in self.following_dict:
-            return src
-        return self._reindexing_src(self.following_dict[src])
-
-    def simplify_graph(self):
-        if not self.simplify:
-            return
-        self.merge_pair.reverse()
-        for (src_node, dst_node) in self.merge_pair:
-            self.merge_node(src_node, dst_node)
-        self.merge_pair.reverse()
-        reindexing_following_dict = {}
-        for dst, src in self.following_dict.items():
-            reindexing_following_dict[dst] = self._reindexing_src(src)
-        self.following_dict = reindexing_following_dict
--- a/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/graph_analysis.py
-from dataclasses import dataclass
-from torch.fx.node import Node
-from torch.fx.graph import Graph
-from torch.fx.graph_module import GraphModule
-from collections import OrderedDict as ODict
-from typing import List, OrderedDict, Union, Any
-from colossalai.fx.passes.utils import get_node_module
-
-__all__ = ['LiveVariable', 'LiveVariableVector', 'LiveStage', 'GraphAnalyser']
-
-
-@dataclass
-class LiveVariable:
-    """
-    LiveVariable is a data structure to store the meta information of a variable for liveness analysis.
-    """
-    name: str
-    node: Node
-    is_inplace: bool
-
-
-class LiveVariableVector(list):
-    """
-    LiveVariableVector is a data structure to store the list of LiveVariable objects.
-    """
-
-    def exists(self, name) -> bool:
-        """
-        Check if a variable has already existed in the current list by name.
-        """
-        for var in self:
-            if name == var.name:
-                return True
-        return False
-
-    def get(self, name) -> LiveVariable:
-        for var in self:
-            if name == var.name:
-                return var
-        raise KeyError(f"Variable {name} is not found")
-
-    def copy(self) -> "LiveVariableVector":
-        """
-        Create a copy of this vector
-        """
-        vector = LiveVariableVector()
-        for var in self:
-            vector.append(var)
-        return vector
-
-
-@dataclass
-class LiveStage:
-    """
-    LiveStage is a data structure to record the living variables at this current node.
-    """
-    name: str
-    node: Node
-    all_live_vars: LiveVariableVector
-    unique_live_vars: LiveVariableVector
-
-
-class GraphAnalyser:
-
-    def __init__(self, gm: GraphModule):
-        self._gm = gm
-        self._graph = gm.graph
-
-    @property
-    def gm(self) -> GraphModule:
-        """
-        Return the GraphModule object associated with this analyser.
-        """
-        return self._gm
-
-    @property
-    def graph(self) -> Graph:
-        """
-        Return the Graph object associated with this analyser.
-        """
-        return self._graph
-
-    def liveness_analysis(self) -> List[LiveStage]:
-        """
-        Analyse the graph to obtain the variable liveness information. This function returns
-        an ordered dictionary where the key is the compute stage ID and the value is a LivenessStage object.
-        """
-        compute_nodes = self.graph.nodes
-        liveness_list = []
-
-        # checked: record all variables created since the first stage
-        # all: record the live variables only exist until the current stage.
-        #       this can be different from the `checked list`` as some varialbes may be destroyed prior to this stage.
-        # unique: record the unique live variables only exist until the current stage.
-        #       this is different from `all list` as some variables are duplicated.
-        checked_variables = LiveVariableVector()
-        all_live_variables = LiveVariableVector()
-        unique_live_vars = LiveVariableVector()
-
-        for idx, node in enumerate(compute_nodes):
-            #############################
-            # find new living variables #
-            #############################
-            # detect whether the current op is an in-place op
-            # if it is an in-place op, we would deem it as a duplciate var
-            is_inplace = False
-            if node.op == 'call_function':
-                # check if this is an inplace op such as torch.nn.functional.relu(x, inplace=True)
-                if node.kwargs.get('inplace', False):
-                    is_inplace = True
-            elif node.op == 'call_module':
-                # to check if this is an inplace op such as torch.nn.Relu(inplace=True)
-                module = get_node_module(node)
-                if getattr(module, 'inplace', False):
-                    is_inplace = True
-
-            # add the output var
-            meta = getattr(node, '_meta_data', None)
-            live_var = LiveVariable(name=node.name, node=node, is_inplace=is_inplace)
-            if not is_inplace:
-                unique_live_vars.append(live_var)
-            checked_variables.append(live_var)
-            all_live_variables.append(live_var)
-
-            # check if any input is not checked yet
-            for arg in node.args:
-                if not isinstance(arg, Node):
-                    continue
-                arg_name = arg.name
-                if not checked_variables.exists(arg_name):
-                    live_var_from_arg = LiveVariable(name=arg_name, node=node, is_inplace=False)
-                    all_live_variables.append(live_var_from_arg)
-                    checked_variables.append(live_var_from_arg)
-                    unique_live_vars.append(live_var_from_arg)
-
-            # TODO: add the logic to remove live variables
-            # this should be completed if we are able to trace the backward compute graph
-
-            # add this stage to liveness dict
-            stage = LiveStage(name=node.name,
-                              node=node,
-                              all_live_vars=all_live_variables.copy(),
-                              unique_live_vars=unique_live_vars.copy())
-            # if a LiveStage is covered by another LiveStage, we just keep the larger one.
-            replace = False
-            for index, prev_stage in enumerate(liveness_list):
-                all_covered = True
-                for ele in prev_stage.unique_live_vars:
-                    if ele not in stage.unique_live_vars:
-                        all_covered = False
-                        break
-                if all_covered:
-                    replace = True
-                    break
-            if replace:
-                liveness_list[index] = stage
-            else:
-                liveness_list.append(stage)
-
-        return liveness_list
-
-    def get_alias_set(self):
-        pass
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/__init__.py
-from .batch_norm_handler import BatchNormHandler
-from .bcast_op_handler import BcastOpHandler
-from .conv_handler import ConvHandler
-from .dot_handler import DotHandler
-from .embedding_handler import EmbeddingHandler
-from .layer_norm_handler import LayerNormHandler
-from .operator_handler import OperatorHandler
-from .reshape_handler import ReshapeHandler
-from .unary_elementwise_handler import UnaryElementwiseHandler
-from .where_handler import WhereHandler
-
-__all__ = [
-    'OperatorHandler', 'DotHandler', 'ConvHandler', 'BatchNormHandler', 'ReshapeHandler', 'BcastOpHandler',
-    'UnaryElementwiseHandler', 'EmbeddingHandler', 'WhereHandler', 'LayerNormHandler'
-]
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/batch_norm_handler.py
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/bcast_op_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/bcast_op_handler.py
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/embedding_handler.py
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['EmbeddingHandler']
-
-
-class EmbeddingHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of Embedding operators(such as nn.embedding).
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, total_sharding_size):
-        input_shape = self.input_data.shape
-        weight_shape = self.weight.shape
-        input_shape_product = reduce(operator.mul, input_shape, 1)
-        weight_shape_product = reduce(operator.mul, weight_shape, 1)
-        compute_cost = input_shape_product * weight_shape_product * 2 / total_sharding_size
-        return compute_cost
-
-    def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_activation, sharding_size_weight):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            sharding_size_forward(int): The forward activation will be divided
-                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
-                be divided into sharding_size_backward_activation number partions.
-            sharding_size_weight(int): The backward weight will be divided
-                into sharding_size_weight number partions.
-
-        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
-                specific strategy, the first element of this tuple is forward
-                memory cost, and the second element of this tuple is backward
-                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
-                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
-                per device with this specific strategy.
-        '''
-        # compute the memory cost of this strategy
-        dtype = self.input_data.dtype
-        numel_output = self.output_data.numel()
-        numel_input = self.input_data.numel()
-        numel_weight = self.weight.numel()
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # forward memory_cost
-        memory_cost_forward_activation = numel_output * size_per_elem_bytes / sharding_size_forward
-        memory_cost_forward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_forward = memory_cost_forward_activation + memory_cost_forward_weight
-
-        # backward memory_cost
-        memory_cost_backward_activation = numel_input * size_per_elem_bytes / sharding_size_backward_activation
-        memory_cost_backward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_backward = memory_cost_backward_activation + memory_cost_backward_weight
-
-        # memory_cost pair
-        memory_cost = (memory_cost_forward, memory_cost_backward)
-
-        return memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight
-
-    @ignore_sharding_exception
-    def split_weight_both_dim(self, mesh_dim_0, mesh_dim_1):
-        name = f'RRS{mesh_dim_1} = RR x S{mesh_dim_0}S{mesh_dim_1}'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {2: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[0] * self.device_mesh.shape[1]
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = 1
-        sharding_size_weight = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, _ = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # compute the communication cost of this strategy during forward phase
-        communication_cost_forward = self.device_mesh.all_reduce_cost(memory_cost_forward_activation, mesh_dim_0)
-        # compute the communication cost of this strategy during backward phase
-        communication_cost_backward = self.device_mesh.all_reduce_cost(memory_cost_backward_activation, mesh_dim_1)
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_both_dim(self, mesh_dim_0, mesh_dim_1):
-        name = f'S{mesh_dim_0}S{mesh_dim_1}R = S{mesh_dim_0}S{mesh_dim_1} x RR'
-
-        dim_partition_dict_for_input = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
-
-        # compute the computation cost of this strategy
-        total_sharding_size = self.device_mesh.shape[0] * self.device_mesh.shape[1]
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_backward_activation = self.device_mesh.shape[mesh_dim_0] * self.device_mesh.shape[mesh_dim_1]
-        sharding_size_weight = 1
-        memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight = self._generate_memory_cost(
-            sharding_size_forward, sharding_size_backward_activation, sharding_size_weight)
-
-        # This strategy do not need to do all_reduce during forward phase
-        communication_cost_forward = 0
-        # compute the communication cost of this strategy during backward phase
-        communication_cost_backward_activation = 0
-        communication_cost_backward_weight = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-            memory_cost_backward_weight, 0)
-        communication_cost_backward = communication_cost_backward_activation + communication_cost_backward_weight
-        communication_cost = communication_cost_forward + communication_cost_backward
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a Conv node, and record all strategies into the strategies_vector.
-        '''
-        # RRS = RR x SS
-        self.split_weight_both_dim(0, 1)
-        self.split_weight_both_dim(1, 0)
-
-        # SSR = SS x RR
-        self.split_input_both_dim(0, 1)
-        self.split_input_both_dim(1, 0)
-
-        return self.strategies_vector
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
-import operator
-from functools import reduce
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (
-    enumerate_all_possible_1d_sharding,
-    enumerate_all_possible_2d_sharding,
-    generate_sharding_size,
-    ignore_sharding_exception,
-)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['LayerNormHandler']
-
-
-class LayerNormHandler(OperatorHandler):
-    """
-    A OperatorHandler which deals with the sharding strategies of normalization.
-
-    Note: To keep the math consistency, LayerNorm do not allow shards on hidden dimension.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.weight = self.module_named_parameters['weight']
-        self.bias = self.module_named_parameters['bias']
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, total_sharding_size):
-        '''
-        Compute the computation cost per device with this specific strategy.
-
-        Note: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-
-        Argument:
-            bs(int): Batch size of the input data.
-            channel_in(int): The channel dimension of input data.
-
-        Return:
-            compute_cost(float): Computation cost per device with this specific strategy
-        '''
-        # TODO: compute_cost need to be devided by TFLOPS, now it just shows the computation size.
-        # TODO: a constant coefficient need to be added.
-
-        norm_kernel_size = self.weight.shape
-        # in LayerNorm context, batch dimensions mean all the dimensions do not join the normalization.
-        input_batch_shape = self.input_data.shape[:-len(norm_kernel_size)]
-        input_batch_product = reduce(operator.mul, input_batch_shape, 1)
-        norm_kernel_product = reduce(operator.mul, norm_kernel_size, 1)
-        forward_compute_cost = input_batch_product * norm_kernel_product / total_sharding_size
-        backward_activation_compute_cost = input_batch_product * norm_kernel_product / total_sharding_size
-        # To compute gradient of on norm kernel element requires input_batch_product times computation, so
-        # the total cost is input_batch_product * norm_kernel_product
-        backward_weight_compute_cost = input_batch_product * norm_kernel_product / total_sharding_size
-        backward_compute_cost = backward_activation_compute_cost + backward_weight_compute_cost
-        compute_cost = forward_compute_cost + backward_compute_cost
-        return compute_cost
-
-    def _generate_memory_cost(self, sharding_size_forward, sharding_size_backward_activation, sharding_size_weight):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            sharding_size_forward(int): The forward activation will be divided
-                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will
-                be divided into sharding_size_backward_activation number partions.
-            sharding_size_weight(int): The backward weight will be divided
-                into sharding_size_weight number partions.
-
-        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this
-                specific strategy, the first element of this tuple is forward
-                memory cost, and the second element of this tuple is backward
-                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per
-                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation
-                per device with this specific strategy.
-        '''
-        # compute the memory cost of this strategy
-        dtype = self.input_data.dtype
-        numel_output = self.output_data.numel()
-        # this operation will not change the shape of input
-        numel_input = numel_output
-        numel_weight = self.weight.numel()
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # forward memory_cost
-        memory_cost_forward_activation = numel_output * size_per_elem_bytes / sharding_size_forward
-        memory_cost_forward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_forward = memory_cost_forward_activation + memory_cost_forward_weight
-
-        # backward memory_cost
-        memory_cost_backward_activation = numel_input * size_per_elem_bytes / sharding_size_backward_activation
-        memory_cost_backward_weight = numel_weight * size_per_elem_bytes / sharding_size_weight
-        memory_cost_backward = memory_cost_backward_activation + memory_cost_backward_weight
-
-        # memory_cost pair
-        memory_cost = (memory_cost_forward, memory_cost_backward)
-
-        return memory_cost, memory_cost_forward_activation, memory_cost_backward_activation, memory_cost_backward_weight
-
-    def _generate_strategy_with_dim_partition(self, dim_partition):
-        dim_partition_dict_for_input = dim_partition
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = dim_partition
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        name = f'{sharding_spec_for_output.sharding_sequence} = {sharding_spec_for_input.sharding_sequence} x {sharding_spec_for_weight.sharding_sequence}'
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        total_sharding_size = generate_sharding_size(dim_partition, self.device_mesh)
-        # compute the computation cost of this strategy
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = generate_sharding_size(dim_partition_dict_for_input, self.device_mesh)
-        sharding_size_backward_activation = generate_sharding_size(dim_partition_dict_for_output, self.device_mesh)
-        sharding_size_weight = generate_sharding_size(dim_partition_dict_for_weight, self.device_mesh)
-        memory_cost, _, _, memory_cost_backward_weight = self._generate_memory_cost(sharding_size_forward,
-                                                                                    sharding_size_backward_activation,
-                                                                                    sharding_size_weight)
-
-        total_mesh_dim_list = []
-        for mesh_dim_list in dim_partition.values():
-            total_mesh_dim_list.extend(mesh_dim_list)
-
-        # This strategy do not need to do all_reduce operation for activation
-        communication_cost_forward_activation = 0
-        communication_cost_backward_activation = 0
-        if len(total_mesh_dim_list) == 1:
-            communication_cost_backward_weight = self.device_mesh.all_reduce_cost(memory_cost_backward_weight,
-                                                                                  total_mesh_dim_list[0])
-        else:
-            assert len(total_mesh_dim_list) == 2, f'temporally we just support 2d device mesh.'
-            communication_cost_backward_weight = self.device_mesh.flatten_device_mesh.all_reduce_cost(
-                memory_cost_backward_weight, 0)
-        communication_cost = communication_cost_forward_activation + communication_cost_backward_activation + communication_cost_backward_weight
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    @ignore_sharding_exception
-    def split_input_batch_single_mesh_dim(self, mesh_dim_0):
-        batch_dimension_length = self.input_data.dim() - self.weight.dim()
-        dim_partition_list = enumerate_all_possible_1d_sharding(mesh_dim_0, batch_dimension_length)
-        for dim_partition in dim_partition_list:
-            self._generate_strategy_with_dim_partition(dim_partition)
-
-    @ignore_sharding_exception
-    def split_input_batch_both_mesh_dim(self, mesh_dim_0, mesh_dim_1):
-        batch_dimension_length = self.input_data.dim() - self.weight.dim()
-        dim_partition_list = enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, batch_dimension_length)
-        for dim_partition in dim_partition_list:
-            self._generate_strategy_with_dim_partition(dim_partition)
-
-    @ignore_sharding_exception
-    def non_split(self):
-        name = f'RR = RR x R'
-
-        dim_partition_dict_for_input = {}
-        sharding_spec_for_input = self._generate_sharding_spec(self.input_data, dim_partition_dict_for_input)
-
-        dim_partition_dict_for_weight = {}
-        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)
-
-        dim_partition_dict_for_output = {}
-        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input])
-
-        total_sharding_size = 1
-        # compute the computation cost of this strategy
-        compute_cost = self._generate_compute_cost(total_sharding_size)
-
-        # compute the memory cost of this strategy
-        sharding_size_forward = 1
-        sharding_size_backward_activation = 1
-        sharding_size_weight = 1
-        memory_cost, _, _, _ = self._generate_memory_cost(sharding_size_forward, sharding_size_backward_activation,
-                                                          sharding_size_weight)
-
-        # This strategy do not need to do all_reduce operation
-        communication_cost = 0
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_output,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_input, sharding_spec_for_weight))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        '''
-        Generate every possible strategies for a BatchNorm node, and record all strategies into the strategies_vector.
-
-        Example:
-            norm_handler = BatchNormHandler(node,  strategies_vector,
-                                               self.shape_consistency_manager)
-            norm_handler.register_strategy()
-            for strategy in norm_handler.strategies_vector:
-                print(f'{strategy.name}, computation_cost: {strategy.compute_cost}, memory_cost: {strategy.memory_cost}')
-
-        Output:
-            RS0 = RS0 x S0, computation_cost: 131072, memory_cost: 524288.0
-            RS1 = RS1 x S1, computation_cost: 131072, memory_cost: 524288.0
-            RR = RR x R, computation_cost: 262144, memory_cost: 1048576
-            RS01 = RS01 x S01, computation_cost: 65536, memory_cost: 262144.0
-        '''
-
-        # SR = SR x R with single mesh dim on batch dimensions
-        self.split_input_batch_single_mesh_dim(0)
-        self.split_input_batch_single_mesh_dim(1)
-
-        # SR = SR x R with both mesh dims on batch dimensions
-        self.split_input_batch_both_mesh_dim(0, 1)
-
-        # RR = RR x R
-        self.non_split()
-
-        return self.strategies_vector
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
-from abc import ABC, abstractmethod
-from typing import Dict, List
-from webbrowser import Opera
-
-import torch
-import torch.nn as nn
-from torch.fx.node import Node
-
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import *
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .._utils import generate_resharding_costs, generate_sharding_spec
-from ..sharding_strategy import StrategiesVector
-
-__all__ = ['OperatorHandler']
-
-
-class OperatorHandler(ABC):
-    '''
-    The OperatorHandler is an abstract class used to generate every possible strategies for an operator node.
-
-    Args:
-        node (Node): the input node in node argument list.
-        device_mesh (DeviceMesh): A logical view of a physical mesh.
-        strategies_vector (StrategiesVector): all the strategies generated in this handler will be recorded into the strategies_vector.
-        handle_backward (Optional[bool]): whether to consider the backward pass. The default value is True. False can be used for inference.
-    '''
-
-    def __init__(self,
-                 node: Node,
-                 device_mesh: DeviceMesh,
-                 strategies_vector: StrategiesVector,
-                 handle_backward: bool = True):
-        self.node = node
-        self.predecessor_node = list(node._input_nodes.keys())
-        self.successor_node = list(node.users.keys())
-        self.device_mesh = device_mesh
-        self.strategies_vector = strategies_vector
-        self.handle_backward = handle_backward
-
-        # find the module and its parameters associated with this node
-        # this can be used to compute the compute/communication/sharding cost
-        if self.node.op == 'call_module':
-            module = node.graph.owning_module.get_submodule(node.target)
-            named_parameters = list(module.named_parameters(recurse=False))
-            # convert named parameters from list to dict
-            named_parameters = {k: v for k, v in named_parameters}
-        elif self.node.op == 'call_function' and self.node.target not in NON_PARAM_FUNC_OP:
-            module = None
-            parameters = list(self.node.args)[1]
-            if isinstance(parameters, Node):
-                named_parameters = {'weight': parameters._meta_data}
-            else:
-                named_parameters = {}
-        else:
-            module = None
-            named_parameters = None
-        self.module = module
-        self.module_named_parameters = named_parameters
-
-    @abstractmethod
-    def register_strategy(self) -> StrategiesVector:
-        """
-        Register
-        """
-        pass
-
-    def _generate_memory_cost(self, dim_partition_dict_for_output, dim_partition_dict_for_weight,
-                              sharding_spec_for_input):
-        '''
-        Compute the memory cost per device with this specific strategy.
-
-        Argument:
-            dim_partition_dict_for_output(List[int]): The key is the dimension of output to be sharded,
-                and the value of the key decribe which logical axis will be sharded in that dimension.
-            dim_partition_dict_for_weight(List[int]): The key is the dimension of weight to be sharded,
-                and the value of the key decribe which logical axis will be sharded in that dimension.
-        Return:
-            total_memory_cost(float): total memory cost per device with this specific strategy
-            activation_cost(float): the memory cost of activation per device with this specific strategy
-            weight_memory_cost(float): the memory cost of weight per device with this specific strategy
-        '''
-        # compute the size of one element with specific dtype
-        dtype = self.input_data.dtype
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # compute the memory cost of activation
-        activation_numel = self.output_data.numel()
-        output_mesh_dims = []
-        for sharding_dim, mesh_dims in dim_partition_dict_for_output.items():
-            output_mesh_dims.extend(mesh_dims)
-        activation_sharding_size = 1
-        for mesh_dim in output_mesh_dims:
-            activation_sharding_size *= self.device_mesh.shape[mesh_dim]
-        activation_memory_cost = activation_numel / activation_sharding_size * size_per_elem_bytes
-
-        # compute the memory cost of weight
-        weight_numel = self.weight.numel()
-        weight_sharding_size = 1
-        weight_mesh_dims = []
-        for sharding_dim, mesh_dims in dim_partition_dict_for_weight.items():
-            weight_mesh_dims.extend(mesh_dims)
-        for mesh_dim in weight_mesh_dims:
-            weight_sharding_size *= self.device_mesh.shape[mesh_dim]
-        weight_memory_cost = weight_numel / weight_sharding_size * size_per_elem_bytes
-
-        # compute the memory cost of input grad
-        input_grad_numel = self.input_data.numel()
-        input_grad_sharding_size = 1
-        input_grad_mesh_dims = []
-        for sharding_dim, mesh_dims in sharding_spec_for_input.items():
-            input_grad_mesh_dims.extend(mesh_dims)
-        for mesh_dim in input_grad_mesh_dims:
-            input_grad_sharding_size *= self.device_mesh.shape[mesh_dim]
-        input_grad_memory_cost = input_grad_numel / input_grad_sharding_size * size_per_elem_bytes
-
-        memory_cost_forward = activation_memory_cost + weight_memory_cost
-        memory_cost_backward = input_grad_memory_cost + weight_memory_cost
-
-        return (memory_cost_forward,
-                memory_cost_backward), activation_memory_cost, weight_memory_cost, input_grad_memory_cost
-
-    def _generate_resharding_costs(self, sharding_specs):
-        # The resharding_cost of weight is counted due to sharing weight cases.
-        if hasattr(self.node._meta_data, 'dtype'):
-            dtype = self.node._meta_data.dtype
-        else:
-            assert isinstance(self.node._meta_data,
-                              tuple), f'Only torch.Tensor, torch.fx.Node and tuple of torch.Tensor is expected'
-            dtype = self.node._meta_data[0].dtype
-
-        nodes = self.predecessor_node
-        return generate_resharding_costs(nodes=nodes,
-                                         sharding_specs=sharding_specs,
-                                         count_backward=self.handle_backward,
-                                         dtype=dtype)
-
-    def _generate_sharding_spec(self, input_: torch.Tensor, dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-        return generate_sharding_spec(input_=input_,
-                                      device_mesh=self.device_mesh,
-                                      dim_partition_dict=dim_partition_dict)
-
-    @abstractmethod
-    def _generate_compute_cost(self, *args, **kwargs):
-        """
-        Compute the flops involved in the node.
-        """
-        pass
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
-import colorsys
-import math
-import warnings
-from copy import deepcopy
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from ..constants import INFINITY_COST
-from .operator_handler import OperatorHandler
-
-
-class ReshapeHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of Reshape Operator, such as torch.reshape, torch.flatten, etc.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, *args, **kwargs):
-        return super()._generate_compute_cost(*args, **kwargs)
-
-    @ignore_sharding_exception
-    def register_strategy(self):
-        # TODO: add strategies with more output sharding specs other than only fully replicated.
-        input_node = self.strategies_vector.predecessor_nodes[0]
-        # For reshape function, to keep the computing correctness we keep the sharding
-        # spec of input is fully replicated. In addition, we will keep the output in
-        # replica status and let the successor node choose the way to resharding the
-        # output node. Therefore, the different strategies of input node with same
-        # output sharding spec will generate same strategy for reshape function.
-        sharding_spec_checklist = []
-        for strategy in input_node.strategies_vector:
-            # It looks a little bit confusing, the input of the processing node
-            # is the output of the input_node.
-            input_sharding_spec = strategy.output_sharding_spec
-            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-            if input_sharding_spec in sharding_spec_checklist:
-                continue
-            sharding_spec_checklist.append(input_sharding_spec)
-            dim_partition_dict_for_output = {}
-            if isinstance(self.output_data, tuple):
-                dim_partition_dict_for_output = [{} for _ in range(len(self.output_data))]
-            try:
-                if isinstance(self.output_data, tuple):
-                    output_sharding_spec = []
-                    for output, dim_partition_dict in zip(self.output_data, dim_partition_dict_for_output):
-                        output_sharding_spec.append(self._generate_sharding_spec(output, dim_partition_dict))
-                else:
-                    output_sharding_spec = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                continue
-            name = f'{input_sharding_spec.sharding_sequence} -> FULLY REPLICATED'
-            # TODO: use meta_info_prop to profile memory cost and compute cost
-            compute_cost = 0
-            # consider node._meta_data is in type of tuple
-            memory_cost = 0
-
-            # compute the communication cost, in reshape op, the communication happens during casting the input sharding spec to fully replicating.
-            dim_partition_dict_for_replicate_input = {}
-            replicate_input_sharding_spec = self._generate_sharding_spec(self.input_data,
-                                                                         dim_partition_dict_for_replicate_input)
-            # shape consistency manager is a singleton class
-            shape_consistency_manager = ShapeConsistencyManager()
-            _, _, communication_cost = shape_consistency_manager.shape_consistency(input_sharding_spec,
-                                                                                   replicate_input_sharding_spec)
-            communication_cost = communication_cost["total"]
-
-            # generate resharding cost
-            resharding_costs = self._generate_resharding_costs([input_sharding_spec])
-
-            # to prevent the resharding happening, set their resharding cost to inf.
-            resharding_costs[input_node] = [0 if cost == 0 else INFINITY_COST for cost in resharding_costs[input_node]]
-            sharding_strategy = ShardingStrategy(name,
-                                                 output_sharding_spec,
-                                                 compute_cost=compute_cost,
-                                                 communication_cost=communication_cost,
-                                                 memory_cost=memory_cost,
-                                                 resharding_costs=resharding_costs,
-                                                 input_shardings=[input_sharding_spec])
-            self.strategies_vector.append(sharding_strategy)
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/strategy_generator.py
-from dataclasses import dataclass
-from abc import ABC, abstractmethod
-from typing import List, Dict
-from colossalai.device.device_mesh import DeviceMesh
-
-__all__ = ['IntermediateStrategy', 'StrategyGenerator']
-
-
-@dataclass
-class IntermediateStrategy:
-    """
-    IntermediateStrategy contains the subset of meta information for ShardingStrategy. It is 
-    to store the essential information regarding the tensor sharding and leave other meta information to OperatorHandler.
-
-    Args:
-        name (str): name of the sharding strategy.
-        dim_partition_dict (Dict[Dict]): stores the tensor to dim partition dict mapping.
-        all_reduce_dims (List[int]): stores the dimensions which require an all-reduce operation.
-    """
-    name: str
-    dim_partition_dict: Dict[str, Dict[int, List[int]]]
-    all_reduce_axis: List[int] = None
-
-
-class StrategyGenerator(ABC):
-    """
-    StrategyGenerator is used to generate the same group of sharding strategies. 
-    """
-
-    def __init__(self, device_mesh: DeviceMesh):
-        self.device_mesh = device_mesh
-
-    @abstractmethod
-    def generate(self) -> List[IntermediateStrategy]:
-        """
-        """
-        pass
-
-    @abstractmethod
-    def validate(self, *args, **kwargs) -> bool:
-        """
-        Validate if the operands are of desired shape. 
-        If True, means this generator can be used for the current operation.
-        """
-        pass
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/unary_elementwise_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/unary_elementwise_handler.py
-import math
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import \
-    INFINITY_COST
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['UnaryElementwiseHandler']
-
-
-class UnaryElementwiseHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of UnaryElementwiseOp.
-    """
-
-    def __init__(self, *args, **kwargs):
-        super().__init__(*args, **kwargs)
-        if self.node.op == 'call_module':
-            target = self.node.target
-            submod = self.node.graph.owning_module.get_submodule(target)
-            submod_type = type(submod)
-            if submod_type == torch.nn.Dropout:
-                print(f'predecessor nodes of dropout node are {self.predecessor_node}')
-        input_nodes_len = 0
-        for check_node in self.predecessor_node:
-            if isinstance(check_node._meta_data, torch.Tensor):
-                input_nodes_len += 1
-        assert input_nodes_len == 1, f'Temporally, we just support single input element-wise op, node name is {self.node}, node args is {self.node.args}.'
-        self.input_data = self.predecessor_node[0]._meta_data
-        self.input_node = self.predecessor_node[0]
-        self.output_data = self.node._meta_data
-
-    def _generate_compute_cost(self, *args, **kwargs):
-        return super()._generate_compute_cost(*args, **kwargs)
-
-    @ignore_sharding_exception
-    def register_strategy(self):
-        # TODO: integrate element-wise func and module together
-        # create sharding strategy for element-wise function
-
-        # For element-wise function, we keep the sharding spec of output node same as
-        # the input. Therefore, the different strategies of input node with same
-        # output sharding spec will generate same strategy for element-wise function.
-
-        for index, strategy in enumerate(self.input_node.strategies_vector):
-            # It looks a little bit confusing, the input of the processing node
-            # is the output of the input_node.
-            input_sharding_spec = strategy.output_sharding_spec
-            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-
-            dim_partition_dict = deepcopy(input_sharding_spec.dim_partition_dict)
-            try:
-                output_sharding_spec = self._generate_sharding_spec(self.output_data, dim_partition_dict)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                continue
-            # add index into name to pass the duplicated check
-            # we keep same strategies with different name for node merging, and it will not increase the searching space,
-            # because in solver, this node will be merged into other nodes, and solver will not create a new variable for this node.
-            name = f'{input_sharding_spec.sharding_sequence} -> {output_sharding_spec.sharding_sequence}_{index}'
-            # TODO: use meta_info_prop to profile memory cost and compute cost
-            compute_cost = self.output_data.numel()
-            memory_cost = 0
-
-            resharding_costs = self._generate_resharding_costs([input_sharding_spec])
-
-            # to prevent the resharding happening, set their resharding cost to inf.
-            resharding_costs[self.input_node] = [
-                0 if cost == 0 else INFINITY_COST for cost in resharding_costs[self.input_node]
-            ]
-            sharding_strategy = ShardingStrategy(name,
-                                                 output_sharding_spec,
-                                                 compute_cost=compute_cost,
-                                                 memory_cost=memory_cost,
-                                                 resharding_costs=resharding_costs,
-                                                 input_shardings=[input_sharding_spec])
-            self.strategies_vector.append(sharding_strategy)
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/where_handler.py
-import operator
-import warnings
-from copy import deepcopy
-from functools import reduce
-from typing import Dict, List
-
-import torch
-
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import (enumerate_all_possible_1d_sharding,
-                                                                     enumerate_all_possible_2d_sharding,
-                                                                     ignore_sharding_exception)
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
-from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec
-
-from .operator_handler import OperatorHandler
-
-__all__ = ['WhereHandler']
-
-
-class WhereHandler(OperatorHandler):
-    """
-    An OperatorHandler which deals with the sharding strategies of torch.where.
-    """
-
-    def __init__(self, *args, **kwargs):
-        # TODO: x or y could be scalar
-        super().__init__(*args, **kwargs)
-        assert len(self.predecessor_node) == 3
-        self.condition_data = self.predecessor_node[0]._meta_data
-        self.x_data = self.predecessor_node[1]._meta_data
-        self.y_data = self.predecessor_node[2]._meta_data
-        self.condition = self.predecessor_node[0]
-        self.x = self.predecessor_node[1]
-        self.y = self.predecessor_node[2]
-        self.output_data = self.node._meta_data
-
-    def _generate_sharding_spec(self, input_: torch.Tensor, dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
-        shape = list(input_.shape)
-
-        # padding the shape to the same length as output_data
-        while len(shape) < self.output_data.dim():
-            shape.insert(0, 1)
-        shape = torch.Size(shape)
-
-        # if the sharding happens on a size one dimension, we should record it as R.
-        processed_dim_partition_dict = deepcopy(dim_partition_dict)
-        for dim_index, _ in dim_partition_dict.items():
-            if shape[dim_index] == 1:
-                processed_dim_partition_dict.pop(dim_index)
-        for dim_index, sharding_index_list in processed_dim_partition_dict.items():
-            sharding_list = [self.device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
-            sharding_size = reduce(operator.mul, sharding_list, 1)
-            assert shape[
-                dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
-        sharding_spec = ShardingSpec(device_mesh=self.device_mesh,
-                                     entire_shape=shape,
-                                     dim_partition_dict=processed_dim_partition_dict)
-
-        return sharding_spec
-
-    def _generate_compute_cost(self, total_sharding_size):
-        lhs_matrix_shape = self.lhs_data.shape[-2:]
-        rhs_matrix_shape = self.rhs_data.shape[-2:]
-        batch_dimensions_shape = self.output_data.shape[:-2]
-        batch_dimensions_product = reduce(operator.mul, batch_dimensions_shape, 1)
-        compute_cost = reduce(
-            operator.mul, lhs_matrix_shape) * rhs_matrix_shape[0] * batch_dimensions_product * 2 / total_sharding_size
-        return compute_cost
-
-    def _generate_resharding_costs(self, sharding_specs):
-        # The resharding_cost of weight is counted due to sharing weight cases.
-        dtype = self.node._meta_data.dtype
-        nodes = self.predecessor_node
-        resharding_costs = {}
-        size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
-
-        # shape consistency manager is a singleton class
-        shape_consistency_manager = ShapeConsistencyManager()
-
-        for input_node, input_spec in zip(nodes, sharding_specs):
-            resharding_costs[input_node] = []
-            for strategy in input_node.strategies_vector:
-                input_sharding_spec = strategy.output_sharding_spec
-                assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
-                # if the input shape is smaller than the target input, we will fill the input to the same length as target.
-                # Then, use the padded input sharding spec to compute the resharding cost.
-                if len(input_sharding_spec.entire_shape) < len(input_spec.entire_shape):
-                    new_entire_shape = list(input_sharding_spec.entire_shape)
-                    while len(new_entire_shape) < len(input_spec.entire_shape):
-                        new_entire_shape.insert(0, 1)
-                    new_entire_shape = torch.Size(new_entire_shape)
-                    new_device_mesh = input_sharding_spec.device_mesh
-                    new_dim_partition_dict = input_sharding_spec.dim_partition_dict
-                    input_sharding_spec = ShardingSpec(device_mesh=new_device_mesh,
-                                                       entire_shape=new_entire_shape,
-                                                       dim_partition_dict=new_dim_partition_dict)
-
-                # compute the resharding cost
-                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
-                    input_sharding_spec, input_spec)
-                total_resharding_cost = total_resharding_cost['total']
-                # we need multiply the size of elem dtype to get correct communication cost
-                resharding_cost = total_resharding_cost * size_per_elem_bytes
-                resharding_costs[input_node].append(resharding_cost)
-
-        return resharding_costs
-
-    def _convert_partition_dict_to_sharding_spec(self, dim_partition_list):
-
-        sharding_spec_list = []
-        check_duplicated_list = []
-        for output_dim_partition_dict in dim_partition_list:
-            try:
-                output_sharding_spec = self._generate_sharding_spec(self.output_data, output_dim_partition_dict)
-            except AssertionError as e:
-                warnings.warn(f'{e}')
-                break
-            sharding_seq = output_sharding_spec.sharding_sequence
-            if sharding_seq not in check_duplicated_list:
-                check_duplicated_list.append(sharding_seq)
-                sharding_spec_list.append(output_sharding_spec)
-
-        return sharding_spec_list
-
-    def _enumerate_all_possible_output(self, mesh_dim_0, mesh_dim_1):
-        # use mesh_dim_0, mesh_dim_1 instead of constant 0, 1 in here for N-D device mesh scaliablity.
-
-        output_dim_partition_list = []
-        dim_size = self.output_data.dim()
-        # enumerate all the 2D sharding cases
-        sharding_list_2d = enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size)
-        output_dim_partition_list.extend(sharding_list_2d)
-
-        # enumerate all the 1D sharding cases
-        sharding_list_1d_on_dim_0 = enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size)
-        output_dim_partition_list.extend(sharding_list_1d_on_dim_0)
-        sharding_list_1d_on_dim_1 = enumerate_all_possible_1d_sharding(mesh_dim_1, dim_size)
-        output_dim_partition_list.extend(sharding_list_1d_on_dim_1)
-
-        # add empty dict for fully replicated case
-        output_dim_partition_list.append({})
-        output_sharding_spec_list = self._convert_partition_dict_to_sharding_spec(output_dim_partition_list)
-
-        return output_sharding_spec_list
-
-    @ignore_sharding_exception
-    def _register_strategy(self, output_sharding_spec):
-        dim_partition_dict_for_input = output_sharding_spec.dim_partition_dict
-        sharding_spec_for_condition = self._generate_sharding_spec(self.condition_data, dim_partition_dict_for_input)
-        sharding_spec_for_x = self._generate_sharding_spec(self.x_data, dim_partition_dict_for_input)
-        sharding_spec_for_y = self._generate_sharding_spec(self.y_data, dim_partition_dict_for_input)
-
-        name = f'{output_sharding_spec.sharding_sequence} = {sharding_spec_for_condition.sharding_sequence} x {sharding_spec_for_x.sharding_sequence} x {sharding_spec_for_y.sharding_sequence}'
-        dim_partition_dict_for_output = output_sharding_spec.dim_partition_dict
-
-        # generate resharding cost for this strategy
-        resharding_costs = self._generate_resharding_costs(
-            [sharding_spec_for_condition, sharding_spec_for_x, sharding_spec_for_y])
-
-        # compute the computation cost of this strategy
-        sharding_dims = []
-        for mesh_dims in dim_partition_dict_for_output.values():
-            for mesh_dim in mesh_dims:
-                sharding_dims.append(self.device_mesh.shape[mesh_dim])
-        sharding_size = reduce(operator.mul, sharding_dims, 1)
-        memory_cost = self.output_data.numel() / sharding_size
-        compute_cost = memory_cost
-        communication_cost = 0
-
-        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=output_sharding_spec,
-                                               compute_cost=compute_cost,
-                                               communication_cost=communication_cost,
-                                               memory_cost=memory_cost,
-                                               resharding_costs=resharding_costs,
-                                               input_shardings=(sharding_spec_for_condition, sharding_spec_for_x,
-                                                                sharding_spec_for_y))
-
-        self.strategies_vector.append(sharding_strategies)
-
-    def register_strategy(self) -> StrategiesVector:
-        MESH_DIM_LIST = [0, 1]
-        output_sharding_specs = self._enumerate_all_possible_output(MESH_DIM_LIST[0], MESH_DIM_LIST[1])
-        for output_sharding_spec in output_sharding_specs:
-            self._register_strategy(output_sharding_spec)
--- a/colossalai/auto_parallel/tensor_shard/deprecated/options.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/options.py
-from dataclasses import dataclass
-
-__all__ = ['SolverOptions']
-
-
-@dataclass
-class SolverOptions:
-    """
-    SolverOptions is a dataclass used to configure the preferences for the parallel execution plan search.
-    """
-    fast: bool = False
--- a/colossalai/auto_parallel/tensor_shard/deprecated/sharding_strategy.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/sharding_strategy.py
-from copy import deepcopy
-from dataclasses import dataclass
-from abc import ABC, abstractmethod
-from enum import Enum
-import operator
-import torch
-from functools import reduce
-
-from colossalai.device.device_mesh import DeviceMesh
-from colossalai.tensor.sharding_spec import ShardingSpec
-from colossalai.tensor.shape_consistency import CollectiveCommPattern, CommSpec
-from typing import Dict, List, Union, Tuple, Any
-from torch.fx.node import Node
-from .constants import *
-
-__all__ = ['ShardingStrategy', 'StrategiesVector']
-
-
-@dataclass
-class ShardingStrategy:
-    '''
-    ShardingStrategy is a structure containing sharding strategies of inputs and output of this node
-    and costs information using in solver.
-
-    Argument:
-        name(str): express the sharding strategies in string, such as 'S0S1 = S0R x RS1'.
-        output_sharding_spec(ShardingSpec): ShardingSpec of the output node.
-        compute_cost(float): Computation cost to complete this strategy.(default to 0)
-        communication_cost(float): Communication cost to complete this strategy.(default to 0)
-        memory_cost(float): Memory cost of the output node using this strategy.(default to 0)
-        resharding_costs(Dict[int, List[float]]): resharding_cost[i][j] means the cost of i-th argument in the output node argument list
-                                                  with j-th strategy in its strategies_vector transforms to sharding spec wanted in this
-                                                  strategy.(default to None)
-        input_shardings(List(ShardingSpec)): The ShardingSpecs of the input nodes.
-    '''
-
-    name: str
-    # TODO: output of fx node,such as torch.var_mean, could be a tuple, so we cannot simply suppose it is a tensor.
-    output_sharding_spec: Union[ShardingSpec, Tuple[ShardingSpec]]
-    compute_cost: float = 0.
-    communication_cost: float = 0.
-    memory_cost: float = 0.
-    resharding_costs: Dict[Node, List[float]] = None
-    # sometimes the input node could be a tuple of nodes, but most of op won't accept tuple of node as input.
-    # Therefore, we could process them at the specific op(operator.getitem)
-    input_shardings: List[ShardingSpec] = None
-
-
-class StrategiesVector(list):
-    '''
-    Each node in fx graph will have a corresponding StrategiesVector, to store all the possible
-    strategies of the node.
-
-    Argument:
-        node (Node): node for which the list of sharding strategies are generated.
-    '''
-
-    def __init__(self, node: Node):
-        super().__init__()
-        self.node = node
-        # fetch its input and output nodes
-        # TODO: placeholder input nodes
-        self.predecessor_nodes = list(node._input_nodes.keys())
-        if self.node.op == 'output':
-            self.predecessor_nodes = list(node._input_nodes.keys())[:1]
-        self.successor_nodes = list(node.users.keys())
-
-    def check_merge(self):
-        merge_label = False
-        if self.node.op == 'call_module':
-            target = self.node.target
-            root_module = self.node.graph.owning_module
-            submod = root_module.get_submodule(target)
-            submod_type = type(submod)
-            # merge elementwise module node into source nodes
-            # we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.
-            if submod_type in ELEMENTWISE_MODULE_OP:
-                merge_label = True
-
-        if self.node.op == 'call_function':
-            # we could merge element-wise op, because the output sharding spec is always same as the input sharding spec.
-            if self.node.target in ELEMENTWISE_FUNC_OP:
-                merge_label = True
-            # we could merge bcast op if the rhs is a scalar, because it will fall back to the element-wise case.
-            if self.node.target in BCAST_FUNC_OP and len(self.predecessor_nodes) == 1:
-                merge_label = True
-            # we could merge reshape op, because the output sharding spec of reshape op is always fully replicated.
-            if self.node.target in RESHAPE_FUNC_OP:
-                merge_label = True
-
-        return merge_label
--- a/colossalai/auto_parallel/tensor_shard/deprecated/solver.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/solver.py
-import multiprocessing
-import time
-import warnings
-from typing import Dict
-
-import numpy as np
-from torch.fx.graph import Graph
-from torch.fx.node import Node
-
-from .constants import INFINITY_COST
-from .cost_graph import CostGraph
-from .graph_analysis import GraphAnalyser
-from .strategies_constructor import StrategiesConstructor
-
-try:
-    import pulp
-    from pulp import LpMinimize, LpProblem, LpStatus, LpVariable, lpDot, lpSum
-except:
-    warnings.warn(f'please install the pulp')
-
-__all___ = ['Solver']
-
-
-class Solver:
-
-    def __init__(self,
-                 graph: Graph,
-                 strategies_constructor: StrategiesConstructor,
-                 cost_graph: CostGraph,
-                 graph_analyser: GraphAnalyser,
-                 memory_budget: float = -1.0,
-                 solution_numbers: int = 1,
-                 memory_increasing_coefficient: float = 1.3):
-        '''
-        Solver class will integrate information provided by the components and use ILP solver to find a possible optimal strategies combination for target computing graph.
-
-        Argument:
-            graph: The computing graph to be optimized.
-            strategies_constructor: It will provide all the possible strategies for each node in the computing graph.
-            cost_graph: A graph data structure to simplify the edge cost graph.
-            graph_analyser: graph_analyser will analyse the graph to obtain the variable liveness information, which will be used to generate memory constraints.
-            memory_budget: Memory constraint for the solution.
-            solution_numbers: If solution_numbers is larger than one, solver will us a serious of solutions based on different memory budget.
-            memory_increasing_coefficient: If solution_numbers is larger than one, we will use this coefficient to generate new memory budget.
-        '''
-        self.graph = graph
-        self.strategies_constructor = strategies_constructor
-        self.cost_graph = cost_graph
-        self.graph_analyser = graph_analyser
-        self.leaf_strategies = self.strategies_constructor.leaf_strategies
-        self.nodes = [strategies_vector.node for strategies_vector in self.leaf_strategies]
-        self.strategy_map = self.strategies_constructor.strategy_map
-        self.memory_budget = memory_budget
-        self.solution_numbers = solution_numbers
-        if self.solution_numbers > 1:
-            self.memory_increasing_coefficient = memory_increasing_coefficient
-        else:
-            self.memory_increasing_coefficient = 1
-        self.liveness_list = self.graph_analyser.liveness_analysis()
-        self.node_index_dict = self._generate_node_index_dict()
-        # The last solution vector of auto sharding.
-        self.last_s_val = None
-        # The last objective value of the best ILP solution.
-        self.last_objective = None
-
-    def _recover_merged_node_strategy(self):
-        '''
-        During cost graph constructing, some nodes, such as unary element-wise node or ReshapeOp, were merged into the previous node.
-        Therefore, the index of those strategies are copied from the previous node. This method is used to recover the strategy index of those merged
-        node.
-        '''
-        for node_index, node in enumerate(self.nodes):
-            if node.strategies_vector.check_merge():
-                # the merged node has only one input, and its strategies follow the input sharding strategy
-                input_strategies_vector = node.args[0].strategies_vector
-                input_best_strategy_index = self.last_s_val[node_index - 1]
-                input_sharding_spec = input_strategies_vector[input_best_strategy_index].output_sharding_spec
-                for strategy_index, strategy in enumerate(node.strategies_vector):
-                    if strategy.input_shardings[0].sharding_sequence == input_sharding_spec.sharding_sequence:
-                        self.last_s_val[node_index] = strategy_index
-                        break
-
-    def _generate_node_index_dict(self) -> Dict[Node, int]:
-        node_index_dict = {}
-        for index, strategies_vector in enumerate(self.leaf_strategies):
-            node_index_dict[strategies_vector.node] = index
-        return node_index_dict
-
-    def _prepare_data_for_solver(self):
-        '''
-        Extract information from components for solver.
-        '''
-        node_nums = len(self.leaf_strategies)
-        memory_budget = self.memory_budget
-
-        # prepare strategies_len
-        strategies_len = []
-        for node in self.nodes:
-            strategies_len.append(self.cost_graph.node_lens[node])
-        strategies_len = np.array(strategies_len)
-
-        # prepare following_nodes
-        following_nodes = self.cost_graph.following_dict
-        index_following_nodes = {}
-        for src, target in following_nodes.items():
-            src_index = self.node_index_dict[src]
-            target_index = self.node_index_dict[target]
-            index_following_nodes[src_index] = target_index
-        following_nodes = index_following_nodes
-        for index in range(node_nums):
-            if index not in following_nodes:
-                following_nodes[index] = -1
-
-        # prepare edge_pairs and resharding costs
-        edge_pairs = []
-        resharding_costs = []
-        for pairs, edge_cost in self.cost_graph.edge_costs.items():
-            src_node = pairs[0]
-            dst_node = pairs[1]
-            src_node_index = self.node_index_dict[src_node]
-            dst_node_index = self.node_index_dict[dst_node]
-            edge_pairs.append(src_node_index)
-            edge_pairs.append(dst_node_index)
-
-            for i in range(strategies_len[src_node_index]):
-                for j in range(strategies_len[dst_node_index]):
-                    resharding_costs.append(edge_cost[(i, j)])
-        edge_pairs = np.array(edge_pairs)
-        resharding_costs = np.array(resharding_costs)
-
-        # prepare liveness_set
-        liveness_set = self.liveness_list
-
-        # omit alias_set now
-        alias_set = None
-        alias_convert_costs = None
-
-        # prepare compute_costs, communication_costs and memory_costs
-        compute_costs = []
-        communication_costs = []
-        memory_costs = []
-        extra_node_costs = self.cost_graph.extra_node_costs
-        for strategies_vector in self.leaf_strategies:
-            node = strategies_vector.node
-            for index, strategy in enumerate(strategies_vector):
-                compute_costs.append(strategy.compute_cost)
-                # node in extra_node_costs means it has some extra communication
-                # cost from node merging, so we need to add those extra communication
-                # cost into
-                if node in extra_node_costs:
-                    origin_communication_cost = strategy.communication_cost
-                    extra_node_cost = extra_node_costs[node][index]
-                    communication_cost = origin_communication_cost + extra_node_cost
-                    communication_costs.append(communication_cost)
-                else:
-                    communication_costs.append(strategy.communication_cost)
-                # temporarily we just consider the forward memory cost
-                memory_cost = strategy.memory_cost
-                if isinstance(memory_cost, tuple):
-                    memory_costs.append(memory_cost[0])
-                else:
-                    memory_costs.append(memory_cost)
-        compute_costs = np.array(compute_costs)
-        communication_costs = np.array(communication_costs)
-        memory_costs = np.array(memory_costs)
-
-        # omit initial value for nodes
-        s_init_np = None
-
-        return node_nums, memory_budget, strategies_len, following_nodes, edge_pairs, alias_set, liveness_set, compute_costs, communication_costs, memory_costs, resharding_costs, alias_convert_costs, s_init_np
-
-    def _call_solver_serialized_args(self,
-                                     node_nums,
-                                     memory_budget,
-                                     strategies_len,
-                                     following_nodes,
-                                     edge_pairs,
-                                     alias_set,
-                                     liveness_set,
-                                     compute_costs,
-                                     communication_costs,
-                                     memory_costs,
-                                     resharding_costs,
-                                     alias_convert_costs,
-                                     s_init_np=None):
-        """
-        Call the solver with serialized arguments.
-        """
-
-        tic = time.time()
-
-        for x in [strategies_len, edge_pairs, compute_costs, communication_costs, memory_costs, resharding_costs]:
-            assert isinstance(x, np.ndarray)
-        assert len(strategies_len) == node_nums, "strategies_len"
-
-        def get_non_zero_index(binary_vector):
-            """
-            Get the index of non-zero item in a vector.
-            """
-            ct = 0
-            ret = None
-            for i, elem in enumerate(binary_vector):
-                if pulp.value(elem):
-                    ret = i
-                    ct += 1
-
-            assert ct == 1
-            return ret
-
-        # 0. Unpack flatten numpy arrays
-        s_follow = following_nodes
-
-        E = edge_pairs.reshape((-1, 2))    # noqa
-        r = []
-        pt = 0
-        edge_set = set()
-        for (i, j) in E:
-            prod_length = strategies_len[i] * strategies_len[j]
-
-            if (i, j) in edge_set:
-                raise ValueError(f"Duplicated edges: {(i, j)}")
-
-            edge_set.add((i, j))
-            r.append(resharding_costs[pt:pt + prod_length])
-            pt += prod_length
-        assert pt == len(resharding_costs)
-
-        ######################
-        # omit alias set now #
-        ######################
-
-        # A = alias_set.reshape((-1, 2))  # noqa
-        # for (i, j) in A:
-        #     prod_length = strategies_len[i] * strategies_len[j]
-        #     v.append(alias_convert_costs[pt:pt + prod_length])
-        #     pt += prod_length
-        # assert pt == len(alias_convert_costs)
-
-        # L = []  # noqa
-        # pt = node_nums
-        # for i in range(node_nums):
-        #     length = liveness_set[i]
-        #     L.append(liveness_set[pt:pt + length])
-        #     pt += length
-        # assert pt == len(liveness_set)
-        v = []
-        pt = 0
-
-        c = []
-        d = []
-        m = []
-        pt = 0
-        for i in range(node_nums):
-            length = strategies_len[i]
-            c.append(compute_costs[pt:pt + length])
-            d.append(communication_costs[pt:pt + length])
-            m.append(memory_costs[pt:pt + length])
-            pt += length
-        assert pt == len(compute_costs), f"{pt} == {len(compute_costs)}"
-        assert pt == len(communication_costs), f"{pt} == {len(communication_costs)}"
-        assert pt == len(memory_costs), f"{pt} == {len(memory_costs)}"
-
-        # 1. Create variables
-
-        #############################
-        # create variables for node #
-        #############################
-        s = []
-        num_nodes = 0
-        reverse_follow_backpatch = []
-        for i in range(node_nums):
-            if s_follow[i] < 0:
-                if strategies_len[i] == 1:
-                    s.append([1])
-                else:
-                    num_nodes += 1
-                    s.append(LpVariable.matrix(f"s[{i}]", (range(strategies_len[i]),), cat="Binary"))
-            else:
-                if s_follow[i] < len(s):
-                    s.append(s[s_follow[i]])
-                else:
-                    s.append(None)
-                    reverse_follow_backpatch.append(i)
-
-        for i in reverse_follow_backpatch:
-            s[i] = s[s_follow[i]]
-
-        #############################
-        # create variables for edge #
-        #############################
-        e = []
-        num_edges = 0
-        for (idx, (i, j)) in enumerate(E):
-            if len(s[i]) == 1:
-                e.append(s[j])
-            elif len(s[j]) == 1:
-                e.append(s[i])
-            else:
-                num_edges += 1
-                e.append(LpVariable.matrix(f"e[{i},{j}]", (range(len(s[i]) * len(s[j])),), cat="Binary"))
-            assert len(e[idx]) == len(r[idx])
-        for element in s:
-            assert len(element) > 0
-        # 2. Set initial value
-        ######################################
-        # set a initial value for warm start #
-        ######################################
-        if s_init_np is not None:
-            s_init = s_init_np.reshape((-1, 3))
-            for (idx, value, fix) in s_init:
-                for i in range(len(s[idx])):
-                    s[idx][i].setInitialValue(i == value)
-                    if fix:
-                        s[idx][i].fixValue()
-
-        # 3. Objective
-        prob = LpProblem("myProblem", LpMinimize)
-        ###################################################################
-        # computing the node cost(computing cost and communication cost)  #
-        ###################################################################
-        obj = 0
-        for i in range(node_nums):
-            assert len(s[i]) == len(c[i])
-            assert len(s[i]) == len(d[i])
-
-            obj += lpDot(s[i], c[i]) + lpDot(s[i], d[i])
-
-        #############################################
-        # computing the edge cost(resharding cost)  #
-        #############################################
-        for i in range(len(E)):
-            assert len(e[i]) == len(r[i])
-            obj += lpDot(e[i], r[i])
-
-        prob += obj
-
-        # 4. Constraints
-        # (a). specified by `cat="Binary"`
-
-        # (b)
-        #################################################
-        # make sure each node only choose one strategy  #
-        #################################################
-        for i in range(node_nums):
-            if s_follow[i] < 0:
-                prob += lpSum(s[i]) == 1
-
-        # (c)
-        #################################################
-        # compute memory consumption with liveness set  #
-        #################################################
-        if memory_budget > 0:
-            for liveness_stage in liveness_set:
-                mem = 0
-                for live_variable in liveness_stage.unique_live_vars:
-                    node_index = self.node_index_dict[live_variable.node]
-                    mem += lpSum(s[node_index][j] * m[node_index][j] for j in range(len(s[node_index])))
-                prob += mem <= memory_budget
-
-        # (d). specified by `cat="Binary"`
-
-        for (idx, (i, j)) in enumerate(E):
-            if strategies_len[i] == 1 or strategies_len[j] == 1:
-                continue
-
-            # (e)
-            prob += lpSum(e[idx]) == 1
-
-            # (f)
-            for row in range(len(s[i])):
-                C = len(s[j])    # noqa
-                prob += lpSum(e[idx][row * C + col] for col in range(0, C)) <= s[i][row]
-
-            # (g)
-            for col in range(len(s[j])):
-                R = len(s[i])    # noqa
-                C = len(s[j])    # noqa
-                prob += lpSum(e[idx][row * C + col] for row in range(0, R)) <= s[j][col]
-
-        # (h)
-        ######################
-        # omit alias set now #
-        ######################
-
-        # alias_set = set()
-        # for (idx, (i, j)) in enumerate(A):
-        #     R = len(s[i])  # noqa
-        #     C = len(s[j])  # noqa
-        #     if (i, j) in alias_set:
-        #         raise ValueError(f"Duplicated edges: {(i, j)}")
-
-        #     alias_set.add((i, j))
-        #     alias_set.add((j, i))
-
-        #     for row in range(len(s[i])):
-        #         for col in range(len(s[j])):
-        #             if v[idx][row * C + col] > 0.5:
-        #                 prob += s[i][row] + s[j][col] <= 1
-
-        verbose = True
-
-        msg = verbose
-        time_limit = 600
-        assert "COIN_CMD" in pulp.listSolvers(
-            onlyAvailable=True), ("Please install ILP solvers by 'sudo apt install coinor-cbc'")
-
-        solver = pulp.COIN_CMD(mip=True, msg=msg, timeLimit=time_limit, threads=multiprocessing.cpu_count())
-        # solver = pulp.GLPK_CMD(mip=True, msg=msg, timeLimit=time_limit)
-        prob.solve(solver)
-
-        status = prob.status
-        objective = pulp.value(prob.objective)
-        objective = float(objective) if objective is not None else -1.0
-        if verbose:
-            print(f"ILP Status: {LpStatus[status]}\tObjective: {objective}\t"
-                  f"Time: {time.time() - tic}")
-            print(f"#nodes: {num_nodes},  #edges: {num_edges}")
-
-        if prob.status in [pulp.LpStatusInfeasible]:
-            raise RuntimeError("Cannot run the function under the given memory budget. "
-                               "Please increase the memory budget.")
-
-        # Get and check results
-        s_val = np.full((node_nums,), -1, dtype=np.int32)
-        for i in range(node_nums):
-            s_val[i] = get_non_zero_index(s[i])
-
-        e_val = np.full((len(E),), -1, dtype=np.int32)
-        for (idx, (i, j)) in enumerate(E):
-            e_val[idx] = get_non_zero_index(e[idx])
-            i_spec_index = e_val[idx] // len(s[j])
-            j_spec_index = e_val[idx] % len(s[j])
-            assert i_spec_index == s_val[i], f"e_val[{i}][{j}]"
-            assert j_spec_index == s_val[j], f"e_val[{i}][{j}]"
-            if verbose and r[idx][e_val[idx]] > 0:
-                print(f"Edge cost {(i, j)} : {r[idx][e_val[idx]]}")
-
-        self.last_s_val = list(s_val)
-        self._recover_merged_node_strategy()
-        self.last_objective = objective
-
-        if objective > INFINITY_COST:
-            warnings.warn("Detect unexpected behaviors in the auto-sharding pass.")
-
-        return self.last_s_val, e_val, self.last_objective, status
-
-    def call_solver_serialized_args(self):
-        """
-        Call the solver with serialized arguments and handle python errors. Additionally,
-        we could give a serious of solutions with different memory budget.
-        """
-        if self.solution_numbers == 1:
-            args = self._prepare_data_for_solver()
-            ret = self._call_solver_serialized_args(*args)
-
-            return ret
-
-        origin_memory_budget = self.memory_budget
-        memory_budget_list = [
-            origin_memory_budget * self.memory_increasing_coefficient**i for i in range(self.solution_numbers)
-        ]
-        ret_list = []
-        for memory_budget in memory_budget_list:
-            self.memory_budget = memory_budget
-            args = self._prepare_data_for_solver()
-            ret = self._call_solver_serialized_args(*args)
-            ret_list.append(ret)
-
-        return ret_list