Merge branch 'main' of https://github.com/oahzxl/ColossalAI into chunk

e532679c · oahzxl · c1492e50 · 7d5640b9 · e532679c · e532679c
Commit e532679c authored Jan 10, 2023 by oahzxl
20 changed files
--- a/colossalai/auto_parallel/passes/__init__.py
+++ b/colossalai/auto_parallel/passes/__init__.py
--- a/colossalai/auto_parallel/passes/comm_metainfo_pass.py
+++ b/colossalai/auto_parallel/passes/comm_metainfo_pass.py
+from typing import Dict
+
+import torch
+from torch.fx import GraphModule
+from torch.fx.node import Node
+
+from colossalai.auto_parallel.meta_profiler import MetaInfo
+from colossalai.auto_parallel.passes.runtime_apply_pass import runtime_apply, runtime_comm_spec_apply
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import MemoryCost, TrainCycleItem
+from colossalai.tensor.comm_spec import CommSpec
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+shape_consistency_manager = ShapeConsistencyManager()
+
+
+def _construct_meta_info(node: Node, origin_sharding_spec: ShardingSpec,
+                         target_sharding_spec: ShardingSpec) -> MetaInfo:
+    # get comm_action_sequence and total_cost from shape_consistency_manager
+    _, comm_action_sequence, total_cost = shape_consistency_manager.shape_consistency(
+        origin_sharding_spec, target_sharding_spec)
+
+    meta_info = MetaInfo()
+    # NOTE: the cost in shape_consistency_manager.mem_cost is the count in number of numel
+    # get mem cost for MetaInfo
+    mem_cost = shape_consistency_manager.mem_cost(comm_action_sequence)
+    # extract user that has _meta_data and extract element length
+    input_node = next(n for n in node._input_nodes if hasattr(n, '_meta_data'))
+    element_length = input_node._meta_data.element_size()
+
+    mem_cost.fwd.activation *= element_length
+    mem_cost.fwd.temp *= element_length
+    mem_cost.bwd.activation *= element_length
+    mem_cost.bwd.temp *= element_length
+    mem_cost.total.activation *= element_length
+
+    meta_info.memory_cost = mem_cost
+
+    # get computation cost for MetaInfo
+    meta_info.compute_cost = TrainCycleItem(total_cost['forward'] * element_length,
+                                            total_cost['backward'] * element_length,
+                                            total_cost['total'] * element_length)
+
+    # get tensor shape for MetaInfo
+    origin_sharding_spec: ShardingSpec
+    target_sharding_spec: ShardingSpec
+    input_shape = origin_sharding_spec.get_sharded_shape_per_device()
+    output_shape = target_sharding_spec.get_sharded_shape_per_device()
+
+    meta_info.fwd_in = [torch.rand(input_shape, device='meta')]
+    meta_info.fwd_buffer = []
+    meta_info.fwd_out = [torch.rand(output_shape, device='meta')]
+
+    return meta_info
+
+
+def _runtime_apply_meta_info(node: Node, origin_spec_dict, sharding_spec_dict) -> MetaInfo:
+    """
+    This method is used to construct `MetaInto` for shape consistency node
+    """
+
+    # extract node index and user node index
+    args = node.args
+    node_index, user_node_index = args[3], args[4]
+    origin_sharding_spec, target_sharding_spec = origin_spec_dict[node_index], sharding_spec_dict[node_index][
+        user_node_index]
+
+    return _construct_meta_info(node, origin_sharding_spec, target_sharding_spec)
+
+
+def _runtime_comm_spec_apply_meta_info(node: Node, comm_actions_dict: Dict) -> MetaInfo:
+    # extract node_index and op_data_name
+    node_index, op_data_name = node.args[2], node.args[3]
+
+    comm_action = comm_actions_dict[node_index][op_data_name]
+    if isinstance(comm_action.comm_spec, CommSpec):
+        # this case is for all_reduce, there will be no memory cost
+        meta_info = MetaInfo()
+        meta_info.memory_cost = TrainCycleItem(MemoryCost(), MemoryCost(), MemoryCost)
+        output_node = next(n for n in node.users if hasattr(n, '_meta_data'))
+        element_length = output_node._meta_data.element_size()
+
+        total_cost = comm_action.comm_spec.get_comm_cost()
+        meta_info.compute_cost = TrainCycleItem(total_cost['forward'] * element_length,
+                                                total_cost['backward'] * element_length,
+                                                total_cost['total'] * element_length)
+
+        input_shape = output_shape = comm_action.comm_spec.sharding_spec.get_sharded_shape_per_device()
+        meta_info.fwd_in = [torch.rand(input_shape, device='meta')]
+        meta_info.fwd_buffer = []
+        meta_info.fwd_out = [torch.rand(output_shape, device='meta')]
+    else:
+        # this case will be handled by shape consistency manager
+        origin_sharding_spec, target_sharding_spec = comm_action.comm_spec['src_spec'], comm_action.comm_spec[
+            'tgt_spec']
+        meta_info = _construct_meta_info(node, origin_sharding_spec, target_sharding_spec)
+
+    return meta_info
+
+
+def comm_metainfo_pass(gm: GraphModule, sharding_spec_dict: Dict, origin_spec_dict: Dict,
+                       comm_actions_dict: Dict) -> GraphModule:
+    """
+    The method manages all the metainfo of the communication node (run_time_apply, runtime_comm_spec_apply) in the graph.
+    """
+    for node in gm.graph.nodes:
+        if node.target == runtime_apply:
+            setattr(node, 'best_metainfo', _runtime_apply_meta_info(node, origin_spec_dict, sharding_spec_dict))
+        elif node.target == runtime_comm_spec_apply:
+            setattr(node, 'best_metainfo', _runtime_comm_spec_apply_meta_info(node, comm_actions_dict))
+        else:
+            pass
+    return gm
--- a/colossalai/auto_parallel/passes/constants.py
+++ b/colossalai/auto_parallel/passes/constants.py
+import torch
+
+OUTPUT_SAVED_OPS = [torch.nn.functional.relu, torch.nn.functional.softmax, torch.flatten]
+
+OUTPUT_SAVED_MOD = [
+    torch.nn.ReLU,
+    torch.nn.Softmax,
+]
--- a/colossalai/auto_parallel/passes/meta_info_prop.py
+++ b/colossalai/auto_parallel/passes/meta_info_prop.py
+import uuid
+from dataclasses import asdict
+from typing import List
+
+import torch
+import torch.fx
+from torch.fx import GraphModule
+from torch.fx.node import Node
+
+from colossalai.auto_parallel.meta_profiler import MetaInfo
+from colossalai.auto_parallel.passes.constants import OUTPUT_SAVED_MOD, OUTPUT_SAVED_OPS
+from colossalai.fx._compatibility import compatibility
+from colossalai.fx.profiler import GraphInfo
+
+
+def _normalize_tuple(x):
+    if not isinstance(x, tuple):
+        return (x,)
+    return x
+
+
+@compatibility(is_backward_compatible=False)
+class MetaInfoProp:
+
+    def __init__(self, module: GraphModule) -> None:
+        self.module = module
+        self.func_dict = {
+            'placeholder': self.placeholder_handler,
+            'get_attr': self.get_attr_handler,
+            'output': self.output_handler,
+            'call_function': self.node_handler,
+            'call_module': self.node_handler,
+            'call_method': self.node_handler,
+        }
+
+    def _set_data_ptr(self, x):
+        """
+        Set uuid to tensor
+        """
+        if isinstance(x, torch.Tensor):
+            if not x.data_ptr():
+                data_ptr = uuid.uuid4()
+                x.data_ptr = lambda: data_ptr
+
+    def _is_inplace(self, node: Node):
+        """
+        Check if the node is inplace operation.
+        """
+        if node.op == 'call_module':
+            return node.graph.owning_module.get_submodule(node.target).__class__ in OUTPUT_SAVED_MOD
+        elif node.op == "call_function":
+            return node.target in OUTPUT_SAVED_OPS
+        return False
+
+    def run(self) -> GraphModule:
+        """
+        Run the meta information propagation pass on the module.
+        """
+        for node in self.module.graph.nodes:
+            node: Node
+            self.func_dict[node.op](node)
+
+    @compatibility(is_backward_compatible=False)
+    def placeholder_handler(self, node: Node) -> None:
+        """
+        Handle the placeholder node.
+        """
+        graph_info = GraphInfo()
+        out = _normalize_tuple(getattr(node, '_meta_data', None))
+        graph_info.fwd_out = list(out) if out[0] is not None else []
+        node.meta = {**asdict(graph_info)}
+
+    @compatibility(is_backward_compatible=False)
+    def get_attr_handler(self, node: Node) -> None:
+        """
+        Handle the get_attr node.
+        """
+        graph_info = GraphInfo()
+        node.meta = {**asdict(graph_info)}
+
+    @compatibility(is_backward_compatible=False)
+    def output_handler(self, node: Node) -> None:
+        """
+        Handle the output node.
+        """
+        graph_info = GraphInfo()
+        output_tensors = []
+        for par in node._input_nodes:
+            if par.meta:
+                output_tensors += par.meta["fwd_out"]
+        graph_info.fwd_in = output_tensors
+        node.meta = {**asdict(graph_info)}
+
+    @compatibility(is_backward_compatible=False)
+    def node_handler(self, node: Node) -> None:
+        """
+        Handle other kind of nodes
+        """
+        assert hasattr(node, 'best_metainfo'), f"Cannot find best_metainfo in node {node}, {node.op}"
+        graph_info = GraphInfo()
+        meta_info = node.best_metainfo
+        meta_info: MetaInfo
+
+        # set data_ptr for input_tensor in MetaInfo class
+        input_tensors: List[torch.Tensor] = meta_info.fwd_in
+        buffer_tensors: List[torch.Tensor] = meta_info.fwd_buffer
+        output_tensors: List[torch.Tensor] = meta_info.fwd_out
+
+        if self._is_inplace(node):
+            # inplace operation will not create new tensor, and it only has one parent node
+            # TODO: Verify this observation
+            # set data_ptr for input_tensor, buffer_tensor and output_tensor of current node
+            parent_node = list(node._input_nodes.keys())[0]
+            parent_tensor = parent_node.meta.get("fwd_out")[0]
+            parent_tensor: torch.Tensor
+            for tensor in input_tensors:
+                tensor.data_ptr = parent_tensor.data_ptr
+            for tensor in buffer_tensors:
+                tensor.data_ptr = parent_tensor.data_ptr
+            for tensor in output_tensors:
+                tensor.data_ptr = parent_tensor.data_ptr
+
+        else:
+            for par in node._input_nodes:
+                # set data_ptr for the input_tensor of current node from the output_tensor of its parent node
+                for tensor in par.meta.get("fwd_out", []):
+                    tensor: torch.Tensor
+                    target_input_tensor = next(
+                        (x for x in input_tensors if not x.data_ptr() and x.shape == tensor.shape), None)
+                    if target_input_tensor is not None:
+                        target_input_tensor.data_ptr = tensor.data_ptr
+
+            # set data_ptr for tensor in input_tensor that is not set
+            for tensor in input_tensors:
+                if not tensor.data_ptr():
+                    self._set_data_ptr(tensor)
+
+            # set data_ptr for buffer_tensor
+            for tensor in buffer_tensors:
+                self._set_data_ptr(tensor)
+
+            # set data_ptr for output_tensor
+            for tensor in output_tensors:
+                self._set_data_ptr(tensor)
+
+        # attach them to graph_info
+        graph_info.fwd_in = input_tensors
+        graph_info.fwd_tmp = buffer_tensors
+        graph_info.fwd_out = output_tensors
+
+        # fetch other memory informations
+        memory_cost = meta_info.memory_cost
+        graph_info.fwd_mem_tmp = memory_cost.fwd.temp
+        graph_info.fwd_mem_out = memory_cost.fwd.activation
+        graph_info.bwd_mem_tmp = memory_cost.bwd.temp
+        graph_info.bwd_mem_out = memory_cost.bwd.activation
+
+        # fetch flop information
+        # here we use fwd_time and bwd_time to deal with the case that
+        # communication cost is a float
+        compute_cost = meta_info.compute_cost
+        graph_info.fwd_time = compute_cost.fwd
+        graph_info.bwd_time = compute_cost.bwd
+
+        node.meta = {**asdict(graph_info)}
--- a/colossalai/fx/passes/experimental/adding_shape_consistency_pass_v2.py
+++ b/colossalai/fx/passes/experimental/adding_shape_consistency_pass_v2.py
-import builtins
-import copy
-import operator
-from ast import NodeTransformer
 from copy import deepcopy
-from typing import List
+from typing import Dict, List

 import torch
-from torch.fx import symbolic_trace
 from torch.fx.node import Node

-from colossalai.auto_parallel.tensor_shard.sharding_strategy import CommAction, CommType, OperationDataType
+from colossalai.auto_parallel.meta_profiler import MetaInfo
+from colossalai.auto_parallel.tensor_shard.sharding_strategy import (
+    CommAction,
+    CommType,
+    OperationData,
+    OperationDataType,
+    TrainCycleItem,
+)
 from colossalai.device.device_mesh import DeviceMesh
-from colossalai.fx.passes.split_module import split_module
-from colossalai.tensor.comm_spec import CollectiveCommPattern, CommSpec, _all_reduce, pattern_to_func_dict
+from colossalai.tensor.comm_spec import CommSpec
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
-from colossalai.tensor.sharding_spec import ShardingSpec, _DimSpec
+from colossalai.tensor.sharding_spec import ShardingSpec

 shape_consistency_manager = ShapeConsistencyManager()


-def runtime_apply(node, origin_dict, input_dict, node_index, user_node_index):
+def runtime_apply(node: Node, origin_dict: Dict, input_dict: Dict, node_index: int, user_node_index: int):
+    """
+    This method will be invoked during runtime to do the shape consistency, which make sure the activations is converted into
+    the user node expected form.
+    """
    origin_sharding_spec = origin_dict[node_index]
    target_sharding_spec = input_dict[node_index][user_node_index]
    return shape_consistency_manager.apply_for_autoparallel_runtime(node, origin_sharding_spec, target_sharding_spec)


-def runtime_comm_spec_apply(tensor, comm_actions_dict, node_index, op_data):
+def runtime_apply_for_iterable_object(node: Node, origin_dict: Dict, input_dict: Dict, node_index: int,
+                                      user_node_index: int):
+    """
+    This method will be invoked during runtime to do the shape consistency, which makes sure the activations in type of tuple or list
+    is converted into the user node expected form.
+    """
+    rst = []
+    for index, (origin_sharding_spec,
+                target_sharding_spec) in enumerate(zip(origin_dict[node_index],
+                                                       input_dict[node_index][user_node_index])):
+        rst.append(
+            shape_consistency_manager.apply_for_autoparallel_runtime(node[index], origin_sharding_spec,
+                                                                     target_sharding_spec))
+    rst = type(node)(rst)
+    return rst
+

-    comm_action = comm_actions_dict[node_index][op_data]
+def runtime_comm_spec_apply(tensor: torch.Tensor, comm_actions_dict: Dict, node_index: int, op_data_name: str):
+    """
+    This method will be invoked during runtime to apply the comm action following the instruction of comm spec.
+    """
+    comm_action = comm_actions_dict[node_index][op_data_name]
    if isinstance(comm_action.comm_spec, CommSpec):
        rst = comm_action.comm_spec.covert_spec_to_action(tensor)
    else:
@@ -37,94 +61,11 @@ def runtime_comm_spec_apply(tensor, comm_actions_dict, node_index, op_data):
    return rst


-def solution_annotatation_pass(gm: torch.fx.GraphModule, solution: List[int], device_mesh):
-    mod_graph = gm.graph
-    nodes = tuple(mod_graph.nodes)
-
-    # the dict to get origin sharding spec of node
-    origin_node_sharding_spec_dict = {}
-    for node_index, (node, strategy_index) in enumerate(zip(nodes, solution)):
-        strategies_vector = node.strategies_vector
-        setattr(node, 'best_strategy', strategies_vector[strategy_index])
-        setattr(node, 'sharding_spec', strategies_vector[strategy_index].get_sharding_spec_by_name(str(node)))
-        origin_node_sharding_spec_dict[node_index] = strategies_vector[strategy_index].get_sharding_spec_by_name(
-            str(node))
-
-    # apply the sharding spec of parameters
-    for node in nodes:
-        if node.op == 'call_module':
-            target_module = node.graph.owning_module.get_submodule(node.target)
-            for name, param in target_module.named_parameters():
-                target_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
-                if target_sharding_spec.dim_partition_dict != {}:
-                    origin_sharding_spec = ShardingSpec(device_mesh, param.shape, {})
-                    setattr(param, 'sharding_spec', origin_sharding_spec)
-                    param_sharded = torch.nn.Parameter(
-                        shape_consistency_manager.apply_for_autoparallel_runtime(param.data, param.sharding_spec,
-                                                                                 target_sharding_spec).detach().clone())
-                else:
-                    param_sharded = param
-                setattr(target_module, name, param_sharded)
-                comm_actions = node.best_strategy.communication_actions
-                for operation_data, comm_action in comm_actions.items():
-                    comm_spec_to_use = comm_action.comm_spec
-                    if operation_data.type == OperationDataType.PARAM and operation_data.name == name and comm_action.comm_type == CommType.HOOK:
-
-                        def wrapper(param, comm_spec):
-
-                            def hook_fn(grad):
-                                _all_reduce(grad, comm_spec)
-
-                            param.register_hook(hook_fn)
-
-                        wrapper(param_sharded, comm_spec_to_use)
-
-            sharded_buffer_dict = {}
-            for name, buffer in target_module.named_buffers():
-                origin_sharding_spec = ShardingSpec(device_mesh, buffer.shape, {})
-                setattr(buffer, 'sharding_spec', origin_sharding_spec)
-                target_sharding_spec = node.best_strategy.get_sharding_spec_by_name(name)
-                buffer_sharded = shape_consistency_manager.apply(buffer, target_sharding_spec)
-                sharded_buffer_dict[name] = buffer_sharded
-
-            for name, buffer_sharded in sharded_buffer_dict.items():
-                setattr(target_module, name, buffer_sharded.detach().clone())
-
-    # the dict to get input sharding specs of user node
-    sharding_spec_convert_dict = {}
-    for index, node in enumerate(nodes):
-        target_sharding_specs = []
-        for user_node in node.strategies_vector.successor_nodes:
-            target_sharding_spec = user_node.best_strategy.get_sharding_spec_by_name(str(node.name))
-            target_sharding_specs.append(target_sharding_spec)
-        sharding_spec_convert_dict[index] = target_sharding_specs
-
-    # the dict to record comm actions of nodes
-    comm_actions_dict = {}
-    for index, node in enumerate(nodes):
-        comm_action_dict = {}
-        for op_data, comm_action in node.best_strategy.communication_actions.items():
-            comm_action_dict[op_data.name] = comm_action
-        comm_actions_dict[index] = comm_action_dict
-
-    # add above dicts into graph
-    for node in nodes:
-        if node.op != 'placeholder':
-            with mod_graph.inserting_before(node):
-                input_specs_node = mod_graph.create_node('placeholder', target='sharding_spec_convert_dict')
-                origin_specs_node = mod_graph.create_node('placeholder', target='origin_node_sharding_spec_dict')
-                comm_actions_dict_node = mod_graph.create_node('placeholder', target='comm_actions_dict')
-            break
-
-    return sharding_spec_convert_dict, origin_node_sharding_spec_dict, comm_actions_dict
-
-
-def shape_consistency_pass(gm: torch.fx.GraphModule):
-    mod_graph = gm.graph
-    nodes = tuple(mod_graph.nodes)
-    input_dict_node = None
-    origin_dict_node = None
-
+def _preprocess_graph(nodes: List[Node]):
+    """
+    This method is used to extract all the placeholders with sharding information,
+    and mapping the nodes into the index of the origin graph.
+    """
    # mapping the node into the origin graph index
    node_to_index_dict = {}
    index = 0
@@ -142,40 +83,110 @@ def shape_consistency_pass(gm: torch.fx.GraphModule):
            continue
        node_to_index_dict[node] = index
        index += 1
-    assert input_dict_node is not None

-    # add shape consistency apply function into graph
+    return input_dict_node, origin_dict_node, comm_actions_dict_node, node_to_index_dict
+
+
+def _shape_consistency_apply(gm: torch.fx.GraphModule):
+    """
+    This pass is used to add the shape consistency node to the origin graph.
+    """
+    mod_graph = gm.graph
+    nodes = tuple(mod_graph.nodes)
+
+    input_dict_node, origin_dict_node, _, node_to_index_dict = _preprocess_graph(nodes)
+
    for node in nodes:
        if not hasattr(node, 'best_strategy') or node.op == 'output':
            continue

-        for user_node in node.strategies_vector.successor_nodes:
-            user_node_index = user_node.strategies_vector.predecessor_nodes.index(node)
-            with mod_graph.inserting_before(user_node):
-                shape_consistency_node = mod_graph.create_node('call_function',
-                                                               runtime_apply,
-                                                               args=(node, origin_dict_node, input_dict_node,
-                                                                     node_to_index_dict[node], user_node_index))
+        for user_node_index, user_node in enumerate(node.strategies_vector.successor_nodes):
+            if isinstance(node.sharding_spec, (list, tuple)):
+                assert isinstance(
+                    node.target_sharding_specs,
+                    (list,
+                     tuple)), 'target sharding specs should be tuple or list when node.sharding_spec is tuple or list'
+                total_difference = 0
+                for sharding_spec, target_sharding_spec in zip(node.sharding_spec,
+                                                               node.target_sharding_specs[user_node_index]):
+                    total_difference += sharding_spec.sharding_sequence_difference(target_sharding_spec)
+                if total_difference == 0:
+                    continue
+                with mod_graph.inserting_before(user_node):
+                    shape_consistency_node = mod_graph.create_node('call_function',
+                                                                   runtime_apply_for_iterable_object,
+                                                                   args=(node, origin_dict_node, input_dict_node,
+                                                                         node_to_index_dict[node], user_node_index))
+
+            else:
+                assert isinstance(node.sharding_spec,
+                                  ShardingSpec), 'node.sharding_spec should be type of ShardingSpec, tuple or list.'
+                if node.sharding_spec.sharding_sequence_difference(node.target_sharding_specs[user_node_index]) == 0:
+                    continue
+                with mod_graph.inserting_before(user_node):
+                    shape_consistency_node = mod_graph.create_node('call_function',
+                                                                   runtime_apply,
+                                                                   args=(node, origin_dict_node, input_dict_node,
+                                                                         node_to_index_dict[node], user_node_index))

-            origin_index_args = user_node.args.index(node)
            new_args = list(user_node.args)
-            new_args[origin_index_args] = shape_consistency_node
-            user_node.args = new_args
+            new_kwargs = dict(user_node.kwargs)
+            # the origin node may be a positional argument or key word argument of user node
+            if node in new_args:
+                # substitute the origin node with shape_consistency_node
+                origin_index_args = new_args.index(node)
+                new_args[origin_index_args] = shape_consistency_node
+                user_node.args = tuple(new_args)
+            elif str(node) in new_kwargs:
+                # substitute the origin node with shape_consistency_node
+                new_kwargs[str(node)] = shape_consistency_node
+                user_node.kwargs = new_kwargs
+
+    return gm
+
+
+def _comm_spec_apply(gm: torch.fx.GraphModule):
+    """
+    This pass is used to add the comm spec apply node to the origin graph.
+    """
+    mod_graph = gm.graph
+    nodes = tuple(mod_graph.nodes)
+
+    _, _, comm_actions_dict_node, node_to_index_dict = _preprocess_graph(nodes)
+
+    for node in nodes:
+        if not hasattr(node, 'best_strategy') or node.op == 'output':
+            continue

        comm_actions = node.best_strategy.communication_actions
        for op_data, comm_action in comm_actions.items():
-            comm_object = node.args[comm_action.arg_index]
-            if op_data.type == OperationDataType.PARAM:
+
+            if comm_action.comm_type == CommType.HOOK:
                continue
            if comm_action.comm_type == CommType.BEFORE:
+                if op_data.type == OperationDataType.OUTPUT:
+                    comm_object = node
+                elif comm_action.key_for_kwarg is not None:
+                    comm_object = node.kwargs[comm_action.key_for_kwarg]
+                else:
+                    comm_object = node.args[comm_action.arg_index]
                with mod_graph.inserting_before(node):
                    comm_spec_apply_node = mod_graph.create_node('call_function',
                                                                 runtime_comm_spec_apply,
                                                                 args=(comm_object, comm_actions_dict_node,
                                                                       node_to_index_dict[node], op_data.name))
-                new_args = list(node.args)
-                new_args[comm_action.arg_index] = comm_spec_apply_node
-                node.args = new_args
+                # the origin node may be a positional argument or key word argument of user node
+                if comm_action.key_for_kwarg is not None:
+                    # substitute the origin node with comm_spec_apply_node
+                    new_kwargs = dict(node.kwargs)
+                    new_kwargs[comm_action.key_for_kwarg] = comm_spec_apply_node
+                    node.kwargs = new_kwargs
+                else:
+                    # substitute the origin node with comm_spec_apply_node
+                    new_args = list(node.args)
+                    new_args[comm_action.arg_index] = comm_spec_apply_node
+                    node.args = tuple(new_args)
+
            elif comm_action.comm_type == CommType.AFTER:
                with mod_graph.inserting_after(node):
                    comm_spec_apply_node = mod_graph.create_node('call_function',
@@ -187,7 +198,24 @@ def shape_consistency_pass(gm: torch.fx.GraphModule):
                    if user == comm_spec_apply_node:
                        continue
                    new_args = list(user.args)
-                    new_args[new_args.index(node)] = comm_spec_apply_node
-                    user.args = tuple(new_args)
-            # TODO: consider other OperationDataType, such as OperationDataType.OUTPUT
+                    new_kwargs = dict(user.kwargs)
+                    # the origin node may be a positional argument or key word argument of user node
+                    if node in new_args:
+                        # substitute the origin node with comm_spec_apply_node
+                        new_args[new_args.index(node)] = comm_spec_apply_node
+                        user.args = tuple(new_args)
+                    elif str(node) in new_kwargs:
+                        # substitute the origin node with comm_spec_apply_node
+                        new_kwargs[str(node)] = comm_spec_apply_node
+                        user.kwargs = new_kwargs
+    return gm
+
+
+def runtime_apply_pass(gm: torch.fx.GraphModule):
+    """
+    The method manages all the passes acting on the distributed training runtime.
+    """
+    gm = _shape_consistency_apply(gm)
+    gm = _comm_spec_apply(gm)
+
    return gm
--- a/colossalai/auto_parallel/passes/runtime_preparation_pass.py
+++ b/colossalai/auto_parallel/passes/runtime_preparation_pass.py
--- a/colossalai/auto_parallel/tensor_shard/constants.py
+++ b/colossalai/auto_parallel/tensor_shard/constants.py
-import torch
 import operator

+import torch
+
 __all__ = [
    'ELEMENTWISE_MODULE_OP', 'ELEMENTWISE_FUNC_OP', 'RESHAPE_FUNC_OP', 'CONV_MODULE_OP', 'CONV_FUNC_OP',
    'LINEAR_MODULE_OP', 'LINEAR_FUNC_OP', 'BATCHNORM_MODULE_OP', 'POOL_MODULE_OP', 'NON_PARAM_FUNC_OP', 'BCAST_FUNC_OP',
@@ -25,7 +26,14 @@ ELEMENTWISE_METHOD_OP = [
    # TODO: contiguous maybe need some extra processes.
    torch.Tensor.contiguous
 ]
-RESHAPE_FUNC_OP = [torch.flatten, torch.reshape]
+RESHAPE_FUNC_OP = [
+    torch.flatten,
+    torch.reshape,
+    torch.transpose,
+    torch.split,
+    torch.permute,
+    operator.getitem,
+]
 RESHAPE_METHOD_OP = [
    torch.Tensor.view,
    torch.Tensor.unsqueeze,
@@ -35,7 +43,7 @@ RESHAPE_METHOD_OP = [
 ]
 BCAST_FUNC_OP = [
    torch.add, torch.sub, torch.mul, torch.div, torch.floor_divide, torch.true_divide, operator.add, operator.sub,
-    operator.mul, operator.floordiv, operator.truediv, torch.matmul, torch.where, operator.pow, torch.pow, torch.tanh
+    operator.mul, operator.floordiv, operator.truediv, torch.matmul, operator.pow, torch.pow
 ]
 CONV_MODULE_OP = [
    torch.nn.Conv1d, torch.nn.Conv2d, torch.nn.Conv3d, torch.nn.ConvTranspose1d, torch.nn.ConvTranspose2d,

--- a/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/__init__.py
+from .cost_graph import CostGraph
+from .graph_analysis import GraphAnalyser
 from .options import SolverOptions
-from .strategies_constructor import StrategiesConstructor
 from .sharding_strategy import ShardingStrategy, StrategiesVector
-from .cost_graph import CostGraph
 from .solver import Solver
-from .graph_analysis import GraphAnalyser
\ No newline at end of file
+from .strategies_constructor import StrategiesConstructor
--- a/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/_utils.py
@@ -5,10 +5,11 @@ from functools import reduce
 from typing import Dict, List, Optional, Union

 import torch
+from torch.fx.node import Node
+
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.tensor.shape_consistency import ShapeConsistencyManager
 from colossalai.tensor.sharding_spec import ShardingSpec
-from torch.fx.node import Node

 from .constants import INFINITY_COST

@@ -17,7 +18,7 @@ def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: Devic
                           dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
    """
    Generate the sharding spec of the tensor based on the given dim_partition_dict.
-    
+

    Args:
        input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
@@ -58,7 +59,7 @@ def generate_resharding_costs(nodes: List[Node],
        nodes (List[Node]): a list of nodes
        sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
        count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
-        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. 
+        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None.
    '''
    # The resharding_cost of weight is counted due to sharing weight cases.
    resharding_costs = {}

--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/conv_handler.py
@@ -3,9 +3,9 @@ import warnings
 from functools import reduce

 import torch
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector

 from .operator_handler import OperatorHandler

@@ -71,19 +71,19 @@ class ConvHandler(OperatorHandler):
        Argument:
            sharding_size_forward(int): The forward activation will be divided
                into sharding_size_forward number partions.
-            sharding_size_backward_activation(int): The backward activation will 
+            sharding_size_backward_activation(int): The backward activation will
                be divided into sharding_size_backward_activation number partions.
            sharding_size_weight(int): The backward weight will be divided
                into sharding_size_weight number partions.

        Return:
-            memory_cost(Tuple[float]): Memory cost per device with this 
+            memory_cost(Tuple[float]): Memory cost per device with this
                specific strategy, the first element of this tuple is forward
                memory cost, and the second element of this tuple is backward
                memory cost.
-            memory_cost_forward(float): Memory cost of forward activation per 
+            memory_cost_forward(float): Memory cost of forward activation per
                device with this specific strategy.
-            memory_cost_backward_activation(float): Memory cost of backward activation 
+            memory_cost_backward_activation(float): Memory cost of backward activation
                per device with this specific strategy.
        '''
        # compute the memory cost of this strategy
@@ -541,14 +541,14 @@ class ConvHandler(OperatorHandler):
            # strategies_for_input = [[R, R, R, R], [R, S0, R, R], [R, S1, R, R], [S0, R, R, R], [S0, S1, R, R], [S1, R, R, R], [S1, S0, R, R]]
            strategies_vector_for_input = StrategiesVector(node=nodes[0], in_nodes=[nodes[1], 2], strategies=strategies_for_input)
            setattr(nodes[1], 'strategies_vector', strategies_vector_for_input)
-            
+
            strategies_vector = StrategiesVector(node=nodes[2], in_nodes=[nodes[1], ])
            conv_handler = ConvHandler(input_node=nodes[1], input_index=0, weight=dict(gm.named_modules())[nodes[2].name].weight, output_node=nodes[2],
                                    device_mesh=device_mesh, strategies_vector=strategies_vector, shape_consistency_manager=shape_consistency_manager)
            conv_handler.register_strategy_into_strategies_vector()
            for strategy in conv_handler.strategies_vector:
                print(f'{strategy.name}: compute_cost is {strategy.compute_cost}, communication_cost is {strategy.communication_cost}, memory_cost is {strategy.memory_cost}, resharding_costs is {strategy.resharding_costs}')
-        
+
        Output:
            S0S1 = S0R x RS1: compute_cost is 8856576, communication_cost is 0, memory_cost is 492032.0, resharding_costs is {mul: [0, 32769.001, 131074.2, 0, 32769.1, 131074.2, 98307.201]}
            S1S0 = S1R x RS0: compute_cost is 8856576, communication_cost is 0, memory_cost is 492032.0, resharding_costs is {mul: [0, 131074.2, 32769.001, 131074.2, 98307.201, 0, 32769.1]}

--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/dot_handler.py
@@ -6,9 +6,9 @@ from typing import List
 import torch
 import torch.nn as nn
 import torch.nn.functional as F
-from colossalai.auto_parallel.tensor_shard.deprecated._utils import \
-    ignore_sharding_exception
-from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import (ShardingStrategy, StrategiesVector)
+
+from colossalai.auto_parallel.tensor_shard.deprecated._utils import ignore_sharding_exception
+from colossalai.auto_parallel.tensor_shard.deprecated.sharding_strategy import ShardingStrategy, StrategiesVector

 from ..constants import LINEAR_FUNC_OP, LINEAR_MODULE_OP
 from .operator_handler import OperatorHandler
@@ -82,13 +82,13 @@ class MatVecStrategyGenerator(StrategyGenerator):

 class MatMulStrategyGenerator(StrategyGenerator):
    """
-    MatMulStrategyGenerator is used to generate the sharding strategies when the second tensor is 
+    MatMulStrategyGenerator is used to generate the sharding strategies when the second tensor is
    a 2D tensor. This is used for nn.Linear, F.linear, torch.matmul and torch.addmm.

    A matmul can be formulated as [n, p] x [p, q] = [n, q]

    Args:
-        is_linear (bool): whether this generator is used for nn.Linear and F.linear. 
+        is_linear (bool): whether this generator is used for nn.Linear and F.linear.
            This will incur extra transformation of the dim partitioning as the weight is transposed.
    """

@@ -255,7 +255,7 @@ class BatchedMatMulStrategyGenerator(StrategyGenerator):
    """
    Generate sharding strategies for the batched matrix multiplication.

-    A batched matrix multiplication can be viewed as 
+    A batched matrix multiplication can be viewed as
    [b, i, k] x [b, k, j] -> [b, i, j]
    """

@@ -431,7 +431,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {0: [mesh_dim_0], 1: [mesh_dim_1]}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_input)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_input)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -451,7 +451,7 @@ class DotHandler(OperatorHandler):

        # create and register strategy
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -473,7 +473,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {0: [mesh_dim_0]}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -491,7 +491,7 @@ class DotHandler(OperatorHandler):
        communication_cost_grad_backward = self.device_mesh.all_reduce_cost(weight_memory_cost, mesh_dim_0)
        communication_cost = communication_cost_activation_forward + communication_cost_grad_backward
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -510,7 +510,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {1: [mesh_dim_1]}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_input)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_input)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -529,7 +529,7 @@ class DotHandler(OperatorHandler):
        communication_cost = communication_cost_activation_backward + communication_cost_activation_forward

        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -548,7 +548,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -564,7 +564,7 @@ class DotHandler(OperatorHandler):
        # compute the communication cost of this strategy
        communication_cost = self.device_mesh.all_reduce_cost(activation_memory_cost, mesh_dim)
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -583,7 +583,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {1: [mesh_dim]}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -600,7 +600,7 @@ class DotHandler(OperatorHandler):
        communication_cost_activation_backward = self.device_mesh.all_reduce_cost(input_grad_memory_cost, mesh_dim)
        communication_cost = communication_cost_activation_backward
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -619,7 +619,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {0: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -636,7 +636,7 @@ class DotHandler(OperatorHandler):
        communication_cost_weight_backward = self.device_mesh.flatten_device_mesh.all_reduce_cost(weight_memory_cost, 0)
        communication_cost = communication_cost_weight_backward
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -655,7 +655,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -673,7 +673,7 @@ class DotHandler(OperatorHandler):
            activation_memory_cost, 0)
        communication_cost = communication_cost_forward_activation
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,
@@ -692,7 +692,7 @@ class DotHandler(OperatorHandler):
        sharding_spec_for_weight = self._generate_sharding_spec(self.weight, dim_partition_dict_for_weight)

        dim_partition_dict_for_output = {1: [mesh_dim_0, mesh_dim_1]}
-        sharding_spec_for_ouput = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)
+        sharding_spec_for_output = self._generate_sharding_spec(self.output_data, dim_partition_dict_for_output)

        # generate resharding cost for this strategy
        resharding_costs = self._generate_resharding_costs([sharding_spec_for_input, sharding_spec_for_weight])
@@ -709,7 +709,7 @@ class DotHandler(OperatorHandler):
            input_grad_memory_cost, 0)
        communication_cost = communication_cost_activation_backward
        sharding_strategies = ShardingStrategy(name,
-                                               output_sharding_spec=sharding_spec_for_ouput,
+                                               output_sharding_spec=sharding_spec_for_output,
                                               compute_cost=compute_cost,
                                               communication_cost=communication_cost,
                                               memory_cost=toatl_memory_cost,

--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/layer_norm_handler.py
--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/operator_handler.py
+from abc import ABC, abstractmethod
+from typing import Dict, List
 from webbrowser import Opera
+
 import torch
 import torch.nn as nn
-from abc import ABC, abstractmethod
 from torch.fx.node import Node
-from typing import Dict, List
+
+from colossalai.auto_parallel.tensor_shard.deprecated.constants import *
 from colossalai.device.device_mesh import DeviceMesh
 from colossalai.tensor.sharding_spec import ShardingSpec
-from .._utils import generate_resharding_costs, generate_sharding_spec
-from colossalai.auto_parallel.tensor_shard.deprecated.constants import *

+from .._utils import generate_resharding_costs, generate_sharding_spec
 from ..sharding_strategy import StrategiesVector

 __all__ = ['OperatorHandler']
@@ -60,7 +62,7 @@ class OperatorHandler(ABC):
    @abstractmethod
    def register_strategy(self) -> StrategiesVector:
        """
-        Register 
+        Register
        """
        pass


--- a/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/op_handler/reshape_handler.py
--- a/colossalai/auto_parallel/tensor_shard/deprecated/strategies_constructor.py
+++ b/colossalai/auto_parallel/tensor_shard/deprecated/strategies_constructor.py
--- a/colossalai/auto_parallel/tensor_shard/initialize.py
+++ b/colossalai/auto_parallel/tensor_shard/initialize.py
--- a/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/__init__.py
--- a/colossalai/auto_parallel/tensor_shard/node_handler/addmm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/addmm_handler.py
--- a/colossalai/auto_parallel/tensor_shard/node_handler/batch_norm_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/batch_norm_handler.py
--- a/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py
+++ b/colossalai/auto_parallel/tensor_shard/node_handler/binary_elementwise_handler.py