init colossalai, support dtk2304

08f2920e · zhuwenwen · da3f0934 · 08f2920e · 08f2920e · 08f2920e
Commit 08f2920e authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/auto_parallel/tensor_shard/utils/factory.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/factory.py
+import operator
+import warnings
+from functools import reduce
+from typing import Dict, List, Optional, Union
+
+import torch
+from colossalai.device.device_mesh import DeviceMesh
+from colossalai.tensor.shape_consistency import ShapeConsistencyManager
+from colossalai.tensor.sharding_spec import ShardingSpec
+from torch.fx.node import Node
+
+from ..constants import INFINITY_COST
+
+__all__ = ['generate_sharding_spec', 'generate_resharding_costs']
+
+
+def generate_sharding_spec(input_: Union[Node, torch.Tensor], device_mesh: DeviceMesh,
+                           dim_partition_dict: Dict[int, List[int]]) -> ShardingSpec:
+    """
+    Generate the sharding spec of the tensor based on the given dim_partition_dict.
+    
+
+    Args:
+        input_ (Union[Node, torch.Tensor]): the input can be a Node object or a PyTorch tensor. If a node is used, it will look for its meta data associated with this node.
+        device_mesh (DeviceMesh): a DeviceMesh object which contains the meta information about the cluster.
+        dim_partition_dict (Dict[int, List[int]]): a dictionary to specify the sharding specs, the key is the tensor dimension and the value is the mesh dimension for sharding.
+    """
+
+    if isinstance(input_, Node):
+        assert hasattr(input_, '_meta_data'), f'The given node has no attribte _meta_data'
+        meta_tensor = input_._meta_data
+        assert meta_tensor is not None, "The given node's _meta_data attribute is None"
+        shape = meta_tensor.shape
+    elif isinstance(input_, torch.Tensor):
+        shape = input_.shape
+    else:
+        raise TypeError(
+            f'We cannot generate sharding spec for {type(input_)} type, only torch.fx.Node or torch.Tensor is expected.'
+        )
+    for dim_index, sharding_index_list in dim_partition_dict.items():
+        sharding_list = [device_mesh.mesh_shape[sharding_index] for sharding_index in sharding_index_list]
+        sharding_size = reduce(operator.mul, sharding_list, 1)
+        assert shape[
+            dim_index] % sharding_size == 0, f'we cannot shard the {dim_index} dimension of tensor into {sharding_size} partitions.'
+
+    sharding_spec = ShardingSpec(device_mesh=device_mesh, entire_shape=shape, dim_partition_dict=dim_partition_dict)
+    return sharding_spec
+
+
+def generate_resharding_costs(nodes: List[Node],
+                              sharding_specs: List[ShardingSpec],
+                              count_backward: Optional[bool] = True,
+                              dtype: Optional[torch.dtype] = None,
+                              index=None):
+    '''
+    Compute the resharding costs with this specific strategy.
+
+    Argument:
+        nodes (List[Node]): a list of nodes
+        sharding_spec_for_input(ShardingSpec): a list of ShardingSpec for the nodes.
+        count_backward (Optional[bool]): whether to include the cost of resharding in the backward pass, default is True. False can be used for inference.
+        dtype (Optional[torch.dtype]): the data type for cost calculation, default is None. 
+    '''
+    # The resharding_cost of weight is counted due to sharing weight cases.
+    resharding_costs = {}
+    size_per_elem_bytes = torch.tensor([], dtype=dtype).element_size()
+
+    # shape consistency manager is a singleton class
+    shape_consistency_manager = ShapeConsistencyManager()
+
+    for input_node, input_spec in zip(nodes, sharding_specs):
+        resharding_costs[input_node] = []
+        for strategy in input_node.strategies_vector:
+            input_sharding_spec = strategy.output_sharding_spec
+            if not isinstance(input_sharding_spec, ShardingSpec):
+                assert isinstance(input_sharding_spec, list), 'only ShardingSpec or List[ShardingSpec] is expected.'
+                input_sharding_spec = input_sharding_spec[index]
+            assert isinstance(input_sharding_spec, ShardingSpec), f'The input node should NOT be a tuple of tensor.'
+            try:
+                # compute the resharding cost
+                _, _, total_resharding_cost = shape_consistency_manager.shape_consistency(
+                    input_sharding_spec, input_spec)
+
+                # we need multiply the size of elem dtype to get correct communication cost
+                resharding_cost = total_resharding_cost["total"] * size_per_elem_bytes
+            except AssertionError as e:
+                warnings.warn(f'{e}')
+                resharding_cost = INFINITY_COST
+            resharding_costs[input_node].append(resharding_cost)
+    return resharding_costs
--- a/colossalai/auto_parallel/tensor_shard/utils/misc.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/misc.py
+import functools
+from typing import Any, Callable, Dict, List, Tuple, Type, Union
+
+import torch
+
+from colossalai.logging import get_dist_logger
+from colossalai.tensor.sharding_spec import ShardingSpec, ShardingSpecException
+
+__all__ = ['ignore_sharding_exception', 'pytree_map']
+
+
+def ignore_sharding_exception(func):
+    """
+    A function wrapper to handle the ShardingSpecException in the function.
+    If ShardingSpecException occurs, this function will return None.
+
+    Usage:
+        # mute the assertion error in the function
+        @ignore_sharding_exception
+        def do_something():
+            ...
+    """
+
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        try:
+            logger = get_dist_logger()
+            rst = func(*args, **kwargs)
+            return rst
+        except ShardingSpecException as e:
+            logger.debug(e)
+            return None
+
+    return wrapper
+
+
+def check_sharding_spec_validity(sharding_spec: ShardingSpec, tensor: torch.Tensor):
+    """
+    This function checks whether the ShardingSpec is valid for the physical tensor.
+    This check includes 3 items:
+        1. the sharding spec covers all dimensions of the physical tensor
+        2. the sharding spec for each dimension is divisible by the number of devices.
+        3. the sharding spec's entire shape must match the tensor shape
+    #
+    """
+    # make sure all dims are covered in sharding spec
+    sharding_len = len(sharding_spec.sharding_sequence)
+    tensor_num_dim = tensor.dim()
+    num_devices_in_col = sharding_spec.device_mesh.mesh_shape[0]
+    num_devices_in_row = sharding_spec.device_mesh.mesh_shape[1]
+    assert sharding_len == tensor_num_dim, \
+        f'The ShardingSpec ({sharding_spec.sharding_sequence}) is created for {sharding_len}-dimension tensor, but the given tensor is {tensor_num_dim}-dimension ({tensor.shape}).'
+
+    # make sure the sharding is valid for each dim
+    for i in range(tensor_num_dim):
+        dim_size = tensor.shape[i]
+        dim_spec = sharding_spec.sharding_sequence[i]
+
+        if str(dim_spec).startswith('S'):
+            devices_str = str(dim_spec).lstrip('S')
+            num_devices = 1
+
+            if '0' in devices_str:
+                num_devices *= num_devices_in_col
+            if '1' in devices_str:
+                num_devices *= num_devices_in_row
+
+            assert dim_size >= num_devices and dim_size % num_devices == 0, \
+                f'The dimension at index {i} has value {dim_size}, but it is sharded over {num_devices} devices.'
+
+    # make sure the entire shape matches the physical tensor shape
+    assert sharding_spec.entire_shape == tensor.shape, \
+        f'The entire_shape of the sharding spec {sharding_spec.entire_shape} does not match the tensor shape {tensor.shape}'
+
+
+def pytree_map(obj: Any, fn: Callable, process_types: Union[Type, Tuple[Type]] = (), map_all: bool = False) -> Any:
+    """process object recursively, like pytree
+
+    Args:
+        obj (:class:`Any`): object to process
+        fn (:class:`Callable`): a function to process subobject in obj
+        process_types (:class: `type | tuple[type]`): types to determine the type to process
+        map_all (:class: `bool`): if map_all is True, then any type of element will use fn
+
+    Returns:
+        :class:`Any`: returns have the same structure of `obj` and type in process_types after map of `fn`
+    """
+    if isinstance(obj, dict):
+        return {k: pytree_map(obj[k], fn, process_types, map_all) for k in obj}
+    elif isinstance(obj, tuple):
+        return tuple(pytree_map(o, fn, process_types, map_all) for o in obj)
+    elif isinstance(obj, list):
+        return list(pytree_map(o, fn, process_types, map_all) for o in obj)
+    elif isinstance(obj, process_types):
+        return fn(obj)
+    else:
+        return fn(obj) if map_all else obj
--- a/colossalai/auto_parallel/tensor_shard/utils/reshape.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/reshape.py
+from enum import Enum
+from typing import Dict, List, Tuple
+
+import torch
+
+
+class PreviousStatus(Enum):
+    """
+    This class shows the status of previous comparision.
+    """
+    RESET = 0
+    # ORIGIN means the dimension size of original tensor is larger in the previous comparision.
+    ORIGIN = 1
+    # TGT means the dimension size of target tensor is larger in the previous comparision.
+    TGT = 2
+
+
+def detect_reshape_mapping(origin_shape: torch.Size, tgt_shape: torch.Size) -> Dict[Tuple[int], Tuple[int]]:
+    """
+    This method is used to detect the reshape mapping between original tensor and target tensor.
+
+    Returns:
+        reshape_mapping_dict: The dictionary shows how a tuple of origin dims(keys) mapping to the related
+        target dims(values) during reshaping operation.
+    Examples:
+        import torch
+        origin_shape = torch.Size([4, 4, 4])
+        tgt_shape = torch.Size([2, 8, 2, 2])
+        reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
+        print(reshape_mapping_dict)
+    Output:
+        {(2,): (3, 2), (1, 0): (1,), (0,): (0, 1)}
+    """
+
+    # reverse the shape object
+    origin_shape = list(origin_shape)
+    tgt_shape = list(tgt_shape)
+    origin_shape.reverse()
+    tgt_shape.reverse()
+
+    # initialize arguments
+    reshape_mapping_dict = {}
+    origin_len = len(origin_shape)
+    tgt_len = len(tgt_shape)
+    origin_index = 0
+    tgt_index = 0
+    original_dimension_size = origin_shape[origin_index]
+    tgt_dimension_size = tgt_shape[tgt_index]
+    tgt_dims = [tgt_len - tgt_index - 1]
+    origin_dims = [origin_len - origin_index - 1]
+    previous_label = PreviousStatus.RESET
+
+    while origin_index != len(origin_shape) or tgt_index != len(tgt_shape):
+        if original_dimension_size == tgt_dimension_size:
+            reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
+            # if the origin_dims has no element, it means the original tensor has been fully matched.
+            # Therefore, we do not have to increase the origin_index for that case.
+            if len(origin_dims) > 0:
+                origin_index += 1
+            # if the tgt_dims has no element, it means the original tensor has been fully matched.
+            # Therefore, we do not have to increase the tgt_index for that case.
+            if len(tgt_dims) > 0:
+                tgt_index += 1
+            # the last step of loop should always end with condition
+            # so we need to manually skip the preparation for next step
+            # in the last step.
+            if origin_index == len(origin_shape) and tgt_index == len(tgt_shape):
+                continue
+
+            # If origin_index equals to origin_len, we just need to set the original_dimension_size
+            # to 1 to match the remaining '1's in the target tensor shape.
+            if origin_index == len(origin_shape):
+                original_dimension_size = 1
+                origin_dims = []
+            else:
+                original_dimension_size = origin_shape[origin_index]
+                origin_dims = [origin_len - origin_index - 1]
+
+            # If tgt_index equals to tgt_len, we just need to set the tgt_dimension_size
+            # to 1 to match the remaining '1's in the original tensor shape.
+            if tgt_index == len(tgt_shape):
+                tgt_dimension_size = 1
+                tgt_dims = []
+            else:
+                tgt_dimension_size = tgt_shape[tgt_index]
+                tgt_dims = [tgt_len - tgt_index - 1]
+
+            previous_label = PreviousStatus.RESET
+
+        elif original_dimension_size > tgt_dimension_size:
+            tgt_index += 1
+
+            if previous_label == PreviousStatus.TGT:
+                # if the target dimension size is larger in the previous comparision, which means
+                # the origin dimension size has already accumulated larger than target dimension size, so
+                # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
+                reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
+                original_dimension_size = original_dimension_size // tgt_dimension_size
+                origin_dims = [origin_len - origin_index - 1]
+                tgt_dimension_size = tgt_shape[tgt_index]
+                tgt_dims = [tgt_len - tgt_index - 1, tgt_len - tgt_index]
+                # reset the previous_label after offloading the origin dims and tgt dims
+                previous_label = PreviousStatus.RESET
+            else:
+                # accumulate the tgt_dimension_size until tgt_dimension_size larger than original_dimension_size
+                tgt_dimension_size *= tgt_shape[tgt_index]
+                tgt_dims.append(tgt_len - tgt_index - 1)
+                previous_label = PreviousStatus.ORIGIN
+
+        else:
+            origin_index += 1
+
+            if previous_label == PreviousStatus.ORIGIN:
+                # if the origin element is larger in the previous comparision, which means
+                # the target element has already accumulated larger than origin element, so
+                # we need to offload the origin dims and tgt dims into the reshape_mapping_dict.
+                reshape_mapping_dict[tuple(origin_dims)] = tuple(tgt_dims)
+                tgt_dimension_size = tgt_dimension_size // original_dimension_size
+                tgt_dims = [tgt_len - tgt_index - 1]
+                original_dimension_size = origin_shape[origin_index]
+                origin_dims = [origin_len - origin_index - 1, origin_len - origin_index]
+                # reset the previous_label after offloading the origin dims and tgt dims
+                previous_label = PreviousStatus.RESET
+            else:
+                # accumulate the original_dimension_size until original_dimension_size larger than tgt_dimension_size
+                original_dimension_size *= origin_shape[origin_index]
+                origin_dims.append(origin_len - origin_index - 1)
+                previous_label = PreviousStatus.TGT
+
+    return reshape_mapping_dict
+
+
+def check_keep_sharding_status(input_dim_partition_dict: Dict[int, List[int]],
+                               reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> bool:
+    """
+    This method is used to check whether the reshape operation could implement without converting
+    the input to fully replicated status.
+
+    Rule:
+        For a sharded dimension of input tensor, if it is not the minimum element of the input tuple,
+        the function will return false.
+        To illustrate this issue, there are two cases to analyse:
+        1. no sharded dims in the input tuple: we could do the reshape operation safely just as the normal
+        operation without distributed tensor.
+        2. sharded dims in the input tuple: the sharded dim must be the minimum element, then during shape
+        consistency process, torch.cat will be implemented on the sharded dim, and everything after the sharded
+        dim get recovered.
+
+    Examples:
+        # the second dimension of the input has been sharded.
+        input_dim_partition_dict = {1: [1]}
+        origin_shape = torch.Size([8, 4, 2])
+        tgt_shape = torch.Size([2, 4, 8])
+        reshape_mapping_dict = detect_reshape_mapping(origin_shape, tgt_shape)
+        # {(2, 1): (2,), (0,): (1, 0)}
+        # the sharded dim of input is 1, which is the minimum element of the tuple (2, 1),
+        # so we do not have to convert the input to fully replicated status.
+        print(check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict))
+
+    Output:
+        True
+    """
+    sharded_dims = list(input_dim_partition_dict.keys())
+    for input_dims in reshape_mapping_dict.keys():
+        # if input_dims has no element, we could just skip this iteration.
+        if len(input_dims) == 0:
+            continue
+        min_element = min(input_dims)
+        for dim in input_dims:
+            if dim in sharded_dims and dim is not min_element:
+                return False
+    return True
+
+
+def infer_output_dim_partition_dict(input_dim_partition_dict: Dict[int, List[int]],
+                                    reshape_mapping_dict: Dict[Tuple[int], Tuple[int]]) -> Dict[Tuple[int], Tuple[int]]:
+    """
+    This method is used to infer the output dim partition dict for a reshape operation,
+    given the input dim partition dict and reshape mapping dict.
+    """
+    assert check_keep_sharding_status(input_dim_partition_dict, reshape_mapping_dict), \
+        'we only infer output dim partition dict for the reshape operation could keep sharding spec.'
+    sharded_dims = list(input_dim_partition_dict.keys())
+    output_dim_partition_dict = {}
+    for input_dims, output_dims in reshape_mapping_dict.items():
+        for dim in input_dims:
+            if dim in sharded_dims:
+                output_dim_partition_dict[min(output_dims)] = input_dim_partition_dict[dim]
+                # we could break because input dims cannot contain two sharded dims, otherwise
+                # the keep sharding status check will fail.
+                break
+    return output_dim_partition_dict
--- a/colossalai/auto_parallel/tensor_shard/utils/sharding.py
+++ b/colossalai/auto_parallel/tensor_shard/utils/sharding.py
+import operator
+from copy import deepcopy
+from functools import reduce
+from typing import Dict
+
+import torch
+
+from colossalai.tensor.sharding_spec import ShardingSpec
+
+__all__ = [
+    'transpose_partition_dim', 'update_partition_dim', 'enumerate_all_possible_1d_sharding',
+    'enumerate_all_possible_2d_sharding', 'generate_sharding_size'
+]
+
+
+def transpose_partition_dim(sharding_spec: ShardingSpec, dim1: int, dim2: int) -> ShardingSpec:
+    """
+    Switch the sharding mesh dimensions for two tensor dimensions. This operation is in-place.
+
+    Args:
+        sharding_spec (ShardingSpec): the sharding spec for which partition dim are switched
+        dim1 (int): the tensor dimension to switch
+        dim2 (int): the tensor dimension to switch
+    """
+    assert len(sharding_spec.entire_shape) >= 2, \
+        'The entire_shape of the sharding spec must have at least 2 dimensions'
+    dim_partition_dict = sharding_spec.dim_partition_dict
+
+    # transpose the dim partition
+    dim1_partition = dim_partition_dict.pop(dim1, None)
+    dim2_partition = dim_partition_dict.pop(dim2, None)
+
+    if dim1_partition:
+        dim_partition_dict[dim2] = dim1_partition
+    if dim2_partition:
+        dim_partition_dict[dim1] = dim2_partition
+
+    # get the transposed shape
+    new_shape = list(sharding_spec.entire_shape[:])
+    new_shape[dim2], new_shape[dim1] = new_shape[dim1], new_shape[dim2]
+    new_shape = torch.Size(new_shape)
+
+    # re-init the sharding spec
+    sharding_spec.__init__(sharding_spec.device_mesh, new_shape, dim_partition_dict)
+    return sharding_spec
+
+
+def update_partition_dim(sharding_spec: ShardingSpec,
+                         dim_mapping: Dict[int, int],
+                         physical_shape: torch.Size,
+                         inplace: bool = False):
+    """
+    This method is used to update the partition dim dict from the logical one to the physical one.
+
+    Args:
+        sharding_spec (ShardingSpec): the sharding spec for which partition dims are updated
+        dim_mapping (Dict[int, int]): the mapping from the logical tensor dimension to the physical tensor dimension
+        physical_shape (torch.Size): the physical shape for the tensor
+    """
+
+    if inplace:
+        current_sharding_spec = sharding_spec
+    else:
+        current_sharding_spec = deepcopy(sharding_spec)
+
+    old_dim_partition_dict = current_sharding_spec.dim_partition_dict
+    new_dim_partition_dict = {}
+
+    # assign new dim
+    for old_dim, new_dim in dim_mapping.items():
+        mesh_dims = old_dim_partition_dict.pop(old_dim)
+        new_dim_partition_dict[new_dim] = mesh_dims
+
+    for tensor_dim, mesh_dims in old_dim_partition_dict.items():
+        if tensor_dim in new_dim_partition_dict:
+            raise KeyError(f"There are duplicated entries for the tensor sharding dimension {tensor_dim}")
+        else:
+            new_dim_partition_dict[tensor_dim] = mesh_dims
+
+    # update sharding spec
+    current_sharding_spec.__init__(device_mesh=sharding_spec.device_mesh,
+                                   entire_shape=physical_shape,
+                                   dim_partition_dict=new_dim_partition_dict)
+    return current_sharding_spec
+
+
+def enumerate_all_possible_2d_sharding(mesh_dim_0, mesh_dim_1, dim_size):
+    dim_partition_list = []
+    # enumerate all the 2D sharding cases
+    for i in range(dim_size):
+        for j in range(i + 1, dim_size):
+            dim_partition_dict_0 = {i: [mesh_dim_0], j: [mesh_dim_1]}
+            dim_partition_dict_1 = {i: [mesh_dim_1], j: [mesh_dim_0]}
+            dim_partition_list.append(dim_partition_dict_0)
+            dim_partition_list.append(dim_partition_dict_1)
+    for i in range(dim_size):
+        dim_partition_dict_flatten = {i: [mesh_dim_0, mesh_dim_1]}
+        dim_partition_list.append(dim_partition_dict_flatten)
+
+    return dim_partition_list
+
+
+def enumerate_all_possible_1d_sharding(mesh_dim_0, dim_size):
+    dim_partition_list = []
+    # enumerate all the 1D sharding cases
+    for i in range(dim_size):
+        dim_partition_dict_0 = {i: [mesh_dim_0]}
+        dim_partition_list.append(dim_partition_dict_0)
+
+    return dim_partition_list
+
+
+def generate_sharding_size(dim_partition_dict, device_mesh):
+    total_sharding_size = 1
+    for mesh_dim_list in dim_partition_dict.values():
+        mesh_dim_sharding_size = [device_mesh.shape[mesh_dim] for mesh_dim in mesh_dim_list]
+        sharding_size = reduce(operator.mul, mesh_dim_sharding_size)
+        total_sharding_size *= sharding_size
+
+    return total_sharding_size
--- a/colossalai/builder/__init__.py
+++ b/colossalai/builder/__init__.py
+from .builder import build_from_config, build_from_registry, build_gradient_handler
+
+__all__ = ['build_gradient_handler', 'build_from_config', 'build_from_registry']
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import inspect
+
+from colossalai.registry import *
+
+
+def build_from_config(module, config: dict):
+    """Returns an object of :class:`module` constructed from `config`.
+
+    Args:
+        module: A python or user-defined class
+        config: A python dict containing information used in the construction of the return object
+
+    Returns: An ``object`` of interest
+
+    Raises:
+        AssertionError: Raises an AssertionError if `module` is not a class
+
+    """
+    assert inspect.isclass(module), 'module must be a class'
+    return module(**config)
+
+
+def build_from_registry(config, registry: Registry):
+    r"""Returns an object constructed from `config`, the type of the object
+    is specified by `registry`.
+
+    Note:
+        the `config` is used to construct the return object such as `LAYERS`, `OPTIMIZERS`
+        and other support types in `registry`. The `config` should contain
+        all required parameters of corresponding object. The details of support
+        types in `registry` and the `mod_type` in `config` could be found in
+        `registry <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/registry/__init__.py>`_.
+
+    Args:
+        config (dict or :class:`colossalai.context.colossalai.context.Config`): information
+            used in the construction of the return object.
+        registry (:class:`Registry`): A registry specifying the type of the return object
+
+    Returns:
+        A Python object specified by `registry`.
+
+    Raises:
+        Exception: Raises an Exception if an error occurred when building from registry.
+    """
+    config_ = config.copy()    # keep the original config untouched
+    assert isinstance(registry, Registry), f'Expected type Registry but got {type(registry)}'
+
+    mod_type = config_.pop('type')
+    assert registry.has(mod_type), f'{mod_type} is not found in registry {registry.name}'
+    try:
+        obj = registry.get_module(mod_type)(**config_)
+    except Exception as e:
+        print(f'An error occurred when building {mod_type} from registry {registry.name}', flush=True)
+        raise e
+
+    return obj
+
+
+def build_gradient_handler(config, model, optimizer):
+    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
+    `model` and `optimizer`.
+
+    Args:
+        config (dict or :class:`colossalai.context.Config`): A python dict or
+            a :class:`colossalai.context.Config` object containing information
+            used in the construction of the ``GRADIENT_HANDLER``.
+        model (:class:`nn.Module`): A model containing parameters for the gradient handler
+        optimizer (:class:`torch.optim.Optimizer`): An optimizer object containing parameters for the gradient handler
+
+    Returns:
+        An object of :class:`colossalai.engine.BaseGradientHandler`
+    """
+    config_ = config.copy()
+    config_['model'] = model
+    config_['optimizer'] = optimizer
+    return build_from_registry(config_, GRADIENT_HANDLER)
--- a/colossalai/cli/__init__.py
+++ b/colossalai/cli/__init__.py
+from .cli import cli
+
+__all__ = ['cli']
--- a/colossalai/cli/benchmark/__init__.py
+++ b/colossalai/cli/benchmark/__init__.py
+import click
+
+from .utils import *
+from .benchmark import run_benchmark
+from colossalai.context import Config
+
+__all__ = ['benchmark']
+
+
+@click.command()
+@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
+@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
+@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
+@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
+@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
+@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
+@click.option("-l", "--layers", type=int, default=2)
+@click.option("-m",
+              "--model",
+              type=click.Choice(['mlp'], case_sensitive=False),
+              default='mlp',
+              help="Select the model to benchmark, currently only supports MLP")
+def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
+              layers: int, model: str):
+    args_dict = locals()
+    args = Config(args_dict)
+    run_benchmark(args)
--- a/colossalai/cli/benchmark/benchmark.py
+++ b/colossalai/cli/benchmark/benchmark.py
+import colossalai
+import click
+import torch.multiprocessing as mp
+
+from functools import partial
+from typing import List, Dict
+
+from colossalai.context import Config
+from colossalai.context.random import reset_seeds
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.utils import free_port, MultiTimer
+from colossalai.cli.benchmark.utils import find_all_configs, profile_model, get_batch_data
+from .models import MLP
+
+
+def run_benchmark(args: Config) -> None:
+    """
+    Run benchmarking with torch.multiprocessing.
+    """
+
+    # sanity checks
+    if args.gpus is None:
+        click.echo("Error: --num_gpus is not given")
+        exit()
+    if args.gpus <= 1:
+        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
+
+    click.echo("=== Benchmarking Parameters ===")
+    for k, v in args.items():
+        click.echo(f'{k}: {v}')
+    click.echo('')
+
+    config_list = find_all_configs(args.gpus)
+
+    avail_ports = [free_port() for _ in range(len(config_list))]
+    run_func = partial(run_dist_profiling,
+                       world_size=args.gpus,
+                       port_list=avail_ports,
+                       config_list=config_list,
+                       hyperparams=args)
+    mp.spawn(run_func, nprocs=args.gpus)
+
+
+def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
+                       hyperparams: Config) -> None:
+    """
+    A function executed for profiling, this function should be spawn by torch.multiprocessing.
+
+    Args:
+        rank (int): rank of the process
+        world_size (int): the number of processes
+        port_list (List[int]): a list of free ports for initializing distributed networks
+        config_list (List[Dict]): a list of configuration
+        hyperparams (Config): the hyperparameters given by the user
+    
+    """
+
+    # disable logging for clean output
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    logger.set_level('WARNING')
+
+    for config, port in zip(config_list, port_list):
+        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+        timer = MultiTimer()
+
+        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
+        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
+            click.echo(
+                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
+            )
+            continue
+
+        if hyperparams.model == 'mlp':
+            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
+        else:
+            if gpc.get_global_rank() == 0:
+                click.echo("Error: Invalid argument for --model")
+                exit()
+
+        data_func = partial(get_batch_data,
+                            dim=hyperparams.dimension,
+                            batch_size=hyperparams.batch_size,
+                            seq_length=hyperparams.seq_len,
+                            mode=config.parallel.tensor.mode)
+
+        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
+                                                                      warmup_steps=hyperparams.warmup_steps,
+                                                                      profile_steps=hyperparams.profile_steps,
+                                                                      data_func=data_func,
+                                                                      timer=timer)
+
+        gpc.destroy()
+        reset_seeds()
+
+        if gpc.get_global_rank() == 0:
+            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
+            click.echo(f"=== {config_str} ===")
+            click.echo(f"Average forward time: {fwd_time}")
+            click.echo(f"Average backward time: {bwd_time}")
+            click.echo(f"Max allocated GPU memory: {max_allocated}")
+            click.echo(f"Max cached GPU memory: {max_cached}\n")
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
+import torch
+import colossalai.nn as col_nn
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, dim: int, layers: int):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+
+        for _ in range(layers):
+            self.layers.append(col_nn.Linear(dim, dim))
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
--- a/colossalai/cli/benchmark/utils.py
+++ b/colossalai/cli/benchmark/utils.py
+import math
+import time
+import torch
+
+from colossalai.utils import MultiTimer
+from colossalai.context import ParallelMode, Config
+from typing import List, Dict, Tuple, Callable
+
+
+def get_time_stamp() -> int:
+    """
+    Return the time stamp for profiling.
+
+    Returns:
+        time_stamp (int): the time given by time.time()
+    """
+
+    torch.cuda.synchronize()
+    time_stamp = time.time()
+    return time_stamp
+
+
+def get_memory_states() -> Tuple[float]:
+    """
+    Return the memory statistics.
+
+    Returns:
+        max_allocated (float): the allocated CUDA memory 
+        max_cached (float):  the cached CUDA memory 
+    """
+
+    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
+    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.empty_cache()
+    return max_allocated, max_cached
+
+
+def find_all_configs(device_cnt: int) -> List[Dict]:
+    """
+    Find all possible configurations for tensor parallelism
+
+    Args:
+        device_cnt (int): the number of devices
+
+    Returns:
+        config_list (List[Dict]): a list of configurations
+    """
+
+    def _is_square(num):
+        # 2D parallel should be implemented with at least 2 devices.
+        if num <= 1:
+            return False
+        return math.floor(math.sqrt(num))**2 == num
+
+    def _is_cube(num):
+        # 3D parallel should be implemented with at least 2 devices.
+        if num <= 1:
+            return False
+        return math.floor(num**(1. / 3.))**3 == num
+
+    config_list = []
+
+    # add non-parallel config
+    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
+    config_list.append(config)
+
+    # add 1D config
+    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
+    config_list.append(config)
+
+    # add 2D config only if device_cnt is a square
+    if _is_square(device_cnt):
+        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
+        config_list.append(config)
+
+    # check for 2.5D
+    # iterate over depth
+    for depth in range(1, device_cnt):
+        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
+            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
+            config_list.append(config)
+
+    # check for 3D if device_cnt is a cube
+    if _is_cube(device_cnt):
+        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
+        config_list.append(config)
+
+    config_list = [Config(cfg) for cfg in config_list]
+    return config_list
+
+
+def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
+                  timer: MultiTimer) -> Tuple[float]:
+    """
+    Profile the forward and backward of a model
+
+    Args:
+        model (torch.nn.Module): a PyTorch model
+        warmup_steps (int): the number of steps for warmup
+        profile_steps (int): the number of steps for profiling
+        data_func (Callable): a function to generate random data
+        timer (colossalai.utils.Multitimer): a timer instance for time recording
+    
+    Returns:
+        fwd_time (float): the average forward time taken by forward pass in second
+        bwd_time (float): the average backward time taken by forward pass in second
+        max_allocated (float): the maximum GPU memory allocated in GB
+        max_cached (float): the maximum GPU memory cached in GB
+    """
+
+    def _run_step(data):
+        timer.start('forward')
+        out = model(data)
+        timer.stop('forward', keep_in_history=True)
+        timer.start('backward')
+        out.mean().backward()
+        timer.stop('backward', keep_in_history=True)
+
+    data_list = [data_func() for _ in range(warmup_steps)]
+    for data in data_list:
+        _run_step(data)
+    timer.reset('forward')
+    timer.reset('backward')
+
+    for _ in range(profile_steps):
+        data = data_func()
+        _run_step(data)
+
+    max_allocated, max_cached = get_memory_states()
+    fwd_time = timer.get_timer('forward').get_history_mean()
+    bwd_time = timer.get_timer('backward').get_history_mean()
+    return fwd_time, bwd_time, max_allocated, max_cached
+
+
+def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
+    """
+    Return a random data of shape (batch_size, seq_length, dim) for profiling.
+
+    Args:
+        dim (int): hidden size
+        batch_size (int): the number of data samples
+        seq_length (int): the number of tokens
+        mode (ParallelMode): Colossal-AI ParallelMode enum
+
+    Returns:
+        data (torch.Tensor): random data
+    """
+
+    if mode in ['2d', '2.5d']:
+        batch_size = batch_size // 2
+        dim = dim // 2
+    elif mode == '3d':
+        batch_size = batch_size // 4
+        dim = dim // 2
+
+    data = torch.rand(batch_size, seq_length, dim).cuda()
+    return data
--- a/colossalai/cli/check/__init__.py
+++ b/colossalai/cli/check/__init__.py
+import click
+from .check_installation import check_installation
+
+__all__ = ['check']
+
+
+@click.command(help="Check if Colossal-AI is correct based on the given option")
+@click.option('-i', '--installation', is_flag=True, help="Check if Colossal-AI is built correctly")
+def check(installation):
+    if installation:
+        check_installation()
+        return
+    click.echo("No option is given")
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
+import subprocess
+
+import click
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+
+import colossalai
+
+
+def check_installation():
+    cuda_ext_installed = _check_cuda_extension_installed()
+    cuda_version, torch_version, torch_cuda_version = _check_cuda_torch()
+    colossalai_verison, torch_version_required, cuda_version_required = _parse_colossalai_version()
+
+    cuda_compatibility = _get_compatibility_string([cuda_version, torch_cuda_version, cuda_version_required])
+    torch_compatibility = _get_compatibility_string([torch_version, torch_version_required])
+
+    click.echo(f'#### Installation Report ####\n')
+    click.echo(f"Colossal-AI version: {colossalai_verison}")
+    click.echo(f'----------------------------')
+    click.echo(f"PyTorch Version: {torch_version}")
+    click.echo(f"PyTorch Version required by Colossal-AI: {torch_version_required}")
+    click.echo(f'PyTorch version match: {torch_compatibility}')
+    click.echo(f'----------------------------')
+    click.echo(f"System CUDA Version: {cuda_version}")
+    click.echo(f"CUDA Version required by PyTorch: {torch_cuda_version}")
+    click.echo(f"CUDA Version required by Colossal-AI: {cuda_version_required}")
+    click.echo(f"CUDA Version Match: {cuda_compatibility}")
+    click.echo(f'----------------------------')
+    click.echo(f"CUDA Extension: {cuda_ext_installed}")
+
+
+def _get_compatibility_string(versions):
+
+    # split version into [major, minor, patch]
+    versions = [version.split('.') for version in versions]
+
+    for version in versions:
+        if len(version) == 2:
+            # x means unknown
+            version.append('x')
+
+    for idx, version_values in enumerate(zip(*versions)):
+        equal = len(set(version_values)) == 1
+
+        if idx in [0, 1] and not equal:
+            # if the major/minor versions do not match
+            # return a cross
+            return 'x'
+        elif idx == 1:
+            # if the minor versions match
+            # return a tick
+            return u'\u2713'
+        else:
+            continue
+
+
+def _parse_colossalai_version():
+    colossalai_verison = colossalai.__version__.split('+')[0]
+    torch_version_required = colossalai.__version__.split('torch')[1].split('cu')[0]
+    cuda_version_required = colossalai.__version__.split('cu')[1]
+    return colossalai_verison, torch_version_required, cuda_version_required
+
+
+def _check_cuda_extension_installed():
+    try:
+        import colossalai._C.fused_optim
+        is_cuda_extension_installed = u'\u2713'
+    except ImportError:
+        is_cuda_extension_installed = 'x'
+    return is_cuda_extension_installed
+
+
+def _check_cuda_torch():
+    # get cuda version
+    if CUDA_HOME is None:
+        cuda_version = 'N/A (CUDA_HOME is not set)'
+    else:
+        raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
+        output = raw_output.split()
+        release_idx = output.index("release") + 1
+        release = output[release_idx].split(".")
+        bare_metal_major = release[0]
+        bare_metal_minor = release[1][0]
+        cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
+
+    # get torch version
+    torch_version = torch.__version__.split('+')[0]
+
+    # get cuda version in pytorch build
+    torch_cuda_major = torch.version.cuda.split(".")[0]
+    torch_cuda_minor = torch.version.cuda.split(".")[1]
+    torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
+
+    return cuda_version, torch_version, torch_cuda_version
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
+import click
+from .launcher import run
+from .check import check
+from .benchmark import benchmark
+
+
+class Arguments():
+
+    def __init__(self, arg_dict):
+        for k, v in arg_dict.items():
+            self.__dict__[k] = v
+
+
+@click.group()
+def cli():
+    pass
+
+
+cli.add_command(run)
+cli.add_command(check)
+cli.add_command(benchmark)
+
+if __name__ == '__main__':
+    cli()
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
+import click
+from .run import launch_multi_processes
+from colossalai.context import Config
+
+
+@click.command(help="Launch distributed training on a single node or multiple nodes",
+               context_settings=dict(ignore_unknown_options=True))
+@click.option("-H",
+              "-host",
+              "--host",
+              type=str,
+              default=None,
+              help="the list of hostnames to launch in the format <host1>,<host2>")
+@click.option(
+    "--hostfile",
+    type=str,
+    default=None,
+    help="Hostfile path that defines the device pool available to the job, each line in the file is a hostname")
+@click.option("--include",
+              type=str,
+              default=None,
+              help="Specify computing devices to use during execution. String format is <host1>,<host2>,"
+              " only effective when used with --hostfile.")
+@click.option(
+    "--exclude",
+    type=str,
+    default=None,
+    help=
+    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
+    " only effective when used with --hostfile.")
+@click.option("--num_nodes",
+              type=int,
+              default=-1,
+              help="Total number of worker nodes to use, only effective when used with --hostfile.")
+@click.option("--nproc_per_node", type=int, default=None, help="Number of GPUs to use on each node.")
+@click.option("--master_port",
+              type=int,
+              default=29500,
+              help="(optional) Port used by PyTorch distributed for communication during distributed training.")
+@click.option("--master_addr",
+              type=str,
+              default="127.0.0.1",
+              help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")
+@click.option(
+    "--extra_launch_args",
+    type=str,
+    default=None,
+    help=
+    "Set additional torch distributed launcher arguments such as --standalone. The format is --extra_launch_args arg1=1,arg2=2. "
+    "This will be converted to --arg1=1 --arg2=2 during execution")
+@click.option("--ssh-port", type=int, default=None, help="(optional) the port used for ssh connection")
+@click.argument("user_script", type=str)
+@click.argument('user_args', nargs=-1)
+def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,
+        master_port: int, extra_launch_args: str, ssh_port: int, user_script: str, user_args: str) -> None:
+    """
+    To launch multiple processes on a single node or multiple nodes via command line.
+
+    Usage::
+        # run with 4 GPUs on the current node use default port 29500
+        colossalai run --nprocs_per_node 4 train.py
+
+        # run with 2 GPUs on the current node at port 29550
+        colossalai run --nprocs_per_node 4 --master_port 29550 train.py
+
+        # run on two nodes
+        colossalai run --host <host1>,<host2> --master_addr host1  --nprocs_per_node 4 train.py
+
+        # run with hostfile
+        colossalai run --hostfile <file_path> --master_addr <host>  --nprocs_per_node 4 train.py
+
+        # run with hostfile with only included hosts
+        colossalai run --hostfile <file_path> --master_addr host1 --include host1,host2  --nprocs_per_node 4 train.py
+
+        # run with hostfile excluding the hosts selected
+        colossalai run --hostfile <file_path> --master_addr host1 --exclude host2  --nprocs_per_node 4 train.py
+    """
+    if not user_script.endswith('.py'):
+        click.echo(f'Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help')
+        exit()
+
+    args_dict = locals()
+    args = Config(args_dict)
+    args.user_args = list(args.user_args)
+    launch_multi_processes(args)
--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
+from typing import List
+import socket
+
+
+class HostInfo:
+    """
+    A data class to store host connection-related data.
+
+    Args:
+        hostname (str): name or IP address of the host
+        port (str): the port for ssh connection
+    """
+
+    def __init__(
+        self,
+        hostname: str,
+        port: str = None,
+    ):
+        self.hostname = hostname
+        self.port = port
+        self.is_local_host = HostInfo.is_host_localhost(hostname, port)
+
+    @staticmethod
+    def is_host_localhost(hostname: str, port: str = None) -> None:
+        """
+        Check if the host refers to the local machine.
+
+        Args:
+            hostname (str): name or IP address of the host
+            port (str): the port for ssh connection
+
+        Returns:
+            bool: True if it is local, False otherwise
+        """
+
+        if port is None:
+            port = 22    # no port specified, lets just use the ssh port
+        hostname = socket.getfqdn(hostname)
+        if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
+            return True
+        localhost = socket.gethostname()
+        localaddrs = socket.getaddrinfo(localhost, port)
+        targetaddrs = socket.getaddrinfo(hostname, port)
+        for (family, socktype, proto, canonname, sockaddr) in localaddrs:
+            for (rfamily, rsocktype, rproto, rcanonname, rsockaddr) in targetaddrs:
+                if rsockaddr[0] == sockaddr[0]:
+                    return True
+        return False
+
+    def __str__(self):
+        return f'hostname: {self.hostname}, port: {self.port}'
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class HostInfoList:
+    """
+    A data class to store a list of HostInfo objects.
+    """
+
+    def __init__(self):
+        self.hostinfo_list = []
+
+    def append(self, hostinfo: HostInfo) -> None:
+        """
+        Add an HostInfo object to the list.
+
+        Args:
+            hostinfo (HostInfo): host information
+        """
+
+        self.hostinfo_list.append(hostinfo)
+
+    def remove(self, hostname: str) -> None:
+        """
+        Add an HostInfo object to the list.
+
+        Args:
+            hostname (str): the name of the host
+        """
+
+        hostinfo = self.get_hostinfo(hostname)
+        self.hostinfo_list.remove(hostinfo)
+
+    def get_hostinfo(self, hostname: str) -> HostInfo:
+        """
+        Return the HostInfo object which matches with the hostname.
+
+        Args:
+            hostname (str): the name of the host
+
+        Returns:
+            hostinfo (HostInfo): the HostInfo object which matches with the hostname
+        """
+
+        for hostinfo in self.hostinfo_list:
+            if hostinfo.hostname == hostname:
+                return hostinfo
+
+        raise Exception(f"Hostname {hostname} is not found")
+
+    def has(self, hostname: str) -> bool:
+        """
+        Check if the hostname has been added.
+
+        Args:
+            hostname (str): the name of the host
+
+        Returns:
+            bool: True if added, False otherwise
+        """
+        for hostinfo in self.hostinfo_list:
+            if hostinfo.hostname == hostname:
+                return True
+        return False
+
+    def __iter__(self):
+        return iter(self.hostinfo_list)
+
+    def __len__(self):
+        return len(self.hostinfo_list)
--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
+import fabric
+from .hostinfo import HostInfo, HostInfoList
+from multiprocessing import Pipe, Process
+from multiprocessing import connection as mp_connection
+import click
+
+
+def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Connection,
+                send_conn: mp_connection.Connection, env: dict) -> None:
+    """
+    Use fabric connection to execute command on local or remote hosts.
+
+    Args:
+        hostinfo (HostInfo): host information
+        workdir (str): the directory to execute the command
+        recv_conn (multiprocessing.connection.Connection): receive messages from the master sender
+        send_conn (multiprocessing.connection.Connection): send messages to the master receiver
+        env (dict): a dictionary for environment variables
+    """
+
+    fab_conn = fabric.Connection(hostinfo.hostname, port=hostinfo.port)
+    finish = False
+    env_msg = ' '.join([f'{k}=\"{v}\"' for k, v in env.items()])
+
+    # keep listening until exit
+    while not finish:
+        # receive cmd
+        cmds = recv_conn.recv()
+
+        if cmds == 'exit':
+            # exit from the loop
+            finish = True
+            break
+        else:
+            # execute the commands
+            try:
+                # cd to execute directory
+                with fab_conn.cd(workdir):
+                    # propagate the runtime environment
+                    with fab_conn.prefix(f"export {env_msg}"):
+                        if hostinfo.is_local_host:
+                            # execute on the local machine
+                            fab_conn.local(cmds, hide=False)
+                        else:
+                            # execute on the remote machine
+                            fab_conn.run(cmds, hide=False)
+                    send_conn.send('success')
+            except:
+                click.echo(f"Error: failed to run {cmds} on {hostinfo.hostname}")
+                send_conn.send('failure')
+
+    # shutdown
+    send_conn.send("finish")
+    fab_conn.close()
+
+
+class MultiNodeRunner:
+    """
+    A runner to execute commands on an array of machines. This runner
+    is inspired by Nezha (https://github.com/zhuzilin/NeZha).
+    """
+
+    def __init__(self):
+        self.processes = {}
+        self.master_send_conns = {}
+        self.master_recv_conns = {}
+
+    def connect(self, host_info_list: HostInfoList, workdir: str, env: dict) -> None:
+        """
+        Establish connections to a list of hosts
+
+        Args:
+            host_info_list (HostInfoList): a list of HostInfo objects
+            workdir (str): the directory where command is executed
+            env (dict): environment variables to propagate to hosts
+        """
+        for hostinfo in host_info_list:
+            master_send_conn, worker_recv_conn = Pipe()
+            master_recv_conn, worker_send_conn = Pipe()
+            p = Process(target=run_on_host, args=(hostinfo, workdir, worker_recv_conn, worker_send_conn, env))
+            p.start()
+            self.processes[hostinfo.hostname] = p
+            self.master_recv_conns[hostinfo.hostname] = master_recv_conn
+            self.master_send_conns[hostinfo.hostname] = master_send_conn
+
+    def send(self, hostinfo: HostInfo, cmd: str) -> None:
+        """
+        Send a command to a local/remote host.
+
+        Args:
+            hostinfo (HostInfo): host information
+            cmd (str): the command to execute
+        """
+
+        assert hostinfo.hostname in self.master_send_conns, \
+            f'{hostinfo} is not found in the current connections'
+        conn = self.master_send_conns[hostinfo.hostname]
+        conn.send(cmd)
+
+    def stop_all(self) -> None:
+        """
+        Stop connections to all hosts.
+        """
+
+        for hostname, conn in self.master_send_conns.items():
+            conn.send('exit')
+
+    def recv_from_all(self) -> dict:
+        """
+        Receive messages from all hosts
+
+        Returns:
+            msg_from_node (dict): a dictionry which contains messages from each node
+        """
+
+        msg_from_node = dict()
+        for hostname, conn in self.master_recv_conns.items():
+            msg_from_node[hostname] = conn.recv()
+        return msg_from_node
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
+import click
+import sys
+import os
+import torch
+from colossalai.context import Config
+from .multinode_runner import MultiNodeRunner
+from .hostinfo import HostInfo, HostInfoList
+from typing import List
+from packaging import version
+
+# Constants that define our syntax
+NODE_SEP = ','
+
+
+def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList:
+    """
+    Parse the hostfile to obtain a list of hosts.
+    
+    A hostfile should look like:
+    worker-0
+    worker-1
+    worker-2
+    ...
+
+    Args:
+        hostfile_path (str): the path to the hostfile
+        ssh_port (int): the port to connect to the host
+    """
+
+    if not os.path.isfile(hostfile_path):
+        click.echo(f"Error: Unable to find the hostfile, no such file: {hostfile_path}")
+        exit()
+
+    with open(hostfile_path, 'r') as fd:
+        device_pool = HostInfoList()
+
+        for line in fd.readlines():
+            line = line.strip()
+            if line == '':
+                # skip empty lines
+                continue
+
+            # build the HostInfo object
+            hostname = line.strip()
+            hostinfo = HostInfo(hostname=hostname, port=ssh_port)
+
+            if device_pool.has(hostname):
+                click.echo(f"Error: found duplicate host {hostname} in the hostfile")
+                exit()
+
+            device_pool.append(hostinfo)
+    return device_pool
+
+
+def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str=None) -> HostInfoList:
+    '''Parse an inclusion or exclusion string and filter a hostfile dictionary.
+
+    Examples:
+        include_str="worker-0,worker-1" will execute jobs only on worker-0 and worker-1.
+        exclude_str="worker-1" will use all available devices except worker-1.
+
+    Args:
+        device_pool (HostInfoList): a list of HostInfo objects
+        include_str (str): --include option passed by user, default None
+        exclude_str (str): --exclude option passed by user, default None
+    
+    Returns:
+        filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion
+    '''
+
+    # Ensure include/exclude are mutually exclusive
+    if include_str and exclude_str:
+        click.echo("--include and --exclude are mutually exclusive, only one can be used")
+        exit()
+
+    # no-op
+    if include_str is None and exclude_str is None:
+        return device_pool
+
+    # Either build from scratch or remove items
+    if include_str:
+        parse_str = include_str
+        filtered_hosts = HostInfoList()
+    elif exclude_str:
+        parse_str = exclude_str
+        filtered_hosts = device_pool
+
+    # foreach node in the list
+    for node_config in parse_str.split(NODE_SEP):
+        hostname = node_config
+        hostinfo = device_pool.get_hostinfo(hostname)
+        # sanity check hostname
+        if not device_pool.has(hostname):
+            click.echo(f"Error: Hostname '{hostname}' not found in hostfile")
+            exit()
+
+        if include_str:
+            filtered_hosts.append(hostinfo)
+        elif exclude_str:
+            filtered_hosts.remove(hostname)
+
+    return filtered_hosts
+
+
+def get_launch_command(
+    master_addr: str,
+    master_port: int,
+    nproc_per_node: int,
+    user_script: str,
+    user_args: List[str],
+    node_rank: int,
+    num_nodes: int,
+    extra_launch_args: str = None,
+) -> str:
+    """
+    Generate a command for distributed training.
+
+    Args:
+        master_addr (str): the host of the master node
+        master_port (str): the port of the master node
+        nproc_per_node (str): the number of processes to launch on each node
+        user_script (str): the user Python file
+        user_args (str): the arguments for the user script
+        node_rank (int): the unique ID for the node
+        num_nodes (int): the number of nodes to execute jobs
+
+    Returns:
+        cmd (str): the command the start distributed training
+    """
+
+    def _arg_dict_to_list(arg_dict):
+        ret = []
+
+        for k, v in arg_dict.items():
+            if v:
+                ret.append(f'--{k}={v}')
+            else:
+                ret.append(f'--{k}')
+        return ret
+
+    if extra_launch_args:
+        extra_launch_args_dict = dict()
+        for arg in extra_launch_args.split(','):
+            if '=' in arg:
+                k, v = arg.split('=')
+                extra_launch_args_dict[k] = v
+            else:
+                extra_launch_args_dict[arg] = None
+        extra_launch_args = extra_launch_args_dict
+    else:
+        extra_launch_args = dict()
+
+    torch_version = version.parse(torch.__version__)
+    assert torch_version.major == 1
+
+    if torch_version.minor < 9:
+        cmd = [
+            sys.executable, "-m", "torch.distributed.launch", f"--nproc_per_node={nproc_per_node}",
+            f"--master_addr={master_addr}", f"--master_port={master_port}", f"--nnodes={num_nodes}",
+            f"--node_rank={node_rank}"
+        ]
+    else:
+        # extra launch args for torch distributed launcher with torch >= 1.9
+        default_torchrun_rdzv_args = dict(rdzv_backend="c10d",
+                                          rdzv_endpoint=f"{master_addr}:{master_port}",
+                                          rdzv_id="colossalai-default-job")
+
+        # update rdzv arguments
+        for key in default_torchrun_rdzv_args.keys():
+            if key in extra_launch_args:
+                value = extra_launch_args.pop(key)
+                default_torchrun_rdzv_args[key] = value
+
+        if torch_version.minor < 10:
+            cmd = [
+                sys.executable, "-m", "torch.distributed.run", f"--nproc_per_node={nproc_per_node}",
+                f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
+            ]
+        else:
+            cmd = [
+                "torchrun", f"--nproc_per_node={nproc_per_node}", f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
+            ]
+        cmd += _arg_dict_to_list(default_torchrun_rdzv_args)
+
+    cmd += _arg_dict_to_list(extra_launch_args) + [user_script] + user_args
+    cmd = ' '.join(cmd)
+    return cmd
+
+
+def launch_multi_processes(args: Config) -> None:
+    """
+    Launch multiple processes on a single node or multiple nodes.
+
+    The overall logic can be summarized as the pseudo code below:
+    
+        if hostfile given:
+            hostinfo = parse_hostfile(hostfile)
+            hostinfo = include_or_exclude_hosts(hostinfo)
+            launch_on_multi_nodes(hostinfo)
+        elif hosts given:
+            hostinfo = parse_hosts(hosts)
+            launch_on_multi_nodes(hostinfo)
+        else:
+            launch_on_current_node()
+    
+    Args:
+        args (Config): the arguments taken from command line
+
+    """
+    assert isinstance(args, Config)
+
+    if args.nproc_per_node is None:
+        click.echo("--nproc_per_node did not receive any value")
+        exit()
+
+    # cannot accept hosts and hostfile at the same time
+    if args.host and args.hostfile:
+        click.echo("Error: hostfile and hosts are mutually exclusive, only one is required")
+
+    # check if hostfile is given
+    if args.hostfile:
+        device_pool = fetch_hostfile(args.hostfile, ssh_port=args.ssh_port)
+        active_device_pool = parse_device_filter(device_pool, args.include, args.exclude)
+
+        if args.num_nodes > 0:
+            # only keep the first num_nodes to execute jobs
+            updated_active_device_pool = HostInfoList()
+            for count, hostinfo in enumerate(active_device_pool):
+                if args.num_nodes == count:
+                    break
+                updated_active_device_pool.append(hostinfo)
+            active_device_pool = updated_active_device_pool
+    else:
+        active_device_pool = None
+
+    env = os.environ.copy()
+
+    # use hosts if hostfile is not given
+    if args.host and active_device_pool is None:
+        active_device_pool = HostInfoList()
+        host_list = args.host.strip().split(NODE_SEP)
+        for hostname in host_list:
+            hostinfo = HostInfo(hostname=hostname, port=args.ssh_port)
+            active_device_pool.append(hostinfo)
+
+    if not active_device_pool:
+        # run on local node if not hosts or hostfile is given
+        # add local node to host info list
+        active_device_pool = HostInfoList()
+        localhost_info = HostInfo(hostname='127.0.0.1', port=args.ssh_port)
+        active_device_pool.append(localhost_info)
+
+    # launch distributed processes
+    runner = MultiNodeRunner()
+    curr_path = os.path.abspath('.')
+
+    # collect current path env
+    env = dict()
+    for k, v in os.environ.items():
+        # do not support multi-line env var
+        if v and '\n' not in v:
+            env[k] = v
+
+    # establish remote connection
+    runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)
+
+    # execute distributed launching command
+    for node_id, hostinfo in enumerate(active_device_pool):
+        cmd = get_launch_command(master_addr=args.master_addr,
+                                 master_port=args.master_port,
+                                 nproc_per_node=args.nproc_per_node,
+                                 user_script=args.user_script,
+                                 user_args=args.user_args,
+                                 node_rank=node_id,
+                                 num_nodes=len(active_device_pool),
+                                 extra_launch_args=args.extra_launch_args)
+        runner.send(hostinfo=hostinfo, cmd=cmd)
+
+    runner.recv_from_all()
+    runner.stop_all()
+    runner.recv_from_all()
--- a/colossalai/communication/__init__.py
+++ b/colossalai/communication/__init__.py
+from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
+from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
+                  send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
+                  recv_forward, recv_backward)
+from .ring import ring_forward
+from .utils import send_obj_meta, recv_obj_meta
+
+__all__ = [
+    'all_gather',
+    'reduce_scatter',
+    'all_reduce',
+    'broadcast',
+    'reduce',
+    'send_forward',
+    'send_forward_recv_forward',
+    'send_forward_backward_recv_forward_backward',
+    'send_backward',
+    'send_backward_recv_backward',
+    'send_backward_recv_forward',
+    'send_forward_recv_backward',
+    'recv_backward',
+    'recv_forward',
+    'ring_forward',
+    'send_obj_meta',
+    'recv_obj_meta',
+]
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+import torch.distributed as dist
+from torch.distributed import ReduceOp
+from torch import Tensor
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+
+
+def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
+    r"""Gathers all tensors from the parallel group and concatenates them in a
+    specific dimension.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be gathered.
+        dim (int): The dimension concatenating in.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        shape = list(tensor.shape)
+        shape[0], shape[dim] = shape[dim], shape[0]
+        shape[0] *= depth
+        out = torch.empty(shape, dtype=tensor.dtype, device=tensor.device)
+        temp = list(torch.chunk(out, depth, dim=0))
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.all_gather(tensor_list=temp,
+                               tensor=tensor.transpose(0, dim).contiguous(),
+                               group=group,
+                               async_op=async_op)
+        out = torch.transpose(out, 0, dim)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def reduce_scatter(tensor: Tensor,
+                   dim: int,
+                   parallel_mode: ParallelMode,
+                   op: ReduceOp = ReduceOp.SUM,
+                   async_op: bool = False) -> Tensor:
+    r"""Reduces all tensors then scatters it in a specific dimension to all
+    members in the parallel group.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
+        dim (int): The dimension concatenating in.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
+            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
+            More details about ReduceOp please refer to
+            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        temp = list(map(lambda x: x.contiguous(), torch.chunk(tensor, depth, dim=dim)))
+        out = torch.empty(temp[0].shape, dtype=tensor.dtype, device=tensor.device)
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.reduce_scatter(output=out, input_list=temp, op=op, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def all_reduce(tensor: Tensor,
+               parallel_mode: ParallelMode,
+               op: ReduceOp = ReduceOp.SUM,
+               async_op: bool = False) -> Tensor:
+    r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
+            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
+            More details about ReduceOp please refer to
+            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        out = tensor.contiguous()
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.all_reduce(out, op=op, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
+    r"""Broadcast tensors to whole parallel group. Tensor must have the same
+    number of elements in all processes participating in the collective.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be broadcast.
+        src (int): Source rank.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        out = tensor.contiguous()
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.broadcast(out, src=src, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
+    r"""Reduce tensors across whole parallel group. Only the process with
+    rank ``dst`` is going to receive the final result.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be reduced.
+        dst (int): Destination rank.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        out = tensor.contiguous()
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.reduce(out, dst=dst, op=op, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def scatter_object_list(scatter_object_output_list, scatter_object_input_list, src=0, group=None) -> None:
+    r"""Modified from `torch.distributed.scatter_object_list <https://pytorch.org/docs/stable/_modules/torch/distributed/distributed_c10d.html#scatter_object_list>` to fix issues
+    """
+    if dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    if (not isinstance(scatter_object_output_list, list) or len(scatter_object_output_list) < 1):
+        raise RuntimeError("Expected argument scatter_object_output_list to be a list of size at least 1.")
+
+    # set tensor device to cuda if backend is nccl
+    device = torch.cuda.current_device() if dist.get_backend(group) == 'nccl' else torch.device("cpu")
+
+    my_rank = dist.get_rank()    # use global rank
+    if my_rank == src:
+        tensor_list, tensor_sizes = zip(
+            *[dist.distributed_c10d._object_to_tensor(obj) for obj in scatter_object_input_list])
+        tensor_list = list(map(lambda x: x.to(device), tensor_list))
+        tensor_sizes = list(map(lambda x: x.to(device), tensor_sizes))
+
+    # Src rank broadcasts the maximum tensor size. This is because all ranks are
+    # expected to call into scatter() with equal-sized tensors.
+    if my_rank == src:
+        max_tensor_size = max(tensor_sizes)
+        for tensor in tensor_list:
+            tensor.resize_(max_tensor_size)
+    else:
+        max_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
+
+    dist.broadcast(max_tensor_size, src=src, group=group)
+
+    # Scatter actual serialized objects
+    output_tensor = torch.empty(max_tensor_size.item(), dtype=torch.uint8).to(device)
+    dist.scatter(
+        output_tensor,
+        scatter_list=None if my_rank != src else tensor_list,
+        src=src,
+        group=group,
+    )
+
+    # Scatter per-object sizes to trim tensors when deserializing back to object
+    obj_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
+    dist.scatter(
+        obj_tensor_size,
+        scatter_list=None if my_rank != src else tensor_sizes,
+        src=src,
+        group=group,
+    )
+
+    output_tensor, obj_tensor_size = output_tensor.cpu(), obj_tensor_size.cpu()
+    # Deserialize back to object
+    scatter_object_output_list[0] = dist.distributed_c10d._tensor_to_object(output_tensor, obj_tensor_size)