Merge branch 'main' of https://github.com/hpcaitech/ColossalAI

7bc5a8e3 · zhuwenwen · e6748d82 · 0f785cb1 · 7bc5a8e3 · 7bc5a8e3
Commit 7bc5a8e3 authored May 05, 2023 by zhuwenwen
20 changed files
--- a/colossalai/checkpoint_io/utils.py
+++ b/colossalai/checkpoint_io/utils.py
+# coding=utf-8
+from pathlib import Path
+import torch
+import torch.nn as nn
+from typing import List, Dict, Mapping, OrderedDict, Optional, Tuple
+from colossalai.tensor.d_tensor.d_tensor import DTensor
+import re
+
+SAFE_WEIGHTS_NAME = "model.safetensors"
+WEIGHTS_NAME = "pytorch_model.bin"
+SAFE_WEIGHTS_INDEX_NAME = "model.safetensors.index.json"
+WEIGHTS_INDEX_NAME = "pytorch_model.bin.index.json"
+
+# ======================================
+# General helper functions
+# ======================================
+
+def calculate_tensor_size(tensor: torch.Tensor) -> float:
+    """
+    Calculate the size of a parameter in MB. Used to compute whether a group of params exceed the shard size.
+    If so, a new shard should be created.
+
+    Args:
+        tenosr (torch.Tensor): the tensor to calculate size for.
+
+    Returns:
+        float: size of the tensor in MB.
+    """
+    return tensor.numel() * tensor.element_size() / 1024 / 1024
+
+def is_safetensors_available() -> bool:
+    """
+    Check whether safetensors is available.
+
+    Returns:
+        bool: whether safetensors is available.
+    """
+    try:
+        import safetensors
+        return True
+    except ImportError:
+        return False
+
+
+def is_dtensor_checkpoint(checkpoint_file_path: str) -> bool:
+    """
+    Check whether the checkpoint file is a dtensor checkpoint.
+
+    Args:
+        checkpoint_file_path (str): path to the checkpoint file.
+
+    Returns:
+        bool: whether the checkpoint file is a dtensor checkpoint.
+    """
+    if checkpoint_file_path.endswith('.*.safetensors') or checkpoint_file_path.endswith('.*.bin'):
+        return True
+    else:
+        return False
+
+
+def is_safetensor_checkpoint(checkpoint_file_path: str) -> bool:
+    """
+    Check whether the checkpoint file is a safetensor checkpoint.
+
+    Args:
+        checkpoint_file_path (str): path to the checkpoint file.
+
+    Returns:
+        bool: whether the checkpoint file is a safetensor checkpoint.
+    """
+    if checkpoint_file_path.endswith('.safetensors'):
+        return True
+    else:
+        return False
+
+
+# ======================================
+# Helper functions for saving shard file
+# ======================================
+def shard_checkpoint(state_dict: torch.Tensor, max_shard_size: int = 1024, weights_name: str = WEIGHTS_NAME):
+ 
+    """
+    Splits a model state dictionary in sub-checkpoints so that the final size of each sub-checkpoint does not exceed a
+    given size.
+    """
+    sharded_state_dicts = []
+    current_block = {}
+    current_block_size = 0
+    total_size = 0
+
+    for key, weight in state_dict.items():
+        if type(weight) != DTensor:
+            weight_size = calculate_tensor_size(weight)
+
+            # If this weight is going to tip up over the maximal size, we split.
+            if current_block_size + weight_size > max_shard_size:
+                sharded_state_dicts.append(current_block)
+                current_block = {}
+                current_block_size = 0
+
+            current_block[key] = weight
+            current_block_size += weight_size
+            total_size += weight_size
+
+    # Add the last block
+    sharded_state_dicts.append(current_block)
+
+    # If we only have one shard, we return it
+    if len(sharded_state_dicts) == 1:
+        return {weights_name: sharded_state_dicts[0]}, None
+    
+    # Otherwise, let's build the index
+    weight_map = {}
+    shards = {}
+
+    for idx, shard in enumerate(sharded_state_dicts):
+        shard_file = weights_name.replace(".bin", f"-{idx+1:05d}-of-{len(sharded_state_dicts):05d}.bin")
+        shard_file = shard_file.replace(
+            ".safetensors", f"-{idx + 1:05d}-of-{len(sharded_state_dicts):05d}.safetensors"
+        )
+        shards[shard_file] = shard
+        for key in shard.keys():
+            weight_map[key] = shard_file
+
+    # Add the metadata
+    metadata = {"total_size": total_size}
+    index = {"metadata": metadata, "weight_map": weight_map}
+    return shards, index
+
+def load_shard_state_dict(checkpoint_file: Path, use_safetensors: bool =False):
+    """
+    load shard state dict into model
+    """
+    if use_safetensors and not checkpoint_file.suffix == ".safetensors":
+        raise Exception("load the model using `safetensors`, but no file endwith .safetensors")
+    if use_safetensors:
+        from safetensors.torch import safe_open
+        from safetensors.torch import load_file as safe_load_file
+        with safe_open(checkpoint_file, framework="pt") as f:
+            metadata = f.metadata()
+        if metadata["format"] != "pt":
+            raise NotImplementedError(
+                f"Conversion from a {metadata['format']} safetensors archive to PyTorch is not implemented yet."
+            )
+        return safe_load_file(checkpoint_file)
+    else:
+        return torch.load(checkpoint_file)
+    
+def load_state_dict_into_model(model: nn.Module, state_dict: torch.Tensor, missing_keys: List, strict: bool = False):
+    r"""Copies parameters and buffers from :attr:`state_dict` into
+    this module and its descendants. 
+
+    Args:
+        state_dict (dict): a dict containing parameters and
+            persistent buffers.
+    """
+    if not isinstance(state_dict, Mapping):
+        raise TypeError("Expected state_dict to be dict-like, got {}.".format(type(state_dict)))
+
+    unexpected_keys: List[str] = []
+    sub_missing_keys: List[str] = []
+    error_msgs: List[str] = []
+
+    # copy state_dict so _load_from_state_dict can modify it
+    metadata = getattr(state_dict, '_metadata', None)
+    state_dict = OrderedDict(state_dict)
+    if metadata is not None:
+        state_dict._metadata = metadata
+
+    def load(module: nn.Module, state_dict, prefix=""):
+        local_metadata = {} if metadata is None else metadata.get(prefix[:-1], {})
+        args = (state_dict, prefix, local_metadata, True, [], [], error_msgs)
+        # Parameters of module and children will start with prefix. We can exit early if there are none in this
+        # state_dict
+        if len([key for key in state_dict if key.startswith(prefix)]) > 0:
+            module._load_from_state_dict(*args)
+
+        for name, child in module._modules.items():
+            if child is not None:
+                load(child, state_dict, prefix + name + ".")
+
+    load(model, state_dict, "")
+    del load
+
+    # deal with missing key
+    if len(missing_keys) > 0:
+        deleted_keys = []
+        for key in missing_keys:
+            if key not in sub_missing_keys:
+                deleted_keys.append(key)
+        for key in deleted_keys:
+            missing_keys.remove(key)
+
+    if strict:
+        if len(unexpected_keys) > 0:
+            error_msgs = 'Unexpected key(s) in state_dict: {}. '.format(
+                        ', '.join('"{}"'.format(k) for k in unexpected_keys))
+            raise RuntimeError('Error(s) in loading state_dict for {}:\n\t{}'.format(
+                               model.__class__.__name__, "\n\t".join(error_msgs)))
+        
+# ======================================
+# Helper functions for saving state dict
+# ======================================
+
+
+def save_state_dict(state_dict: dict, checkpoint_file_path: str, use_safetensors: bool) -> None:
+    """
+    Save state dict to checkpoint.
+
+    Args:
+        state_dict (dict): state dict.
+        checkpoint_file_path (str): path to the checkpoint file.
+        use_safetensors (bool): whether to use safetensors to save the checkpoint.
+    """
+    if use_safetensors:
+        assert is_safetensors_available(), "safetensors is not available."
+        assert checkpoint_file_path.endswith('.safetensors'), \
+            "safetensors only supports .safetensors suffix for checkpoint file."
+        from safetensors.torch import save_file as safe_save_file
+        safe_save_file(state_dict, checkpoint_file_path, metadata={"format": "pt"})
+    else:
+        torch.save(state_dict, checkpoint_file_path)
+
+
+def save_dtensor(name: str, tensor: torch.Tensor, index_file: "CheckpointIndexFile", use_safetensors: bool) -> None:
+    """
+    Save distributed tensor to checkpoint. This checkpoint will be a dictionary which contains
+    only one tensor.
+
+    Args:
+        tensor (Tensor): tensor to be saved.
+        index_file (CheckpointIndexFile): path to the checkpoint file.
+        size_per_shard (int): size per shard in MB.
+    """
+    root_path = index_file.root_path
+    output_root_path = root_path.joinpath('dtensor')
+
+    # create directory
+    output_root_path.mkdir(exist_ok=True)
+
+    # save tensor to this directory
+    # TODO(YuliangLiu): get index of the tensor shard
+    # e.g. index =
+    index = 0
+
+    # save tensor to file
+    ckpt_file_name = generate_dtensor_file_name(name, index, use_safetensors)
+    ckpt_file_path = output_root_path.joinpath(ckpt_file_name)
+
+    # dtensor ckpt file always contains only one tensor
+    state_dict = {name: tensor}
+    save_state_dict(state_dict, str(ckpt_file_path), use_safetensors)
+
+    # update the weight map
+    # * means all shards
+    ckpt_file_name_in_weight_map = 'dtensor/' + generate_dtensor_file_name(name, '*', use_safetensors)
+    index_file.append_weight_map(name, ckpt_file_name_in_weight_map)
+
+
+def get_checkpoint_file_suffix(use_safetensors: bool) -> str:
+    """
+    Get checkpoint file suffix.
+
+    Args:
+        use_safetensors (bool): whether to use safetensors to save the checkpoint.
+
+    Returns:
+        str: checkpoint file suffix.
+    """
+    if use_safetensors:
+        return '.safetensors'
+    else:
+        return '.bin'
+
+
+def generate_checkpoint_shard_file_name(index: int,
+                                        total_number: int,
+                                        use_safetensors: bool,
+                                        prefix: str = None) -> str:
+    """
+    Generate checkpoint shard file name.
+
+    Args:
+        index (int): index of the shard.
+        total_number (int): total number of shards.
+        use_safetensors (bool): whether to use safetensors to save the checkpoint.
+        prefix (str): prefix of the shard file name. Default: None.
+
+    Returns:
+        str: checkpoint shard file name.
+    """
+    suffix = get_checkpoint_file_suffix(use_safetensors)
+
+    if prefix is None:
+        return f"{index:05d}-of-{total_number:05d}.{suffix}"
+    else:
+        return f"{prefix}-{index:05d}-of-{total_number:05d}.{suffix}"
+
+
+def generate_dtensor_file_name(param_name: str, index: int, use_safetensors: bool) -> str:
+    """
+    Generate dtensor file name.
+
+    Args:
+        param_name (str): name of the distributed parameter.
+        index (int): index of the shard.
+        use_safetensors (bool): whether to use safetensors to save the checkpoint.
+
+    Returns:
+        str: dtensor file name.
+    """
+    suffix = get_checkpoint_file_suffix(use_safetensors)
+    return f'{param_name}.{index}.{suffix}'
+
+
+def save_state_dict_as_shard(
+    state_dict: dict,
+    checkpoint_path: str,
+    index: int,
+    total_number: int,
+    use_safetensors: bool,
+    prefix: str = None,
+) -> None:
+    """
+    Save state dict as shard.
+
+    Args:
+        state_dict (dict): state dict.
+        checkpoint_path (str): path to the checkpoint file.
+        index (int): index of the shard.
+        total_number (int): total number of shards.
+        prefix (str): prefix of the shard file name.
+        use_safetensors (bool): whether to use safetensors to save the checkpoint.
+    """
+    # generate the shard name
+    shard_file_name = generate_checkpoint_shard_file_name(index, total_number, use_safetensors, prefix)
+    shard_file_path = Path(checkpoint_path).joinpath(shard_file_name).absolute()
+
+    # save the shard
+    save_state_dict(state_dict, str(shard_file_path), use_safetensors)
+
+
+# ========================================
+# Helper functions for loading state dict
+# ========================================
+
+
+def has_index_file(checkpoint_path: str) -> Tuple[bool, Optional[Path]]:
+    """
+    Check whether the checkpoint has an index file.
+
+    Args:
+        checkpoint_path (str): path to the checkpoint.
+
+    Returns:
+        Tuple[bool, Optional[Path]]: a tuple of (has_index_file, index_file_path)
+    """
+    checkpoint_path = Path(checkpoint_path)
+    if checkpoint_path.is_file():
+        # check if it is .index.json
+        reg = re.compile("(.*?).index((\..*)?).json")
+        if reg.fullmatch(checkpoint_path.name) is not None:
+            return True, checkpoint_path
+        else:
+            return False, None
+    elif checkpoint_path.is_dir():
+        # check if there is only one a file ending with .index.json in this directory
+        index_files = list(checkpoint_path.glob('*.index.*json'))
+
+        # if we found a .index.json file, make sure there is only one
+        if len(index_files) > 0:
+            assert len(
+                index_files
+            ) == 1, f'Expected to find one .index.json file in {checkpoint_path}, but found {len(index_files)}'
+
+        if len(index_files) == 1:
+            return True, index_files[0]
+        else:
+            return False, None
+
+
+def load_state_dict(checkpoint_file_path: Path):
+    """
+    Load state dict from checkpoint.
+
+    Args:
+        checkpoint_file_path (Path): path to the checkpoint file.
+
+    Returns:
+        dict: state dict.
+    """
+
+    assert not is_dtensor_checkpoint(checkpoint_file_path), \
+        f'Cannot load state dict from dtensor checkpoint {checkpoint_file_path}, you should convert the distributed tensors to gathered tensors with our CLI offline.'
+
+    if is_safetensor_checkpoint(checkpoint_file_path):
+        assert is_safetensors_available(), \
+            f'Cannot load state dict from safetensor checkpoint {checkpoint_file_path}, because safetensors is not available. Please install safetensors first with pip install safetensors.'
+        # load with safetensors
+        from safetensors import safe_open
+        state_dict = {}
+        with safe_open(checkpoint_file_path, framework="pt", device="cpu") as f:
+            for k in f.keys():
+                state_dict[k] = f.get_tensor(k)
+        return state_dict
+
+    else:
+        # load with torch
+        return torch.load(checkpoint_file_path)
+    
+
+
+def add_variant(weights_name: str, variant: Optional[str] = None) -> str:
+    if variant is not None and len(variant) > 0:
+        splits = weights_name.split(".")
+        splits = splits[:-1] + [variant] + splits[-1:]
+        weights_name = ".".join(splits)
+
+    return weights_name
--- a/colossalai/cli/__init__.py
+++ b/colossalai/cli/__init__.py
+from .cli import cli
+
+__all__ = ['cli']
--- a/colossalai/cli/benchmark/__init__.py
+++ b/colossalai/cli/benchmark/__init__.py
+import click
+
+from colossalai.context import Config
+
+from .benchmark import run_benchmark
+from .utils import *
+
+__all__ = ['benchmark']
+
+
+@click.command()
+@click.option("-g", "--gpus", type=int, default=None, help="Total number of devices to use.")
+@click.option("-b", "--batch_size", type=int, default=8, help="Batch size of the input tensor.")
+@click.option("-s", "--seq_len", type=int, default=512, help="Sequence length of the input tensor.")
+@click.option("-d", "--dimension", type=int, default=1024, help="Hidden dimension of the input tensor.")
+@click.option("-w", "--warmup_steps", type=int, default=10, help="The number of warmup steps.")
+@click.option("-p", "--profile_steps", type=int, default=50, help="The number of profiling steps.")
+@click.option("-l", "--layers", type=int, default=2)
+@click.option("-m",
+              "--model",
+              type=click.Choice(['mlp'], case_sensitive=False),
+              default='mlp',
+              help="Select the model to benchmark, currently only supports MLP")
+def benchmark(gpus: int, batch_size: int, seq_len: int, dimension: int, warmup_steps: int, profile_steps: int,
+              layers: int, model: str):
+    args_dict = locals()
+    args = Config(args_dict)
+    run_benchmark(args)
--- a/colossalai/cli/benchmark/benchmark.py
+++ b/colossalai/cli/benchmark/benchmark.py
+from functools import partial
+from typing import Dict, List
+
+import click
+import torch.multiprocessing as mp
+
+import colossalai
+from colossalai.cli.benchmark.utils import find_all_configs, get_batch_data, profile_model
+from colossalai.context import Config
+from colossalai.context.random import reset_seeds
+from colossalai.core import global_context as gpc
+from colossalai.logging import disable_existing_loggers, get_dist_logger
+from colossalai.testing import free_port
+from colossalai.utils import MultiTimer
+
+from .models import MLP
+
+
+def run_benchmark(args: Config) -> None:
+    """
+    Run benchmarking with torch.multiprocessing.
+    """
+
+    # sanity checks
+    if args.gpus is None:
+        click.echo("Error: --num_gpus is not given")
+        exit()
+    if args.gpus <= 1:
+        click.echo("Warning: tensor parallel will be activated with at least 2 devices.")
+
+    click.echo("=== Benchmarking Parameters ===")
+    for k, v in args.items():
+        click.echo(f'{k}: {v}')
+    click.echo('')
+
+    config_list = find_all_configs(args.gpus)
+
+    avail_ports = [free_port() for _ in range(len(config_list))]
+    run_func = partial(run_dist_profiling,
+                       world_size=args.gpus,
+                       port_list=avail_ports,
+                       config_list=config_list,
+                       hyperparams=args)
+    mp.spawn(run_func, nprocs=args.gpus)
+
+
+def run_dist_profiling(rank: int, world_size: int, port_list: List[int], config_list: List[Dict],
+                       hyperparams: Config) -> None:
+    """
+    A function executed for profiling, this function should be spawn by torch.multiprocessing.
+
+    Args:
+        rank (int): rank of the process
+        world_size (int): the number of processes
+        port_list (List[int]): a list of free ports for initializing distributed networks
+        config_list (List[Dict]): a list of configuration
+        hyperparams (Config): the hyperparameters given by the user
+
+    """
+
+    # disable logging for clean output
+    disable_existing_loggers()
+    logger = get_dist_logger()
+    logger.set_level('WARNING')
+
+    for config, port in zip(config_list, port_list):
+        colossalai.launch(config=config, rank=rank, world_size=world_size, host='localhost', port=port, backend='nccl')
+        timer = MultiTimer()
+
+        # 1D parallel should be skipped if in_features or out_features is not able to be divided exactly by 1D parallel size.
+        if config.parallel.tensor.mode == '1d' and hyperparams.dimension % config.parallel.tensor.size != 0:
+            click.echo(
+                "1D parallel will be skipped because in_features or out_features is not able to be divided exactly by 1D parallel size."
+            )
+            continue
+
+        if hyperparams.model == 'mlp':
+            model = MLP(dim=hyperparams.dimension, layers=hyperparams.layers)
+        else:
+            if gpc.get_global_rank() == 0:
+                click.echo("Error: Invalid argument for --model")
+                exit()
+
+        data_func = partial(get_batch_data,
+                            dim=hyperparams.dimension,
+                            batch_size=hyperparams.batch_size,
+                            seq_length=hyperparams.seq_len,
+                            mode=config.parallel.tensor.mode)
+
+        fwd_time, bwd_time, max_allocated, max_cached = profile_model(model=model,
+                                                                      warmup_steps=hyperparams.warmup_steps,
+                                                                      profile_steps=hyperparams.profile_steps,
+                                                                      data_func=data_func,
+                                                                      timer=timer)
+
+        gpc.destroy()
+        reset_seeds()
+
+        if gpc.get_global_rank() == 0:
+            config_str = ', '.join([f'{k}: {v}' for k, v in config.parallel.tensor.items()])
+            click.echo(f"=== {config_str} ===")
+            click.echo(f"Average forward time: {fwd_time}")
+            click.echo(f"Average backward time: {bwd_time}")
+            click.echo(f"Max allocated GPU memory: {max_allocated}")
+            click.echo(f"Max cached GPU memory: {max_cached}\n")
--- a/colossalai/cli/benchmark/models.py
+++ b/colossalai/cli/benchmark/models.py
+import torch
+
+import colossalai.nn as col_nn
+
+
+class MLP(torch.nn.Module):
+
+    def __init__(self, dim: int, layers: int):
+        super().__init__()
+        self.layers = torch.nn.ModuleList()
+
+        for _ in range(layers):
+            self.layers.append(col_nn.Linear(dim, dim))
+
+    def forward(self, x):
+        for layer in self.layers:
+            x = layer(x)
+        return x
--- a/colossalai/cli/benchmark/utils.py
+++ b/colossalai/cli/benchmark/utils.py
+import math
+import time
+import torch
+
+from colossalai.utils import MultiTimer
+from colossalai.context import ParallelMode, Config
+from typing import List, Dict, Tuple, Callable
+
+
+def get_time_stamp() -> int:
+    """
+    Return the time stamp for profiling.
+
+    Returns:
+        time_stamp (int): the time given by time.time()
+    """
+
+    torch.cuda.synchronize()
+    time_stamp = time.time()
+    return time_stamp
+
+
+def get_memory_states() -> Tuple[float]:
+    """
+    Return the memory statistics.
+
+    Returns:
+        max_allocated (float): the allocated CUDA memory 
+        max_cached (float):  the cached CUDA memory 
+    """
+
+    max_allocated = torch.cuda.max_memory_allocated() / (1024**3)
+    max_cached = torch.cuda.max_memory_reserved() / (1024**3)
+    torch.cuda.reset_peak_memory_stats()
+    torch.cuda.empty_cache()
+    return max_allocated, max_cached
+
+
+def find_all_configs(device_cnt: int) -> List[Dict]:
+    """
+    Find all possible configurations for tensor parallelism
+
+    Args:
+        device_cnt (int): the number of devices
+
+    Returns:
+        config_list (List[Dict]): a list of configurations
+    """
+
+    def _is_square(num):
+        # 2D parallel should be implemented with at least 2 devices.
+        if num <= 1:
+            return False
+        return math.floor(math.sqrt(num))**2 == num
+
+    def _is_cube(num):
+        # 3D parallel should be implemented with at least 2 devices.
+        if num <= 1:
+            return False
+        return math.floor(num**(1. / 3.))**3 == num
+
+    config_list = []
+
+    # add non-parallel config
+    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode=None)))
+    config_list.append(config)
+
+    # add 1D config
+    config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='1d')))
+    config_list.append(config)
+
+    # add 2D config only if device_cnt is a square
+    if _is_square(device_cnt):
+        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2d')))
+        config_list.append(config)
+
+    # check for 2.5D
+    # iterate over depth
+    for depth in range(1, device_cnt):
+        if device_cnt % depth == 0 and _is_square(device_cnt // depth):
+            config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='2.5d', depth=depth)))
+            config_list.append(config)
+
+    # check for 3D if device_cnt is a cube
+    if _is_cube(device_cnt):
+        config = dict(parallel=dict(tensor=dict(size=device_cnt, mode='3d')))
+        config_list.append(config)
+
+    config_list = [Config(cfg) for cfg in config_list]
+    return config_list
+
+
+def profile_model(model: torch.nn.Module, warmup_steps: int, profile_steps: int, data_func: Callable,
+                  timer: MultiTimer) -> Tuple[float]:
+    """
+    Profile the forward and backward of a model
+
+    Args:
+        model (torch.nn.Module): a PyTorch model
+        warmup_steps (int): the number of steps for warmup
+        profile_steps (int): the number of steps for profiling
+        data_func (Callable): a function to generate random data
+        timer (colossalai.utils.Multitimer): a timer instance for time recording
+    
+    Returns:
+        fwd_time (float): the average forward time taken by forward pass in second
+        bwd_time (float): the average backward time taken by forward pass in second
+        max_allocated (float): the maximum GPU memory allocated in GB
+        max_cached (float): the maximum GPU memory cached in GB
+    """
+
+    def _run_step(data):
+        timer.start('forward')
+        out = model(data)
+        timer.stop('forward', keep_in_history=True)
+        timer.start('backward')
+        out.mean().backward()
+        timer.stop('backward', keep_in_history=True)
+
+    data_list = [data_func() for _ in range(warmup_steps)]
+    for data in data_list:
+        _run_step(data)
+    timer.reset('forward')
+    timer.reset('backward')
+
+    for _ in range(profile_steps):
+        data = data_func()
+        _run_step(data)
+
+    max_allocated, max_cached = get_memory_states()
+    fwd_time = timer.get_timer('forward').get_history_mean()
+    bwd_time = timer.get_timer('backward').get_history_mean()
+    return fwd_time, bwd_time, max_allocated, max_cached
+
+
+def get_batch_data(dim: int, batch_size: int, seq_length: int, mode: ParallelMode) -> torch.Tensor:
+    """
+    Return a random data of shape (batch_size, seq_length, dim) for profiling.
+
+    Args:
+        dim (int): hidden size
+        batch_size (int): the number of data samples
+        seq_length (int): the number of tokens
+        mode (ParallelMode): Colossal-AI ParallelMode enum
+
+    Returns:
+        data (torch.Tensor): random data
+    """
+
+    if mode in ['2d', '2.5d']:
+        batch_size = batch_size // 2
+        dim = dim // 2
+    elif mode == '3d':
+        batch_size = batch_size // 4
+        dim = dim // 2
+
+    data = torch.rand(batch_size, seq_length, dim).cuda()
+    return data
--- a/colossalai/cli/check/__init__.py
+++ b/colossalai/cli/check/__init__.py
+import click
+from .check_installation import check_installation
+
+__all__ = ['check']
+
+
+@click.command(help="Check if Colossal-AI is correct based on the given option")
+@click.option('-i', '--installation', is_flag=True, help="Check if Colossal-AI is built correctly")
+def check(installation):
+    if installation:
+        check_installation()
+        return
+    click.echo("No option is given")
--- a/colossalai/cli/check/check_installation.py
+++ b/colossalai/cli/check/check_installation.py
+import subprocess
+
+import click
+import torch
+from torch.utils.cpp_extension import CUDA_HOME
+
+import colossalai
+
+
+def to_click_output(val):
+    # installation check output to understandable symbols for readability
+    VAL_TO_SYMBOL = {True: u'\u2713', False: 'x', None: 'N/A'}
+
+    if val in VAL_TO_SYMBOL:
+        return VAL_TO_SYMBOL[val]
+    else:
+        return val
+
+
+def check_installation():
+    """
+    This function will check the installation of colossalai, specifically, the version compatibility of
+    colossalai, pytorch and cuda.
+
+    Example:
+    ```text
+    ```
+
+    Returns: A table of installation information.
+    """
+    found_aot_cuda_ext = _check_aot_built_cuda_extension_installed()
+    cuda_version = _check_cuda_version()
+    torch_version, torch_cuda_version = _check_torch_version()
+    colossalai_verison, prebuilt_torch_version_required, prebuilt_cuda_version_required = _parse_colossalai_version()
+
+    # if cuda_version is None, that means either
+    # CUDA_HOME is not found, thus cannot compare the version compatibility
+    if not cuda_version:
+        sys_torch_cuda_compatibility = None
+    else:
+        sys_torch_cuda_compatibility = _is_compatible([cuda_version, torch_cuda_version])
+
+    # if cuda_version or cuda_version_required is None, that means either
+    # CUDA_HOME is not found or AOT compilation is not enabled
+    # thus, there is no need to compare the version compatibility at all
+    if not cuda_version or not prebuilt_cuda_version_required:
+        sys_colossalai_cuda_compatibility = None
+    else:
+        sys_colossalai_cuda_compatibility = _is_compatible([cuda_version, prebuilt_cuda_version_required])
+
+    # if torch_version_required is None, that means AOT compilation is not enabled
+    # thus there is no need to compare the versions
+    if prebuilt_torch_version_required is None:
+        torch_compatibility = None
+    else:
+        torch_compatibility = _is_compatible([torch_version, prebuilt_torch_version_required])
+
+    click.echo(f'#### Installation Report ####')
+    click.echo(f'\n------------ Environment ------------')
+    click.echo(f"Colossal-AI version: {to_click_output(colossalai_verison)}")
+    click.echo(f"PyTorch version: {to_click_output(torch_version)}")
+    click.echo(f"System CUDA version: {to_click_output(cuda_version)}")
+    click.echo(f"CUDA version required by PyTorch: {to_click_output(torch_cuda_version)}")
+    click.echo("")
+    click.echo(f"Note:")
+    click.echo(f"1. The table above checks the versions of the libraries/tools in the current environment")
+    click.echo(f"2. If the System CUDA version is N/A, you can set the CUDA_HOME environment variable to locate it")
+    click.echo(
+        f"3. If the CUDA version required by PyTorch is N/A, you probably did not install a CUDA-compatible PyTorch. This value is give by torch.version.cuda and you can go to https://pytorch.org/get-started/locally/ to download the correct version."
+    )
+
+    click.echo(f'\n------------ CUDA Extensions AOT Compilation ------------')
+    click.echo(f"Found AOT CUDA Extension: {to_click_output(found_aot_cuda_ext)}")
+    click.echo(f"PyTorch version used for AOT compilation: {to_click_output(prebuilt_torch_version_required)}")
+    click.echo(f"CUDA version used for AOT compilation: {to_click_output(prebuilt_cuda_version_required)}")
+    click.echo("")
+    click.echo(f"Note:")
+    click.echo(
+        f"1. AOT (ahead-of-time) compilation of the CUDA kernels occurs during installation when the environment variable CUDA_EXT=1 is set"
+    )
+    click.echo(f"2. If AOT compilation is not enabled, stay calm as the CUDA kernels can still be built during runtime")
+
+    click.echo(f"\n------------ Compatibility ------------")
+    click.echo(f'PyTorch version match: {to_click_output(torch_compatibility)}')
+    click.echo(f"System and PyTorch CUDA version match: {to_click_output(sys_torch_cuda_compatibility)}")
+    click.echo(f"System and Colossal-AI CUDA version match: {to_click_output(sys_colossalai_cuda_compatibility)}")
+    click.echo(f"")
+    click.echo(f"Note:")
+    click.echo(f"1. The table above checks the version compatibility of the libraries/tools in the current environment")
+    click.echo(
+        f"   - PyTorch version mismatch: whether the PyTorch version in the current environment is compatible with the PyTorch version used for AOT compilation"
+    )
+    click.echo(
+        f"   - System and PyTorch CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version required by PyTorch"
+    )
+    click.echo(
+        f"   - System and Colossal-AI CUDA version match: whether the CUDA version in the current environment is compatible with the CUDA version used for AOT compilation"
+    )
+
+
+def _is_compatible(versions):
+    """
+    Compare the list of versions and return whether they are compatible.
+    """
+    if None in versions:
+        return False
+
+    # split version into [major, minor, patch]
+    versions = [version.split('.') for version in versions]
+
+    for version in versions:
+        if len(version) == 2:
+            # x means unknown
+            version.append('x')
+
+    for idx, version_values in enumerate(zip(*versions)):
+        equal = len(set(version_values)) == 1
+
+        if idx in [0, 1] and not equal:
+            return False
+        elif idx == 1:
+            return True
+        else:
+            continue
+
+
+def _parse_colossalai_version():
+    """
+    Get the Colossal-AI version information.
+
+    Returns:
+        colossalai_version: Colossal-AI version.
+        torch_version_for_aot_build: PyTorch version used for AOT compilation of CUDA kernels.
+        cuda_version_for_aot_build: CUDA version used for AOT compilation of CUDA kernels.
+    """
+    # colossalai version can be in two formats
+    # 1. X.X.X+torchX.XXcuXX.X (when colossalai is installed with CUDA extensions)
+    # 2. X.X.X (when colossalai is not installed with CUDA extensions)
+    # where X represents an integer.
+    colossalai_verison = colossalai.__version__.split('+')[0]
+
+    try:
+        torch_version_for_aot_build = colossalai.__version__.split('torch')[1].split('cu')[0]
+        cuda_version_for_aot_build = colossalai.__version__.split('cu')[1]
+    except:
+        torch_version_for_aot_build = None
+        cuda_version_for_aot_build = None
+    return colossalai_verison, torch_version_for_aot_build, cuda_version_for_aot_build
+
+
+def _check_aot_built_cuda_extension_installed():
+    """
+    According to `op_builder/README.md`, the CUDA extension can be built with either
+    AOT (ahead-of-time) or JIT (just-in-time) compilation.
+    AOT compilation will build CUDA extensions to `colossalai._C` during installation.
+    JIT (just-in-time) compilation will build CUDA extensions to `~/.cache/colossalai/torch_extensions` during runtime.
+    """
+    try:
+        import colossalai._C.fused_optim
+        found_aot_cuda_ext = True
+    except ImportError:
+        found_aot_cuda_ext = False
+    return found_aot_cuda_ext
+
+
+def _check_torch_version():
+    """
+    Get the PyTorch version information.
+
+    Returns:
+        torch_version: PyTorch version.
+        torch_cuda_version: CUDA version required by PyTorch.
+    """
+    # get torch version
+    # torch version can be of two formats
+    # - 1.13.1+cu113
+    # - 1.13.1.devxxx
+    torch_version = torch.__version__.split('+')[0]
+    torch_version = '.'.join(torch_version.split('.')[:3])
+
+    # get cuda version in pytorch build
+    try:
+        torch_cuda_major = torch.version.cuda.split(".")[0]
+        torch_cuda_minor = torch.version.cuda.split(".")[1]
+        torch_cuda_version = f'{torch_cuda_major}.{torch_cuda_minor}'
+    except:
+        torch_cuda_version = None
+
+    return torch_version, torch_cuda_version
+
+
+def _check_cuda_version():
+    """
+    Get the CUDA version information.
+
+    Returns:
+        cuda_version: CUDA version found on the system.
+    """
+
+    # get cuda version
+    if CUDA_HOME is None:
+        cuda_version = CUDA_HOME
+    else:
+        try:
+            raw_output = subprocess.check_output([CUDA_HOME + "/bin/nvcc", "-V"], universal_newlines=True)
+            output = raw_output.split()
+            release_idx = output.index("release") + 1
+            release = output[release_idx].split(".")
+            bare_metal_major = release[0]
+            bare_metal_minor = release[1][0]
+            cuda_version = f'{bare_metal_major}.{bare_metal_minor}'
+        except:
+            cuda_version = None
+    return cuda_version
--- a/colossalai/cli/cli.py
+++ b/colossalai/cli/cli.py
+import click
+
+from .benchmark import benchmark
+from .check import check
+from .launcher import run
+
+
+class Arguments():
+
+    def __init__(self, arg_dict):
+        for k, v in arg_dict.items():
+            self.__dict__[k] = v
+
+
+@click.group()
+def cli():
+    pass
+
+
+cli.add_command(run)
+cli.add_command(check)
+cli.add_command(benchmark)
+
+if __name__ == '__main__':
+    cli()
--- a/colossalai/cli/launcher/__init__.py
+++ b/colossalai/cli/launcher/__init__.py
+import click
+
+from colossalai.context import Config
+
+from .run import launch_multi_processes
+
+
+@click.command(help="Launch distributed training on a single node or multiple nodes",
+               context_settings=dict(ignore_unknown_options=True))
+@click.option("-H",
+              "-host",
+              "--host",
+              type=str,
+              default=None,
+              help="the list of hostnames to launch in the format <host1>,<host2>")
+@click.option(
+    "--hostfile",
+    type=str,
+    default=None,
+    help="Hostfile path that defines the device pool available to the job, each line in the file is a hostname")
+@click.option("--include",
+              type=str,
+              default=None,
+              help="Specify computing devices to use during execution. String format is <host1>,<host2>,"
+              " only effective when used with --hostfile.")
+@click.option(
+    "--exclude",
+    type=str,
+    default=None,
+    help=
+    "Specify computing devices to NOT use during execution. Mutually exclusive with --include. Formatting is the same as --includ,"
+    " only effective when used with --hostfile.")
+@click.option("--num_nodes",
+              type=int,
+              default=-1,
+              help="Total number of worker nodes to use, only effective when used with --hostfile.")
+@click.option("--nproc_per_node", type=int, default=None, help="Number of GPUs to use on each node.")
+@click.option("--master_port",
+              type=int,
+              default=29500,
+              help="(optional) Port used by PyTorch distributed for communication during distributed training.")
+@click.option("--master_addr",
+              type=str,
+              default="127.0.0.1",
+              help="(optional) IP address of node 0, will be inferred via 'hostname -I' if not specified.")
+@click.option(
+    "--extra_launch_args",
+    type=str,
+    default=None,
+    help=
+    "Set additional torch distributed launcher arguments such as --standalone. The format is --extra_launch_args arg1=1,arg2=2. "
+    "This will be converted to --arg1=1 --arg2=2 during execution")
+@click.option("--ssh-port", type=int, default=None, help="(optional) the port used for ssh connection")
+@click.argument("user_script", type=str)
+@click.argument('user_args', nargs=-1)
+def run(host: str, hostfile: str, num_nodes: int, nproc_per_node: int, include: str, exclude: str, master_addr: str,
+        master_port: int, extra_launch_args: str, ssh_port: int, user_script: str, user_args: str) -> None:
+    """
+    To launch multiple processes on a single node or multiple nodes via command line.
+
+    Usage::
+        # run with 4 GPUs on the current node use default port 29500
+        colossalai run --nprocs_per_node 4 train.py
+
+        # run with 2 GPUs on the current node at port 29550
+        colossalai run --nprocs_per_node 4 --master_port 29550 train.py
+
+        # run on two nodes
+        colossalai run --host <host1>,<host2> --master_addr host1  --nprocs_per_node 4 train.py
+
+        # run with hostfile
+        colossalai run --hostfile <file_path> --master_addr <host>  --nprocs_per_node 4 train.py
+
+        # run with hostfile with only included hosts
+        colossalai run --hostfile <file_path> --master_addr host1 --include host1,host2  --nprocs_per_node 4 train.py
+
+        # run with hostfile excluding the hosts selected
+        colossalai run --hostfile <file_path> --master_addr host1 --exclude host2  --nprocs_per_node 4 train.py
+    """
+    if not user_script.endswith('.py'):
+        click.echo(f'Error: invalid Python file {user_script}. Did you use a wrong option? Try colossalai run --help')
+        exit()
+
+    args_dict = locals()
+    args = Config(args_dict)
+    args.user_args = list(args.user_args)
+    launch_multi_processes(args)
--- a/colossalai/cli/launcher/hostinfo.py
+++ b/colossalai/cli/launcher/hostinfo.py
+import socket
+from typing import List
+
+
+class HostInfo:
+    """
+    A data class to store host connection-related data.
+
+    Args:
+        hostname (str): name or IP address of the host
+        port (str): the port for ssh connection
+    """
+
+    def __init__(
+        self,
+        hostname: str,
+        port: str = None,
+    ):
+        self.hostname = hostname
+        self.port = port
+        self.is_local_host = HostInfo.is_host_localhost(hostname, port)
+
+    @staticmethod
+    def is_host_localhost(hostname: str, port: str = None) -> None:
+        """
+        Check if the host refers to the local machine.
+
+        Args:
+            hostname (str): name or IP address of the host
+            port (str): the port for ssh connection
+
+        Returns:
+            bool: True if it is local, False otherwise
+        """
+
+        if port is None:
+            port = 22    # no port specified, lets just use the ssh port
+
+        # socket.getfqdn("127.0.0.1") does not return localhost
+        # on some users' machines
+        # thus, we directly return True if hostname is locahost, 127.0.0.1 or 0.0.0.0
+        if hostname in ("localhost", "127.0.0.1", "0.0.0.0"):
+            return True
+
+        hostname = socket.getfqdn(hostname)
+        localhost = socket.gethostname()
+        localaddrs = socket.getaddrinfo(localhost, port)
+        targetaddrs = socket.getaddrinfo(hostname, port)
+        for (family, socktype, proto, canonname, sockaddr) in localaddrs:
+            for (rfamily, rsocktype, rproto, rcanonname, rsockaddr) in targetaddrs:
+                if rsockaddr[0] == sockaddr[0]:
+                    return True
+        return False
+
+    def __str__(self):
+        return f'hostname: {self.hostname}, port: {self.port}'
+
+    def __repr__(self):
+        return self.__str__()
+
+
+class HostInfoList:
+    """
+    A data class to store a list of HostInfo objects.
+    """
+
+    def __init__(self):
+        self.hostinfo_list = []
+
+    def append(self, hostinfo: HostInfo) -> None:
+        """
+        Add an HostInfo object to the list.
+
+        Args:
+            hostinfo (HostInfo): host information
+        """
+
+        self.hostinfo_list.append(hostinfo)
+
+    def remove(self, hostname: str) -> None:
+        """
+        Add an HostInfo object to the list.
+
+        Args:
+            hostname (str): the name of the host
+        """
+
+        hostinfo = self.get_hostinfo(hostname)
+        self.hostinfo_list.remove(hostinfo)
+
+    def get_hostinfo(self, hostname: str) -> HostInfo:
+        """
+        Return the HostInfo object which matches with the hostname.
+
+        Args:
+            hostname (str): the name of the host
+
+        Returns:
+            hostinfo (HostInfo): the HostInfo object which matches with the hostname
+        """
+
+        for hostinfo in self.hostinfo_list:
+            if hostinfo.hostname == hostname:
+                return hostinfo
+
+        raise Exception(f"Hostname {hostname} is not found")
+
+    def has(self, hostname: str) -> bool:
+        """
+        Check if the hostname has been added.
+
+        Args:
+            hostname (str): the name of the host
+
+        Returns:
+            bool: True if added, False otherwise
+        """
+        for hostinfo in self.hostinfo_list:
+            if hostinfo.hostname == hostname:
+                return True
+        return False
+
+    def __iter__(self):
+        return iter(self.hostinfo_list)
+
+    def __len__(self):
+        return len(self.hostinfo_list)
--- a/colossalai/cli/launcher/multinode_runner.py
+++ b/colossalai/cli/launcher/multinode_runner.py
+from multiprocessing import Pipe, Process
+from multiprocessing import connection as mp_connection
+
+import click
+import fabric
+
+from .hostinfo import HostInfo, HostInfoList
+
+
+def run_on_host(hostinfo: HostInfo, workdir: str, recv_conn: mp_connection.Connection,
+                send_conn: mp_connection.Connection, env: dict) -> None:
+    """
+    Use fabric connection to execute command on local or remote hosts.
+
+    Args:
+        hostinfo (HostInfo): host information
+        workdir (str): the directory to execute the command
+        recv_conn (multiprocessing.connection.Connection): receive messages from the master sender
+        send_conn (multiprocessing.connection.Connection): send messages to the master receiver
+        env (dict): a dictionary for environment variables
+    """
+
+    fab_conn = fabric.Connection(hostinfo.hostname, port=hostinfo.port)
+    finish = False
+    env_msg = ' '.join([f'{k}=\"{v}\"' for k, v in env.items()])
+
+    # keep listening until exit
+    while not finish:
+        # receive cmd
+        cmds = recv_conn.recv()
+
+        if cmds == 'exit':
+            # exit from the loop
+            finish = True
+            break
+        else:
+            # execute the commands
+            try:
+                # cd to execute directory
+                with fab_conn.cd(workdir):
+                    # propagate the runtime environment
+                    with fab_conn.prefix(f"export {env_msg}"):
+                        if hostinfo.is_local_host:
+                            # execute on the local machine
+                            fab_conn.local(cmds, hide=False)
+                        else:
+                            # execute on the remote machine
+                            fab_conn.run(cmds, hide=False)
+                    send_conn.send('success')
+            except Exception as e:
+                click.echo(
+                    f"Error: failed to run {cmds} on {hostinfo.hostname}, is localhost: {hostinfo.is_local_host}, exception: {e}"
+                )
+                send_conn.send('failure')
+
+    # shutdown
+    send_conn.send("finish")
+    fab_conn.close()
+
+
+class MultiNodeRunner:
+    """
+    A runner to execute commands on an array of machines. This runner
+    is inspired by Nezha (https://github.com/zhuzilin/NeZha).
+    """
+
+    def __init__(self):
+        self.processes = {}
+        self.master_send_conns = {}
+        self.master_recv_conns = {}
+
+    def connect(self, host_info_list: HostInfoList, workdir: str, env: dict) -> None:
+        """
+        Establish connections to a list of hosts
+
+        Args:
+            host_info_list (HostInfoList): a list of HostInfo objects
+            workdir (str): the directory where command is executed
+            env (dict): environment variables to propagate to hosts
+        """
+        for hostinfo in host_info_list:
+            master_send_conn, worker_recv_conn = Pipe()
+            master_recv_conn, worker_send_conn = Pipe()
+            p = Process(target=run_on_host, args=(hostinfo, workdir, worker_recv_conn, worker_send_conn, env))
+            p.start()
+            self.processes[hostinfo.hostname] = p
+            self.master_recv_conns[hostinfo.hostname] = master_recv_conn
+            self.master_send_conns[hostinfo.hostname] = master_send_conn
+
+    def send(self, hostinfo: HostInfo, cmd: str) -> None:
+        """
+        Send a command to a local/remote host.
+
+        Args:
+            hostinfo (HostInfo): host information
+            cmd (str): the command to execute
+        """
+
+        assert hostinfo.hostname in self.master_send_conns, \
+            f'{hostinfo} is not found in the current connections'
+        conn = self.master_send_conns[hostinfo.hostname]
+        conn.send(cmd)
+
+    def stop_all(self) -> None:
+        """
+        Stop connections to all hosts.
+        """
+
+        for hostname, conn in self.master_send_conns.items():
+            conn.send('exit')
+
+    def recv_from_all(self) -> dict:
+        """
+        Receive messages from all hosts
+
+        Returns:
+            msg_from_node (dict): a dictionry which contains messages from each node
+        """
+
+        msg_from_node = dict()
+        for hostname, conn in self.master_recv_conns.items():
+            msg_from_node[hostname] = conn.recv()
+        return msg_from_node
--- a/colossalai/cli/launcher/run.py
+++ b/colossalai/cli/launcher/run.py
+import os
+import sys
+from typing import List
+
+import click
+import torch
+from packaging import version
+
+from colossalai.context import Config
+
+from .hostinfo import HostInfo, HostInfoList
+from .multinode_runner import MultiNodeRunner
+
+# Constants that define our syntax
+NODE_SEP = ','
+
+
+def fetch_hostfile(hostfile_path: str, ssh_port: int) -> HostInfoList:
+    """
+    Parse the hostfile to obtain a list of hosts.
+
+    A hostfile should look like:
+    worker-0
+    worker-1
+    worker-2
+    ...
+
+    Args:
+        hostfile_path (str): the path to the hostfile
+        ssh_port (int): the port to connect to the host
+    """
+
+    if not os.path.isfile(hostfile_path):
+        click.echo(f"Error: Unable to find the hostfile, no such file: {hostfile_path}")
+        exit()
+
+    with open(hostfile_path, 'r') as fd:
+        device_pool = HostInfoList()
+
+        for line in fd.readlines():
+            line = line.strip()
+            if line == '':
+                # skip empty lines
+                continue
+
+            # build the HostInfo object
+            hostname = line.strip()
+            hostinfo = HostInfo(hostname=hostname, port=ssh_port)
+
+            if device_pool.has(hostname):
+                click.echo(f"Error: found duplicate host {hostname} in the hostfile")
+                exit()
+
+            device_pool.append(hostinfo)
+    return device_pool
+
+
+def parse_device_filter(device_pool: HostInfoList, include_str=None, exclude_str=None) -> HostInfoList:
+    '''Parse an inclusion or exclusion string and filter a hostfile dictionary.
+
+    Examples:
+        include_str="worker-0,worker-1" will execute jobs only on worker-0 and worker-1.
+        exclude_str="worker-1" will use all available devices except worker-1.
+
+    Args:
+        device_pool (HostInfoList): a list of HostInfo objects
+        include_str (str): --include option passed by user, default None
+        exclude_str (str): --exclude option passed by user, default None
+
+    Returns:
+        filtered_hosts (HostInfoList): filtered hosts after inclusion/exclusion
+    '''
+
+    # Ensure include/exclude are mutually exclusive
+    if include_str and exclude_str:
+        click.echo("--include and --exclude are mutually exclusive, only one can be used")
+        exit()
+
+    # no-op
+    if include_str is None and exclude_str is None:
+        return device_pool
+
+    # Either build from scratch or remove items
+    if include_str:
+        parse_str = include_str
+        filtered_hosts = HostInfoList()
+    elif exclude_str:
+        parse_str = exclude_str
+        filtered_hosts = device_pool
+
+    # foreach node in the list
+    for node_config in parse_str.split(NODE_SEP):
+        hostname = node_config
+        hostinfo = device_pool.get_hostinfo(hostname)
+        # sanity check hostname
+        if not device_pool.has(hostname):
+            click.echo(f"Error: Hostname '{hostname}' not found in hostfile")
+            exit()
+
+        if include_str:
+            filtered_hosts.append(hostinfo)
+        elif exclude_str:
+            filtered_hosts.remove(hostname)
+
+    return filtered_hosts
+
+
+def get_launch_command(
+    master_addr: str,
+    master_port: int,
+    nproc_per_node: int,
+    user_script: str,
+    user_args: List[str],
+    node_rank: int,
+    num_nodes: int,
+    extra_launch_args: str = None,
+) -> str:
+    """
+    Generate a command for distributed training.
+
+    Args:
+        master_addr (str): the host of the master node
+        master_port (str): the port of the master node
+        nproc_per_node (str): the number of processes to launch on each node
+        user_script (str): the user Python file
+        user_args (str): the arguments for the user script
+        node_rank (int): the unique ID for the node
+        num_nodes (int): the number of nodes to execute jobs
+
+    Returns:
+        cmd (str): the command the start distributed training
+    """
+
+    def _arg_dict_to_list(arg_dict):
+        ret = []
+
+        for k, v in arg_dict.items():
+            if v:
+                ret.append(f'--{k}={v}')
+            else:
+                ret.append(f'--{k}')
+        return ret
+
+    if extra_launch_args:
+        extra_launch_args_dict = dict()
+        for arg in extra_launch_args.split(','):
+            if '=' in arg:
+                k, v = arg.split('=')
+                extra_launch_args_dict[k] = v
+            else:
+                extra_launch_args_dict[arg] = None
+        extra_launch_args = extra_launch_args_dict
+    else:
+        extra_launch_args = dict()
+
+    torch_version = version.parse(torch.__version__)
+    assert torch_version.major == 1
+
+    if torch_version.minor < 9:
+        cmd = [
+            sys.executable, "-m", "torch.distributed.launch", f"--nproc_per_node={nproc_per_node}",
+            f"--master_addr={master_addr}", f"--master_port={master_port}", f"--nnodes={num_nodes}",
+            f"--node_rank={node_rank}"
+        ]
+    else:
+        # extra launch args for torch distributed launcher with torch >= 1.9
+        default_torchrun_rdzv_args = dict(rdzv_backend="c10d",
+                                          rdzv_endpoint=f"{master_addr}:{master_port}",
+                                          rdzv_id="colossalai-default-job")
+
+        # update rdzv arguments
+        for key in default_torchrun_rdzv_args.keys():
+            if key in extra_launch_args:
+                value = extra_launch_args.pop(key)
+                default_torchrun_rdzv_args[key] = value
+
+        if torch_version.minor < 10:
+            cmd = [
+                sys.executable, "-m", "torch.distributed.run", f"--nproc_per_node={nproc_per_node}",
+                f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
+            ]
+        else:
+            cmd = [
+                "torchrun", f"--nproc_per_node={nproc_per_node}", f"--nnodes={num_nodes}", f"--node_rank={node_rank}"
+            ]
+        cmd += _arg_dict_to_list(default_torchrun_rdzv_args)
+
+    cmd += _arg_dict_to_list(extra_launch_args) + [user_script] + user_args
+    cmd = ' '.join(cmd)
+    return cmd
+
+
+def launch_multi_processes(args: Config) -> None:
+    """
+    Launch multiple processes on a single node or multiple nodes.
+
+    The overall logic can be summarized as the pseudo code below:
+
+        if hostfile given:
+            hostinfo = parse_hostfile(hostfile)
+            hostinfo = include_or_exclude_hosts(hostinfo)
+            launch_on_multi_nodes(hostinfo)
+        elif hosts given:
+            hostinfo = parse_hosts(hosts)
+            launch_on_multi_nodes(hostinfo)
+        else:
+            launch_on_current_node()
+
+    Args:
+        args (Config): the arguments taken from command line
+
+    """
+    assert isinstance(args, Config)
+
+    if args.nproc_per_node is None:
+        click.echo("--nproc_per_node did not receive any value")
+        exit()
+
+    # cannot accept hosts and hostfile at the same time
+    if args.host and args.hostfile:
+        click.echo("Error: hostfile and hosts are mutually exclusive, only one is required")
+
+    # check if hostfile is given
+    if args.hostfile:
+        device_pool = fetch_hostfile(args.hostfile, ssh_port=args.ssh_port)
+        active_device_pool = parse_device_filter(device_pool, args.include, args.exclude)
+
+        if args.num_nodes > 0:
+            # only keep the first num_nodes to execute jobs
+            updated_active_device_pool = HostInfoList()
+            for count, hostinfo in enumerate(active_device_pool):
+                if args.num_nodes == count:
+                    break
+                updated_active_device_pool.append(hostinfo)
+            active_device_pool = updated_active_device_pool
+    else:
+        active_device_pool = None
+
+    env = os.environ.copy()
+
+    # use hosts if hostfile is not given
+    if args.host and active_device_pool is None:
+        active_device_pool = HostInfoList()
+        host_list = args.host.strip().split(NODE_SEP)
+        for hostname in host_list:
+            hostinfo = HostInfo(hostname=hostname, port=args.ssh_port)
+            active_device_pool.append(hostinfo)
+
+    if not active_device_pool:
+        # run on local node if not hosts or hostfile is given
+        # add local node to host info list
+        active_device_pool = HostInfoList()
+        localhost_info = HostInfo(hostname='127.0.0.1', port=args.ssh_port)
+        active_device_pool.append(localhost_info)
+
+    # launch distributed processes
+    runner = MultiNodeRunner()
+    curr_path = os.path.abspath('.')
+
+    # collect current path env
+    env = dict()
+    for k, v in os.environ.items():
+        # do not support multi-line env var
+        if v and '\n' not in v:
+            env[k] = v
+
+    # establish remote connection
+    runner.connect(host_info_list=active_device_pool, workdir=curr_path, env=env)
+
+    # execute distributed launching command
+    for node_id, hostinfo in enumerate(active_device_pool):
+        cmd = get_launch_command(master_addr=args.master_addr,
+                                 master_port=args.master_port,
+                                 nproc_per_node=args.nproc_per_node,
+                                 user_script=args.user_script,
+                                 user_args=args.user_args,
+                                 node_rank=node_id,
+                                 num_nodes=len(active_device_pool),
+                                 extra_launch_args=args.extra_launch_args)
+        runner.send(hostinfo=hostinfo, cmd=cmd)
+
+    # start training
+    msg_from_node = runner.recv_from_all()
+    has_error = False
+
+    # print node status
+    click.echo("\n====== Training on All Nodes =====")
+    for hostname, msg in msg_from_node.items():
+        click.echo(f"{hostname}: {msg}")
+
+        # check if a process failed
+        if msg == "failure":
+            has_error = True
+
+    # stop all nodes
+    runner.stop_all()
+
+    # receive the stop status
+    msg_from_node = runner.recv_from_all()
+
+    # printe node status
+    click.echo("\n====== Stopping All Nodes =====")
+    for hostname, msg in msg_from_node.items():
+        click.echo(f"{hostname}: {msg}")
+
+    # give the process an exit code
+    # so that it behaves like a normal process
+    if has_error:
+        sys.exit(1)
+    else:
+        sys.exit(0)
--- a/colossalai/cluster/__init__.py
+++ b/colossalai/cluster/__init__.py
+from .device_mesh_manager import DeviceMeshManager
+from .dist_coordinator import DistCoordinator
+from .process_group_manager import ProcessGroupManager
+
+__all__ = ['DistCoordinator', 'ProcessGroupManager', 'DeviceMeshManager']
--- a/colossalai/cluster/device_mesh_manager.py
+++ b/colossalai/cluster/device_mesh_manager.py
+from dataclasses import dataclass
+from typing import Dict, List, Tuple, Union
+
+import torch
+import torch.distributed as dist
+
+from colossalai.device.alpha_beta_profiler import AlphaBetaProfiler
+from colossalai.device.device_mesh import DeviceMesh
+
+
+@dataclass
+class DeviceMeshInfo:
+    '''
+    This class is used to store the information used to initialize the device mesh.
+
+    Args:
+        physical_ids (List[int]): The physical ids of the current booster. For example, if we have the last 4 GPUs on a 8-devices cluster, then the physical ids should be [4, 5, 6, 7].
+        mesh_shapes (List[Union[torch.Size, List[int], Tuple[int]]]): The shape of the mesh. For example, if we have 4 GPUs and we want to use 2D mesh with mesh shape [2, 2], then the mesh shape should be [2, 2].
+    '''
+    physical_ids: List[int]
+    mesh_shape: Union[torch.Size, List[int], Tuple[int]] = None
+
+    def __post_init__(self):
+        if self.mesh_shape is not None:
+            world_size = len(self.physical_ids)
+            mesh_shape_numel = torch.Size(self.mesh_shape).numel()
+            assert world_size == mesh_shape_numel, f'the numel of mesh_shape should be equal to world size, but got {world_size} != {mesh_shape_numel}'
+
+
+def initialize_device_mesh(device_mesh_info: DeviceMeshInfo):
+    '''
+    This method is used to initialize the device mesh.
+
+    Args:
+        device_mesh_info (DeviceMeshInfo): The information used to initialize device mesh.
+    '''
+    # parse the device mesh info
+    physical_devices = device_mesh_info.physical_ids
+    physical_mesh = torch.tensor(physical_devices)
+    logical_mesh_shape = device_mesh_info.mesh_shape
+
+    if logical_mesh_shape is None:
+        ab_profiler = AlphaBetaProfiler(physical_devices)
+        # search for the best logical mesh shape
+        logical_mesh_id = ab_profiler.search_best_logical_mesh()
+        logical_mesh_id = torch.Tensor(logical_mesh_id).to(torch.int)
+
+    else:
+        logical_mesh_id = physical_mesh.reshape(logical_mesh_shape)
+
+    device_mesh = DeviceMesh(physical_mesh_id=physical_mesh, logical_mesh_id=logical_mesh_id, init_process_group=True)
+    return device_mesh
+
+
+class DeviceMeshManager:
+    """
+    Device mesh manager is responsible for creating and managing device meshes.
+    """
+
+    def __init__(self):
+        self.device_mesh_store: Dict[str, DeviceMesh] = dict()
+
+    def create_device_mesh(self, name, device_mesh_info: DeviceMeshInfo) -> DeviceMesh:
+        """
+        Create a device mesh and store it in the manager.
+
+        Args:
+            name (str): name of the device mesh
+            device_mesh_info (DeviceMeshInfo): the information used to initialize the device mesh
+       """
+        if name not in self.device_mesh_store:
+            device_mesh = initialize_device_mesh(device_mesh_info)
+            self.device_mesh_store[name] = device_mesh
+            return device_mesh
+        else:
+            raise ValueError(f'Device mesh {name} already exists.')
+
+    def get(self, name: str) -> DeviceMesh:
+        """
+        Get a device mesh by name.
+
+        Args:
+            name (str): name of the device mesh
+
+        Returns:
+            DeviceMesh: the device mesh
+        """
+        if name in self.device_mesh_store:
+            return self.device_mesh_store[name]
+        else:
+            raise ValueError(f'Device mesh {name} does not exist.')
+
+    def destroy(self, name: str) -> None:
+        """
+        Destroy a device mesh by name.
+
+        Args:
+            name (str): name of the device mesh
+        """
+        if name in self.device_mesh_store:
+            for pgs in self.device_mesh_store[name].process_groups_dict.values():
+                for pg in pgs:
+                    dist.destroy_process_group(pg)
+            del self.device_mesh_store[name]
+        else:
+            raise ValueError(f'Device mesh {name} does not exist.')
+
+    def destroy_all(self):
+        """
+        Destroy all device meshes.
+        """
+        for name in self.device_mesh_store:
+            for pgs in self.device_mesh_store[name].process_groups_dict.values():
+                for pg in pgs:
+                    dist.destroy_process_group(pg)
+
+        self.device_mesh_store.clear()
--- a/colossalai/cluster/dist_coordinator.py
+++ b/colossalai/cluster/dist_coordinator.py
+import functools
+import os
+from contextlib import contextmanager
+
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+from colossalai.context.singleton_meta import SingletonMeta
+
+
+class DistCoordinator(metaclass=SingletonMeta):
+    """
+    This class is used to coordinate distributed training. It is a singleton class, which means that there is only one instance of this
+    class in the whole program.
+
+    There are some terms that are used in this class:
+        - rank: the rank of the current process
+        - world size: the total number of processes
+        - local rank: the rank of the current process on the current node
+        - master: the process with rank 0
+        - node master: the process with local rank 0 on the current node
+
+    Example:
+        >>> from colossalai.cluster.dist_coordinator import DistCoordinator
+        >>> coordinator = DistCoordinator()
+        >>>
+        >>> if coordinator.is_master():
+        >>>     do_something()
+        >>>
+        >>> coordinator.print_on_master('hello world')
+
+    Attributes:
+        rank (int): the rank of the current process
+        world_size (int): the total number of processes
+        local_rank (int): the rank of the current process on the current node
+    """
+
+    def __init__(self):
+        assert dist.is_initialized(
+        ), 'Distributed is not initialized. Please call `torch.distributed.init_process_group` or `colossalai.launch` first.'
+        self._rank = dist.get_rank()
+        self._world_size = dist.get_world_size()
+        # this is often passed by launchers such as torchrun
+        self._local_rank = os.environ.get('LOCAL_RANK', -1)
+
+    @property
+    def rank(self) -> int:
+        return self._rank
+
+    @property
+    def world_size(self) -> int:
+        return self._world_size
+
+    @property
+    def local_rank(self) -> int:
+        return self._local_rank
+
+    def _assert_local_rank_set(self):
+        """
+        Assert that the local rank is set. This is often passed by launchers such as torchrun.
+        """
+        assert self.local_rank >= 0, 'The environment variable LOCAL_RANK is not set, thus the coordinator is not aware of the local rank of the current process.'
+
+    def is_master(self, process_group: ProcessGroup = None) -> bool:
+        """
+        Check if the current process is the master process (rank is 0). It can accept a sub process group to check the rank 0 with respect to the process.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to use for the rank 0 check. Defaults to None, which refers to the default process group.
+
+        Returns:
+            bool: True if the current process is the master process, False otherwise
+        """
+        rank = dist.get_rank(group=process_group)
+        return rank == 0
+
+    def is_node_master(self) -> bool:
+        """
+        Check if the current process is the master process on the current node (local rank is 0).
+
+        Returns:
+            bool: True if the current process is the master process on the current node, False otherwise
+        """
+        self._assert_local_rank_set()
+        return self.local_rank == 0
+
+    def is_last_process(self, process_group: ProcessGroup = None) -> bool:
+        """
+        Check if the current process is the last process (rank is world size - 1). It can accept a sub process group to check the last rank with respect to the process.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to use for the last rank check. Defaults to None, which refers to the default process group.
+
+        Returns:
+            bool: True if the current process is the last process, False otherwise
+        """
+        rank = dist.get_rank(group=process_group)
+        world_size = dist.get_world_size(group=process_group)
+        return rank == world_size - 1
+
+    def print_on_master(self, msg: str, process_group: ProcessGroup = None):
+        """
+        Print message only from rank 0.
+
+        Args:
+            msg (str): message to print
+            process_group (ProcessGroup, optional): process group to use for the rank 0 check. Defaults to None, which refers to the default process group.
+        """
+        rank = dist.get_rank(group=process_group)
+        if rank == 0:
+            print(msg)
+
+    def print_on_node_master(self, msg: str):
+        """
+        Print message only from local rank 0. Local rank 0 refers to the 0th process running the current node.
+
+        Args:
+            msg (str): message to print
+        """
+        self._assert_local_rank_set()
+        if self.local_rank == 0:
+            print(msg)
+
+    @contextmanager
+    def priority_execution(self, executor_rank: int = 0, process_group: ProcessGroup = None):
+        """
+        This context manager is used to allow one process to execute while blocking all
+        other processes in the same process group. This is often useful when downloading is required
+        as we only want to download in one process to prevent file corruption.
+
+        Example:
+            >>> from colossalai.cluster import DistCoordinator
+            >>> dist_coordinator = DistCoordinator()
+            >>> with dist_coordinator.priority_execution():
+            >>>     dataset = CIFAR10(root='./data', download=True)
+
+        Args:
+            executor_rank (int): the process rank to execute without blocking, all other processes will be blocked
+            process_group (ProcessGroup, optional): process group to use for the executor rank check. Defaults to None, which refers to the default process group.
+        """
+        rank = dist.get_rank(group=process_group)
+        should_block = rank != executor_rank
+
+        if should_block:
+            self.block_all(process_group)
+
+        yield
+
+        if not should_block:
+            self.block_all(process_group)
+
+    def destroy(self, process_group: ProcessGroup = None):
+        """
+        Destroy the distributed process group.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to destroy. Defaults to None, which refers to the default process group.
+        """
+        dist.destroy_process_group(process_group)
+
+    def block_all(self, process_group: ProcessGroup = None):
+        """
+        Block all processes in the process group.
+
+        Args:
+            process_group (ProcessGroup, optional): process group to block. Defaults to None, which refers to the default process group.
+        """
+        dist.barrier(group=process_group)
+
+    def on_master_only(self, process_group: ProcessGroup = None):
+        """
+        A function wrapper that only executes the wrapped function on the master process (rank 0).
+
+        Example:
+            >>> from colossalai.cluster import DistCoordinator
+            >>> dist_coordinator = DistCoordinator()
+            >>>
+            >>> @dist_coordinator.on_master_only()
+            >>> def print_on_master(msg):
+            >>>     print(msg)
+        """
+        is_master = self.is_master(process_group)
+
+        # define an inner functiuon
+        def decorator(func):
+
+            @functools.wraps(func)
+            def wrapper(*args, **kwargs):
+                if is_master:
+                    return func(*args, **kwargs)
+
+            return wrapper
+
+        return decorator
--- a/colossalai/cluster/process_group_manager.py
+++ b/colossalai/cluster/process_group_manager.py
+from typing import List
+
+import torch.distributed as dist
+from torch.distributed import ProcessGroup
+
+
+class ProcessGroupManager:
+    """
+    ProcessGroupManager is used to manage the process groups in the cluster.
+
+    There are some terms used in this class:
+        - pg: the short name for process group
+        - pg_name: the name of the process group
+        - pg_size: the world size of the process group
+        - rank: the rank of the current process in the process group
+        - world_size: the total number of processes in the process group
+    """
+
+    def __init__(self):
+        self.pg_store = dict()
+
+    def create_process_group(self, name: str, ranks: List[int], backend: str = 'nccl') -> ProcessGroup:
+        """
+        Get a process group by name. If the process group does not exist, it will be created.
+
+        Args:
+            name (str): name of the process group
+            ranks (List[int]): ranks of the process group
+            backend (str, optional): backend of the process group. Defaults to 'nccl'.
+
+        Returns:
+            ProcessGroup: the process group
+        """
+        if name not in self.pg_store:
+            pg = dist.new_group(ranks=ranks, backend=backend)
+            self.pg_store[name] = pg
+            return pg
+        else:
+            raise ValueError(f'Process group {name} already exists.')
+
+    def get(self, name: str) -> ProcessGroup:
+        """
+        Get a process group by name.
+
+        Args:
+            name (str): name of the process group
+
+        Returns:
+            ProcessGroup: the process group
+        """
+        if name in self.pg_store:
+            return self.pg_store[name]
+        else:
+            raise ValueError(f'Process group {name} does not exist.')
+
+    def destroy(self, name: str) -> None:
+        """
+        Destroy a process group by name.
+
+        Args:
+            name (str): name of the process group
+        """
+        if name in self.pg_store:
+            dist.destroy_process_group(self.pg_store[name])
+            del self.pg_store[name]
+        else:
+            raise ValueError(f'Process group {name} does not exist.')
+
+    def destroy_all(self) -> None:
+        """
+        Destroy all process groups.
+        """
+        for name in self.pg_store:
+            dist.destroy_process_group(self.pg_store[name])
+        self.pg_store.clear()
--- a/colossalai/communication/__init__.py
+++ b/colossalai/communication/__init__.py
+from .collective import all_gather, reduce_scatter, all_reduce, broadcast, reduce
+from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward, send_backward,
+                  send_backward_recv_backward, send_forward_recv_backward, send_forward_backward_recv_forward_backward,
+                  recv_forward, recv_backward)
+from .ring import ring_forward
+from .utils import send_obj_meta, recv_obj_meta
+
+__all__ = [
+    'all_gather',
+    'reduce_scatter',
+    'all_reduce',
+    'broadcast',
+    'reduce',
+    'send_forward',
+    'send_forward_recv_forward',
+    'send_forward_backward_recv_forward_backward',
+    'send_backward',
+    'send_backward_recv_backward',
+    'send_backward_recv_forward',
+    'send_forward_recv_backward',
+    'recv_backward',
+    'recv_forward',
+    'ring_forward',
+    'send_obj_meta',
+    'recv_obj_meta',
+]
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+from torch.distributed import ReduceOp
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+
+_all_gather_func = dist._all_gather_base \
+    if "all_gather_into_tensor" not in dir(dist) else dist.all_gather_into_tensor
+_reduce_scatter_func = dist._reduce_scatter_base \
+    if "reduce_scatter_tensor" not in dir(dist) else dist.reduce_scatter_tensor
+
+
+def all_gather(tensor: Tensor, dim: int, parallel_mode: ParallelMode, async_op: bool = False) -> Tensor:
+    r"""Gathers all tensors from the parallel group and concatenates them in a
+    specific dimension.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be gathered.
+        dim (int): The dimension concatenating in.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-together only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        tensor_in = tensor.contiguous() if dim == 0 else tensor.transpose(0, dim).contiguous()
+        out_shape = (tensor_in.shape[0] * depth,) + tensor_in.shape[1:]
+        tensor_out = torch.empty(out_shape, dtype=tensor.dtype, device=tensor.device)
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = _all_gather_func(tensor_out, tensor_in, group=group, async_op=async_op)
+        out = tensor_out if dim == 0 else tensor_out.transpose(0, dim)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def reduce_scatter(tensor: Tensor,
+                   dim: int,
+                   parallel_mode: ParallelMode,
+                   op: ReduceOp = ReduceOp.SUM,
+                   async_op: bool = False) -> Tensor:
+    r"""Reduces all tensors then scatters it in a specific dimension to all
+    members in the parallel group.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be reduce_scattered.
+        dim (int): The dimension concatenating in.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
+            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
+            More details about ReduceOp please refer to
+            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce_scatter only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        tensor_in = tensor.contiguous() if dim == 0 else tensor.transpose(0, dim).contiguous()
+        out_shape = (tensor_in.shape[0] // depth,) + tensor_in.shape[1:]
+        tensor_out = torch.empty(out_shape, dtype=tensor.dtype, device=tensor.device)
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = _reduce_scatter_func(tensor_out, tensor_in, op=op, group=group, async_op=async_op)
+        out = tensor_out if dim == 0 else tensor_out.transpose(0, dim)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def all_reduce(tensor: Tensor,
+               parallel_mode: ParallelMode,
+               op: ReduceOp = ReduceOp.SUM,
+               async_op: bool = False) -> Tensor:
+    r"""Reduces the tensor data across whole parallel group in such a way that all get the final result.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be all-reduced.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        op (torch.distributed.ReduceOp, optional): The type of reduce operation,
+            should be included in [SUM, AVG, PRODUCT, MIN, MAX, BAND, BOR, BXOR].
+            More details about ReduceOp please refer to
+            `ReduceOp <https://pytorch.org/docs/stable/distributed.html#torch.distributed.ReduceOp>`_.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of all-gather only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        out = tensor.contiguous()
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.all_reduce(out, op=op, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def broadcast(tensor: Tensor, src: int, parallel_mode: ParallelMode, async_op: bool = False):
+    r"""Broadcast tensors to whole parallel group. Tensor must have the same
+    number of elements in all processes participating in the collective.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be broadcast.
+        src (int): Source rank.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The tensor need to be broadcast only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        out = tensor.contiguous()
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.broadcast(out, src=src, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def reduce(tensor: Tensor, dst: int, parallel_mode: ParallelMode, op: ReduceOp = ReduceOp.SUM, async_op: bool = False):
+    r"""Reduce tensors across whole parallel group. Only the process with
+    rank ``dst`` is going to receive the final result.
+
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+
+    Args:
+        tensor (:class:`torch.Tensor`): Tensor to be reduced.
+        dst (int): Destination rank.
+        parallel_mode (:class:`colossalai.context.ParallelMode`): Parallel group mode used in this communication.
+        async_op (bool, optional): Whether operations are asynchronous.
+
+    Returns:
+        Union[tuple(:class:`torch.Tensor`, work handle), :class:`torch.Tensor`]: The result of reduce only,
+        if async_op is set to False. A tuple of output of all-gather and Async work handle, if async_op is set to True.
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    if depth == 1:
+        out = tensor
+        work = None
+    else:
+        out = tensor.contiguous()
+        group = gpc.get_cpu_group(parallel_mode) if tensor.device.type == "cpu" else gpc.get_group(parallel_mode)
+        work = dist.reduce(out, dst=dst, op=op, group=group, async_op=async_op)
+    if async_op:
+        return out, work
+    else:
+        return out
+
+
+def scatter_object_list(scatter_object_output_list, scatter_object_input_list, src=0, group=None) -> None:
+    r"""Modified from `torch.distributed.scatter_object_list
+    <https://pytorch.org/docs/stable/_modules/torch/distributed/distributed_c10d.html#scatter_object_list>` to fix issues
+    """
+    if dist.distributed_c10d._rank_not_in_group(group):
+        return
+
+    if (not isinstance(scatter_object_output_list, list) or len(scatter_object_output_list) < 1):
+        raise RuntimeError("Expected argument scatter_object_output_list to be a list of size at least 1.")
+
+    # set tensor device to cuda if backend is nccl
+    device = torch.cuda.current_device() if dist.get_backend(group) == 'nccl' else torch.device("cpu")
+
+    my_rank = dist.get_rank()    # use global rank
+    if my_rank == src:
+        tensor_list, tensor_sizes = zip(
+            *[dist.distributed_c10d._object_to_tensor(obj) for obj in scatter_object_input_list])
+        tensor_list = list(map(lambda x: x.to(device), tensor_list))
+        tensor_sizes = list(map(lambda x: x.to(device), tensor_sizes))
+
+    # Src rank broadcasts the maximum tensor size. This is because all ranks are
+    # expected to call into scatter() with equal-sized tensors.
+    if my_rank == src:
+        max_tensor_size = max(tensor_sizes)
+        for tensor in tensor_list:
+            tensor.resize_(max_tensor_size)
+    else:
+        max_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
+
+    dist.broadcast(max_tensor_size, src=src, group=group)
+
+    # Scatter actual serialized objects
+    output_tensor = torch.empty(max_tensor_size.item(), dtype=torch.uint8).to(device)
+    dist.scatter(
+        output_tensor,
+        scatter_list=None if my_rank != src else tensor_list,
+        src=src,
+        group=group,
+    )
+
+    # Scatter per-object sizes to trim tensors when deserializing back to object
+    obj_tensor_size = torch.tensor([0], dtype=torch.long).to(device)
+    dist.scatter(
+        obj_tensor_size,
+        scatter_list=None if my_rank != src else tensor_sizes,
+        src=src,
+        group=group,
+    )
+
+    output_tensor, obj_tensor_size = output_tensor.cpu(), obj_tensor_size.cpu()
+    # Deserialize back to object
+    scatter_object_output_list[0] = dist.distributed_c10d._tensor_to_object(output_tensor, obj_tensor_size)
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from typing import List, Tuple, Union
+import torch
+import torch.distributed as dist
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+from functools import reduce
+import operator
+from .utils import split_tensor_into_1d_equal_chunks, gather_split_1d_tensor
+
+TensorShape = Union[torch.Size, List[int], Tuple[int]]
+
+
+def _get_tensor_shape(tensor_shape: TensorShape, chunk_tensor: bool = False) -> Tuple[TensorShape, bool]:
+    """get the exact tensor shape when communicating and return whether the tensor is a chunk
+
+    Args:
+        tensor_shape (:class:`torch.Size`): shape of tensor
+        chunk_tensor (bool, optional): whether to chunk tensor, defaults to False
+
+    Returns:
+        Tuple[Union[:class:`torch.Size`, List[int], Tuple[int]], bool]: exact tensor shape, whether to chunk tensor
+    """
+    if chunk_tensor:
+        tensor_chunk_shape = reduce(operator.mul, tensor_shape, 1)
+        tensor_parallel_world_size = gpc.get_world_size(ParallelMode.TENSOR)
+        if tensor_chunk_shape % tensor_parallel_world_size == 0:
+            tensor_chunk_shape = tensor_chunk_shape // tensor_parallel_world_size
+        else:
+            tensor_chunk_shape = tensor_shape
+            chunk_tensor = False
+    else:
+        tensor_chunk_shape = tensor_shape
+    return tensor_chunk_shape, chunk_tensor
+
+
+def create_recv_buffer_with_shapes(recv_shapes, dtype, scatter_gather_tensors):
+    if isinstance(recv_shapes, torch.Size):
+        recv_chunk_shape, recv_split = _get_tensor_shape(recv_shapes, scatter_gather_tensors)
+        buffer_recv = torch.empty(recv_chunk_shape, requires_grad=True, device=get_current_device(), dtype=dtype)
+        return buffer_recv, recv_split
+    buffer_recv = []
+    for recv_shape in recv_shapes:
+        recv_chunk_shape, recv_split = _get_tensor_shape(recv_shape, scatter_gather_tensors)
+        tensor_recv = torch.empty(recv_chunk_shape, requires_grad=True, device=get_current_device(), dtype=dtype)
+        buffer_recv.append(tensor_recv)
+    return buffer_recv, recv_split
+
+
+def process_object_to_send(object_send, scatter_gather_tensors):
+    if isinstance(object_send, torch.Tensor):
+        send_split = _get_tensor_shape(object_send.shape, scatter_gather_tensors)[1]
+        if send_split:
+            object_send = split_tensor_into_1d_equal_chunks(object_send)
+        return object_send
+
+    object_send_list = []
+    for tensor_send in object_send:
+        send_split = _get_tensor_shape(tensor_send.shape, scatter_gather_tensors)[1]
+        if send_split:
+            object_send_list.append(split_tensor_into_1d_equal_chunks(tensor_send))
+        else:
+            object_send_list.append(tensor_send)
+    object_send = tuple(object_send_list)
+
+    return object_send
+
+
+def filling_ops_queue(obj, comm_op, comm_rank, ops_queue):
+    if isinstance(obj, torch.Tensor):
+        op_to_add = dist.P2POp(comm_op, obj, comm_rank)
+        ops_queue.append(op_to_add)
+    else:
+        for tensor_to_comm in obj:
+            op_to_add = dist.P2POp(comm_op, tensor_to_comm, comm_rank)
+            ops_queue.append(op_to_add)
+
+
+def _communicate(object_send_next: Union[torch.Tensor, List[torch.Tensor]] = None,
+                 object_send_prev: Union[torch.Tensor, List[torch.Tensor]] = None,
+                 recv_prev: bool = False,
+                 recv_next: bool = False,
+                 recv_prev_shape: Union[torch.Size, List[torch.Size]] = None,
+                 recv_next_shape: Union[torch.Size, List[torch.Size]] = None,
+                 prev_rank: int = None,
+                 next_rank: int = None,
+                 dtype: torch.dtype = None,
+                 scatter_gather_tensors: bool = False) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
+    """
+    Adapted from megatron.p2p_communication.
+    Communicate tensors between stages. Used as helper method in other
+    communication methods that are used in pipeline schedule.
+    Takes the following arguments:
+        object_send_next (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): tensor to send to next rank (no tensor sent if
+                          set to None).
+        object_send_prev (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): tensor to send to prev rank (no tensor sent if
+                          set to None).
+        recv_prev (bool): boolean for whether tensor should be received from
+                   previous rank.
+        recv_next (bool): boolean for whether tensor should be received from
+                   next rank.
+        recv_prev_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the previous stage, defaults to None.
+        recv_next_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): shape of the tensor to be received from the next stage, defaults to None.
+        prev_rank (int): the rank of the previous pipeline stage, defaults to None,
+        next_rank (int): the rank of the next pipeline stage, defaults to None,
+        dtype (torch.dtype): data type of intermediate buffers, defaults to None
+        scatter_gather_tensors (bool): whether to scatter and gather tensor between pipeline stages, defaults to False
+
+    Returns:
+        Tuple[Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]]: returns tensor_recv_prev, tensor_recv_next
+    """
+
+    # Create placeholder tensors for receive in forward and backward directions
+    # if needed.
+    tensor_recv_prev = None
+    tensor_recv_next = None
+
+    if recv_prev:
+        assert recv_prev_shape is not None
+        tensor_recv_prev, recv_prev_split = create_recv_buffer_with_shapes(recv_prev_shape, dtype,
+                                                                           scatter_gather_tensors)
+
+    if recv_next:
+        assert recv_next_shape is not None
+        tensor_recv_next, recv_next_split = create_recv_buffer_with_shapes(recv_next_shape, dtype,
+                                                                           scatter_gather_tensors)
+
+    if object_send_prev is not None or recv_prev:
+        if prev_rank is None:
+            prev_rank = gpc.get_prev_global_rank(ParallelMode.PIPELINE)
+
+    if object_send_next is not None or recv_next:
+        if next_rank is None:
+            next_rank = gpc.get_next_global_rank(ParallelMode.PIPELINE)
+
+    if object_send_prev is not None:
+        object_send_prev = process_object_to_send(object_send_prev, scatter_gather_tensors)
+
+    if object_send_next is not None:
+        object_send_next = process_object_to_send(object_send_next, scatter_gather_tensors)
+
+    ops = []
+    if object_send_prev is not None:
+        filling_ops_queue(object_send_prev, dist.isend, prev_rank, ops)
+
+    if tensor_recv_prev is not None:
+        filling_ops_queue(tensor_recv_prev, dist.irecv, prev_rank, ops)
+
+    if tensor_recv_next is not None:
+        filling_ops_queue(tensor_recv_next, dist.irecv, next_rank, ops)
+
+    if object_send_next is not None:
+        filling_ops_queue(object_send_next, dist.isend, next_rank, ops)
+
+    if len(ops) > 0:
+        reqs = dist.batch_isend_irecv(ops)
+        for req in reqs:
+            req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
+
+    if recv_prev and recv_prev_split:
+        if isinstance(tensor_recv_prev, torch.Tensor):
+            tensor_recv_prev = gather_split_1d_tensor(tensor_recv_prev).view(recv_prev_shape).requires_grad_()
+        else:
+            for index in range(len(tensor_recv_prev)):
+                tensor_recv_prev[index] = gather_split_1d_tensor(tensor_recv_prev[index]).view(
+                    recv_prev_shape[index]).requires_grad_()
+
+    if recv_next and recv_next_split:
+        if isinstance(tensor_recv_next, torch.Tensor):
+            tensor_recv_next = gather_split_1d_tensor(tensor_recv_next).view(recv_next_shape).requires_grad_()
+        else:
+            for index in range(len(tensor_recv_next)):
+                tensor_recv_next[index] = gather_split_1d_tensor(tensor_recv_next[index]).view(
+                    recv_next_shape[index]).requires_grad_()
+
+    return tensor_recv_prev, tensor_recv_next
+
+
+def recv_forward(input_tensor_shape,
+                 prev_rank=None,
+                 dtype=torch.float,
+                 scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Copy the forward output from the previous stage in pipeline as the input tensor of this stage.
+
+    Args:
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
+        prev_rank (int, optional): The rank of the source of the tensor.
+
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor or input tensor list.
+    """
+    if gpc.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        input_tensor, _ = _communicate(recv_prev=True,
+                                       recv_prev_shape=input_tensor_shape,
+                                       prev_rank=prev_rank,
+                                       dtype=dtype,
+                                       scatter_gather_tensors=scatter_gather_tensors)
+    return input_tensor
+
+
+def recv_backward(output_grad_shape,
+                  next_rank=None,
+                  dtype=torch.float,
+                  scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Copy the gradient tensor from the next stage in pipeline as the input gradient of this stage.
+
+    Args:
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
+        next_rank (int, optional): The rank of the source of the tensor.
+
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor or gradident tensor list.
+    """
+    if gpc.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        _, output_tensor_grad = _communicate(recv_next=True,
+                                             recv_next_shape=output_grad_shape,
+                                             next_rank=next_rank,
+                                             dtype=dtype,
+                                             scatter_gather_tensors=scatter_gather_tensors)
+    return output_tensor_grad
+
+
+def send_forward(output_tensor, next_rank=None, scatter_gather_tensors=False) -> None:
+    """Sends the input tensor to the next stage in pipeline.
+
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        next_rank (int, optional): The rank of the recipient of the tensor.
+    """
+    if not gpc.is_pipeline_last_stage():
+        _communicate(object_send_next=output_tensor, next_rank=next_rank, scatter_gather_tensors=scatter_gather_tensors)
+
+
+def send_backward(input_tensor_grad, prev_rank=None, scatter_gather_tensors=False) -> None:
+    """Sends the gradient tensor to the previous stage in pipeline.
+
+    Args:
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent
+        prev_rank (int, optional): The rank of the recipient of the tensor
+    """
+    if not gpc.is_pipeline_first_stage():
+        _communicate(object_send_prev=input_tensor_grad,
+                     prev_rank=prev_rank,
+                     scatter_gather_tensors=scatter_gather_tensors)
+
+
+def send_forward_recv_backward(output_tensor,
+                               output_grad_shape,
+                               recv_next=True,
+                               next_rank=None,
+                               dtype=torch.float,
+                               scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the input tensor to the 
+    next stage in pipeline, while receives the gradient tensor from the
+    next stage in pipeline as the input gradient tensor of this stage.
+
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
+
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor.
+    """
+    if gpc.is_pipeline_last_stage():
+        output_tensor_grad = None
+    else:
+        _, output_tensor_grad = _communicate(object_send_next=output_tensor,
+                                             recv_next=recv_next,
+                                             recv_next_shape=output_grad_shape,
+                                             next_rank=next_rank,
+                                             dtype=dtype,
+                                             scatter_gather_tensors=scatter_gather_tensors)
+    return output_tensor_grad
+
+
+def send_backward_recv_forward(input_tensor_grad,
+                               input_tensor_shape,
+                               recv_prev=True,
+                               prev_rank=None,
+                               dtype=torch.float,
+                               scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the gradient tensor to the
+    previous stage in pipeline, while receives the output tensor from the
+    previous stage in pipeline as the input of this stage.
+
+    Args:
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
+
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor.
+    """
+    if gpc.is_pipeline_first_stage():
+        input_tensor = None
+    else:
+        input_tensor, _ = _communicate(object_send_prev=input_tensor_grad,
+                                       recv_prev=recv_prev,
+                                       recv_prev_shape=input_tensor_shape,
+                                       prev_rank=prev_rank,
+                                       dtype=dtype,
+                                       scatter_gather_tensors=scatter_gather_tensors)
+    return input_tensor
+
+
+def send_forward_recv_forward(output_tensor,
+                              input_tensor_shape,
+                              recv_prev=True,
+                              prev_rank=None,
+                              next_rank=None,
+                              dtype=torch.float,
+                              scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the input tensor to the 
+    next stage in pipeline, while receives the output tensor from the
+    previous stage in pipeline as the input of this stage.
+
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
+
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input tensor.
+    """
+    input_tensor, _ = _communicate(object_send_next=output_tensor,
+                                   recv_prev=recv_prev,
+                                   recv_prev_shape=input_tensor_shape,
+                                   prev_rank=prev_rank,
+                                   next_rank=next_rank,
+                                   dtype=dtype,
+                                   scatter_gather_tensors=scatter_gather_tensors)
+    return input_tensor
+
+
+def send_backward_recv_backward(input_tensor_grad,
+                                output_grad_shape,
+                                recv_next=True,
+                                prev_rank=None,
+                                next_rank=None,
+                                dtype=torch.float,
+                                scatter_gather_tensors=False) -> Union[torch.Tensor, List[torch.Tensor]]:
+    """Batched communication operation. Sends the gradient tensor to the
+    previous stage in pipeline, while receives the gradient tensor from the
+    next member in pipeline as the input of this stage.
+
+    Args:
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor to be sent.
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor to be received.
+
+    Returns:
+        Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]: The input gradient tensor.
+    """
+    _, output_tensor_grad = _communicate(object_send_prev=input_tensor_grad,
+                                         recv_next=recv_next,
+                                         recv_next_shape=output_grad_shape,
+                                         prev_rank=prev_rank,
+                                         next_rank=next_rank,
+                                         dtype=dtype,
+                                         scatter_gather_tensors=scatter_gather_tensors)
+    return output_tensor_grad
+
+
+def send_forward_backward_recv_forward_backward(
+        output_tensor,
+        input_tensor_grad,
+        input_tensor_shape,
+        output_grad_shape,
+        recv_prev=True,
+        recv_next=True,
+        prev_rank=None,
+        next_rank=None,
+        dtype=torch.float,
+        scatter_gather_tensors=False) -> Tuple[Union[torch.Tensor, List[torch.Tensor]]]:
+    """Batched communication operation. Sends the input tensor to the next stage in pipeline and
+    the gradient tensor to the previous stage, while receives the input gradient tensor from the
+    next stage and the input tensor from the previous stage.
+
+    Args:
+        output_tensor (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor sent to the next.
+        input_tensor_grad (Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): Tensor sent to the previous.
+        input_tensor_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor received from the previous.
+        output_grad_shape (Union[:class:`torch.Size`, List[:class:`torch.Size`]]): The shape of the tensor received from the next.
+
+    Returns:
+        Tuple(Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]], Union[:class:`torch.Tensor`, List[:class:`torch.Tensor`]]): (the input tensor, the input gradient tensor)
+    """
+    input_tensor, output_tensor_grad = _communicate(object_send_next=output_tensor,
+                                                    object_send_prev=input_tensor_grad,
+                                                    recv_prev=recv_prev,
+                                                    recv_next=recv_next,
+                                                    recv_prev_shape=input_tensor_shape,
+                                                    recv_next_shape=output_grad_shape,
+                                                    prev_rank=prev_rank,
+                                                    next_rank=next_rank,
+                                                    dtype=dtype,
+                                                    scatter_gather_tensors=scatter_gather_tensors)
+    return input_tensor, output_tensor_grad