Migrated project

404ecbdc · zbian · 2ebaefc5 · 404ecbdc · 404ecbdc · 404ecbdc
Commit 404ecbdc authored Oct 28, 2021 by zbian
20 changed files
--- a/.gitignore
+++ b/.gitignore
+# Byte-compiled / optimized / DLL files
+__pycache__/
+*.py[cod]
+*$py.class
+
+# C extensions
+*.so
+
+# Distribution / packaging
+.Python
+build/
+develop-eggs/
+dist/
+downloads/
+eggs/
+.eggs/
+lib/
+lib64/
+parts/
+sdist/
+var/
+wheels/
+pip-wheel-metadata/
+share/python-wheels/
+*.egg-info/
+.installed.cfg
+*.egg
+MANIFEST
+
+# PyInstaller
+#  Usually these files are written by a python script from a template
+#  before PyInstaller builds the exe, so as to inject date/other infos into it.
+*.manifest
+*.spec
+
+# Installer logs
+pip-log.txt
+pip-delete-this-directory.txt
+
+# Unit test / coverage reports
+htmlcov/
+.tox/
+.nox/
+.coverage
+.coverage.*
+.cache
+nosetests.xml
+coverage.xml
+*.cover
+*.py,cover
+.hypothesis/
+.pytest_cache/
+
+# Translations
+*.mo
+*.pot
+
+# Django stuff:
+*.log
+local_settings.py
+db.sqlite3
+db.sqlite3-journal
+
+# Flask stuff:
+instance/
+.webassets-cache
+
+# Scrapy stuff:
+.scrapy
+
+# Sphinx documentation
+docs/_build/
+docs/.build/
+
+# PyBuilder
+target/
+
+# Jupyter Notebook
+.ipynb_checkpoints
+
+# IPython
+profile_default/
+ipython_config.py
+
+# pyenv
+.python-version
+
+# pipenv
+#   According to pypa/pipenv#598, it is recommended to include Pipfile.lock in version control.
+#   However, in case of collaboration, if having platform-specific dependencies or dependencies
+#   having no cross-platform support, pipenv may install dependencies that don't work, or not
+#   install all needed dependencies.
+#Pipfile.lock
+
+# PEP 582; used by e.g. github.com/David-OConnor/pyflow
+__pypackages__/
+
+# Celery stuff
+celerybeat-schedule
+celerybeat.pid
+
+# SageMath parsed files
+*.sage.py
+
+# Environments
+.env
+.venv
+env/
+venv/
+ENV/
+env.bak/
+venv.bak/
+
+# Spyder project settings
+.spyderproject
+.spyproject
+
+# Rope project settings
+.ropeproject
+
+# mkdocs documentation
+/site
+
+# mypy
+.mypy_cache/
+.dmypy.json
+dmypy.json
+
+# Pyre type checker
+.pyre/
+
+# IDE
+.idea/
+.vscode/
+
+# macos
+.DS_Store
+#data/
+
+# launcher setting
+tests/launcher/log
+tests/launcher/personal
+
+docs/.build
--- a/MANIFEST.in
+++ b/MANIFEST.in
+include *.txt README.md
+recursive-include requirements *.txt
+recursive-include colossalai *.cpp *.h *.cu *.tr *.cuh *.cc
+recursive-include csrc *.cpp *.h *.cu *.tr *.cuh *.cc
\ No newline at end of file
--- a/README.md
+++ b/README.md
+# ColossalAI
+
+An integrated large-scale model training framework with efficient parallelization techniques
+
+## Installation
+
+### PyPI
+
+```bash
+pip install colossalai
+```
+
+### Install From Source
+
+```shell
+git clone git@github.com:hpcaitech/ColossalAI.git
+cd ColossalAI
+# install dependency
+pip install -r requirements/requirements.txt
+
+# install colossalai
+pip install .
+```
+
+Install and enable CUDA kernel fusion (compulsory installation when using fused optimizer)
+
+```shell
+pip install -v --no-cache-dir --global-option="--cuda_ext" .
+```
+
+## Documentation
+
+- [Documentation](https://www.colossalai.org/)
+
+## Quick View
+
+### Start Distributed Training in Lines
+
+```python
+import colossalai
+from colossalai.engine import Engine
+from colossalai.trainer import Trainer
+from colossalai.core import global_context as gpc
+
+model, train_dataloader, test_dataloader, criterion, optimizer, schedule, lr_scheduler = colossalai.initialize()
+engine = Engine(
+    model=model,
+    criterion=criterion,
+    optimizer=optimizer,
+    lr_scheduler=lr_scheduler,
+    schedule=schedule
+)
+
+trainer = Trainer(engine=engine,
+                  hooks_cfg=gpc.config.hooks,
+                  verbose=True)
+trainer.fit(
+    train_dataloader=train_dataloader,
+    test_dataloader=test_dataloader,
+    max_epochs=gpc.config.num_epochs,
+    display_progress=True,
+    test_interval=5
+)
+```
+
+### Write a Simple 2D Parallel Model
+
+Let's say we have a huge MLP model and its very large hidden size makes it difficult to fit into a single GPU. We can
+then distribute the model weights across GPUs in a 2D mesh while you still write your model in a familiar way.
+
+```python
+from colossalai.nn import Linear2D
+import torch.nn as nn
+
+
+class MLP_2D(nn.Module):
+
+    def __init__(self):
+        super().__init__()
+        self.linear_1 = Linear2D(in_features=1024, out_features=16384)
+        self.linear_2 = Linear2D(in_features=16384, out_features=1024)
+
+    def forward(self, x):
+        x = self.linear_1(x)
+        x = self.linear_2(x)
+        return x
+
+```
+
+## Features
+
+ColossalAI provides a collection of parallel training components for you. We aim to support you to write your
+distributed deep learning models just like how you write your single-GPU model. We provide friendly tools to kickstart
+distributed training in a few lines.
+
+- [Data Parallelism](./docs/parallelization.md)
+- [Pipeline Parallelism](./docs/parallelization.md)
+- [1D, 2D, 2.5D, 3D and sequence parallelism](./docs/parallelization.md)
+- [friendly trainer and engine](./docs/trainer_engine.md)
+- [Extensible for new parallelism](./docs/add_your_parallel.md)
+- [Mixed Precision Training](./docs/amp.md)
+- [Zero Redundancy Optimizer (ZeRO)](./docs/zero.md)
+
+
--- a/colossalai/__init__.py
+++ b/colossalai/__init__.py
+from .initialize import init_dist, initialize
+from .nn import *
+
+__version__ = '0.0.1'
--- a/colossalai/builder/__init__.py
+++ b/colossalai/builder/__init__.py
+from .builder import *
+from .pipeline import ModelInitializer
--- a/colossalai/builder/builder.py
+++ b/colossalai/builder/builder.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import inspect
+from collections.abc import Iterable
+
+from colossalai.registry import *
+
+
+def build_from_config(module, config: dict):
+    """Returns an object of :class:`module` constructed from `config`.
+
+    :param module: A python or user-defined class
+    :type module: class
+    :param config: A python dict containing information used in the construction
+        of the return object
+    :type config: dict
+    :raises AssertionError: Raises an AssertionError if `module` is not a class
+    :return: An object of :class:`module`
+    :rtype: :class:`module`
+    """
+    assert inspect.isclass(module), 'module must be a class'
+    return module(**config)
+
+
+def build_from_registry(config, registry: Registry):
+    """Returns an object constructed from `config`, the type of the object
+    is specified by `registry`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object 
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.colossalai.context.Config`
+    :param registry: A registry specifying the type of the return object
+    :type registry: :class:`Registry`
+    :raises AssertionError: Raises an AssertionError if `registry` is not an object
+        of :class:`Registry` or `mod_type` in `config` is not found in `registry`
+    :raises Exception: Raises an Exception if an error occurred when building
+        from registry
+    :return: An object specified by `registry`
+    :rtype: Python object specified by `registry`
+    """
+    config_ = config.copy()  # keep the original config untouched
+    assert isinstance(
+        registry, Registry), f'Expected type Registry but got {type(registry)}'
+
+    mod_type = config_.pop('type')
+    assert registry.has(
+        mod_type), f'{mod_type} is not found in registry {registry.name}'
+    try:
+        obj = registry.get_module(mod_type)(**config_)
+    except Exception as e:
+        print(
+            f'An error occurred when building {mod_type} from registry {registry.name}', flush=True)
+        raise e
+
+    return obj
+
+
+def build_layer(config):
+    """Returns a layer object of :class:`nn.Module` constructed from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`nn.Module`
+    :rtype: :class:`nn.Module`
+    """
+    return build_from_registry(config, LAYERS)
+
+
+def build_loss(config):
+    """Returns a loss function object of :class:`torch.autograd.Function` constructed 
+    from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`torch.autograd.Function`
+    :rtype: :class:`torch.autograd.Function`
+    """
+    return build_from_registry(config, LOSSES)
+
+
+def build_model(config):
+    """Returns a model object of :class:`nn.Module` constructed from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`nn.Module`
+    :rtype: :class:`nn.Module`
+    """
+    return build_from_registry(config, MODELS)
+
+
+def build_dataset(config):
+    """Returns a dataset object of :class:`torch.utils.data.Dataset` constructed 
+    from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`torch.utils.data.Dataset`
+    :rtype: :class:`torch.utils.data.Dataset`
+    """
+    return build_from_registry(config, DATASETS)
+
+
+def build_optimizer(config, model, params: Iterable = None, need_module=False):
+    """Returns an optimizer object of :class:`torch.optim.Optimizer` constructed from `config`, 
+    'model' and 'params'.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object 
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :param model: A model containing parameters for the optimizer 
+    :type model: :class:`nn.Module`
+    :param params: A dict containing parameters for the optimizer
+    :type params: dict, optional
+    :param need_module: Indicates whether the optimizer needs a module
+    :type params: bool, optional
+    :raises AssertionError: Raises an AssertionError if both `model` and `params` are None
+    :return: An object of :class:`torch.optim.Optimizer`
+    :rtype: :class:`torch.optim.Optimizer`
+    """
+    assert model is not None or params is not None, 'arguments model and params can not both be None'
+    if need_module:
+        config['module'] = model
+    elif model is not None:
+        config['params'] = model.parameters()
+    elif params is not None:
+        config['params'] = params
+
+    return build_from_registry(config, OPTIMIZERS)
+
+
+def build_gradient_handler(config, model, optimizer):
+    """Returns a gradient handler object of :class:`BaseGradientHandler` constructed from `config`,
+    `model` and `optimizer`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :param model: A model containing parameters for the gradient handler
+    :type model: :class:`nn.Module`
+    :param optimizer: An optimizer object containing parameters for the gradient handler
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :return: An object of :class:`BaseGradientHandler`
+    :rtype: :class:`BaseGradientHandler`
+    """
+    config_ = config.copy()
+    mod_type = config_.pop('type')
+    return GRADIENT_HANDLER.get_module(mod_type)(model, optimizer, **config_)
+
+
+def build_hooks(config, trainer):
+    """Returns a hook object of :class:`BaseHook` constructed from `config` and `trainer`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :param trainer: A :class:`Trainer` object containing parameters for the hook
+    :type trainer: :class:`Trainer`
+    :return: An object of :class:`BaseHook`
+    :rtype: :class:`BaseHook`
+    """
+    config['trainer'] = trainer
+    return build_from_registry(config, HOOKS)
+
+
+def build_transform(config):
+    """Returns a transformation object of :class:`torchvision.transforms` constructed
+    from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: An object of :class:`torchvision.transforms`
+    :rtype: :class:`torchvision.transforms`
+    """
+    return build_from_registry(config, TRANSFORMS)
+
+
+def build_pipe_alloc_policy(config):
+    """Returns a pipeline allocation policy object constructed from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :return: A pipeline allocation policy object
+    :rtype: 
+    """
+    return build_from_registry(config, PIPE_ALLOC_POLICY)
+
+
+def build_data_sampler(config, dataset):
+    """Returns a data sampler object of :class:`colossalai.nn.data.sampler.BaseSampler`
+    constructed from `config`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :param dataset: An object of :class:`torch.utils.data.Dataset` containing information
+        used in the construction of the return object
+    :type dataset: :class:`torch.utils.data.Dataset`
+    :return: An object of :class:`colossalai.nn.data.sampler.BaseSampler`
+    :rtype: :class:`colossalai.nn.data.sampler.BaseSampler`
+    """
+    config_ = config.copy()
+    mod_type = config_.pop('type')
+    return SAMPLERS.get_module(mod_type)(dataset, **config_)
+
+
+def build_optimizer_wrapper(config, optimizer, model=None):
+    """Returns an optimizer wrapper object of :class:`torch.optim.Optimizer` constructed 
+    from `config`, `model` and `optimizer`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object 
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :param optimizer: An optimizer object containing parameters for the gradient handler
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param model: A model containing parameters for the gradient handler
+    :type model: :class:`nn.Module`, optional
+    :return: An object of :class:`torch.optim.Optimizer`
+    :rtype: :class:`torch.optim.Optimizer`
+    """
+    config_ = config.copy()
+    mod_type = config_.pop('type')
+
+    # LSG: special treatment for zeor level 3
+    if mod_type == 'ZeroRedundancyOptimizer_Level_3':
+        return OPTIMIZER_WRAPPERS.get_module(mod_type)(model, optimizer, **config_)
+    else:
+        return OPTIMIZER_WRAPPERS.get_module(mod_type)(optimizer, **config_)
+
+
+def build_lr_scheduler(config, optimizer, total_steps, num_steps_per_epoch):
+    """Returns a learning rate scheduler object of :class:`torch.optim.lr_scheduler` 
+    constructed from `config`, `optimizer`, `total_steps` and `num_steps_per_epoch`.
+
+    :param config: A python dict or a :class:`colossalai.context.Config` object 
+        containing information used in the construction of the return object
+    :type config: dict or :class:`colossalai.context.Config`
+    :param optimizer: An optimizer object containing parameters for the learning rate
+        scheduler
+    :type optimizer: :class:`torch.optim.Optimizer`
+    :param total_steps: Number of total steps of the learning rate scheduler
+    :type total_steps: int
+    :param num_steps_per_epoch: number of steps per epoch of the learning rate scheduler
+    :type num_steps_per_epoch: int
+    :return: An object of :class:`torch.optim.lr_scheduler`
+    :rtype: :class:`torch.optim.lr_scheduler`
+    """
+    config_ = config.copy()
+    mod_type = config_.pop('type')
+    # warmup epochs will overwrite warmup steps
+    if 'warmup_epochs' in config_:
+        warmup_epochs = config_.pop('warmup_epochs')
+        config_['warmup_steps'] = int(num_steps_per_epoch * warmup_epochs)
+    return LR_SCHEDULERS.get_module(mod_type)(optimizer, total_steps, num_steps_per_epoch=num_steps_per_epoch,
+                                              **config_)
--- a/colossalai/builder/pipeline.py
+++ b/colossalai/builder/pipeline.py
+import copy
+import heapq
+
+from colossalai.builder import build_model, build_layer
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.logging import get_global_dist_logger
+from colossalai.utils import set_to_cuda
+
+
+def _binary_partition(weights, st, ed):
+    """Returns the binary partition position of `weights`, given the start
+    position `st` and the end position `ed`.
+
+    :param weights: A python list to be binary partitioned
+    :type weights: list
+    :param st: the start position of the binary partition
+    :type st: int
+    :param ed: the end postition of the binary partition
+    :type ed: int
+    :return: the binary partition position of `weights`
+    :rtype: int
+    """
+    w_sum = weights[ed - 1]
+    prefix = 0
+    if st > 0:
+        w_sum -= weights[st - 1]
+        prefix = weights[st - 1]
+    minimum = float("inf")
+    for idx in range(st + 1, ed):
+        front = weights[idx - 1] - prefix
+        diff = abs(w_sum - 2 * front)
+        if diff < minimum:
+            pos = idx
+            minimum = diff
+
+    return st, pos, ed
+
+
+def _heap_addition(weights, intervals, add_cnt):
+    """
+    """
+    def _heap_push(heap, st, ed):
+        value = weights[ed - 1]
+        if st > 0:
+            value -= weights[st - 1]
+        heapq.heappush(heap, (-value, st, ed))
+
+    ret_intervals = []
+    heap = []
+
+    for st, ed in intervals:
+        _heap_push(heap, st, ed)
+
+    while add_cnt > 0:
+        _, st, ed = heapq.heappop(heap)
+        if ed - st == 1:
+            ret_intervals.append((st, ed))
+        else:
+            l, m, r = _binary_partition(weights, st, ed)
+            _heap_push(heap, l, m)
+            _heap_push(heap, m, r)
+            add_cnt -= 1
+
+    while heap:
+        _, st, ed = heapq.heappop(heap)
+        ret_intervals.append((st, ed))
+
+    ret_intervals.sort()
+    return ret_intervals
+
+
+def _calc_partitions(weights, value):
+    prev = 0
+    prefix = 0
+    num_block = 0
+    intervals = []
+
+    for idx, w in enumerate(weights):
+        if weights[idx] - prefix > value:
+            intervals.append((prev, idx))
+            prev = idx
+            prefix = weights[idx - 1]
+            num_block += 1
+
+    intervals.append((prev, len(weights)))
+    return num_block + 1, intervals
+
+
+def _binary_search(weights, num):
+    length = len(weights)
+    prefix = [1 if w == 0 else w for w in weights]
+    for i in range(1, length):
+        prefix[i] += prefix[i - 1]
+
+    lower_bound = max(weights)
+    upper_bound = prefix[length - 1]
+
+    while upper_bound > lower_bound:
+        mid = (upper_bound + lower_bound) // 2
+        number, _ = _calc_partitions(prefix, mid)
+        if number <= num:
+            upper_bound = mid
+        else:
+            lower_bound = mid + 1
+
+    num_block, intervals = _calc_partitions(prefix, upper_bound)
+    if num_block < num:
+        intervals = _heap_addition(prefix, intervals, num - num_block)
+
+    return intervals
+
+
+def _partition_uniform(num_items, num_parts, num_chunks):
+    assert num_items % num_chunks == 0, \
+        "Layer length should be divided by the number of chunks, otherwise parameter method is recomended"
+
+    logger = get_global_dist_logger()
+    parts = [[] for _ in range(num_parts)]
+    partition_items = num_items // num_chunks
+    for idx in range(num_chunks):
+        base_idx = idx * partition_items
+        chunk_size = partition_items // num_parts
+        left = num_parts - partition_items % num_parts
+        if chunk_size == 0:
+            logger.warning("Some nodes in Pipeline have no requests")
+
+        for p in range(num_parts):
+            st = base_idx
+            base_idx += chunk_size + (p >= left)
+            parts[p].append((st, base_idx))
+
+    return parts
+
+
+def _partition_balanced(weights, num_parts, num_chunks):
+    num_total = num_parts * num_chunks
+    num_items = len(weights)
+    if num_items <= num_total:
+        return _partition_uniform(num_items, num_parts, num_chunks)
+
+    intervals = _binary_search(weights, num_total)
+
+    current = 0
+    parts = [[] for _ in range(num_parts)]
+    for inter in intervals:
+        parts[current].append(inter)
+        current = (current + 1) % num_parts
+
+    return parts
+
+
+class ModelInitializer():
+    def __init__(self, config, num_chunks, verbose=False):
+        self.num_chunks = num_chunks
+        self.ori_model = build_model(config)
+        self.layers = self.ori_model.layers_cfg
+        layer_length = len(self.layers)
+        self.verbose = verbose
+        self._logger = get_global_dist_logger()
+        self._logger.info(f"The total length of layers is {layer_length}", ranks=[0])
+
+    def model_initialize(self, partition_method='parameter'):
+        # Some space for initializing comunication groups
+        self._interval = None
+        self._partition_layers(method=partition_method)
+        models = self._build()
+        model = set_to_cuda(models)
+
+        return model
+
+    def _partition_layers(self, method):
+        pipeline_parallel_size = gpc.get_world_size(ParallelMode.PIPELINE)
+        pipeline_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+
+        method = method.lower()
+        # Make a partition
+        if method == 'layer':
+            num_layers = len(self.layers)
+            self.parts = _partition_uniform(num_layers, pipeline_parallel_size, self.num_chunks)
+        elif method == 'parameter':
+            param_counts = self._count_layer_params()
+            # print_rank_0(param_counts)
+            self.parts = _partition_balanced(param_counts, pipeline_parallel_size, self.num_chunks)
+        else:
+            assert method == 'layer', "Method should be a pre-set string"
+
+        # Display the partition
+        if gpc.get_global_rank() == 0 and self.verbose:
+            log_str = 'Layer allocation after partitioning: \n'
+            for stage in range(pipeline_parallel_size):
+
+                num_layers = 0
+                for st, ed in self.parts[stage]:
+                    num_layers += ed - st
+
+                log_str += f'\n===== stage={stage}, layers={num_layers} =====\n'
+                for st, ed in self.parts[stage]:
+                    for idx, layer in enumerate(self.layers[st: ed]):
+                        log_str += f'\t{idx + st:2d}: {layer}\n'
+            self._logger.info(log_str)
+
+        # Save the partition
+        self._interval = self.parts[pipeline_rank]
+
+    def _build(self):
+        """Build model from the layer cfg according to the partition
+        """
+        models = []
+        for st, ed in self._interval:
+            model = copy.copy(self.ori_model)
+            model.build_from_cfg(st, ed)
+            models.append(model)
+
+        return models
+
+    def _count_layer_params(self):
+        """Count the number of parameters in each layer
+        """
+        param_counts = [0] * len(self.layers)
+        for idx, cfg in enumerate(self.layers):
+            layer = build_layer(cfg)
+            params = filter(lambda p: p.requires_grad, layer.parameters())
+            param_counts[idx] = sum(p.numel() for p in params)
+
+        return param_counts
--- a/colossalai/checkpointing.py
+++ b/colossalai/checkpointing.py
+import os
+import os.path as osp
+import re
+from typing import Tuple
+
+import torch
+
+from .context import Config
+from .context.parallel_mode import ParallelMode
+from .core import global_context as gpc
+
+__all__ = [
+    'get_checkpoint_path',
+    'get_latest_checkpoint_path',
+    'get_latest_checkpoint_pattern',
+    'save_checkpoint',
+    'load_checkpoint'
+]
+
+
+def unwrap_config(config: Config):
+    '''
+    unwrap Config objects to normal dicts
+    '''
+    config_dict = dict()
+    for k, v in config.items():
+        if isinstance(v, dict):
+            config_dict[k] = unwrap_config(v)
+        else:
+            config_dict[k] = v
+
+    return config_dict
+
+
+def _get_ranks_name():
+    # tensor parallel
+    tp_local_rank = 0
+    if gpc.is_initialized(ParallelMode.TENSOR):
+        tp_local_rank = gpc.get_local_rank(ParallelMode.TENSOR)
+
+    # pipeline parallel
+    pp_local_rank = 0
+    if gpc.is_initialized(ParallelMode.PIPELINE):
+        pp_local_rank = gpc.get_local_rank(ParallelMode.PIPELINE)
+
+    ranks_name = f'tp{tp_local_rank}-pp{pp_local_rank}'
+    return ranks_name
+
+
+def _get_standard_checkpoint_filename(epoch: int, suffix: str = ''):
+    ranks_name = _get_ranks_name()
+    return f'epoch{epoch}-{ranks_name}{suffix}.pt'
+
+
+def get_checkpoint_path(checkpoint_dir: str, epoch: int, suffix: str = ''):
+    '''This is a function to generate the checkpoint path from the (checkpoint_dir, epoch, suffix, gpu_parallel_rank) tuple.
+    This is useful during generation and recuperation of the checkpoint.
+
+    :param checkpoint_dir: set up a directory for saving checkpoints
+    :type checkpoint_dir: str
+    :param epoch: epoch number (indicate how many epochs have you trained this model)
+    :type epoch: int
+    :param suffix: additional notation to specify the model or checkpoint, defaults to ''
+    :type suffix: str, optional
+    :return: checkpoint path to be generated 
+    :rtype: path
+    '''
+    ckpt_filename = _get_standard_checkpoint_filename(epoch, suffix)
+    return os.path.join(checkpoint_dir, ckpt_filename)
+
+
+def _ensure_directory_exists(filename: str):
+    # ensure the directory exists
+    dir = os.path.dirname(filename)
+    if not os.path.exists(dir):
+        os.makedirs(dir)
+
+
+def get_latest_checkpoint_pattern(suffix: str = ''):
+    '''Generate Regular expression of latest checkpoint's pattern
+
+    :param suffix: additional notation to specify the model or checkpoint, defaults to ''
+    :type suffix: str, optional
+    :return: checkpoint pattern
+    :rtype: regular expression
+    '''
+    ranks_name = _get_ranks_name()
+    ckpt_pattern = re.compile(f'epoch(\d+)-{ranks_name}{suffix}\.pt')
+    return ckpt_pattern
+
+
+def get_latest_checkpoint_path(checkpoint_dir: str, suffix: str = ''):
+    '''This is a function to retrieve the latest checkpoint path from the (checkpoint_dir, suffix, gpu_parallel_rank) tuple.
+    This is useful during recuperation of the checkpoint, especially when you do not know the epoch number.
+
+    :param checkpoint_dir: directory for saving checkpoints
+    :type checkpoint_dir: str
+    :param suffix: additional notation to specify the model or checkpoint, defaults to ''
+    :type suffix: str, optional
+    :raises FileNotFoundError: raise error when we cannot find the latest checkpoint file with inputs given
+    :return: the latest checkpoint path to be retrieved 
+    :rtype: path
+    '''
+    CKPT_NAME_PAT = get_latest_checkpoint_pattern(suffix=suffix)
+
+    last_epoch = -1
+    assert osp.isdir(checkpoint_dir), f'{checkpoint_dir} is not a directory'
+
+    for filename in os.listdir(checkpoint_dir):
+        ret = CKPT_NAME_PAT.match(filename)
+        if ret:
+            epoch = int(ret[0].split('-')[0].lstrip('epoch'))
+            if epoch > last_epoch:
+                last_epoch = epoch
+
+    if last_epoch == -1:
+        ranks_name = _get_ranks_name()
+        raise FileNotFoundError(f"Cannot find the latest checkpoint file for {ranks_name} in {checkpoint_dir}")
+    else:
+        target_file = _get_standard_checkpoint_filename(last_epoch, suffix=suffix)
+        path = osp.join(checkpoint_dir, target_file)
+        return path
+
+
+def save_checkpoint(checkpoint_path: str,
+                    epoch: int,
+                    model: torch.nn.Module,
+                    optimizer: torch.optim.Optimizer,
+                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
+                    **kwargs):
+    '''Given a directory to store the checkpoints, saves all the training components' parameters or buffers, such as model, optimizer, lr_scheduler and etc. into a checkpoint dictionary. 
+    
+    This method can be used for both colosalai nn.BaseModel and normal pytorch nn.Module.
+
+
+    :param checkpoint_path: set up a directory for saving checkpoints
+    :type checkpoint_path: str
+    :param epoch: epoch number (indicate how many epochs have you trained this model)
+    :type epoch: int
+    :param model: model to be registered
+    :type model: torch.nn.Module
+    :param optimizer: optimizer to be registered
+    :type optimizer: torch.optim.Optimizer
+    :param lr_scheduler: lr_scheduler to be registered, defaults to None
+    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
+    '''
+    # for compatibility with normal pytorch nn.Module
+    if hasattr(model, 'state_dict_for_save_checkpoint'):
+        model_sd = model.state_dict_for_save_checkpoint()
+    else:
+        model_sd = model.state_dict()
+
+    # ckpt container
+    checkpoint = {
+        'epoch': epoch,
+        'model': model_sd,
+        'optimizer': optimizer.state_dict(),
+        **kwargs
+    }
+    if lr_scheduler is not None:
+        checkpoint['lr_scheduler'] = lr_scheduler.state_dict()
+
+    _ensure_directory_exists(checkpoint_path)
+    torch.save(checkpoint, checkpoint_path)
+
+
+def load_checkpoint(checkpoint_path: str,
+                    model: torch.nn.Module,
+                    optimizer: torch.optim.Optimizer,
+                    lr_scheduler: torch.optim.lr_scheduler._LRScheduler = None,
+                    finetune: bool = False,
+                    strict: bool = True) -> Tuple:
+    '''Loads the checkpoint file. 
+    If finetune is False, then we intend to continue/resume the training process from the checkpoint given. 
+    So we copy parameters and buffers from state_dict into these modules(model, optimizer,lr_scheduler) and its descendants. 
+    If finetune is True, then only the weights and buffers of model should be reload.
+    If strict is True, then the keys of state_dict must exactly match the keys returned by this module’s state_dict() function.
+    
+    :param checkpoint_path: the exact and matched checkpoint_path directory to retrieve appropriate state_dict
+    :type checkpoint_path: str
+    :param model: model to reload parameters and buffers
+    :type model: torch.nn.Module
+    :param optimizer: optimizer to recuperate
+    :type optimizer: torch.optim.Optimizer 
+    :param lr_scheduler: lr_scheduler to recuperate, defaults to None
+    :type lr_scheduler: torch.optim.lr_scheduler._LRScheduler, optional
+    :param finetune: whether to finetune the model with new dataset or continue the pre-training, defaults to False
+    :type finetune: bool, optional
+    :param strict: whether to strictly enforce that the keys in
+        :attr:`state_dict` of the checkpoint match the names of
+        parameters and buffers in model., defaults to True
+    :type strict: bool, optional
+    :raises ValueError: raise error if the model/optimizer cannot successfully be recuperated
+    :return: (the epoch number of the checkpoint retrieved, the checkpoint retrieved)
+    :rtype: Tuple
+
+    '''
+    # Load the checkpoint.
+    checkpoint = torch.load(checkpoint_path, map_location='cpu')
+    try:
+        last_epoch = checkpoint.pop('epoch') if not finetune else 0
+        model.load_state_dict(checkpoint.pop('model'), strict=strict)
+    except KeyError:
+        raise ValueError('Checkpoint is corrupted')
+
+    if not finetune:
+        try:
+            optimizer.load_state_dict(checkpoint.pop('optimizer'))
+        except KeyError:
+            raise ValueError('Checkpoint is corrupted')
+
+        if lr_scheduler is not None and 'lr_scheduler' in checkpoint:
+            lr_scheduler.load_state_dict(checkpoint.pop('lr_scheduler'))
+
+    return last_epoch, checkpoint
--- a/colossalai/communication/__init__.py
+++ b/colossalai/communication/__init__.py
+from .collective import all_gather, reduce_scatter, scatter
+from .p2p import (send_forward, send_forward_recv_forward, send_backward_recv_forward,
+                  send_backward, send_backward_recv_backward, send_forward_recv_backward,
+                  send_forward_backward_recv_forward_backward, recv_forward, recv_backward)
+from .ring import ring_forward
+from .utils import send_tensor_meta, recv_tensor_meta
+
+__all__ = [
+    'all_gather', 'reduce_scatter', 'scatter',
+    'send_forward', 'send_forward_recv_forward', 'send_forward_backward_recv_forward_backward',
+    'send_backward', 'send_backward_recv_backward', 'send_backward_recv_forward',
+    'send_forward_recv_backward', 'recv_backward', 'recv_forward',
+    'ring_forward', 'send_tensor_meta', 'recv_tensor_meta'
+]
\ No newline at end of file
--- a/colossalai/communication/collective.py
+++ b/colossalai/communication/collective.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+import torch.distributed as dist
+from torch import Tensor
+
+from colossalai.context import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+
+def all_gather(tensor: Tensor, dim: int,
+               parallel_mode: ParallelMode) -> Tensor:
+    """Gathers all tensors from the parallel group and concatenates them in a 
+    specific dimension.
+    
+    :param tensor: Tensor to be gathered
+    :param dim: The dimension concatenating in
+    :param parallel_mode: Parallel group mode used in this communication
+    :type tensor: Tensor
+    :type dim: int
+    :type parallel_mode: ParallelMode
+    :return: The tensor generated by all-gather
+    :rtype: Tensor
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    temp = tensor.clone()
+    shape = list(temp.shape)
+    shape[dim] *= depth
+    out = torch.empty(shape, dtype=temp.dtype, device=get_current_device())
+    out = list(torch.chunk(out, depth, dim=dim))
+    out = [val.contiguous() for val in out]
+    dist.all_gather(out, temp, group=gpc.get_group(parallel_mode))
+    out = torch.cat(out, dim=dim)
+    return out
+
+
+def reduce_scatter(tensor: Tensor, dim: int,
+                   parallel_mode: ParallelMode) -> Tensor:
+    """Reduces all tensors then scatters it in a specific dimension to all 
+    members in the parallel group.
+    
+    :param tensor: Tensor to be reduced and scattered
+    :param dim: The dimension scattering in
+    :param parallel_mode: Parallel group mode used in this communication
+    :type tensor: Tensor
+    :type dim: int
+    :type parallel_mode: ParallelMode
+    :return: The tensor generated by reduce-scatter
+    :rtype: Tensor
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    temp = list(torch.chunk(tensor, depth, dim=dim))
+    temp = [val.contiguous() for val in temp]
+    out = torch.empty(temp[0].shape,
+                      dtype=temp[0].dtype,
+                      device=get_current_device())
+    dist.reduce_scatter(output=out,
+                        input_list=temp,
+                        group=gpc.get_group(parallel_mode))
+    return out
+
+
+def scatter(tensor: Tensor, src: int, dim: int,
+            parallel_mode: ParallelMode) -> Tensor:
+    """Scatters in a specific dimension from source rank to all ranks in 
+    the parallel group.
+    
+    :param tensor: Tensor to be scattered
+    :param dim: The dimension scattering in
+    :param parallel_mode: Parallel group mode used in this communication
+    :type tensor: Tensor
+    :type dim: int
+    :type parallel_mode: ParallelMode
+    :return: The tensor generated by scatter
+    :rtype: Tensor
+    """
+    depth = gpc.get_world_size(parallel_mode)
+    temp = tensor.clone()
+    dist.broadcast(temp, src=src, group=gpc.get_group(parallel_mode))
+    rank = gpc.get_local_rank(parallel_mode)
+    out = torch.chunk(temp, depth, dim=dim)[rank].contiguous()
+    return out
--- a/colossalai/communication/p2p.py
+++ b/colossalai/communication/p2p.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+import torch.distributed as dist
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+
+def _communicate(tensor_send_next=None,
+                 tensor_send_prev=None,
+                 recv_prev=False,
+                 recv_next=False,
+                 recv_prev_shape=None,
+                 recv_next_shape=None,
+                 prev_rank=None,
+                 next_rank=None,
+                 up_group=None,
+                 down_group=None,
+                 dtype=None):
+    """
+    Adapted from megatron.p2p_communication.
+    Communicate tensors between stages. Used as helper method in other
+    communication methods that are used in pipeline schedule.
+    Takes the following arguments:
+        tensor_send_next: tensor to send to next rank (no tensor sent if
+                          set to None).
+        tensor_send_prev: tensor to send to prev rank (no tensor sent if
+                          set to None).
+        recv_prev: boolean for whether tensor should be received from
+                   previous rank.
+        recv_next: boolean for whether tensor should be received from
+                   next rank.
+    Returns:
+        (tensor_recv_prev, tensor_recv_next)
+    """
+
+    # Create placeholder tensors for receive in forward and backward directions
+    # if needed.
+    tensor_recv_prev = None
+    tensor_recv_next = None
+
+    if recv_prev:
+        assert recv_prev_shape is not None
+        tensor_recv_prev = torch.empty(recv_prev_shape,
+                                       requires_grad=True,
+                                       device=get_current_device(),
+                                       dtype=dtype)
+    if recv_next:
+        assert recv_next_shape is not None
+        tensor_recv_next = torch.empty(recv_next_shape,
+                                       requires_grad=True,
+                                       device=get_current_device(),
+                                       dtype=dtype)
+
+    if tensor_send_prev is not None or recv_prev:
+        if prev_rank is None:
+            prev_rank = gpc.get_prev_global_rank(
+                ParallelMode.PIPELINE)
+        if up_group is None:
+            up_group = gpc.get_group(ParallelMode.PIPELINE_PREV)
+
+    if tensor_send_next is not None or recv_next:
+        if next_rank is None:
+            next_rank = gpc.get_next_global_rank(
+                ParallelMode.PIPELINE)
+        if down_group is None:
+            down_group = gpc.get_group(ParallelMode.PIPELINE_NEXT)
+
+    # rank = dist.get_rank()
+    rank = gpc.get_global_rank()
+
+    ops = []
+    if tensor_send_prev is not None:
+        send_prev_op = dist.broadcast(tensor_send_prev,
+                                      src=rank,
+                                      group=up_group,
+                                      async_op=True)
+        ops.append(send_prev_op)
+    if tensor_recv_prev is not None:
+        recv_prev_op = dist.broadcast(tensor_recv_prev,
+                                      src=prev_rank,
+                                      group=up_group,
+                                      async_op=True)
+        ops.append(recv_prev_op)
+    if tensor_recv_next is not None:
+        recv_next_op = dist.broadcast(tensor_recv_next,
+                                      src=next_rank,
+                                      group=down_group,
+                                      async_op=True)
+        ops.append(recv_next_op)
+    if tensor_send_next is not None:
+        send_next_op = dist.broadcast(tensor_send_next,
+                                      src=rank,
+                                      group=down_group,
+                                      async_op=True)
+        ops.append(send_next_op)
+    for req in ops:
+        req.wait()
+    # To protect against race condition when using batch_isend_irecv().
+    torch.cuda.synchronize()
+    return tensor_recv_prev, tensor_recv_next
+
+
+def recv_forward(input_tensor_shape, prev_rank=None, up_group=None):
+    """Receives the input tensor from the previous member in pipeline.
+    
+    :param input_tensor_shape: The shape of the tensor to be recieved
+    :param prev_rank: The rank of the source of the tensor
+    :param up_group: Communication group including the previous member in pipeline parallel group
+    :type input_tensor_shape: torch.Size
+    :type prev_rank: int, optional
+    :type up_group: ProcessGroup, optional
+    :return: The input tensor in forward step
+    :rtype: Tensor
+    """
+    if gpc.is_first_rank(ParallelMode.PIPELINE):
+        input_tensor = None
+    else:
+        input_tensor, _ = _communicate(recv_prev=True,
+                                       recv_prev_shape=input_tensor_shape,
+                                       prev_rank=prev_rank,
+                                       up_group=up_group)
+    return input_tensor
+
+
+def recv_backward(output_grad_shape, next_rank=None, down_group=None):
+    """Receives the grad tensor from the next member in pipeline.
+    
+    :param output_grad_shape: The shape of the tensor to be recieved
+    :param next_rank: The rank of the source of the tensor
+    :param down_group: Communication group including the next member in pipeline parallel group
+    :type output_grad_shape: torch.Size
+    :type next_rank: int, optional
+    :type down_group: ProcessGroup, optional
+    :return: The grad of output tensor in forward step
+    :rtype: Tensor
+    """
+    if gpc.is_last_rank(ParallelMode.PIPELINE):
+        output_tensor_grad = None
+    else:
+        _, output_tensor_grad = _communicate(recv_next=True,
+                                             recv_next_shape=output_grad_shape,
+                                             next_rank=next_rank,
+                                             down_group=down_group)
+    return output_tensor_grad
+
+
+def send_forward(output_tensor,
+                 next_rank=None,
+                 down_group=None):
+    """Sends the input tensor to the next member in pipeline.
+    
+    :param output_tensor: Tensor to be sent
+    :param next_rank: The rank of the recipient of the tensor
+    :param down_group: Communication group including the next member in pipeline parallel group
+    :type output_tensor: Tensor
+    :type next_rank: int, optional
+    :type down_group: ProcessGroup, optional
+    """
+    if not gpc.is_last_rank(ParallelMode.PIPELINE):
+        _communicate(tensor_send_next=output_tensor,
+                     next_rank=next_rank,
+                     down_group=down_group)
+
+
+def send_backward(input_tensor_grad,
+                  prev_rank=None,
+                  up_group=None):
+    """Sends the grad tensor to the previous member in pipeline.
+    
+    :param input_tensor_grad: Tensor to be sent
+    :param prev_rank: The rank of the recipient of the tensor
+    :param up_group: Communication group including the previous member in pipeline parallel group
+    :type input_tensor_grad: Tensor
+    :type prev_rank: int, optional
+    :type up_group: ProcessGroup, optional
+    """
+    if not gpc.is_first_rank(ParallelMode.PIPELINE):
+        _communicate(tensor_send_prev=input_tensor_grad,
+                     prev_rank=prev_rank,
+                     up_group=up_group)
+
+
+def send_forward_recv_backward(output_tensor,
+                               output_grad_shape,
+                               recv_next=True,
+                               next_rank=None,
+                               down_group=None):
+    """Batched communication operation. Sends the input tensor to the 
+    next member in pipeline, while recieves the grad tensor from the
+    next member in pipeline.
+    
+    :param output_tensor: Tensor to be sent
+    :param output_grad_shape: The shape of the tensor to be recieved
+    :type output_tensor: Tensor
+    :type output_grad_shape: torch.Size
+    :return: The grad of output tensor in forward step
+    :rtype: Tensor
+    """
+    if gpc.is_last_rank(ParallelMode.PIPELINE):
+        output_tensor_grad = None
+    else:
+        _, output_tensor_grad = _communicate(tensor_send_next=output_tensor,
+                                             recv_next=recv_next,
+                                             recv_next_shape=output_grad_shape,
+                                             next_rank=next_rank,
+                                             down_group=down_group)
+    return output_tensor_grad
+
+
+def send_backward_recv_forward(input_tensor_grad,
+                               input_tensor_shape,
+                               recv_prev=True,
+                               prev_rank=None,
+                               up_group=None):
+    """Batched communication operation. Sends the grad tensor to the 
+    previous member in pipeline, while recieves the input tensor from the
+    previous member in pipeline.
+    
+    :param input_tensor_grad: Tensor to be sent
+    :param input_tensor_shape: The shape of the tensor to be recieved
+    :type input_tensor_grad: Tensor
+    :type input_tensor_shape: torch.Size
+    :return: The input tensor in forward step
+    :rtype: Tensor
+    """
+    if gpc.is_first_rank(ParallelMode.PIPELINE):
+        input_tensor = None
+    else:
+        input_tensor, _ = _communicate(tensor_send_prev=input_tensor_grad,
+                                       recv_prev=recv_prev,
+                                       recv_prev_shape=input_tensor_shape,
+                                       prev_rank=prev_rank,
+                                       up_group=up_group)
+    return input_tensor
+
+
+def send_forward_recv_forward(output_tensor,
+                              input_tensor_shape,
+                              recv_prev=True,
+                              prev_rank=None,
+                              next_rank=None,
+                              up_group=None,
+                              down_group=None):
+    """Batched communication operation. Sends the input tensor to the 
+    next member in pipeline, while recieves the input tensor from the
+    previous member in pipeline.
+    
+    :param output_tensor: Tensor to be sent
+    :param input_tensor_shape: The shape of the tensor to be recieved
+    :type output_tensor: Tensor
+    :type input_tensor_shape: torch.Size
+    :return: The input tensor in forward step
+    :rtype: Tensor
+    """
+    input_tensor, _ = _communicate(tensor_send_next=output_tensor,
+                                   recv_prev=recv_prev,
+                                   recv_prev_shape=input_tensor_shape,
+                                   prev_rank=prev_rank,
+                                   next_rank=next_rank,
+                                   up_group=up_group,
+                                   down_group=down_group)
+    return input_tensor
+
+
+def send_backward_recv_backward(input_tensor_grad,
+                                output_grad_shape,
+                                recv_next=True,
+                                prev_rank=None,
+                                next_rank=None,
+                                up_group=None,
+                                down_group=None):
+    """Batched communication operation. Sends the grad tensor to the 
+    previous member in pipeline, while recieves the grad tensor from the
+    next member in pipeline.
+    
+    :param input_tensor_grad: Tensor to be sent
+    :param output_grad_shape: The shape of the tensor to be recieved
+    :type input_tensor_grad: Tensor
+    :type output_grad_shape: torch.Size
+    :return: The grad of output tensor in forward step
+    :rtype: Tensor
+    """
+    _, output_tensor_grad = _communicate(tensor_send_prev=input_tensor_grad,
+                                         recv_next=recv_next,
+                                         recv_next_shape=output_grad_shape,
+                                         prev_rank=prev_rank,
+                                         next_rank=next_rank,
+                                         up_group=up_group,
+                                         down_group=down_group)
+    return output_tensor_grad
+
+
+def send_forward_backward_recv_forward_backward(output_tensor,
+                                                input_tensor_grad,
+                                                input_tensor_shape,
+                                                output_grad_shape,
+                                                recv_prev=True,
+                                                recv_next=True,
+                                                prev_rank=None,
+                                                next_rank=None,
+                                                up_group=None,
+                                                down_group=None):
+    """Batched communication operation. Sends the input tensor to the next and 
+    the grad tensor to the previous, while recieves the grad tensor from the
+    next and the input tensor from the previous.
+    
+    :param output_tensor: Tensor sent to the next
+    :param input_tensor_grad: Tensor sent to the previous
+    :param input_tensor_shape: The shape of the tensor recieved from the previous
+    :param output_grad_shape: The shape of the tensor recieved from the next
+    :type output_tensor: Tensor
+    :type input_tensor_grad: Tensor
+    :type input_tensor_shape: torch.Size
+    :type output_grad_shape: torch.Size
+    :return: (the input tensor in forward step, the grad of output tensor in forward step)
+    :rtype: (Tensor, Tensor)
+    """
+    input_tensor, output_tensor_grad = _communicate(
+        tensor_send_next=output_tensor,
+        tensor_send_prev=input_tensor_grad,
+        recv_prev=recv_prev,
+        recv_next=recv_next,
+        recv_prev_shape=input_tensor_shape,
+        recv_next_shape=output_grad_shape,
+        prev_rank=prev_rank,
+        next_rank=next_rank,
+        up_group=up_group,
+        down_group=down_group)
+    return input_tensor, output_tensor_grad
--- a/colossalai/communication/ring.py
+++ b/colossalai/communication/ring.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import torch
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device, synchronize
+
+
+def ring_forward(tensor_send_next: torch.Tensor, parallel_mode: ParallelMode):
+    """Sends a tensor to the next member and recieves a tensor from the previous member.
+    This function returns the recieved tensor from the previous member.
+
+    :param tensor_send_next: Tensor sent to next member
+    :param parallel_mode: Parallel group mode used in this communication
+    :type tensor_send_next: Tensor
+    :type parallel_mode: ParallelMode
+    :return: The tensor recieved from the previous
+    :rtype: Tensor
+    """
+    buffer_shape = tensor_send_next.size()
+
+    ops = []
+    current_rank = gpc.get_global_rank()
+
+    tensor_recv_prev = torch.empty(buffer_shape,
+                                   requires_grad=True,
+                                   device=get_current_device(),
+                                   dtype=tensor_send_next.dtype)
+
+    # send to next rank
+    send_next_op = torch.distributed.P2POp(
+        torch.distributed.isend, tensor_send_next,
+        gpc.get_next_global_rank(parallel_mode))
+    ops.append(send_next_op)
+
+    # receive from prev rank
+    recv_prev_op = torch.distributed.P2POp(
+        torch.distributed.irecv, tensor_recv_prev,
+        gpc.get_prev_global_rank(parallel_mode))
+    ops.append(recv_prev_op)
+
+    if current_rank % 2 == 0:
+        ops = ops[::-1]
+
+    reqs = torch.distributed.batch_isend_irecv(ops)
+    for req in reqs:
+        req.wait()
+
+    # To protect against race condition when using batch_isend_irecv().
+    synchronize()
+
+    return tensor_recv_prev
--- a/colossalai/communication/utils.py
+++ b/colossalai/communication/utils.py
+import torch
+import torch.distributed as dist
+
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
+from colossalai.utils import get_current_device
+
+
+def send_tensor_meta(tensor, need_meta=True, down_group=None):
+    """Sends tensor meta information before sending a specific tensor. 
+    Since the recipient must know the shape of the tensor in p2p communications,
+    meta information of the tensor should be sent before communications. This function
+    synchronizes with :func:`recv_tensor_meta`.
+
+    :param tensor: Tensor to be sent
+    :param need_meta: If False, meta information won't be sent
+    :param down_group: Communication group including the next member in pipeline parallel group
+    :type tensor: Tensor
+    :type need_meta: bool, optional
+    :type down_group: ProcessGroup, optional
+    :return: False
+    :rtype: bool
+    """
+    if need_meta:
+        rank = gpc.get_global_rank()
+
+        if down_group is None:
+            down_group = gpc.get_group(ParallelMode.PIPELINE_NEXT)
+
+        tensor_kwargs = {'dtype': torch.long, 'device': get_current_device()}
+
+        send_shape = torch.tensor(tensor.size(), **tensor_kwargs)
+        send_ndims = torch.tensor(len(tensor.size()), **tensor_kwargs)
+
+        dist.broadcast(send_ndims, src=rank, group=down_group)
+        dist.broadcast(send_shape, src=rank, group=down_group)
+
+    return False
+
+
+def recv_tensor_meta(tensor_shape, prev_rank=None, up_group=None):
+    """Recieves tensor meta information before recieving a specific tensor. 
+    Since the recipient must know the shape of the tensor in p2p communications,
+    meta information of the tensor should be recieved before communications. This function
+    synchronizes with :func:`send_tensor_meta`.
+
+    :param tensor_shape: The shape of the tensor to be recieved
+    :param prev_rank: The rank of the source of the tensor
+    :param up_group: Communication group including the previous member in pipeline parallel group
+    :type tensor_shape: torch.Size
+    :type prev_rank: int, optional
+    :type up_group: ProcessGroup, optional
+    :return: The shape of the tensor to be recieved
+    :rtype: torch.Size
+    """
+    if tensor_shape is None:
+        if prev_rank is None:
+            prev_rank = gpc.get_prev_global_rank(
+                ParallelMode.PIPELINE)
+        if up_group is None:
+            up_group = gpc.get_group(ParallelMode.PIPELINE_PREV)
+
+        tensor_kwargs = {'dtype': torch.long, 'device': get_current_device()}
+
+        recv_ndims = torch.empty((), **tensor_kwargs)
+        dist.broadcast(recv_ndims, src=prev_rank, group=up_group)
+
+        recv_shape = torch.empty(recv_ndims, **tensor_kwargs)
+        dist.broadcast(recv_shape, src=prev_rank, group=up_group)
+
+        tensor_shape = torch.Size(recv_shape)
+
+    return tensor_shape
--- a/colossalai/constants.py
+++ b/colossalai/constants.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+ALLOWED_MODES = [None, '1d', '2d', '2.5d', '3d', 'sequence']
+
+# intializer
+INITIALIZER_MAPPING = {
+    'data': 'Initializer_Data',
+    'tensor': 'Initializer_Tensor',
+    'pipeline': 'Initializer_Pipeline',
+    'embedding': 'Initializer_Embedding',
+    '1d': 'Initializer_1D',
+    '2d': 'Initializer_2D',
+    '2.5d': 'Initializer_2p5D',
+    '3d': 'Initializer_3D',
+    'sequence': 'Initializer_Sequence'
+}
+
+# 2D paralllel
+SUMMA_DIM = 'SUMMA_DIM'
+
+# 2.5D paralllel
+TESSERACT_DIM = 'TESSERACT_DIM'
+TESSERACT_DEP = 'TESSERACT_DEP'
+
+# 3D parallel
+DEPTH_3D = 'DEPTH_3D'
+
+# Tensor parallel attributes
+IS_TENSOR_PARALLEL = 'is_tensor_parallel'
+TENSOR_PARALLEL_ATTRIBUTES = [IS_TENSOR_PARALLEL]
--- a/colossalai/context/__init__.py
+++ b/colossalai/context/__init__.py
+from .config import Config
+from .parallel_context import ParallelContext
+from .parallel_context import ParallelMode
+from .process_group_initializer import *
+from .random import *
--- a/colossalai/context/_utils.py
+++ b/colossalai/context/_utils.py
+import math
+
+
+def set_parallel_size(obj, config: dict, key: str, attr_name: str):
+    if key in config:
+        ele = config[key]
+        if isinstance(ele, int):
+            setattr(obj, attr_name, ele)
+        elif isinstance(ele, dict):
+            setattr(obj, attr_name, ele['size'])
+        else:
+            raise NotImplementedError(
+                f"Parallel configuration does not support this kind of argument, please use int or dict"
+            )
+
+
+def add_tensor_pg(pg_init, mode, size, depth=None):
+    if mode == '1d':
+        pg_init.append(dict(
+            type='Initializer1D',
+            parallel_size=size
+        ))
+    elif mode == '2d':
+        dim = math.floor(math.sqrt(size))
+        pg_init.append(dict(
+            type='Initializer2D_Col',
+            summa_dim=dim
+        ))
+        pg_init.append(dict(
+            type='Initializer2D_Row',
+            summa_dim=dim
+        ))
+    elif mode == '2.5d':
+        dim = math.floor(math.sqrt(size // depth))
+        pg_init.append(dict(
+            type='Initializer_Tesseract_ROW',
+            tesseract_dim=dim,
+            tesseract_dep=depth
+        ))
+        pg_init.append(dict(
+            type='Initializer_Tesseract_COL',
+            tesseract_dim=dim,
+            tesseract_dep=depth
+        ))
+        pg_init.append(dict(
+            type='Initializer_Tesseract_DEP',
+            tesseract_dim=dim,
+            tesseract_dep=depth
+        ))
+        pg_init.append(dict(
+            type='Initializer_Tesseract_XZ',
+            tesseract_dim=dim,
+            tesseract_dep=depth
+        ))
+    elif mode == '3d':
+        dim = math.floor(math.pow(size, 1.0 / 3.0) + 0.5)
+        pg_init.append(dict(
+            type='ParallelInitializer3D_Input',
+            depth=dim
+        ))
+        pg_init.append(dict(
+            type='ParallelInitializer3D_Weight',
+            depth=dim
+        ))
+        pg_init.append(dict(
+            type='ParallelInitializer3D_Output',
+            depth=dim
+        ))
+    else:
+        raise NotImplementedError("This kind of tensor splitting has not been implemented yet")
--- a/colossalai/context/config.py
+++ b/colossalai/context/config.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import inspect
+import sys
+from importlib.machinery import SourceFileLoader
+from pathlib import Path
+
+
+class Config(dict):
+    """This is a wrapper class for dict objects so that values of which can be
+    accessed as attributes.
+
+    :param config: The dict object to be wrapped
+    :type config: dict
+    """
+
+    def __init__(self, config: dict = None):
+        if config is not None:
+            for k, v in config.items():
+                self._add_item(k, v)
+
+    def __missing__(self, key):
+        raise KeyError(key)
+
+    def __getattr__(self, key):
+        try:
+            value = super(Config, self).__getitem__(key)
+            return value
+        except KeyError:
+            raise AttributeError(key)
+
+    def __setattr__(self, key, value):
+        super(Config, self).__setitem__(key, value)
+
+    def _add_item(self, key, value):
+        if isinstance(value, dict):
+            self.__setattr__(key, Config(value))
+        else:
+            self.__setattr__(key, value)
+
+    def update(self, config):
+        assert isinstance(config, (Config, dict)), 'can only update dictionary or Config objects.'
+        for k, v in config.items():
+            self._add_item(k, v)
+        return self
+
+    @staticmethod
+    def from_file(filename: str):
+        """Reads a python file and constructs a corresponding :class:`Config` object.
+
+        :param filename: Name of the file to construct the return object
+        :type filename: str
+        :raises AssertionError: Raises an AssertionError if the file does not exist, or the file
+            is not .py file
+        :return: A :class:`Config` object constructed with information in the file
+        :rtype: :class:`Config`
+        """
+
+        # check config path
+        if isinstance(filename, str):
+            filepath = Path(filename).absolute()
+        elif isinstance(filename, Path):
+            filepath = filename.absolute()
+
+        assert filepath.exists(), f'{filename} is not found, please check your configuration path'
+
+        # check extension
+        extension = filepath.suffix
+        assert extension == '.py', 'only .py files are supported'
+
+        # import the config as module
+        remove_path = False
+        if filepath.parent not in sys.path:
+            sys.path.insert(0, (filepath))
+            remove_path = True
+
+        module_name = filepath.stem
+        source_file = SourceFileLoader(fullname=str(module_name), path=str(filepath))
+        module = source_file.load_module()
+
+        # load into config
+        config = Config()
+
+        for k, v in module.__dict__.items():
+            if k.startswith('__') or inspect.ismodule(v) or inspect.isclass(v):
+                continue
+            else:
+                config._add_item(k, v)
+
+        # TODO: replace with logger warning here when logger is done
+        print('warning: variables which starts with __, is a module or class declaration are omitted')
+
+        # remove module
+        del sys.modules[module_name]
+        if remove_path:
+            sys.path.pop(0)
+
+        return config
--- a/colossalai/context/parallel_context.py
+++ b/colossalai/context/parallel_context.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+import os
+import random
+from typing import Union
+
+import numpy as np
+import torch
+import torch.distributed as dist
+
+from colossalai.constants import ALLOWED_MODES, INITIALIZER_MAPPING
+from colossalai.context.config import Config
+from colossalai.registry import DIST_GROUP_INITIALIZER
+from ._utils import set_parallel_size
+from .parallel_mode import ParallelMode
+from .random import add_seed, get_seeds, set_mode
+
+
+class ParallelContext:
+    """This class provides interface functions for users to get the parallel context, 
+    such as the global rank, the local rank, the world size, etc. of each device.
+
+    :param args: The distributed arguments in the system
+    :type args: dict
+    """
+
+    def __init__(self, args=None):
+        # distributed settings
+        self._global_ranks = dict()
+        self._local_ranks = dict()
+        self._world_sizes = dict()
+        self._groups = dict()
+        self._ranks_in_group = dict()
+
+        # load config from file
+        self._dist_args = args
+        self._config = None
+
+        # default 3D parallel args, will be overwritten during process group intialization
+        self.world_size = 1
+        self.data_parallel_size = 1
+        self.pipeline_parallel_size = 1
+        self.tensor_parallel_size = 1
+
+    @property
+    def config(self):
+        return self._config
+
+    def load_config(self, config: Union[dict, str]):
+        """Loads the configuration from either a dict or a file.
+
+        :param config: Either a dict containing the configuration information or the filename
+            of a file containing the configuration information
+        :type config: dict or str
+        :raises TypeError: Raises a TypeError if `config` is neither a dict or a str
+        """
+        if isinstance(config, str):
+            self._config = Config.from_file(config)
+        elif isinstance(config, dict):
+            self._config = Config(config)
+        else:
+            raise TypeError("Invalid type for config, only dictionary or string is supported")
+
+    def set_dist_args(self, args):
+        """Sets the distributed arguments.
+
+        :param args: The distributed arguments in the system
+        :type args: dict
+        """
+        self._dist_args = args
+
+    @staticmethod
+    def _check_parallel_mode(parallel_mode: ParallelMode):
+        assert isinstance(parallel_mode, ParallelMode)
+
+    def get_global_rank(self):
+        """Returns the global rank of the current device.
+
+        :return: The global rank of the current device
+        :rtype: int
+        """
+        return self._global_ranks[ParallelMode.GLOBAL]
+
+    def add_global_rank(self, parallel_mode: ParallelMode, rank: int):
+        """Adds the global rank of the current device for `parallel_mode` to the context.
+
+        :param parallel_mode: The parallel mode for the rank
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :param rank: The rank to be added
+        :type rank: int
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        """
+        self._check_parallel_mode(parallel_mode)
+        self._global_ranks[parallel_mode] = rank
+
+    def get_local_rank(self, parallel_mode: ParallelMode):
+        """Returns the local rank of the current device.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: The local rank of the current device for `parallel_mode`
+        :rtype: int
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._local_ranks[parallel_mode]
+
+    def add_local_rank(self, parallel_mode: ParallelMode, rank: int):
+        """Adds the local rank of the current device for `parallel_mode` to the context.
+
+        :param parallel_mode: The parallel mode for the rank
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :param rank: The rank to be added
+        :type rank: int
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        """
+        self._check_parallel_mode(parallel_mode)
+        self._local_ranks[parallel_mode] = rank
+
+    def get_next_global_rank(self, parallel_mode: ParallelMode):
+        """Returns the global rank of the next device.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: The global rank of the next device for `parallel_mode`
+        :rtype: int
+        """
+        self._check_parallel_mode(parallel_mode)
+
+        # get rank and world size
+        local_rank = self.get_local_rank(parallel_mode)
+        world_size = self.get_world_size(parallel_mode)
+        ranks_in_group = self.get_ranks_in_group(parallel_mode)
+
+        return ranks_in_group[(local_rank + 1) % world_size]
+
+    def get_prev_global_rank(self, parallel_mode: ParallelMode):
+        """Returns the global rank of the previous device.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: The global rank of the previous device for `parallel_mode`
+        :rtype: int
+        """
+        self._check_parallel_mode(parallel_mode)
+
+        # get rank and world size
+        local_rank = self.get_local_rank(parallel_mode)
+        world_size = self.get_world_size(parallel_mode)
+        ranks_in_group = self.get_ranks_in_group(parallel_mode)
+
+        return ranks_in_group[(local_rank - 1) % world_size]
+
+    def is_first_rank(self, parallel_mode: ParallelMode):
+        """Returns a boolean value indicating whether the current device is the first one
+        among its group for `parallel_mode`.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: a boolean value indicating whether the current device is the first one
+            among its group for `parallel_mode`
+        :rtype: bool
+        """
+        rank = self.get_local_rank(parallel_mode)
+        return rank == 0
+
+    def is_last_rank(self, parallel_mode: ParallelMode):
+        """Returns a boolean value indicating whether the current device is the last one
+        among its group for `parallel_mode`.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: a boolean value indicating whether the current device is the last one
+            among its group for `parallel_mode`
+        :rtype: bool
+        """
+        rank = self.get_local_rank(parallel_mode)
+        world_size = self.get_world_size(parallel_mode)
+        return rank == world_size - 1
+
+    def get_world_size(self, parallel_mode: ParallelMode):
+        """Returns the world size for `parallel_mode`.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: The world size for `parallel_mode`
+        :rtype: int
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._world_sizes[parallel_mode]
+
+    def add_world_size(self, parallel_mode: ParallelMode, world_size: int):
+        """Adds world size for `parallel_mode`.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :param world_size: The world size to be added
+        :type world_size: int
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        """
+        self._check_parallel_mode(parallel_mode)
+        self._world_sizes[parallel_mode] = world_size
+
+    def get_group(self, parallel_mode: ParallelMode):
+        """Returns the group of the current device for `parallel_mode`.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: The group of the current device for `parallel_mode`
+        :rtype: torch.distributed.ProcessGroup
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._groups[parallel_mode]
+
+    def add_group(self, parallel_mode: ParallelMode, group: dist.ProcessGroup):
+        """Adds the group of the current device for `parallel_mode`.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :param group: The group to be added
+        :type group: torch.distributed.ProcessGroup
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        """
+        self._check_parallel_mode(parallel_mode)
+        self._groups[parallel_mode] = group
+
+    def get_ranks_in_group(self, parallel_mode: ParallelMode):
+        """Returns the rank of the current device for `parallel_mode` in the group.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        :return: the rank of the current device for `parallel_mode` in the group
+        :rtype: int
+        """
+        self._check_parallel_mode(parallel_mode)
+        return self._ranks_in_group[parallel_mode]
+
+    def add_ranks_in_group(self, parallel_mode: ParallelMode, ranks: list):
+        """Adds the ranks of the current device for `parallel_mode` in the group.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :param ranks: List of ranks to be added
+        :type ranks: list
+        :raises AssertionError: Raises an AssertionError if `parallel_mode` is not an instance
+            of :class:`colossalai.context.ParallelMode`
+        """
+        self._check_parallel_mode(parallel_mode)
+        self._ranks_in_group[parallel_mode] = ranks
+
+    def init_global_dist(self, addr=None, port=None):
+        """Initializes the global distributed environment.
+
+        :param addr: The IP address of the current device
+        :type addr: str, optional
+        :param port: The port to be used in the system of the current device
+        :type port: int, optional
+        """
+        # get config
+        rank = self._dist_args.local_rank
+        world_size = self._dist_args.world_size
+        # default env config, overwrite by exporting
+        # them in your bash script
+        addr = os.getenv('MASTER_ADDR', 'localhost') if addr is None else addr
+        port = os.getenv('MASTER_PORT', '8008') if port is None else port
+        init_method = f'tcp://{addr}:{port}'
+
+        dist.init_process_group(backend=self._dist_args.backend,
+                                rank=rank,
+                                world_size=world_size,
+                                init_method=init_method)
+
+        # None will give the default global process group for pytorch dist operations
+        self._register_dist(rank, world_size, None,
+                            list(range(world_size)), ParallelMode.GLOBAL)
+        self._global_ranks[ParallelMode.GLOBAL] = rank
+
+    def _register_dist(self, local_rank, world_size,
+                       process_group, ranks_in_group, mode):
+        self.add_local_rank(mode, local_rank)
+        self.add_world_size(mode, world_size)
+        self.add_group(mode, process_group)
+        self.add_ranks_in_group(mode, ranks_in_group)
+
+    def check_sanity(self):
+        """Checks sanity of the parallel context.
+
+        :raises AssertionError: Raises an AssertionError if the world size does not equal to the product
+            of data paralle size, pipeline parallel size and tensor parallel size
+        """
+        dps = self.data_parallel_size
+        pps = self.pipeline_parallel_size
+        tps = self.tensor_parallel_size
+        ws = self.world_size
+        assert ws == dps * pps * tps, f"Expected the world size {ws} to be equal to data parallel size ({dps}) * pipeline parallel size ({pps}) * tensor parallel size ({tps})"
+
+    def init_parallel_groups(self):
+        """Initializes the parallel groups.
+
+        :raises AssertionError: Raises an AssertionError if the field paralle is not present in the config file
+        """
+
+        # get rank and world size
+        rank = self.get_global_rank()
+        world_size = self.get_world_size(ParallelMode.GLOBAL)
+        self.world_size = world_size
+
+        assert hasattr(self.config, 'parallel'), 'Expected the field parallel to be present in the config file'
+
+        # set parallel size as attributes for global context
+        parallel_config = self.config.parallel
+        set_parallel_size(self, parallel_config, 'pipeline',
+                          'pipeline_parallel_size')
+        set_parallel_size(self, parallel_config, 'tensor',
+                          'tensor_parallel_size')
+
+        # the user should not set the data parallel size manually
+        # instead, it should be calculated based on other parallel config
+        self.data_parallel_size = self.world_size // (self.pipeline_parallel_size * self.tensor_parallel_size)
+
+        # get the tensor parallel mode and check
+        tensor_parallel_mode = parallel_config['tensor'].get('mode', None)
+        assert tensor_parallel_mode in ALLOWED_MODES, f"mode in the parallel config must be set to one of {ALLOWED_MODES}"
+        self.check_sanity()
+
+        pg_init = []
+        # LSG: init data parallel process group for compatibility with other parallel module such as zero
+        pg_init.append(dict(type=INITIALIZER_MAPPING['data']))
+
+        if self.pipeline_parallel_size > 1:
+            pg_init.append(dict(type=INITIALIZER_MAPPING['pipeline']))
+        pg_init.append(dict(type=INITIALIZER_MAPPING['tensor']))
+
+        # init specific tensor parallel group
+        if tensor_parallel_mode is not None:
+            tensor_parallel_cfg = parallel_config['tensor'].copy()
+
+            # remove duplicate parameters
+            tensor_parallel_cfg.pop('mode')
+            tensor_parallel_cfg.pop('size')
+
+            # add this config to initialize later
+            pg_init.append(dict(type=INITIALIZER_MAPPING[tensor_parallel_mode.lower()], **tensor_parallel_cfg))
+
+        # run initialization of different process groups
+        for initializer_cfg in pg_init:
+            cfg = initializer_cfg.copy()
+            initializer_type = cfg.pop('type')
+            initializer = DIST_GROUP_INITIALIZER.get_module(initializer_type)(
+                rank, world_size, self.config,
+                self.data_parallel_size,
+                self.pipeline_parallel_size,
+                self.tensor_parallel_size,
+                **cfg)
+            parallel_setting = initializer.init_dist_group()
+            if isinstance(parallel_setting, list):
+                for args in parallel_setting:
+                    self._register_dist(*args)
+            else:
+                self._register_dist(*parallel_setting)
+
+    def is_initialized(self, parallel_mode: ParallelMode):
+        """Returns a boolean value indicating whether `parallel_mode` is initialized
+        in the current system.
+
+        :param parallel_mode: The chosen parallel mode
+        :type parallel_mode: :class:`colossalai.context.ParallelMode`
+        :return: a boolean value indicating whether `parallel_mode` is initialized
+            in the current system
+        :rtype: bool
+        """
+        return parallel_mode in self._groups
+
+    def destroy(self):
+        """Destroys the current distributed parallel environment.
+        """
+        for mode, group in self._groups.items():
+            if mode is not ParallelMode.GLOBAL:
+                dist.destroy_process_group(group)
+        # destroy global process group
+        dist.destroy_process_group()
+
+    def set_device(self):
+        """Sets distributed processes to be bound to devices.
+        """
+        devices_per_node = torch.cuda.device_count()
+        global_rank = self.get_global_rank()
+        device = global_rank % devices_per_node
+        torch.cuda.set_device(device)
+        print(f'process rank {global_rank} is bound to device {device}')
+
+    def set_seed(self):
+        """Sets seeds for all random libraries.
+        """
+        if hasattr(self.config, 'seed'):
+            seed = getattr(self.config, 'seed')
+        else:
+            seed = 2  # default seed
+
+        random.seed(seed)
+        np.random.seed(seed)
+        torch.manual_seed(seed)
+
+        global_rank = self.get_global_rank()
+
+        if torch.cuda.is_available():
+            # create random seed for different parallel modes
+            # data parallel seed are kept the same
+            parallel_seed = seed
+            add_seed(ParallelMode.DATA, parallel_seed)
+
+            # model parallel seeds are different across ranks
+            pipeline_offset = self._local_ranks.get(ParallelMode.PIPELINE, 0)
+
+            # add seed for data parallel and tensor parallel only
+            if self.is_initialized(ParallelMode.TENSOR):
+                tp_rank = self.get_local_rank(ParallelMode.TENSOR)
+                # 100 is only to increase the diff in seeds between pipeline stages
+                tp_rank_with_offset = tp_rank + pipeline_offset * 1024
+                tp_seed = seed + tp_rank_with_offset
+                add_seed(ParallelMode.TENSOR, tp_seed)
+
+            set_mode(ParallelMode.DATA)
+            seeds = get_seeds()
+            seed_str = ', '.join([f'{k}: {v}' for k, v in seeds.items()])
+
+            print(f"initialized seed on rank {global_rank}, "
+                  f"numpy: {seed}, python random: {seed}, {seed_str},"
+                  f"the default parallel seed is {ParallelMode.DATA}.", flush=True)
+        else:
+            print(f"initialized seed on rank {global_rank}, "
+                  f"numpy: {seed}, python random: {seed}, pytorch: {seed}", flush=True)
+            print('WARNING: CUDA is not available, thus CUDA RNG cannot be used to track CUDA random number states',
+                  flush=True)
--- a/colossalai/context/parallel_mode.py
+++ b/colossalai/context/parallel_mode.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+
+from enum import Enum
+
+
+# parallel modes
+class ParallelMode(Enum):
+    """This is an enumeration class containing all possible parallel modes.
+    """
+
+    GLOBAL = 'global'
+
+    # common parallel
+    DATA = 'data'
+
+    # pipeline parallel
+    PIPELINE = 'pipe'
+    PIPELINE_PREV = 'pipe_prev'
+    PIPELINE_NEXT = 'pipe_next'
+
+    # containing all ranks in tensor parallel
+    TENSOR = 'tensor'
+
+    # sequence parallel
+    SEQUENCE = 'sequence'
+
+    # 1D Parallel
+    PARALLEL_1D = '1d'
+
+    # 2D parallel
+    PARALLEL_2D_ROW = '2d_row'
+    PARALLEL_2D_COL = '2d_col'
+
+    # 3D parallel
+    PARALLEL_3D_INPUT = '3d_input'
+    PARALLEL_3D_WEIGHT = '3d_weight'
+    PARALLEL_3D_OUTPUT = '3d_output'
+
+    # 2.5D parallel
+    PARALLEL_2P5D_ROW = '2p5d_row'
+    PARALLEL_2P5D_COL = '2p5d_col'
+    PARALLEL_2P5D_DEP = '2p5d_dep'
+    PARALLEL_2P5D_XZ = '2p5d_xz'
--- a/colossalai/context/process_group_initializer/__init__.py
+++ b/colossalai/context/process_group_initializer/__init__.py
+from .initializer_1d import Initializer_1D
+from .initializer_2d import Initializer_2D
+from .initializer_2p5d import Initializer_2p5D
+from .initializer_3d import Initializer_3D
+from .initializer_data import Initializer_Data
+from .initializer_pipeline import Initializer_Pipeline
+from .initializer_sequence import Initializer_Sequence
+from .initializer_tensor import Initializer_Tensor
+from .process_group_initializer import ProcessGroupInitializer
+
+__all__ = [
+    'Initializer_Tensor', 'Initializer_Sequence', 'Initializer_Pipeline',
+    'Initializer_Data', 'Initializer_2p5D', 'Initializer_2D', 'Initializer_3D',
+    'Initializer_1D', 'ProcessGroupInitializer'
+]