init colossalai, support dtk2304

08f2920e · zhuwenwen · da3f0934 · 08f2920e · 08f2920e · 08f2920e
Commit 08f2920e authored Apr 23, 2023 by zhuwenwen
20 changed files
--- a/colossalai/context/process_group_initializer/process_group_initializer.py
+++ b/colossalai/context/process_group_initializer/process_group_initializer.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from abc import ABC, abstractmethod
+from colossalai.context import Config
+class ProcessGroupInitializer(ABC):
+    """An object, knowing the parallelism configuration, that initializes parallel groups.
+    Args:
+        rank (int): The rank of current process.
+        world_size (int): Size of whole communication world.
+        config (Config): Running configuration.
+        data_parallel_size (int): Size of data parallel.
+        pipeline_parallel_size (int): Size of pipeline parallel.
+        tensor_parallel_size (int): Size of tensor parallel.
+    """
+    def __init__(self, rank: int, world_size: int, config: Config, data_parallel_size: int, pipeline_parallel_size: int,
+                 tensor_parallel_size: int):
+        self.rank = rank
+        self.world_size = world_size
+        self.data_parallel_size = data_parallel_size
+        self.config = config
+        self.pipeline_parallel_size = pipeline_parallel_size
+        self.tensor_parallel_size = tensor_parallel_size
+        super().__init__()
+    @abstractmethod
+    def init_dist_group(self):
+        pass
--- a/colossalai/context/random/__init__.py
+++ b/colossalai/context/random/__init__.py
+from ._helper import (seed, set_mode, with_seed, add_seed, get_seeds, get_states, get_current_mode, set_seed_states,
+                      sync_states, moe_set_seed, reset_seeds)
+__all__ = [
+    'seed', 'set_mode', 'with_seed', 'add_seed', 'get_seeds', 'get_states', 'get_current_mode', 'set_seed_states',
+    'sync_states', 'moe_set_seed', 'reset_seeds'
+]
--- a/colossalai/context/random/_helper.py
+++ b/colossalai/context/random/_helper.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import functools
+from contextlib import contextmanager
+import torch.cuda
+from torch import Tensor
+from .seed_manager import SeedManager
+from ..parallel_mode import ParallelMode
+_SEED_MANAGER = SeedManager()
+def get_seeds():
+    """Returns the seeds of the seed manager.
+    Returns:
+        dict: The seeds of the seed manager.
+    """
+    return _SEED_MANAGER.seeds
+def get_states(copy=False):
+    """Returns the seed states of the seed manager.
+    Returns:
+        dict: The seed states of the seed manager.
+    """
+    states = _SEED_MANAGER.seed_states
+    if copy:
+        new_states = dict()
+        for parallel_mode, state in states.items():
+            new_states[parallel_mode] = state.clone()
+        return new_states
+    else:
+        return _SEED_MANAGER.seed_states
+def get_current_mode():
+    """Returns the current mode of the seed manager.
+    Returns:
+        :class:`torch.ByteTensor`: The current mode of the seed manager.
+    """
+    return _SEED_MANAGER.current_mode
+def add_seed(parallel_mode: ParallelMode, seed: int, overwrite: bool = False):
+    """Adds a seed to the seed manager for `parallel_mode`.
+    Args:
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        seed (int): The seed to be added
+    Raises:
+        AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of
+            :class:`colossalai.context.ParallelMode` or the seed for `parallel_mode` has been added.
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+    """
+    _SEED_MANAGER.add_seed(parallel_mode, seed, overwrite)
+def set_mode(parallel_mode: ParallelMode):
+    """Sets the current mode of the seed manager.
+    Args:
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+    """
+    _SEED_MANAGER.set_mode(parallel_mode)
+def set_seed_states(parallel_mode: ParallelMode, state: Tensor):
+    """Sets the state of the seed manager for `parallel_mode`.
+    Args:
+        parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        state (:class:`torch.Tensor`): the state to be set.
+    Raises:
+        AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
+    """
+    _SEED_MANAGER.set_state(parallel_mode, state)
+def sync_states():
+    current_mode = get_current_mode()
+    current_states = torch.cuda.get_rng_state()
+    set_seed_states(current_mode, current_states)
+@contextmanager
+def seed(parallel_mode: ParallelMode):
+    """ A context for seed switch
+    Examples:
+        >>> with seed(ParallelMode.DATA):
+        >>>     output = F.dropout(input)
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+    """
+    try:
+        # set to new mode
+        current_mode = _SEED_MANAGER.current_mode
+        yield _SEED_MANAGER.set_mode(parallel_mode)
+    finally:
+        # recover
+        _SEED_MANAGER.set_mode(current_mode)
+def with_seed(func, parallel_mode: ParallelMode):
+    """
+    A function wrapper which executes the function with a specified seed.
+    Examples:
+        >>> # use with decorator
+        >>> @with_seed(ParallelMode.DATA)
+        >>> def forward(input):
+        >>>     return F.dropout(input)
+        >>> out = forward(input)
+        >>> # OR use it inline
+        >>> def forward(input):
+        >>>     return F.dropout(input)
+        >>> wrapper_forward = with_seed(forward, ParallelMode.DATA)
+        >>> out = wrapped_forward(input)
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+    """
+    @functools.wraps(func)
+    def wrapper(*args, **kwargs):
+        # switch mode
+        current_mode = _SEED_MANAGER.current_mode
+        _SEED_MANAGER.set_mode(parallel_mode)
+        # exec func
+        out = func(*args, **kwargs)
+        # recover state
+        _SEED_MANAGER.set_mode(current_mode)
+        return out
+    return wrapper
+def moe_set_seed(seed):
+    if torch.cuda.is_available():
+        from colossalai.core import global_context as gpc
+        global_rank = gpc.get_global_rank()
+        diff_seed = seed + global_rank
+        add_seed(ParallelMode.TENSOR, diff_seed, True)
+        print(f"moe seed condition: {global_rank} with tensor seed {diff_seed}", flush=True)
+def reset_seeds():
+    _SEED_MANAGER.reset()
--- a/colossalai/context/random/seed_manager.py
+++ b/colossalai/context/random/seed_manager.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+import torch
+from torch import Tensor
+from colossalai.context.parallel_mode import ParallelMode
+class SeedManager:
+    """This class is a manager of all random seeds involved in the system.
+    Note:
+        The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
+        in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
+    """
+    def __init__(self):
+        self._current_mode = None
+        self._seeds = dict()
+        self._seed_states = dict()
+    @property
+    def current_mode(self):
+        return self._current_mode
+    @property
+    def seeds(self):
+        return self._seeds
+    @property
+    def seed_states(self):
+        return self._seed_states
+    def set_state(self, parallel_mode: ParallelMode, state: Tensor):
+        """Sets the state of the seed manager for `parallel_mode`.
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            state (:class:`torch.Tensor`): the state to be set.
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not found in the seed manager.
+        """
+        assert parallel_mode in self._seed_states, f'Parallel mode {parallel_mode} is not found in the seed manager'
+        self._seed_states[parallel_mode] = state
+    def set_mode(self, parallel_mode: ParallelMode):
+        """Sets the current mode of the seed manager.
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+        """
+        if self.current_mode:
+            # save the current state for current mode
+            self._seed_states[self._current_mode] = torch.cuda.get_rng_state()
+        # set the new state for new mode
+        self._current_mode = parallel_mode
+        torch.cuda.set_rng_state(self._seed_states[parallel_mode])
+    def add_seed(self, parallel_mode: ParallelMode, seed: int, overwrtie: bool = False):
+        """Adds a seed to the seed manager for `parallel_mode`.
+        Args:
+            parallel_mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
+            seed (int): The seed to be added.
+            overwrtie (bool, optional): Whether allows to overwrite the seed that has been set already
+        Raises:
+            AssertionError: Raises an AssertionError if `parallel_mode` is not an instance of :class:`colossalai.context.ParallelMode`
+                or the seed for `parallel_mode` has been added.
+        """
+        assert isinstance(parallel_mode, ParallelMode), 'A valid ParallelMode must be provided'
+        if overwrtie is False:
+            assert parallel_mode not in self._seed_states, f'The seed for {parallel_mode} has been added'
+        elif parallel_mode in self._seed_states:
+            print(f"Warnning: {parallel_mode} seed has been overwritten.", flush=True)
+        current_state = torch.cuda.get_rng_state()
+        torch.cuda.manual_seed(seed)
+        self._seed_states[parallel_mode] = torch.cuda.get_rng_state()
+        self._seeds[parallel_mode] = seed
+        torch.cuda.set_rng_state(current_state)
+    def reset(self):
+        self._current_mode = None
+        self._seeds = dict()
+        self._seed_states = dict()
--- a/colossalai/context/singleton_meta.py
+++ b/colossalai/context/singleton_meta.py
+class SingletonMeta(type):
+    """
+    The Singleton class can be implemented in different ways in Python. Some
+    possible methods include: base class, decorator, metaclass. We will use the
+    metaclass because it is best suited for this purpose.
+    """
+    _instances = {}
+    def __call__(cls, *args, **kwargs):
+        """
+        Possible changes to the value of the `__init__` argument do not affect
+        the returned instance.
+        """
+        if cls not in cls._instances:
+            instance = super().__call__(*args, **kwargs)
+            cls._instances[cls] = instance
+        else:
+            assert len(args) == 0 and len(
+                kwargs) == 0, f'{cls.__name__} is a singleton class and a instance has been created.'
+        return cls._instances[cls]
--- a/colossalai/core.py
+++ b/colossalai/core.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from colossalai.context.parallel_context import global_context
+__all__ = ['global_context']
\ No newline at end of file
--- a/colossalai/device/__init__.py
+++ b/colossalai/device/__init__.py
+from .calc_pipeline_strategy import alpa_dp
+from .profile_alpha_beta import profile_alpha_beta
+__all__ = ['profile_alpha_beta', 'alpa_dp']
--- a/colossalai/device/calc_pipeline_strategy.py
+++ b/colossalai/device/calc_pipeline_strategy.py
+from math import pow
+import numpy as np
+def get_submesh_choices(num_hosts, num_devices_per_host, mode="new"):
+    submesh_choices = []
+    i = 1
+    p = -1
+    while i <= num_devices_per_host:
+        i *= 2
+        p += 1
+    assert pow(2, p) == num_devices_per_host, ("Only supports the cases where num_devices_per_host is power of two, "
+                                               f"while now num_devices_per_host = {num_devices_per_host}")
+    if mode == "alpa":
+        for i in range(p + 1):
+            submesh_choices.append((1, pow(2, i)))
+        for i in range(2, num_hosts + 1):
+            submesh_choices.append((i, num_devices_per_host))
+    elif mode == "new":
+        for i in range(p // 2 + 1):
+            for j in range(i, p - i + 1):
+                submesh_choices.append((pow(2, i), pow(2, j)))
+    return submesh_choices
+def alpa_dp_impl(num_layers, num_devices, num_microbatches, submesh_choices, compute_cost, max_stage_cost,
+                 best_configs):
+    """Implementation of Alpa DP for pipeline strategy
+	Paper reference: https://www.usenix.org/system/files/osdi22-zheng-lianmin.pdf
+	Arguments:
+		num_layers: K
+		num_devices: N*M
+		num_microbatches: B
+		submesh_choices: List[(n_i,m_i)]
+		compute_cost: t_intra
+	"""
+    # For f, layer ID start from 0
+    # f[#pipeline stages, layer id that is currently being considered, number of devices used]
+    f = np.full((num_layers + 1, num_layers + 1, num_devices + 1), np.inf, dtype=np.float32)
+    f_stage_max = np.full((num_layers + 1, num_layers + 1, num_devices + 1), 0.0, dtype=np.float32)
+    f_argmin = np.full((num_layers + 1, num_layers + 1, num_devices + 1, 3), -1, dtype=np.int32)
+    f[0, num_layers, 0] = 0
+    for s in range(1, num_layers + 1):
+        for k in range(num_layers - 1, -1, -1):
+            for d in range(1, num_devices + 1):
+                for m, submesh in enumerate(submesh_choices):
+                    n_submesh_devices = np.prod(np.array(submesh))
+                    if n_submesh_devices <= d:
+                        # TODO: [luzgh]: Why alpa needs max_n_succ_stages? Delete.
+                        # if s - 1 <= max_n_succ_stages[i, k - 1, m, n_config]:
+                        # ...
+                        for i in range(num_layers, k, -1):
+                            stage_cost = compute_cost[k, i, m]
+                            new_cost = f[s - 1, k, d - n_submesh_devices] + stage_cost
+                            if (stage_cost <= max_stage_cost and new_cost < f[s, k, d]):
+                                f[s, k, d] = new_cost
+                                f_stage_max[s, k, d] = max(stage_cost, f_stage_max[s - 1, i, d - n_submesh_devices])
+                                f_argmin[s, k, d] = (i, m, best_configs[k, i, m])
+    best_s = -1
+    best_total_cost = np.inf
+    for s in range(1, num_layers + 1):
+        if f[s, 0, num_devices] < best_total_cost:
+            best_s = s
+            best_total_cost = f[s, 0, num_devices]
+    if np.isinf(best_total_cost):
+        return np.inf, None
+    total_cost = f[best_s, 0, num_devices] + (num_microbatches - 1) * f_stage_max[best_s, 0, num_devices]
+    current_s = best_s
+    current_layer = 0
+    current_devices = num_devices
+    res = []
+    while current_s > 0 and current_layer < num_layers and current_devices > 0:
+        next_start_layer, submesh_choice, autosharding_choice = (f_argmin[current_s, current_layer, current_devices])
+        assert next_start_layer != -1 and current_devices != -1
+        res.append(((current_layer, next_start_layer), submesh_choice, autosharding_choice))
+        current_s -= 1
+        current_layer = next_start_layer
+        current_devices -= np.prod(np.array(submesh_choices[submesh_choice]))
+    assert (current_s == 0 and current_layer == num_layers and current_devices == 0)
+    return total_cost, res
+def alpa_dp(num_layers,
+            num_devices,
+            num_microbatches,
+            submesh_choices,
+            num_autosharding_configs,
+            compute_cost,
+            gap=1e-6):
+    """Alpa auto stage dynamic programming.
+	Code reference: https://github.com/alpa-projects/alpa/blob/main/alpa/pipeline_parallel/stage_construction.py
+    Arguments:
+        submesh_choices: List[(int,int)]
+        num_autosharding_configs: Max number of t_intra(start_layer, end_layer, LogicalMesh)
+        compute_cost: np.array(num_layers,num_layers,num_submesh_choices,num_autosharding_configs)
+    """
+    assert np.shape(compute_cost) == (num_layers, num_layers, len(submesh_choices),
+                                      num_autosharding_configs), "Cost shape wrong."
+    all_possible_stage_costs = np.sort(np.unique(compute_cost))
+    best_cost = np.inf
+    best_solution = None
+    last_max_stage_cost = 0.0
+    # TODO: [luzgh]: Why alpa needs the num_autosharding_configs dimension in compute_cost?
+    # In dp_impl it seems the argmin n_config will be chosen. Just amin here.
+    best_configs = np.argmin(compute_cost, axis=3)
+    best_compute_cost = np.amin(compute_cost, axis=3)
+    assert len(all_possible_stage_costs), "no solution in auto stage construction."
+    for max_stage_cost in all_possible_stage_costs:
+        if max_stage_cost * num_microbatches >= best_cost:
+            break
+        if max_stage_cost - last_max_stage_cost < gap:
+            continue
+        cost, solution = alpa_dp_impl(num_layers, num_devices, num_microbatches, submesh_choices, best_compute_cost,
+                                      max_stage_cost, best_configs)
+        if cost < best_cost:
+            best_cost = cost
+            best_solution = solution
+        last_max_stage_cost = max_stage_cost
+    return best_cost, best_solution
--- a/colossalai/device/device_mesh.py
+++ b/colossalai/device/device_mesh.py
+import operator
+from functools import reduce
+import torch
+import torch.distributed as dist
+class DeviceMesh:
+    """A logical view of a physical mesh. The logical view is used in the
+    search process.
+    A physical mesh can have multiple logical views. (e.g., a 2x8 physical mesh
+    can be viewed as a 1x16 or a 4x4 logical mesh). Each mesh dimension has its
+    own latency and bandwidth. We use alpha-beta model to model the
+    communication cost.
+    Arguments:
+        physical_mesh_id (torch.Tensor): physical view of the devices in global rank.
+        mesh_shape (torch.Size): shape of logical view.
+        mesh_alpha (List[float], optional): coefficients used for computing
+            communication cost (default: None)
+        mesh_beta (List[float], optional): coefficients used for computing
+            communication cost (default: None)
+        init_process_group (bool, optional): initialize logical process group
+            during initializing the DeviceMesh instance if the init_process_group set to True.
+            Otherwise, users need to call create_process_groups_for_logical_mesh manually to init logical process group.
+            (default: False)
+        need_flatten(bool, optional): initialize flatten_device_mesh during initializing the DeviceMesh instance if the need_flatten set to True.
+    """
+    def __init__(self,
+                 physical_mesh_id,
+                 mesh_shape,
+                 mesh_alpha=None,
+                 mesh_beta=None,
+                 init_process_group=False,
+                 need_flatten=True):
+        self.physical_mesh_id = physical_mesh_id
+        self.mesh_shape = mesh_shape
+        self._logical_mesh_id = self.physical_mesh_id.reshape(self.mesh_shape)
+        # map global rank into logical rank
+        self.convert_map = {}
+        self._global_rank_to_logical_rank_map(self._logical_mesh_id, [])
+        # coefficient for alpha-beta communication model
+        if mesh_alpha is None:
+            mesh_alpha = [1] * len(self.mesh_shape)
+        if mesh_beta is None:
+            mesh_beta = [1] * len(self.mesh_shape)
+        self.mesh_alpha = tuple(mesh_alpha)
+        self.mesh_beta = tuple(mesh_beta)
+        self.init_process_group = init_process_group
+        self.need_flatten = need_flatten
+        if self.init_process_group:
+            self.process_groups_dict = self.create_process_groups_for_logical_mesh()
+        if self.need_flatten and self._logical_mesh_id.dim() > 1:
+            self.flatten_device_mesh = self.flatten()
+            # Create a new member `flatten_device_meshes` to distinguish from original flatten methods (Because I'm not sure if there are functions that rely on the self.flatten())
+            self.flatten_device_meshes = FlattenDeviceMesh(self.physical_mesh_id, self.mesh_shape, self.mesh_alpha,
+                                                           self.mesh_beta)
+    @property
+    def shape(self):
+        return self.mesh_shape
+    @property
+    def num_devices(self):
+        return reduce(operator.mul, self.physical_mesh_id.shape, 1)
+    @property
+    def logical_mesh_id(self):
+        return self._logical_mesh_id
+    def __deepcopy__(self, memo):
+        cls = self.__class__
+        result = cls.__new__(cls)
+        memo[id(self)] = result
+        for k, v in self.__dict__.items():
+            if k != 'process_groups_dict':
+                setattr(result, k, __import__("copy").deepcopy(v, memo))
+            else:
+                setattr(result, k, v)
+        return result
+    def flatten(self):
+        """
+        Flatten the logical mesh into an effective 1d logical mesh,
+        """
+        flatten_mesh_shape_size = len(self.mesh_shape)
+        flatten_mesh_shape = [self.num_devices]
+        return DeviceMesh(self.physical_mesh_id,
+                          tuple(flatten_mesh_shape),
+                          mesh_alpha=[max(self.mesh_alpha)] * (flatten_mesh_shape_size - 1),
+                          mesh_beta=[min(self.mesh_beta)] * (flatten_mesh_shape_size - 1),
+                          init_process_group=self.init_process_group,
+                          need_flatten=False)
+    def _global_rank_to_logical_rank_map(self, tensor, index_list):
+        '''
+        This method is a helper function to build convert_map recursively.
+        '''
+        for index, inner_tensor in enumerate(tensor):
+            if inner_tensor.numel() == 1:
+                self.convert_map[int(inner_tensor)] = index_list + [index]
+            else:
+                self._global_rank_to_logical_rank_map(inner_tensor, index_list + [index])
+    def create_process_groups_for_logical_mesh(self):
+        '''
+        This method is used to initialize the logical process groups which will be used in communications
+        among logical device mesh.
+        Note: if init_process_group set to False, you have to call this method manually. Otherwise,
+        the communication related function, such as ShapeConsistencyManager.apply will raise errors.
+        '''
+        process_groups_dict = {}
+        check_duplicate_list = []
+        global_rank_flatten_list = self.physical_mesh_id.view(-1).tolist()
+        for global_rank in global_rank_flatten_list:
+            process_groups = self.global_rank_to_process_groups_with_global_rank(global_rank)
+            for axis, process_group in process_groups.items():
+                if axis not in process_groups_dict:
+                    process_groups_dict[axis] = []
+                if process_group not in check_duplicate_list:
+                    check_duplicate_list.append(process_group)
+                    process_group_handler = dist.new_group(process_group)
+                    process_groups_dict[axis].append((process_group, process_group_handler))
+        return process_groups_dict
+    def global_rank_to_logical_rank(self, rank):
+        return self.convert_map[rank]
+    def global_rank_to_process_groups_with_logical_rank(self, rank):
+        '''
+        Give a global rank and return all logical process groups of this rank.
+        for example:
+            physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+            mesh_shape = (4, 4)
+            # [[0, 1, 2, 3],
+            #  [4, 5, 6, 7],
+            #  [8, 9, 10,11],
+            #  [12,13,14,15]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+            print(device_mesh.global_rank_to_process_groups_with_logical_rank(0))
+        output:
+            # key is axis name
+            # value is a list of logical ranks in same axis with rank 0
+            {0: [[0, 0], [1, 0], [2, 0], [3, 0]], 1: [[0, 0], [0, 1], [0, 2], [0, 3]]}
+        '''
+        process_groups = {}
+        for d in range(self.logical_mesh_id.dim()):
+            for replacer in range(self.logical_mesh_id.shape[d]):
+                if d not in process_groups:
+                    process_groups[d] = []
+                process_group_member = self.convert_map[rank].copy()
+                process_group_member[d] = replacer
+                process_groups[d].append(process_group_member)
+        return process_groups
+    def global_rank_to_process_groups_with_global_rank(self, rank):
+        '''
+        Give a global rank and return all process groups of this rank.
+        for example:
+            physical_mesh_id = torch.arange(0, 16).reshape(2, 8)
+            mesh_shape = (4, 4)
+            # [[0, 1, 2, 3],
+            #  [4, 5, 6, 7],
+            #  [8, 9, 10,11],
+            #  [12,13,14,15]]
+            device_mesh = DeviceMesh(physical_mesh_id, mesh_shape)
+            print(device_mesh.global_rank_to_process_groups_with_global_rank(0))
+        output:
+            # key is axis name
+            # value is a list of global ranks in same axis with rank 0
+            {0: [0, 4, 8, 12], 1: [0, 1, 2, 3]}
+        '''
+        logical_process_groups = self.global_rank_to_process_groups_with_logical_rank(rank)
+        process_groups = {}
+        for dim, logical_ranks in logical_process_groups.items():
+            process_groups[dim] = []
+            for logical_rank in logical_ranks:
+                for g_rank, l_rank in self.convert_map.items():
+                    if l_rank == logical_rank:
+                        process_groups[dim].append(g_rank)
+        return process_groups
+    def all_gather_cost(self, num_bytes, mesh_dim):
+        num_devices = self.logical_mesh_id.shape[mesh_dim]
+        return (self.mesh_alpha[mesh_dim] + self.mesh_beta[mesh_dim] * (num_devices - 1) / num_devices * num_bytes +
+                0.1)
+    def all_reduce_cost(self, num_bytes, mesh_dim):
+        num_devices = self.logical_mesh_id.shape[mesh_dim]
+        return (self.mesh_alpha[mesh_dim] + self.mesh_beta[mesh_dim] * 2 * (num_devices - 1) / num_devices * num_bytes +
+                0.01)
+    def reduce_scatter_cost(self, num_bytes, mesh_dim):
+        num_devices = self.logical_mesh_id.shape[mesh_dim]
+        return (self.mesh_alpha[mesh_dim] + self.mesh_beta[mesh_dim] * (num_devices - 1) / num_devices * num_bytes +
+                0.001)
+    def all_to_all_cost(self, num_bytes, mesh_dim):
+        num_devices = self.logical_mesh_id.shape[mesh_dim]
+        penalty_factor = num_devices / 2.0
+        return (self.mesh_alpha[mesh_dim] + self.mesh_beta[mesh_dim] *
+                (num_devices - 1) / num_devices / num_devices * num_bytes * penalty_factor + 0.001)
+class FlattenDeviceMesh(DeviceMesh):
+    def __init__(self, physical_mesh_id, mesh_shape, mesh_alpha=None, mesh_beta=None):
+        super().__init__(physical_mesh_id,
+                         mesh_shape,
+                         mesh_alpha,
+                         mesh_beta,
+                         init_process_group=False,
+                         need_flatten=False)
+        # Different from flatten(), mesh_shape leaves unchanged, mesh_alpha and mesh_beta are scalars
+        self.mesh_alpha = max(self.mesh_alpha)
+        self.mesh_beta = min(self.mesh_beta)
+        # Different from original process_groups_dict, rank_list is not stored
+        self.process_number_dict = self.create_process_numbers_for_logical_mesh()
+    def create_process_numbers_for_logical_mesh(self):
+        '''
+        Build 1d DeviceMesh in column-major(0) and row-major(1)
+        for example:
+            mesh_shape = (2,4)
+            # [[0, 1, 2, 3],
+            #  [4, 5, 6, 7]]
+            # return {0: [0, 4, 1, 5, 2, 6, 3, 7], 1: [0, 1, 2, 3, 4, 5, 6, 7]}
+        '''
+        num_devices = reduce(operator.mul, self.mesh_shape, 1)
+        process_numbers_dict = {}
+        process_numbers_dict[0] = torch.arange(num_devices).reshape(self.mesh_shape).transpose(1, 0).flatten().tolist()
+        process_numbers_dict[1] = torch.arange(num_devices).reshape(self.mesh_shape).flatten().tolist()
+        return process_numbers_dict
+    def mix_gather_cost(self, num_bytes):
+        num_devices = reduce(operator.mul, self.mesh_shape, 1)
+        return (self.mesh_alpha + self.mesh_beta * (num_devices - 1) / num_devices * num_bytes + 0.1)
--- a/colossalai/device/profile_alpha_beta.py
+++ b/colossalai/device/profile_alpha_beta.py
+import fcntl
+import math
+import os
+import time
+import torch
+import torch.distributed as dist
+import torch.multiprocessing as mp
+MB = int((1 << 10) * 1e3)
+GB = int((1 << 20) * 1e3)
+Byte = 4
+FRAMEWORK = 0
+NON_SENSE = (0.1, 0.1)
+def printflock(*msgs):
+    """ solves multi-process interleaved print problem """
+    with open(__file__, "r") as fh:
+        fcntl.flock(fh, fcntl.LOCK_EX)
+        try:
+            print(*msgs)
+        finally:
+            fcntl.flock(fh, fcntl.LOCK_UN)
+def profile(device1d, nbytes, ctype):
+    warmup = 5
+    repeat = 25
+    rank = dist.get_rank()
+    src_device_num = device1d[0]
+    wsize = len(device1d)
+    group = dist.new_group(device1d)
+    torch.cuda.set_device(rank)
+    device = torch.device("cuda", rank)
+    buf = torch.randn(nbytes // 4).to(device)
+    torch.cuda.synchronize()
+    # warmup
+    for _ in range(warmup):
+        if ctype == "a":
+            dist.all_reduce(buf, op=dist.ReduceOp.SUM, group=group)
+        elif ctype == "b":
+            dist.broadcast(buf, src=src_device_num, group=group)
+    torch.cuda.synchronize()
+    dist.barrier()
+    begin = time.perf_counter()
+    for _ in range(repeat):
+        if ctype == "a":
+            dist.all_reduce(buf, op=dist.ReduceOp.SUM, group=group)
+        elif ctype == "b":
+            dist.broadcast(buf, src=src_device_num, group=group)
+    torch.cuda.synchronize()
+    end = time.perf_counter()
+    dist.barrier()
+    if rank == src_device_num:
+        avg_time_s = (end - begin) / repeat - FRAMEWORK
+        alg_band = nbytes / avg_time_s
+        if ctype == "b":
+            bus_band = alg_band
+        elif ctype == "a":
+            bus_band = 2 * (wsize - 1) / wsize * alg_band
+        print(
+            f"GPU:{rank}, Bytes: {nbytes} B,Time: {round(avg_time_s * 1e6,2)} us, Bus bandwidth: {round(bus_band / GB,2)} GB/s"
+        )
+        return (avg_time_s, alg_band)
+    else:
+        return NON_SENSE    # Just a placeholder
+def profile_latency(device1d, it=3, ctype="a"):
+    latency = []
+    for i in range(it):
+        nbytes = int(Byte << i)
+        (t, _) = profile(device1d, nbytes, ctype)
+        latency.append(t)
+    return min(latency)
+def profile_bandwidth(device1d, maxbytes, ctype="a"):
+    (_, bandwidth) = profile(device1d, maxbytes, ctype)
+    return bandwidth
+def profile_ab(rank, *args):
+    wsize = int(torch.cuda.device_count())
+    device1d = args[0]
+    return_dict = args[1]
+    ctype = args[2]
+    os.environ['MASTER_ADDR'] = 'localhost'
+    os.environ['MASTER_PORT'] = '29020'
+    dist.init_process_group(backend=dist.Backend.NCCL, init_method='env://', world_size=wsize, rank=rank)
+    device = torch.device("cuda", rank)
+    max_nbytes = torch.tensor(torch.cuda.mem_get_info(device)[0]).to(device)
+    max_nbytes = min(int(4 * GB), int(GB << int(math.log2(max_nbytes.item() / GB))))
+    if rank == device1d[0]:
+        print(f"max_nbytes: {max_nbytes} B")
+    alpha = profile_latency(device1d, it=5, ctype=ctype)
+    beta = 1 / profile_bandwidth(device1d, maxbytes=max_nbytes, ctype=ctype)
+    if rank == device1d[0]:
+        print(f"alpha(us): {round(alpha * 1e6,2)}, beta(us/GB): {round(beta * 1e6 * GB,2)}")
+    return_dict[rank] = (alpha, beta)
+def profile_alpha_beta(device1d):
+    assert torch.cuda.is_available()
+    assert len(device1d) > 0 and len(device1d) <= int(torch.cuda.device_count())
+    manager = mp.Manager()
+    return_dict = manager.dict()
+    ctype = "a"
+    mp.spawn(profile_ab, args=[device1d, return_dict, ctype], nprocs=int(torch.cuda.device_count()))
+    return return_dict[device1d[0]]
--- a/colossalai/engine/__init__.py
+++ b/colossalai/engine/__init__.py
+from ._base_engine import Engine
+from .gradient_handler import *
+__all__ = ['Engine']
--- a/colossalai/engine/_base_engine.py
+++ b/colossalai/engine/_base_engine.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from typing import List, Iterable
+from torch.nn import Module
+from torch.nn.modules.loss import _Loss
+from colossalai.logging import get_dist_logger
+from torch import Tensor
+from colossalai.gemini.ophooks import register_ophooks_recursively, BaseOpHook
+from colossalai.engine.schedule import BaseSchedule, NonPipelineSchedule, PipelineSchedule, InterleavedPipelineSchedule
+from typing import Optional, Type
+from colossalai.engine.gradient_handler import BaseGradientHandler
+from colossalai.logging import get_dist_logger
+class Engine:
+    """Basic engine class for training and evaluation. It runs a specific process method
+    :meth:`step` which is based on the given :attr:`schedule` over each batch of a dataset.
+    It controls a iteration in training.
+    Args:
+        model (``torch.nn.Module``): The neural network model.
+        optimizer (``colossalai.nn.optimizer.ColossalaiOptimizer``): Optimizer for updating the parameters.
+        criterion (``torch.nn.modules.loss._Loss``, optional): Loss function for calculating loss.
+        gradient_handlers (List[``BaseGradientHandler``], optional): A list of gradient handler used in backward.
+        clip_grad_norm (float, optional): The norm of gradient clipping.
+        ophook_list (list): List of ophook.
+        verbose (bool): whether to display log info.
+        schedule (''BaseSchedule''): Runtime schedule.
+    Examples:
+        >>> # define model, criterion, optimizer, lr_scheduler, train_dataloader for your training
+        >>> model = ...
+        >>> criterion = ...
+        >>> optimizer = ...
+        >>> train_dataloader = ...
+        >>> engine, _, _, _ = colossalai.initialize(model, optimizer, criterion)
+        >>> engine.train()
+        >>> for inputs, labels in train_dataloader
+        >>>     # set gradients to zero
+        >>>     engine.zero_grad()
+        >>>     # run forward pass
+        >>>     outputs = engine(inputs)
+        >>>     # compute loss value and run backward pass
+        >>>     loss = engine.criterion(outputs, labels)
+        >>>     engine.backward(loss)
+        >>>     # update parameters
+        >>>     engine.step()
+    The example of using Engine in training could be find in
+    `Training with engine and trainer <https://www.colossalai.org/docs/basics/engine_trainer>`_. and
+    `Run resnet cifar10 with engine <https://github.com/hpcaitech/ColossalAI-Examples/blob/main/image/resnet/run_resnet_cifar10_with_engine.py>`_.
+    """
+    def __init__(self,
+                 model: Module,
+                 optimizer: "ColossalaiOptimizer",
+                 criterion: Optional[_Loss] = None,
+                 gradient_handlers: Optional[List[BaseGradientHandler]] = None,
+                 clip_grad_norm: float = 0.0,
+                 ophook_list: Optional[List[BaseOpHook]] = None,
+                 verbose: bool = True,
+                 schedule: Optional[BaseSchedule] = None):
+        self._model = model
+        self._optimizer = optimizer
+        self._criterion = criterion
+        self._clip_grad_norm = clip_grad_norm
+        self._verbose = verbose
+        self._logger = get_dist_logger()
+        # state
+        self.training = True    # default
+        # build gradient handler
+        if gradient_handlers:
+            self._gradient_handlers = gradient_handlers
+        else:
+            self._gradient_handlers = []
+        if ophook_list is None:
+            self._ophook_list = []
+        else:
+            self._ophook_list = ophook_list
+        # build schedule
+        if schedule:
+            assert isinstance(schedule, BaseSchedule), \
+                f'expected schedule to be of type BaseSchedule, but got {type(schedule)}'
+            self._schedule = schedule
+        else:
+            self._schedule = NonPipelineSchedule()
+        if self.uses_pipeline:
+            self._schedule.pre_processing(self)
+        #register hook if any
+        if len(self._ophook_list) > 0:
+            register_ophooks_recursively(self._model, self._ophook_list)
+    @property
+    def ophooks(self):
+        """show current activated ophooks"""
+        return self._ophook_list
+    @property
+    def model(self):
+        """Model attached to the engine"""
+        return self._model
+    @property
+    def optimizer(self):
+        """Optimizer attached to the engine"""
+        return self._optimizer
+    @property
+    def criterion(self):
+        """Criterion attached to the engine"""
+        return self._criterion
+    @property
+    def schedule(self):
+        """Schedule attached to the engine"""
+        return self._schedule
+    @property
+    def uses_pipeline(self):
+        """show the pipeline parallel used or not"""
+        return isinstance(self._schedule, (PipelineSchedule, InterleavedPipelineSchedule))
+    def add_hook(self, ophook: Type[BaseOpHook]) -> None:
+        """add necessary hook"""
+        # whether this hook exist
+        for h in self._ophook_list:
+            if type(h) == type(ophook):
+                logger = get_dist_logger()
+                logger.warning(f"duplicate hooks, at least two instance of {type(ophook)}")
+        self._ophook_list.append(ophook)
+        register_ophooks_recursively(self._model, self._ophook_list)
+    def remove_hook(self, ophook: Type[BaseOpHook]) -> None:
+        """remove hook"""
+        logger = get_dist_logger()
+        logger.warning(f"removing hooks is currently not supported")
+    def zero_grad(self):
+        """Set the gradient of parameters to zero
+        """
+        self.optimizer.zero_grad()
+    def step(self):
+        """Execute parameter update
+        """
+        self._all_reduce_gradients()
+        self.optimizer.clip_grad_norm(self.model, self._clip_grad_norm)
+        return self.optimizer.step()
+    def backward(self, loss: Tensor):
+        """Start backward propagation given the loss value computed by a loss function.
+        Args:
+            loss (:class:`torch.Tensor`): Loss value computed by a loss function.
+        """
+        ret = self.optimizer.backward(loss)
+        for ophook in self._ophook_list:
+            ophook.post_iter()
+        return ret
+    def backward_by_grad(self, tensor, grad):
+        """Start backward propagation given the gradient of the output tensor.
+        Args:
+            tensor (:class:`torch.Tensor`): Output tensor.
+            grad (:class:`torch.Tensor`): Gradient passed back to the output.
+        """
+        ret = self.optimizer.backward_by_grad(tensor, grad)
+        for ophook in self._ophook_list:
+            ophook.post_iter()
+        return ret
+    def __call__(self, *args, **kwargs):
+        """Run the forward step for the model.
+        Returns:
+            Tuple[:class:`torch.Tensor`] or :class:`torch.Tensor`: Output of the model.
+        """
+        return self.model(*args, **kwargs)
+    def _all_reduce_gradients(self):
+        """Handles all-reduce operations of gradients across different parallel groups.
+        """
+        for handler in self._gradient_handlers:
+            handler.handle_gradient()
+    def execute_schedule(self, data_iter: Iterable, **kwargs):
+        """Run the forward, loss computation, and backward for the model.
+        Returns a tuple of (output, label, loss).
+        Returns:
+            Tuple[:class:`torch.Tensor`]: A tuple of (output, label, loss).
+        """
+        output, label, loss = self._schedule.forward_backward_step(self, data_iter, **kwargs)
+        return output, label, loss
+    def train(self):
+        """Sets the model to training mode.
+        """
+        self.training = True
+        self._model.train()
+    def eval(self):
+        """Sets the model to evaluation mode.
+        """
+        self.training = False
+        self._model.eval()
--- a/colossalai/engine/gradient_accumulation/__init__.py
+++ b/colossalai/engine/gradient_accumulation/__init__.py
+import torch.nn as nn
+from typing import List
+from colossalai.engine import BaseGradientHandler
+from typing import Iterable
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from ._gradient_accumulation import GradAccumDataloader, GradAccumOptimizer, GradAccumLrSchedulerByStep, GradAccumGradientHandler
+__all__ = [
+    'accumulate_gradient', 'GradAccumDataloader', 'GradAccumOptimizer', 'GradAccumLrSchedulerByStep',
+    'GradAccumGradientHandler'
+]
+def accumulate_gradient(model: nn.Module,
+                        optimizer: Optimizer,
+                        dataloader: Iterable,
+                        accumulate_size: int,
+                        gradient_handlers: List[BaseGradientHandler] = None,
+                        lr_scheduler: _LRScheduler = None):
+    r"""Turning model, optimizer, dataloader into corresponding object for gradient accumulation.
+    Args:
+        model (:class:`torch.nn.Module`): your model object for gradient accumulation.
+        optimizer (:class:`torch.optim.Optimizer`): your optimizer object for gradient accumulation.
+        dataloader (:class:`torch.utils.data.DataLoader` or iterable objects):
+            your dataloader object, would be called like iter(dataloader)
+        accumulate_size (int): the number of steps to accumulate gradients
+        gradient_handlers (List[:class:`colossalai.engine.BaseGradientHandler`]):
+            list of gradient handler objects. Default is None.
+        lr_scheduler (`torch.optim.lr_scheduler` or `colossalai.nn.lr_scheduler`):
+            your ``lr_scheduler`` object for gradient accumulation. Defaults to None.
+    More details about `gradient_handlers` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
+    More details about `lr_scheduler` could be found
+    `lr_scheduler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/nn/lr_scheduler>`_. and
+    `how to adjust learning rate <https://pytorch.org/docs/stable/optim.html#how-to-adjust-learning-rate>`_.
+    """
+    optimizer = GradAccumOptimizer(optimizer, accumulate_size=accumulate_size, model=model)
+    dataloader = GradAccumDataloader(dataloader, accumulate_size=accumulate_size)
+    if gradient_handlers is not None:
+        gradient_handlers = [GradAccumGradientHandler(handler, accumulate_size) for handler in gradient_handlers]
+    if lr_scheduler is not None:
+        lr_scheduler = GradAccumLrSchedulerByStep(lr_scheduler, accumulate_size=accumulate_size)
+    return optimizer, dataloader, gradient_handlers, lr_scheduler
--- a/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
+++ b/colossalai/engine/gradient_accumulation/_gradient_accumulation.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from typing import Union
+import torch.nn as nn
+from torch import Tensor
+from typing import Iterable, Any, Tuple
+from colossalai.nn.optimizer import ColossalaiOptimizer
+from torch.nn.parallel.distributed import DistributedDataParallel
+from torch.optim import Optimizer
+from torch.optim.lr_scheduler import _LRScheduler
+from torch.utils.data import DataLoader
+from colossalai.utils import conditional_context
+from colossalai.engine import BaseGradientHandler
+class GradAccumOptimizer(ColossalaiOptimizer):
+    """A wrapper for the optimizer to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached.
+    Args:
+        optim (:class:`torch.optim.Optimizer`): Your optimizer object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+        model (:class:`torch.nn.Module`):
+            Your model object to check if it is DistributedDataParallel for special handling of no_sync() context.
+    """
+    def __init__(self, optim: Optimizer, accumulate_size: int, model: nn.Module = None):
+        super().__init__(optim)
+        self.accumulate_size = accumulate_size
+        self.accumulate_step = 0
+        # handle pytorch ddp auto all reduce
+        self.model = model
+        self.is_torch_ddp = isinstance(self.model, DistributedDataParallel)
+    def zero_grad(self, *args, **kwargs) -> None:
+        """
+        Set all gradients to zero.
+        Args:
+            *args: positional arguments for the optimizer wrapped
+            **kwargs: keyword arguments for the optimizer wrapped
+        """
+        if self.accumulate_step == 0:
+            self.optim.zero_grad(*args, **kwargs)
+    def step(self, *args, **kwargs) -> None:
+        """
+        Update the model parameters.
+        Args:
+            *args: positional arguments for the optimizer wrapped
+            **kwargs: keyword arguments for the optimizer wrapped
+        """
+        if self.accumulate_step < self.accumulate_size:
+            return None
+        else:
+            self.accumulate_step = 0
+            return self.optim.step(*args, **kwargs)
+    def clip_grad_norm(self, model: nn.Module, max_norm: float) -> None:
+        """
+        Clip gradients by norm.
+        Args:
+            model (:class:`torch.nn.Module`): a torch module instance
+            max_norm (float): the max norm for gradient clipping
+        """
+        if self.accumulate_step < self.accumulate_size:
+            pass
+        else:
+            self.optim.clip_grad_norm(model, max_norm)
+    def backward(self, loss: Tensor) -> None:
+        """Execute backward pass.
+        Args:
+            loss (:class:`torch.Tensor`): the loss value.
+        """
+        self.accumulate_step += 1
+        if self.is_torch_ddp:
+            no_sync = self.accumulate_step < self.accumulate_size
+            with conditional_context(self.model.no_sync(), enable=no_sync):
+                scaled_loss = loss / self.accumulate_size
+                self.optim.backward(scaled_loss)
+        else:
+            scaled_loss = loss / self.accumulate_size
+            self.optim.backward(scaled_loss)
+    def backward_by_grad(self, tensor: Tensor, grad: Tensor) -> None:
+        """Execute backward pass given the gradients of the output.
+        Args:
+            loss (:class:`torch.Tensor`): the loss value.
+            grad (:class:`torch.Tensor`): the output gradient.
+        """
+        self.accumulate_step += 1
+        no_sync = self.is_torch_ddp and self.accumulate_step < self.accumulate_size
+        if no_sync:
+            with self.model.no_sync():
+                self.optim.backward_by_grad(tensor, grad)
+        else:
+            self.optim.backward_by_grad(tensor, grad)
+class GradAccumDataloader:
+    """A wrapper for dataloader to enable gradient accumulation by dropping the last incomplete steps.
+    Note:
+        The dataloader would drop the last incomplete steps for gradient accumulation.
+        For example, if a dataloader has 10 batches of data and accumulate size is 4. The model parameters will
+        be updated only twice at step 4 and step 8. The last two batches of data do not form a complete 4-step cycle.
+        Thus, they will be automatically skipped by this class. If the dataloader is not standard PyTorch dataloader,
+        (e.g. Dali dataloader), this class will automatically consume (load data for nothing) the remaining 2 batches.
+    Args:
+        dataloader (``Iterable``): Your dataloader object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+    """
+    def __init__(self, dataloader: Iterable, accumulate_size: int) -> None:
+        self.dataloader = dataloader
+        self.consume_remain_data = not isinstance(dataloader, DataLoader)
+        self.steps_per_epoch = len(dataloader) - len(dataloader) % accumulate_size
+    def __getattr__(self, __name: str) -> Any:
+        return getattr(self.dataloader, __name)
+    def __len__(self) -> int:
+        return self.steps_per_epoch
+    def __iter__(self) -> Iterable:
+        self._cur_step = 0
+        self._dataiter = iter(self.dataloader)
+        return self
+    def __next__(self) -> Union[Tensor, Tuple[Tensor]]:
+        if self._cur_step < self.steps_per_epoch:
+            self._cur_step += 1
+            data = next(self._dataiter)
+            if self._cur_step == self.steps_per_epoch and self.consume_remain_data:
+                # this is to handle non standard pytorch dataloader
+                # such as dali dataloader
+                while True:
+                    try:
+                        _ = next(self._dataiter)
+                    except StopIteration:
+                        break
+            return data
+        else:
+            raise StopIteration
+class GradAccumLrSchedulerByStep(_LRScheduler):
+    """A wrapper for the LR scheduler to enable gradient accumulation by skipping the steps 
+    before accumulation size is reached.
+    Args:
+        lr_scheduler (:class:`torch.optim.lr_scheduler._LRScheduler`):
+            Your ``lr_scheduler`` object for gradient accumulation.
+        accumulate_size (int): The number of steps to accumulate gradients.
+    """
+    def __init__(self, lr_scheduler: _LRScheduler, accumulate_size: int) -> None:
+        self.lr_scheduler = lr_scheduler
+        self.accumulate_size = accumulate_size
+        self.accumulate_step = 0
+    @staticmethod
+    def compute_effective_steps_per_epoch(dataloader: Iterable, accumulate_size: int) -> int:
+        """
+        Computes the number of effective training iterations. An effective iteration is defined
+        as the the aggregation of <accumulate_size> iterations. For examples, if accumulate_size = 4,
+        then 4 iterations are considered as one effective iteration.
+        Args:
+            dataloader (``Iterable``): Your dataloader object for gradient accumulation.
+            accumulate_size (int): The number of steps to accumulate gradients.
+        """
+        return len(dataloader) // accumulate_size
+    def __getattr__(self, __name: str) -> Any:
+        return getattr(self.lr_scheduler, __name)
+    def step(self, *args, **kwargs) -> None:
+        """
+        Update the learning rate.
+        Args:
+            *args: positional arguments for the lr scheduler wrapped.
+            **kwargs: keyword arguments for the lr scheduler wrapped.
+        """
+        self.accumulate_step += 1
+        if self.accumulate_step < self.accumulate_size:
+            pass
+        else:
+            self.accumulate_step = 0
+            self.lr_scheduler.step(*args, **kwargs)
+    def get_lr(self) -> Tensor:
+        """
+        Compute the next learning rate.
+        Returns:
+            Tensor: the upcoming learning rate.
+        """
+        return self.lr_scheduler.get_lr()
+    def get_last_lr(self) -> Tensor:
+        """
+        Returns the current learning rate.
+        Returns:
+            Tensor: the current learning rate.
+        """
+        return self.lr_scheduler.get_last_lr()
+    def print_lr(self, *args, **kwargs) -> None:
+        """
+        Print he learning rate.
+        Args:
+            *args: positional arguments for the lr scheduler wrapped.
+            **kwargs: keyword arguments for the lr scheduler wrapped.
+        """
+        self.lr_scheduler.print_lr(*args, **kwargs)
+    def state_dict(self) -> dict:
+        """
+        Returns the states of the lr scheduler as dictionary.
+        Returns:
+            dict: the states of the lr scheduler.
+        """
+        return self.lr_scheduler.state_dict()
+    def load_state_dict(self, state_dict: dict) -> None:
+        """
+        Load the states of the lr scheduler from a dictionary object.
+        Returns:
+            dict: the states of the lr scheduler.
+        """
+        self.lr_scheduler.load_state_dict(state_dict)
+class GradAccumGradientHandler:
+    r"""A wrapper for the gradient handler to enable gradient accumulation by skipping the steps
+    before accumulation size is reached.
+    Args:
+        grad_handler (:class:`colossalai.engine.BaseGradientHandler`):
+            Your ``gradient_handler`` object for gradient accumulation, would be called when achieving `accumulate_size`.
+        accumulate_size (int): The number of steps to accumulate gradients.
+    More details about ``gradient_handlers`` could be found in
+    `Gradient_handler <https://github.com/hpcaitech/ColossalAI/tree/main/colossalai/engine/gradient_handler>`_.
+    """
+    def __init__(self, grad_handler: BaseGradientHandler, accumulate_size: int) -> None:
+        assert isinstance(grad_handler, BaseGradientHandler), \
+            f'expected grad_handler to be type BaseGradientHandler, but got {type(grad_handler)}'
+        self.grad_handler = grad_handler
+        self.accumulate_size = accumulate_size
+        self.accumulate_step = 0
+    def handle_gradient(self) -> None:
+        """
+        Handle gradients reduction only in the last gradient accumulation step.
+        """
+        self.accumulate_step += 1
+        if self.accumulate_step < self.accumulate_size:
+            pass
+        else:
+            self.accumulate_step = 0
+            self.grad_handler.handle_gradient()
--- a/colossalai/engine/gradient_handler/__init__.py
+++ b/colossalai/engine/gradient_handler/__init__.py
+from ._base_gradient_handler import BaseGradientHandler
+from ._data_parallel_gradient_handler import DataParallelGradientHandler
+from ._zero_gradient_handler import ZeROGradientHandler
+from ._sequence_parallel_gradient_handler import SequenceParallelGradientHandler
+from ._pipeline_parallel_gradient_handler import PipelineSharedModuleGradientHandler
+from ._moe_gradient_handler import MoeGradientHandler
+from ._sequence_parallel_gradient_handler import SequenceParallelGradientHandler
+__all__ = [
+    'BaseGradientHandler', 'DataParallelGradientHandler', 'ZeROGradientHandler', 'PipelineSharedModuleGradientHandler',
+    'MoeGradientHandler', 'SequenceParallelGradientHandler'
+]
--- a/colossalai/engine/gradient_handler/_base_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_base_gradient_handler.py
+#!/usr/bin/env python
+# -*- encoding: utf-8 -*-
+from abc import ABC, abstractmethod
+class BaseGradientHandler(ABC):
+    """A basic helper class to handle all-reduce operations of gradients across different parallel groups 
+    before optimization.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def __init__(self, model, optimizer):
+        self._model = model
+        self._optimizer = optimizer
+    @abstractmethod
+    def handle_gradient(self):
+        """A method to accumulate gradients across different parallel groups. Users should
+        write their own functions or just use the functions in pre-defined subclasses.
+        """
+        pass
--- a/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_data_parallel_gradient_handler.py
+from colossalai.core import global_context as gpc
+from colossalai.registry import GRADIENT_HANDLER
+from ._base_gradient_handler import BaseGradientHandler
+from ...context.parallel_mode import ParallelMode
+from .utils import bucket_allreduce
+@GRADIENT_HANDLER.register_module
+class DataParallelGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in a data parallel group.
+    A all-reduce collective communication will be operated in 
+    :func:`handle_gradient` among a data parallel group.
+    For better performance, it bucketizes the gradients of all parameters that are 
+    the same type to improve the efficiency of communication.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def handle_gradient(self):
+        """A method running a all-reduce operation in a data parallel group.
+        """
+        # TODO: add memory buffer
+        if gpc.data_parallel_size > 1:
+            bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.DATA))
--- a/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_moe_gradient_handler.py
+from colossalai.core import global_context as gpc
+from colossalai.registry import GRADIENT_HANDLER
+from colossalai.utils.moe import get_moe_epsize_param_dict
+from ._base_gradient_handler import BaseGradientHandler
+from ...context.parallel_mode import ParallelMode
+from .utils import bucket_allreduce
+from colossalai.context.moe_context import MOE_CONTEXT
+@GRADIENT_HANDLER.register_module
+class MoeGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in a data parallel group and
+    moe model parallel. A all-reduce collective communication will be operated in
+    :func:`handle_gradient` among a data parallel group.
+    For better performance, it bucketizes the gradients of all parameters that are
+    the same type to improve the efficiency of communication.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def __init__(self, model, optimizer=None):
+        super().__init__(model, optimizer)
+    def handle_gradient(self):
+        """A method running an all-reduce operation in a data parallel group.
+        Then running an all-reduce operation for all parameters in experts
+        across moe model parallel group
+        """
+        global_data = gpc.data_parallel_size
+        if global_data > 1:
+            epsize_param_dict = get_moe_epsize_param_dict(self._model)
+            # epsize is 1, indicating the params are replicated among processes in data parallelism
+            # use the ParallelMode.DATA to get data parallel group
+            # reduce gradients for all parameters in data parallelism
+            if 1 in epsize_param_dict:
+                bucket_allreduce(param_list=epsize_param_dict[1], group=gpc.get_group(ParallelMode.DATA))
+            for ep_size in epsize_param_dict:
+                if ep_size != 1 and ep_size != MOE_CONTEXT.world_size:
+                    bucket_allreduce(param_list=epsize_param_dict[ep_size],
+                                     group=MOE_CONTEXT.parallel_info_dict[ep_size].dp_group)
--- a/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_pipeline_parallel_gradient_handler.py
+#!/usr/bin/env python
+from collections import defaultdict
+import torch
+import torch.distributed as dist
+from colossalai.core import global_context as gpc
+from colossalai.registry import GRADIENT_HANDLER
+from torch._utils import _flatten_dense_tensors, _unflatten_dense_tensors
+from ._base_gradient_handler import BaseGradientHandler
+@GRADIENT_HANDLER.register_module
+class PipelineSharedModuleGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in sub parallel groups.
+    A all-reduce collective communication will be operated in 
+    :func:`handle_gradient` among all sub pipeline parallel groups.
+    For better performance, it bucketizes the gradients of all parameters that are 
+    the same type to improve the efficiency of communication.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def handle_gradient(self):
+        """A method running a all-reduce operation in sub pipeline parallel groups.
+        """
+        if gpc.pipeline_parallel_size > 1:
+            # bucketize and all-reduce
+            buckets = defaultdict(lambda: defaultdict(list))
+            # Pack the buckets.
+            for param in self._model.parameters():
+                group = getattr(param, 'pipeline_shared_module_pg', None)
+                if param.requires_grad and group is not None and (
+                    (hasattr(param, 'colo_attr') and not param.colo_attr.saved_grad.is_null())
+                        or param.grad is not None):
+                    tp = param.data.type()
+                    buckets[group][tp].append(param)
+            # For each bucket, all-reduce and copy all-reduced grads.
+            for group, group_buckets in buckets.items():
+                for tp, bucket in group_buckets.items():
+                    grads = [
+                        param.colo_attr.grad_payload if hasattr(param, 'colo_attr') else param.grad.data
+                        for param in bucket
+                    ]
+                    coalesced = _flatten_dense_tensors(grads).to(torch.cuda.current_device())
+                    dist.all_reduce(coalesced, op=dist.ReduceOp.SUM, group=group)
+                    for buf, synced in zip(grads, _unflatten_dense_tensors(coalesced, grads)):
+                        buf.copy_(synced)
--- a/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+++ b/colossalai/engine/gradient_handler/_sequence_parallel_gradient_handler.py
+from colossalai.core import global_context as gpc
+from colossalai.registry import GRADIENT_HANDLER
+from ._base_gradient_handler import BaseGradientHandler
+from ...context.parallel_mode import ParallelMode
+from .utils import bucket_allreduce
+@GRADIENT_HANDLER.register_module
+class SequenceParallelGradientHandler(BaseGradientHandler):
+    """A helper class to handle all-reduce operations in a data parallel group.
+    A all-reduce collective communication will be operated in 
+    :func:`handle_gradient` among a data parallel group.
+    For better performance, it bucketizes the gradients of all parameters that are 
+    the same type to improve the efficiency of communication.
+    Args:
+        model (Module): Model where the gradients accumulate.
+        optimizer (Optimizer): Optimizer for updating the parameters.
+    """
+    def handle_gradient(self):
+        """A method running a all-reduce operation in a data parallel group.
+        """
+        if gpc.get_world_size(ParallelMode.SEQUENCE_DP) > 1:
+            bucket_allreduce(param_list=self._model.parameters(), group=gpc.get_group(ParallelMode.SEQUENCE_DP))