Layer integration (#83)

* integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>

Layer integration (#83)
* integrated parallel layers for ease of building models * integrated 2.5d layers * cleaned codes and unit tests * added log metric by step hook; updated imagenet benchmark; fixed some bugs * reworked initialization; cleaned codes Co-authored-by: BoxiangW <45734921+BoxiangW@users.noreply.github.com>
0fedef4f · アマデウス · GitHub · 5c3843dc · 0fedef4f · 0fedef4f
Unverified Commit 0fedef4f authored Dec 27, 2021 by アマデウス Committed by GitHub Dec 27, 2021
20 changed files
--- a/colossalai/nn/layer/parallel_2d/layers.py
+++ b/colossalai/nn/layer/parallel_2d/layers.py
 import math
+from typing import Callable

 import torch
-import torch.distributed as dist
-from torch import Tensor
-from torch.nn import Parameter, init as init
-
-from colossalai.context import seed, ParallelMode
+import torch.nn as nn
+import torch.nn.functional as F
+from colossalai.communication import broadcast
+from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
+from colossalai.nn import init as init
 from colossalai.registry import LAYERS
 from colossalai.utils import get_current_device
-from ._operation import Matmul_AB_2D, Add_Bias_2D, _LayerNorm_2D
-from ._utils import get_summa_dim_from_env, assert_summa_initialization
-from .._common_utils import divide, set_tensor_parallel_attribute_by_partition
+from torch import Tensor, dtype
+from torch.nn import Parameter
+
+from .._common_utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
 from ..base_layer import ParallelLayer
+from ._operation import (Matmul_AB_2D, add_bias_2d, all_gather_weight_2d, classifier_2d, layernorm_2d, split_batch_2d)
+from ._utils import assert_summa_initialization, get_summa_dim_from_env


 @LAYERS.register_module
@@ -30,15 +34,14 @@ class Linear2D(ParallelLayer):
    :param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
    :type skip_bias_add: bool, optional
    """
-
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 bias: bool = True,
                 dtype=None,
                 skip_bias_add: bool = False,
-                 init_weight='torch',
-                 init_bias='torch'):
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
        super().__init__()

        self.in_features = in_features
@@ -52,118 +55,57 @@ class Linear2D(ParallelLayer):
        self.summa_dim = get_summa_dim_from_env()

        # partitioning dimension
-        self.input_size_per_partition = divide(
-            self.in_features, self.summa_dim)
-        self.hidden_size_per_partition = divide(
-            self.out_features, self.summa_dim)
+        self.input_size_per_partition = divide(self.in_features, self.summa_dim)
+        self.hidden_size_per_partition = divide(self.out_features, self.summa_dim)

        # create weight, shape: [k/q, h/q]
        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
-        self.weight = Parameter(torch.empty(
-            self.input_size_per_partition,
-            self.hidden_size_per_partition,
-            **factory_kwargs))
+        self.weight = Parameter(
+            torch.empty(self.input_size_per_partition, self.hidden_size_per_partition, **factory_kwargs))

        # create bias, shape: [h/q]
        if bias:
-            self.bias = Parameter(torch.empty(
-                self.hidden_size_per_partition,
-                **factory_kwargs))
+            self.bias = Parameter(torch.empty(divide(self.out_features, self.summa_dim**2), **factory_kwargs))
        else:
            self.register_parameter('bias', None)

        # initialize parameters
        with seed(ParallelMode.TENSOR):
-            self.reset_parameters(init_weight, init_bias)
+            self.reset_parameters(weight_initializer, bias_initializer)
        self._set_tensor_parallel_attributes()

    def _set_tensor_parallel_attributes(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+        set_tensor_parallel_attribute_by_partition(self.weight, self.summa_dim**2)
        if self.bias is not None:
-            set_tensor_parallel_attribute_by_partition(self.bias, num_partition)
+            set_tensor_parallel_attribute_by_partition(self.bias, self.summa_dim**2)

-    def reset_parameters(self, init_weight, init_bias) -> None:
-        assert init_weight in ('torch', 'jax', 'zero')
-        assert init_bias in ('torch', 'jax', 'zero')
-        # setting
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
        fan_in, fan_out = self.in_features, self.out_features
-
-        # init weight
-        if init_weight == 'torch':
-            a = math.sqrt(5)
-            nonlinearity = 'leaky_relu'
-            std = init.calculate_gain(nonlinearity, a) / math.sqrt(fan_in)
-            bound = math.sqrt(3.0) * std
-            init.uniform_(self.weight, -bound, bound)
-        elif init_weight == 'jax':
-            std = math.sqrt(2.0 / float(fan_in + fan_out))
-            a = math.sqrt(3.0) * std
-            init.uniform_(self.weight, -a, a)
-        elif init_weight == 'zero':
-            init.zeros_(self.weight)
-
-        # init bias
+        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
        if self.bias is not None:
-            if init_bias == 'torch':
-                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                init.uniform_(self.bias, -bound, bound)
-            elif init_bias == 'jax':
-                init.normal_(self.bias, std=1e-6)
-            elif init_bias == 'zero':
-                init.zeros_(self.bias)
+            bias_initializer(self.bias, fan_in=fan_in)

    def forward(self, x: Tensor) -> Tensor:
        # input: [m/q, n/q, k/q]
        # output: [m/q, n/q, h/q]
-        out_shape = x.shape[:-1] + (self.hidden_size_per_partition,)
-
-        output = Matmul_AB_2D.apply(
-            x,
-            self.weight,
-            self.summa_dim,
-            out_shape,
-            self.row_rank,
-            self.col_rank,
-            ParallelMode.PARALLEL_2D_ROW,
-            ParallelMode.PARALLEL_2D_COL,
-            self.data_parallel_rank,
-            self.pipeline_parallel_rank,
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size)
+        out_shape = x.shape[:-1] + (self.hidden_size_per_partition, )
+
+        output = Matmul_AB_2D.apply(x, self.weight, self.summa_dim, out_shape, self.row_rank, self.col_rank,
+                                    ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL, self.data_parallel_rank,
+                                    self.pipeline_parallel_rank, self.pipeline_parallel_size, self.tensor_parallel_size)

        if self.bias is not None:
            if self.skip_bias_add:
-                bias = Add_Bias_2D.apply(
-                    None,
-                    self.bias,
-                    self.hidden_size_per_partition,
-                    self.row_rank,
-                    self.col_rank,
-                    ParallelMode.PARALLEL_2D_ROW,
-                    ParallelMode.PARALLEL_2D_COL,
-                    True,
-                    self.data_parallel_rank,
-                    self.pipeline_parallel_rank,
-                    self.pipeline_parallel_size,
-                    self.tensor_parallel_size
-                )
+                bias = add_bias_2d.apply(None, self.bias, self.hidden_size_per_partition, self.row_rank, self.col_rank,
+                                         ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL, True,
+                                         self.data_parallel_rank, self.pipeline_parallel_rank,
+                                         self.pipeline_parallel_size, self.tensor_parallel_size)
                return output, bias
            else:
-                output = Add_Bias_2D.apply(
-                    output,
-                    self.bias,
-                    self.hidden_size_per_partition,
-                    self.row_rank,
-                    self.col_rank,
-                    ParallelMode.PARALLEL_2D_ROW,
-                    ParallelMode.PARALLEL_2D_COL,
-                    False,
-                    self.data_parallel_rank,
-                    self.pipeline_parallel_rank,
-                    self.pipeline_parallel_size,
-                    self.tensor_parallel_size
-                )
+                output = add_bias_2d.apply(output, self.bias, self.hidden_size_per_partition, self.row_rank,
+                                           self.col_rank, ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL,
+                                           False, self.data_parallel_rank, self.pipeline_parallel_rank,
+                                           self.pipeline_parallel_size, self.tensor_parallel_size)
                return output
        else:
            return output
@@ -183,12 +125,7 @@ class LayerNorm2D(ParallelLayer):
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    """
-
-    def __init__(self,
-                 normalized_shape: int,
-                 eps: float = 1e-05,
-                 dtype=None
-                 ):
+    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()

        # layer norm config
@@ -202,63 +139,252 @@ class LayerNorm2D(ParallelLayer):
        self.summa_dim = get_summa_dim_from_env()

        # partitioning dimension
-        self.partitioned_partition = divide(normalized_shape, self.summa_dim)
+        self.partitioned_partition = divide(normalized_shape, self.summa_dim**2)

        # create parameters
        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}

-        self.gamma = Parameter(torch.ones(
-            self.partitioned_partition,
-            **factory_kwargs))
-        self.beta = Parameter(torch.zeros(
-            self.partitioned_partition,
-            **factory_kwargs))
+        self.gamma = Parameter(torch.ones(self.partitioned_partition, **factory_kwargs))
+        self.beta = Parameter(torch.zeros(self.partitioned_partition, **factory_kwargs))

        self._set_tensor_parallel_attributes()

    def _set_tensor_parallel_attributes(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.gamma, num_partition)
-        set_tensor_parallel_attribute_by_partition(self.beta, num_partition)
+        set_tensor_parallel_attribute_by_partition(self.gamma, self.summa_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.beta, self.summa_dim**2)

    def forward(self, x: Tensor) -> Tensor:
        with torch.no_grad():
            E_x = torch.sum(x, dim=-1, keepdim=True)  # [b/q, s, 1]
-            torch.distributed.all_reduce(
-                E_x, group=gpc.get_group(ParallelMode.PARALLEL_2D_ROW))
+            torch.distributed.all_reduce(E_x, group=gpc.get_group(ParallelMode.PARALLEL_2D_ROW))
            E_x /= self.normalized_shape

            # Var_x in the block below is the sum of input^2
            Var_x = torch.sum(x * x, dim=-1, keepdim=True)  # [b/q, s, 1]
-            torch.distributed.all_reduce(
-                Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2D_ROW))
+            torch.distributed.all_reduce(Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2D_ROW))
            Var_x /= self.normalized_shape

            Var_x = Var_x - E_x * E_x  # variance of x [b/q, s, 1]
            # this time 1/sqrt(Var_x + epsilon)
            Var_x = 1.0 / torch.sqrt(Var_x + self.variance_epsilon)

-        output = _LayerNorm_2D.apply(x, E_x, Var_x, self.normalized_shape,
-                                     ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL)
-        bias = Add_Bias_2D.apply(
-            None, self.beta, self.partitioned_partition,
-            self.row_rank, self.col_rank,
-            ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL,
-            True,
-            self.data_parallel_rank,
-            self.pipeline_parallel_rank,
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size
-        )
-        scale = Add_Bias_2D.apply(
-            None, self.gamma, self.partitioned_partition,
-            self.row_rank, self.col_rank,
-            ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL,
-            True,
-            self.data_parallel_rank,
-            self.pipeline_parallel_rank,
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size
-        )
+        output = layernorm_2d.apply(x, E_x, Var_x, self.normalized_shape, ParallelMode.PARALLEL_2D_ROW,
+                                    ParallelMode.PARALLEL_2D_COL)
+        bias = add_bias_2d.apply(None, self.beta, self.partitioned_partition, self.row_rank, self.col_rank,
+                                 ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL, True,
+                                 self.data_parallel_rank, self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                 self.tensor_parallel_size)
+        scale = add_bias_2d.apply(None, self.gamma, self.partitioned_partition, self.row_rank, self.col_rank,
+                                  ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL, True,
+                                  self.data_parallel_rank, self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                  self.tensor_parallel_size)
        output = torch.addcmul(bias, scale, output)
        return output
+
+
+@LAYERS.register_module
+class PatchEmbedding2D(ParallelLayer):
+    """ 2D Image to Patch Embedding
+
+    :param img_size: iamge size
+    :type img_size: int
+    :param patch_size: patch size
+    :type patch_size: int
+    :param embed_dim: dimension of embedding
+    :type embed_dim: int
+    :param in_chans: number of channels of input image, defaults to 3
+    :type in_chans: int, optional
+    :param flatten: whether to flatten output tensor, defaults to True
+    :type flatten: bool, optional
+    """
+    def __init__(self,
+                 img_size: int,
+                 patch_size: int,
+                 in_chans: int,
+                 embed_size: int,
+                 dtype: dtype = None,
+                 flatten: bool = True,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 position_embed_initializer: Callable = init.zeros_()):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        assert_summa_initialization()
+        self.summa_dim = get_summa_dim_from_env()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.embed_size = embed_size
+        self.embed_size_per_partition = embed_size // (self.summa_dim**2)
+
+        with seed(ParallelMode.TENSOR):
+            self.weight = Parameter(
+                torch.empty((self.embed_size_per_partition, in_chans, *self.patch_size),
+                            device=get_current_device(),
+                            dtype=dtype))
+            self.bias = Parameter(torch.empty(self.embed_size_per_partition, device=get_current_device(), dtype=dtype))
+
+            self.cls_token = Parameter(
+                torch.zeros((1, 1, self.embed_size_per_partition), device=get_current_device(), dtype=dtype))
+            self.pos_embed = Parameter(
+                torch.zeros((1, self.num_patches + 1, self.embed_size_per_partition),
+                            device=get_current_device(),
+                            dtype=dtype))
+
+        self.reset_parameters(weight_initializer, bias_initializer, position_embed_initializer)
+        self._set_tensor_parallel_attribute()
+
+    def _set_tensor_parallel_attribute(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, self.summa_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.bias, self.summa_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.cls_token, self.summa_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.pos_embed, self.summa_dim**2)
+
+    def reset_parameters(self, weight_initializer, bias_initializer, position_embed_initializer):
+        with seed(ParallelMode.TENSOR):
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            fan_out = self.embed_size
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            bias_initializer(self.bias, fan_in=fan_in)
+            position_embed_initializer(self.pos_embed)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        B, C, H, W = input_.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+
+        input_ = split_batch_2d(input_)
+
+        weight = all_gather_weight_2d.apply(self.weight, 0, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
+        bias = all_gather_weight_2d.apply(self.bias, 0, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
+
+        output = F.conv2d(input_, weight, bias, stride=self.patch_size)
+        if self.flatten:
+            output = output.flatten(2).transpose(1, 2)  # BCHW -> BNC
+
+        cls_token = all_gather_weight_2d.apply(self.cls_token, -1, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
+        pos_embed = all_gather_weight_2d.apply(self.pos_embed, -1, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
+        cls_token = cls_token.expand(output.shape[0], -1, -1)
+        output = torch.cat((cls_token, output), dim=1)
+        output = output + pos_embed
+
+        return output
+
+
+@LAYERS.register_module
+class Embedding2D(ParallelLayer):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs):
+        super().__init__()
+
+        assert_summa_initialization()
+        self.summa_dim = get_summa_dim_from_env()
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, self.summa_dim**2)
+
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+
+        self.weight = Parameter(
+            torch.empty((num_embeddings, embed_dim_per_partition), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, self.summa_dim**2)
+
+    def reset_parameters(self, weight_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.num_embeddings, self.embed_dim
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        input_ = split_batch_2d(input_)
+
+        weight = all_gather_weight_2d.apply(self.weight, -1, self.summa_dim, ParallelMode.PARALLEL_2D_COL)
+
+        output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+        return output
+
+
+@LAYERS.register_module
+class Classifier2D(ParallelLayer):
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: Parameter = None,
+                 bias: bool = True,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+        assert_summa_initialization()
+        self.row_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
+        self.col_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
+        self.summa_dim = get_summa_dim_from_env()
+
+        # partitioning dimension
+        self.input_size_per_partition = divide(self.in_features, self.summa_dim**2)
+
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = Parameter(
+                torch.empty(self.num_classes, self.input_size_per_partition, device=get_current_device(), dtype=dtype))
+            self.has_weight = True
+        if bias:
+            self.bias = Parameter(torch.zeros(self.num_classes, device=get_current_device(), dtype=dtype))
+        else:
+            self.bias = None
+
+        self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self):
+        if self.has_weight:
+            set_tensor_parallel_attribute_by_partition(self.weight, self.summa_dim**2)
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.in_features, self.num_classes
+            col_src_rank = gpc.get_ranks_in_group(ParallelMode.PARALLEL_2D_COL)[0]
+            row_src_rank = gpc.get_ranks_in_group(ParallelMode.PARALLEL_2D_ROW)[0]
+
+            if self.has_weight:
+                weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+
+            if self.bias is not None:
+                bias_initializer(self.bias, fan_in=fan_in)
+                broadcast(self.bias, col_src_rank, ParallelMode.PARALLEL_2D_COL)
+                broadcast(self.bias, row_src_rank, ParallelMode.PARALLEL_2D_ROW)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        out_shape = input_.shape[:-1] + (self.num_classes, )
+
+        return classifier_2d.apply(input_, self.weight, self.bias, self.summa_dim, out_shape, self.row_rank,
+                                   self.col_rank, ParallelMode.PARALLEL_2D_ROW, ParallelMode.PARALLEL_2D_COL,
+                                   self.data_parallel_rank, self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                   self.tensor_parallel_size)
--- a/colossalai/nn/layer/parallel_2p5d/__init__.py
+++ b/colossalai/nn/layer/parallel_2p5d/__init__.py
-from ._operation import Matmul_AB_2p5D, Matmul_ABT_2p5D, Matmul_ATB_2p5D, Add_Bias_2p5D
-from ._transformer import TransformerMLP2p5D, TransformerSelfAttention2p5D, TransformerLayer2p5D
-from ._vit import ViTMLP2p5D, ViTSelfAttention2p5D, ViTHead2p5D, ViTPatchEmbedding2p5D, ViTTokenFuser2p5D, ViTInputSplitter2p5D
-from .layers import Linear2p5D, LayerNorm2p5D
+from ._operation import reduce_by_batch_2p5d, split_batch_2p5d
+from .layers import Classifier2p5D, Embedding2p5D, LayerNorm2p5D, Linear2p5D, PatchEmbedding2p5D

 __all__ = [
-    'Matmul_AB_2p5D', 'Matmul_ABT_2p5D', 'Matmul_ATB_2p5D', 'Add_Bias_2p5D',
-    'TransformerMLP2p5D', 'TransformerSelfAttention2p5D', 'TransformerLayer2p5D',
-    'ViTMLP2p5D', 'ViTSelfAttention2p5D', 'ViTHead2p5D', 'ViTPatchEmbedding2p5D', 'ViTTokenFuser2p5D',
-    'ViTInputSplitter2p5D',
-    'Linear2p5D', 'LayerNorm2p5D'
+    'split_batch_2p5d', 'reduce_by_batch_2p5d', 'Linear2p5D', 'LayerNorm2p5D', 'Classifier2p5D', 'PatchEmbedding2p5D',
+    'Embedding2p5D'
 ]
--- a/colossalai/nn/layer/parallel_2p5d/_operation.py
+++ b/colossalai/nn/layer/parallel_2p5d/_operation.py
@@ -2,11 +2,11 @@ from typing import Any, Tuple

 import torch
 import torch.distributed as dist
-from torch import Tensor
-
+from colossalai.communication.collective import (all_gather, all_reduce, reduce_scatter)
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from colossalai.utils import get_current_device
+from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd


@@ -22,25 +22,92 @@ def get_parallel_rank(parallel_mode: ParallelMode):
    return gpc.get_local_rank(parallel_mode)


-class Matmul_AB_2p5D(torch.autograd.Function):
+def split_batch_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
+    return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL),
+                       dim=dim)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)].contiguous()
+
+
+class classifier_2p5d(torch.autograd.Function):
    """Matrix multiplication for :math:`C = AB`
    """
-
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
+    def forward(
+        ctx: Any,
        A: Tensor,
        B: Tensor,
+        bias,
        tesseract_dim: int,
        out_shape: Tuple[int, ...],
        row_rank: int,
        col_rank: int,
-                dep_rank: int,
        row_parallel_mode: ParallelMode,
        col_parallel_mode: ParallelMode,
        data_parallel_rank: int,
        pipeline_parallel_rank: int,
        pipeline_parallel_size: int,
+        tensor_parallel_size: int,
+    ) -> Tensor:
+
+        A_shape = A.shape
+        A = A.reshape((-1, A_shape[-1]))
+        B_shape = B.shape
+        B = B.reshape((-1, B_shape[-1]))
+        B_temp = all_gather(B, -1, col_parallel_mode)
+        if ctx:
+            ctx.save_for_backward(A, B_temp)
+
+        C = torch.matmul(A, B_temp.transpose(0, 1))
+
+        C = all_reduce(C, row_parallel_mode)
+
+        ctx.use_bias = bias is not None
+        if bias is not None:
+            C = C + bias
+
+        out = C.reshape(out_shape)
+
+        if ctx:
+            ctx.tesseract_dim = tesseract_dim
+            ctx.row_rank = row_rank
+            ctx.col_rank = col_rank
+            ctx.row_parallel_mode = row_parallel_mode
+            ctx.col_parallel_mode = col_parallel_mode
+            ctx.A_shape = A_shape
+            ctx.B_shape = B_shape
+            ctx.data_parallel_rank = data_parallel_rank
+            ctx.pipeline_parallel_rank = pipeline_parallel_rank
+            ctx.pipeline_parallel_size = pipeline_parallel_size
+            ctx.tensor_parallel_size = tensor_parallel_size
+
+        return out
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
+        A, B = ctx.saved_tensors
+
+        with torch.no_grad():
+            A_grad = torch.matmul(output_grad, B)
+            A_grad = A_grad.reshape(ctx.A_shape)
+            B_grad = torch.matmul(output_grad.reshape(-1, output_grad.shape[-1]).transpose(0, 1), A)
+            B_grad = reduce_scatter(B_grad, -1, ctx.col_parallel_mode)
+            B_grad = B_grad.reshape(ctx.B_shape)
+
+            bias_grad = torch.sum(output_grad, dim=tuple(range(output_grad.ndim - 1)))
+            bias_grad = all_reduce(bias_grad, ctx.col_parallel_mode)
+
+        return A_grad, B_grad, bias_grad, None, None, None, None, None, None, None, None, None, None
+
+
+class Matmul_AB_2p5D(torch.autograd.Function):
+    """Matrix multiplication for :math:`C = AB`
+    """
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float16)
+    def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple[int, ...], row_rank: int,
+                col_rank: int, dep_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
+                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                tensor_parallel_size: int) -> Tensor:
        # A: [b / dq, s, h / q] -> [(b * s) / dq, h / q]
        # B: [h / dq, s / q]
@@ -59,8 +126,8 @@ class Matmul_AB_2p5D(torch.autograd.Function):
        C_shape = (A.shape[0], B.shape[-1])
        C = torch.zeros(C_shape, dtype=A.dtype, device=get_current_device())

-        A_list = [torch.empty_like(A) for _ in range(gpc.get_world_size(row_parallel_mode)-1)]
-        B_list = [torch.empty_like(B) for _ in range(gpc.get_world_size(col_parallel_mode)-1)]
+        A_list = [torch.empty_like(A) for _ in range(gpc.get_world_size(row_parallel_mode) - 1)]
+        B_list = [torch.empty_like(B) for _ in range(gpc.get_world_size(col_parallel_mode) - 1)]
        A_list.insert(gpc.get_local_rank(row_parallel_mode), A)
        B_list.insert(gpc.get_local_rank(col_parallel_mode), B)
        op_a = dist.all_gather(A_list, A, group=gpc.get_group(row_parallel_mode), async_op=True)
@@ -100,52 +167,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        A, B = ctx.saved_tensors
        with torch.no_grad():
-            A_grad = Matmul_ABT_2p5D.apply(
-                output_grad, B,
-                ctx.tesseract_dim, ctx.A_shape,
-                ctx.row_rank, ctx.col_rank, ctx.dep_rank,
-                ctx.row_parallel_mode,
-                ctx.col_parallel_mode,
-                ctx.data_parallel_rank,
-                ctx.pipeline_parallel_rank,
-                ctx.pipeline_parallel_size,
-                ctx.tensor_parallel_size
-            )
-            B_grad = Matmul_ATB_2p5D.apply(
-                A, output_grad,
-                ctx.tesseract_dim, ctx.B_shape,
-                ctx.row_rank, ctx.col_rank, ctx.dep_rank,
-                ctx.row_parallel_mode,
-                ctx.col_parallel_mode,
-                ctx.data_parallel_rank,
-                ctx.pipeline_parallel_rank,
-                ctx.pipeline_parallel_size,
-                ctx.tensor_parallel_size
-            )
+            A_grad = Matmul_ABT_2p5D.apply(output_grad, B, ctx.tesseract_dim, ctx.A_shape, ctx.row_rank, ctx.col_rank,
+                                           ctx.dep_rank, ctx.row_parallel_mode, ctx.col_parallel_mode,
+                                           ctx.data_parallel_rank, ctx.pipeline_parallel_rank,
+                                           ctx.pipeline_parallel_size, ctx.tensor_parallel_size)
+            B_grad = Matmul_ATB_2p5D.apply(A, output_grad, ctx.tesseract_dim, ctx.B_shape, ctx.row_rank, ctx.col_rank,
+                                           ctx.dep_rank, ctx.row_parallel_mode, ctx.col_parallel_mode,
+                                           ctx.data_parallel_rank, ctx.pipeline_parallel_rank,
+                                           ctx.pipeline_parallel_size, ctx.tensor_parallel_size)
        return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None, None, None, None


 class Matmul_ABT_2p5D(torch.autograd.Function):
    """Matrix multiplication for :math:`C = AB^T`
    """
-
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                A: Tensor,
-                B: Tensor,
-                tesseract_dim: int,
-                out_shape: Tuple[int, ...],
-                row_rank: int,
-                col_rank: int,
-                dep_rank: int,
-                row_parallel_mode: ParallelMode,
-                col_parallel_mode: ParallelMode,
-                data_parallel_rank: int,
-                pipeline_parallel_rank: int,
-                pipeline_parallel_size: int,
-                tensor_parallel_size: int
-                ) -> Tensor:
+    def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple[int, ...], row_rank: int,
+                col_rank: int, dep_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
+                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
+                tensor_parallel_size: int) -> Tensor:

        assert A.shape[-1] == B.shape[-1], \
            'Invalid shapes: A={}, B={} for ABT.'.format(A.shape, B.shape)
@@ -197,50 +238,25 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        A, B = ctx.saved_tensors
        with torch.no_grad():
-            A_grad = Matmul_AB_2p5D.apply(
-                output_grad, B,
-                ctx.tesseract_dim, ctx.A_shape,
-                ctx.row_rank, ctx.col_rank, ctx.dep_rank,
-                ctx.row_parallel_mode,
-                ctx.col_parallel_mode,
-                ctx.data_parallel_rank,
-                ctx.pipeline_parallel_rank,
-                ctx.pipeline_parallel_size,
-                ctx.tensor_parallel_size
-            )
-            B_grad = Matmul_ATB_2p5D.apply(
-                output_grad, A,
-                ctx.tesseract_dim, ctx.B_shape,
-                ctx.row_rank, ctx.col_rank, ctx.dep_rank,
-                ctx.row_parallel_mode,
-                ctx.col_parallel_mode,
-                ctx.data_parallel_rank,
-                ctx.pipeline_parallel_rank,
-                ctx.pipeline_parallel_size,
-                ctx.tensor_parallel_size
-            )
+            A_grad = Matmul_AB_2p5D.apply(output_grad, B, ctx.tesseract_dim, ctx.A_shape, ctx.row_rank, ctx.col_rank,
+                                          ctx.dep_rank, ctx.row_parallel_mode, ctx.col_parallel_mode,
+                                          ctx.data_parallel_rank, ctx.pipeline_parallel_rank,
+                                          ctx.pipeline_parallel_size, ctx.tensor_parallel_size)
+            B_grad = Matmul_ATB_2p5D.apply(output_grad, A, ctx.tesseract_dim, ctx.B_shape, ctx.row_rank, ctx.col_rank,
+                                           ctx.dep_rank, ctx.row_parallel_mode, ctx.col_parallel_mode,
+                                           ctx.data_parallel_rank, ctx.pipeline_parallel_rank,
+                                           ctx.pipeline_parallel_size, ctx.tensor_parallel_size)
        return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None, None, None, None


 class Matmul_ATB_2p5D(torch.autograd.Function):
    """Matrix multiplication for :math:`C = A^TB`
    """
-
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                A: Tensor,
-                B: Tensor,
-                tesseract_dim: int,
-                out_shape: Tuple[int, ...],
-                row_rank: int,
-                col_rank: int,
-                dep_rank: int,
-                row_parallel_mode: ParallelMode,
-                col_parallel_mode: ParallelMode,
-                data_parallel_rank: int,
-                pipeline_parallel_rank: int,
-                pipeline_parallel_size: int,
+    def forward(ctx: Any, A: Tensor, B: Tensor, tesseract_dim: int, out_shape: Tuple[int, ...], row_rank: int,
+                col_rank: int, dep_rank: int, row_parallel_mode: ParallelMode, col_parallel_mode: ParallelMode,
+                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
                tensor_parallel_size: int):

        assert A.shape[-2] == B.shape[-2], \
@@ -261,14 +277,12 @@ class Matmul_ATB_2p5D(torch.autograd.Function):
            src_a = i + row_rank * tesseract_dim + dep_rank * (
                        tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
                    pipeline_parallel_rank * tensor_parallel_size
-            dist.broadcast(A_temp, src=src_a,
-                           group=get_parallel_group(row_parallel_mode))
+            dist.broadcast(A_temp, src=src_a, group=get_parallel_group(row_parallel_mode))
            C_temp = torch.matmul(A_temp.transpose(0, 1), B)
            src_c = col_rank + i * tesseract_dim + dep_rank * (
                        tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
                    pipeline_parallel_rank * tensor_parallel_size
-            dist.reduce(C_temp, dst=src_c,
-                        group=get_parallel_group(col_parallel_mode))
+            dist.reduce(C_temp, dst=src_c, group=get_parallel_group(col_parallel_mode))
            if i == row_rank:
                C = C_temp.clone()

@@ -295,59 +309,30 @@ class Matmul_ATB_2p5D(torch.autograd.Function):
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
        A, B = ctx.saved_tensors
        with torch.no_grad():
-            A_grad = Matmul_ABT_2p5D.apply(
-                B, output_grad,
-                ctx.tesseract_dim, ctx.A_shape,
-                ctx.row_rank, ctx.col_rank, ctx.dep_rank,
-                ctx.row_parallel_mode,
-                ctx.col_parallel_mode,
-                ctx.data_parallel_rank,
-                ctx.pipeline_parallel_rank,
-                ctx.pipeline_parallel_size,
-                ctx.tensor_parallel_size
-            )
-            B_grad = Matmul_AB_2p5D.apply(
-                A, output_grad,
-                ctx.tesseract_dim, ctx.B_shape,
-                ctx.row_rank, ctx.col_rank, ctx.dep_rank,
-                ctx.row_parallel_mode,
-                ctx.col_parallel_mode,
-                ctx.data_parallel_rank,
-                ctx.pipeline_parallel_rank,
-                ctx.pipeline_parallel_size,
-                ctx.tensor_parallel_size
-            )
+            A_grad = Matmul_ABT_2p5D.apply(B, output_grad, ctx.tesseract_dim, ctx.A_shape, ctx.row_rank, ctx.col_rank,
+                                           ctx.dep_rank, ctx.row_parallel_mode, ctx.col_parallel_mode,
+                                           ctx.data_parallel_rank, ctx.pipeline_parallel_rank,
+                                           ctx.pipeline_parallel_size, ctx.tensor_parallel_size)
+            B_grad = Matmul_AB_2p5D.apply(A, output_grad, ctx.tesseract_dim, ctx.B_shape, ctx.row_rank, ctx.col_rank,
+                                          ctx.dep_rank, ctx.row_parallel_mode, ctx.col_parallel_mode,
+                                          ctx.data_parallel_rank, ctx.pipeline_parallel_rank,
+                                          ctx.pipeline_parallel_size, ctx.tensor_parallel_size)
        return A_grad, B_grad, None, None, None, None, None, None, None, None, None, None, None, None, None


 class Add_Bias_2p5D(torch.autograd.Function):
    """Matrix add bias: :math:`C = A + b`
    """
-
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                input: Tensor,
-                bias: Tensor,
-                output_size_per_partition: int,
-                tesseract_dim: int,
-                row_rank: int,
-                col_rank: int,
-                dep_rank: int,
-                col_parallel_mode: ParallelMode,
-                skip_bias_add: bool,
-                data_parallel_rank: int,
-                pipeline_parallel_rank: int,
-                pipeline_parallel_size: int,
-                tensor_parallel_size: int
-                ) -> Tensor:
+    def forward(ctx: Any, input: Tensor, bias: Tensor, output_size_per_partition: int, tesseract_dim: int,
+                row_rank: int, col_rank: int, dep_rank: int, col_parallel_mode: ParallelMode, skip_bias_add: bool,
+                data_parallel_rank: int, pipeline_parallel_rank: int, pipeline_parallel_size: int,
+                tensor_parallel_size: int) -> Tensor:
        if row_rank == 0:
            bias_temp = bias.clone()
        else:
-            bias_temp = torch.zeros(
-                output_size_per_partition,
-                dtype=bias.dtype,
-                device=get_current_device())
+            bias_temp = torch.zeros(output_size_per_partition, dtype=bias.dtype, device=get_current_device())
        src_rank = col_rank + dep_rank * (
                    tesseract_dim ** 2) + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
                   pipeline_parallel_rank * tensor_parallel_size
@@ -407,14 +392,10 @@ class Add_Bias_2p5D(torch.autograd.Function):
                return output_grad, reduce_tmp, None, None, None, None, None, None, None, None, None, None, None, None, None, None, None


-class _LayerNorm_2p5D(torch.autograd.Function):
+class layernorm_2p5d(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx: Any,
-                input: Tensor,
-                E_x: Tensor,
-                Var_x: Tensor,
-                hidden_size: int,
+    def forward(ctx: Any, input: Tensor, E_x: Tensor, Var_x: Tensor, hidden_size: int,
                row_parallel_mode: ParallelMode) -> Tensor:
        input = input - E_x
        # in here, input = x - E[x], Var_x = 1 / sqrt(Var[x] + eps)
@@ -432,14 +413,11 @@ class _LayerNorm_2p5D(torch.autograd.Function):
        # in here, Var_x = 1 / sqrt(Var[x] + eps), x = (x - E[x]) * Var_x
        with torch.no_grad():
            output_grad_sum = torch.sum(output_grad, dim=-1, keepdim=True)
-            torch.distributed.all_reduce(
-                output_grad_sum, group=get_parallel_group(row_parallel_mode))
+            torch.distributed.all_reduce(output_grad_sum, group=get_parallel_group(row_parallel_mode))
            output_grad_sum /= ctx.hidden_size

-            output_grad_mul_x_sum = torch.sum(
-                output_grad * x, dim=-1, keepdim=True)
-            torch.distributed.all_reduce(
-                output_grad_mul_x_sum, group=get_parallel_group(row_parallel_mode))
+            output_grad_mul_x_sum = torch.sum(output_grad * x, dim=-1, keepdim=True)
+            torch.distributed.all_reduce(output_grad_mul_x_sum, group=get_parallel_group(row_parallel_mode))
            output_grad_mul_x_sum /= ctx.hidden_size

            input_grad = output_grad.clone()
@@ -450,105 +428,28 @@ class _LayerNorm_2p5D(torch.autograd.Function):
        return input_grad, None, None, None, None, None, None


-# class Sum_2p5D(torch.autograd.Function):
-#     """Compute the sum of input tensors
-#     """
-
-#     @staticmethod
-#     def forward(ctx,
-#                 inputs,
-#                 dim,
-#                 tesseract_dim,
-#                 row_parallel_mode,
-#                 keepdim=False):
-#         # input: [b/q, s, h/q]
-#         ctx.save_for_backward(inputs)
-#         # sum: [b/q, s]
-#         out = torch.sum(inputs, dim=dim, keepdim=keepdim)
-#         torch.distributed.all_reduce(
-#             out, group=gpc.get_group(row_parallel_mode))
-#         return out
-
-#     @staticmethod
-#     def backward(ctx, output_grad):
-#         with torch.no_grad():
-#             inputs = ctx.saved_tensors
-#             input_grad = torch.ones(inputs.shape, dtype=output_grad.dtype)
-#         return input_grad, None, None, None, None, None
-
-
-# class _ViT_Split_2p5D(torch.autograd.Function):
-#     @staticmethod
-#     @custom_fwd(cast_inputs=torch.float16)
-#     def forward(ctx, inputs, batch_size,
-#                 tesseract_dim, tesseract_dep,
-#                 xz_parallel_mode):
-#         # inputs: [b, s, h/q]
-#         # output: [b/dq, s, h/q]
-
-#         ctx.BATCH_SIZE = batch_size
-#         ctx.tesseract_dim = tesseract_dim
-#         ctx.tesseract_dep = tesseract_dep
-#         ctx.xz_parallel_mode = xz_parallel_mode
-#         xz_rank = gpc.get_local_rank(xz_parallel_mode)
-#         output = torch.chunk(inputs, tesseract_dep *
-#                              tesseract_dim, dim=0)[xz_rank]
-#         output = output.clone()
-#         return output
-
-#     @staticmethod
-#     @custom_bwd
-#     def backward(ctx, output_grad):
-#         # output_grad: [b/dq, s, h/q]
-#         # grads: [b, s, h/q]
-#         # *
-#         grads_shape = (ctx.BATCH_SIZE,) + output_grad.shape[1:]
-#         grads = torch.empty(grads_shape,
-#                             dtype=output_grad.dtype,
-#                             device=get_current_device())
-#         dist.all_gather(list(grads.chunk(ctx.tesseract_dim * ctx.tesseract_dep, dim=0)),
-#                         output_grad.contiguous(),
-#                         group=get_parallel_group(ctx.xz_parallel_mode))
-#         return grads, None, None, None, None
-
-class AllGatherLast(torch.autograd.Function):
-
+class all_gather_weight_2p5d(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                inputs: Tensor,
-                tesseract_dim: int,
-                col_parallel_mode: ParallelMode) -> Tensor:
+    def forward(ctx: Any, inputs: Tensor, dim: int, tesseract_dim: int, col_parallel_mode: ParallelMode) -> Tensor:
+        ctx.dim = dim
        ctx.tesseract_dim = tesseract_dim
        ctx.row_rank = gpc.get_local_rank(col_parallel_mode)

-        last_dim = tesseract_dim * inputs.size(-1)
-        outputs_shape = (last_dim,) + inputs.shape[:-1]
-        outputs = torch.empty(
-            outputs_shape, dtype=inputs.dtype, device=get_current_device())
-        dist.all_gather(
-            list(outputs.chunk(tesseract_dim, dim=0)),
-            inputs.permute(2, 0, 1).contiguous(),
-            group=gpc.get_group(col_parallel_mode)
-        )
-        outputs = outputs.permute(1, 2, 0).contiguous()
+        outputs = all_gather(inputs, dim, col_parallel_mode)
        return outputs

    @staticmethod
    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        grad = output_grad.chunk(ctx.tesseract_dim, dim=-1)[ctx.row_rank]
-        return grad.contiguous(), None, None
+        grad = output_grad.chunk(ctx.tesseract_dim, dim=ctx.dim)[ctx.row_rank]
+        return grad.contiguous(), None, None, None


 class SplitFirst(torch.autograd.Function):
-
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                inputs: Tensor,
-                tesseract_dim: int,
-                col_parallel_mode: ParallelMode) -> Tensor:
+    def forward(ctx: Any, inputs: Tensor, tesseract_dim: int, col_parallel_mode: ParallelMode) -> Tensor:
        ctx.tesseract_dim = tesseract_dim
        ctx.batch_size = inputs.size(0)
        ctx.para_mode = col_parallel_mode
@@ -560,12 +461,33 @@ class SplitFirst(torch.autograd.Function):
    @staticmethod
    @custom_bwd
    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        grad_shape = (ctx.batch_size,) + output_grad.shape[1:]
-        grad = torch.empty(
-            grad_shape, dtype=output_grad.dtype, device=get_current_device())
-        dist.all_gather(
-            list(grad.chunk(ctx.tesseract_dim, dim=0)),
+        grad_shape = (ctx.batch_size, ) + output_grad.shape[1:]
+        grad = torch.empty(grad_shape, dtype=output_grad.dtype, device=get_current_device())
+        dist.all_gather(list(grad.chunk(ctx.tesseract_dim, dim=0)),
                        output_grad.contiguous(),
-            group=gpc.get_group(ctx.para_mode)
-        )
+                        group=gpc.get_group(ctx.para_mode))
        return grad, None, None
+
+
+def split_batch_2p5d(input_: Tensor, dim: int = 0) -> Tensor:
+    return torch.chunk(input_, gpc.get_world_size(ParallelMode.PARALLEL_2P5D_COL),
+                       dim=dim)[gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)].contiguous()
+
+
+class reduce_by_batch_2p5d(torch.autograd.Function):
+    """All-reduce the input from the model parallel region."""
+    @staticmethod
+    def symbolic(graph, input_):
+        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_COL))
+        return input_
+
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, input_):
+        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_COL))
+        return input_.clone()
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, grad_output):
+        return grad_output
--- a/colossalai/nn/layer/parallel_2p5d/_transformer.py
+++ b/colossalai/nn/layer/parallel_2p5d/_transformer.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import math
-
-import torch
-from torch import nn as nn, Tensor
-
-from colossalai.nn.layer._common_utils import divide
-from colossalai.registry import LAYERS
-from ._utils import assert_tesseract_initialization, \
-    get_tesseract_dim_dep_from_env
-from .layers import Linear2p5D, LayerNorm2p5D
-from .._common_utils import ACT2FN
-from ..base_layer import ParallelLayer
-
-
-@LAYERS.register_module
-class TransformerMLP2p5D(ParallelLayer):
-    """
-    MLP will take the input with h hidden state, project it to mlp_ratio * h
-    hidden dimension, perform nonlinear transformation, and project the
-    state back into h hidden dimension. At the end, dropout is also
-    applied.
-
-    :param in_features: the size of input tensor
-    :type in_features: int
-    :param mlp_ratio: hidden size of MLP divided by embedding dim, defaults to 4.0
-    :type mlp_ratio: int, optional
-    :param act_func: activation function, defaults to 'gelu'
-    :type act_func: str, optional
-    :param dropout_prob: dropout probability, defaults to 0.
-    :type dropout_prob: float, optional
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 mlp_ratio: int = 4.0,
-                 act_func: str = 'gelu',
-                 dropout_prob: float = 0.,
-                 dtype=None,
-                 skip_bias_add: bool = False
-                 ):
-        super().__init__()
-        assert_tesseract_initialization()
-        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
-        self.in_features = in_features
-        self.skip_bias_add = skip_bias_add
-
-        # Project to h * mlp_ratio.
-        self.dense_1 = Linear2p5D(
-            in_features,
-            int(mlp_ratio * in_features),
-            dtype=dtype,
-            skip_bias_add=skip_bias_add
-        )
-
-        assert act_func in ACT2FN.keys(), f'Invalid value for argument act_func, ' \
-                                          f'activation function can only be {list(ACT2FN.keys())}'
-        self.activation_func = ACT2FN[act_func]
-
-        # Project back to h.
-        self.dense_2 = Linear2p5D(
-            int(mlp_ratio * in_features),
-            in_features,
-            dtype=dtype,
-            skip_bias_add=skip_bias_add
-        )
-        self.dropout = nn.Dropout(dropout_prob)
-        self.layernorm = LayerNorm2p5D(in_features, dtype=dtype)
-
-    def forward(self, x: Tensor) -> Tensor:
-        if self.skip_bias_add:
-            intermediate_output, _ = self.dense_1(x)
-        else:
-            intermediate_output = self.dense_1(x)
-
-        intermediate_output = self.activation_func(intermediate_output)
-
-        if self.skip_bias_add:
-            output, _ = self.dense_2(intermediate_output)
-        else:
-            output = self.dense_2(intermediate_output)
-
-        output = self.dropout(output)
-        output = self.layernorm(x + output)
-        return output
-
-
-@LAYERS.register_module
-class TransformerSelfAttention2p5D(ParallelLayer):
-    """Self attention layer for 2.5D parallel Transformer
-
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param attention_dropout_prob: dropout probability for attention layer
-    :type attention_dropout_prob: float
-    :param hidden_dropout_prob: dropout probability for hidden layer
-    :type hidden_dropout_prob: float
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    """
-
-    def __init__(self,
-                 hidden_size: int,
-                 num_attention_heads: int,
-                 attention_dropout_prob: float,
-                 hidden_dropout_prob: float,
-                 dtype=None,
-                 ):
-        super().__init__()
-
-        assert_tesseract_initialization()
-        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
-        self.hidden_size = hidden_size
-        self.num_attention_heads = divide(
-            num_attention_heads, self.tesseract_dim)  # *
-        self.attention_head_size = divide(hidden_size, num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-
-        self.query_key_value = Linear2p5D(
-            hidden_size,
-            3 * hidden_size,
-            dtype=dtype,
-        )
-        self.attention_dropout = nn.Dropout(attention_dropout_prob)
-        self.dense = Linear2p5D(
-            hidden_size,
-            hidden_size,
-            dtype=dtype,
-        )
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-        self.layernorm = LayerNorm2p5D(
-            hidden_size,
-            dtype=dtype)
-
-    def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
-        query_key_value = self.query_key_value(hidden_states)
-        new_qkv_shape = query_key_value.shape[:-1] + \
-            (self.num_attention_heads, 3 * self.attention_head_size)
-        query_key_value = query_key_value.view(new_qkv_shape)
-        query_key_value = query_key_value.permute((0, 2, 1, 3))
-        query_layer, key_layer, value_layer = torch.chunk(
-            query_key_value, 3, dim=-1)
-
-        attention_scores = torch.matmul(
-            query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / \
-            math.sqrt(self.attention_head_size)
-        attention_scores = attention_scores + attention_mask
-        attention_probs = nn.Softmax(dim=-1)(attention_scores)
-        attention_probs = self.attention_dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.permute((0, 2, 1, 3)).contiguous()
-        new_context_layer_shape = context_layer.size()[
-            :-2] + (self.all_head_size,)
-        context_layer = context_layer.view(*new_context_layer_shape)
-
-        output = self.dense(context_layer)
-        output = self.dropout(output)
-        attention_output = self.layernorm(hidden_states + output)
-
-        return attention_output
-
-
-@LAYERS.register_module
-class TransformerLayer2p5D(ParallelLayer):
-    """Transformer layer which contains a self-attention layer and a MLP layer
-
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param act_func: activation function, defaults to 'gelu'
-    :type act_func: str, optional
-    :param mlp_ratio: hidden size of MLP divided by embedding dim, defaults to 4.0
-    :type mlp_ratio: float, optional
-    :param attention_dropout_prob: dropout probability for attention layer, defaults to 0.
-    :type attention_dropout_prob: float, optional
-    :param hidden_dropout_prob: dropout probability for attention layer, defaults to 0.
-    :type hidden_dropout_prob: float, optional
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    """
-
-    def __init__(self,
-                 hidden_size: int,
-                 num_attention_heads: int,
-                 act_func: str = 'gelu',
-                 mlp_ratio: float = 4.0,
-                 attention_dropout_prob: float = 0.,
-                 hidden_dropout_prob: float = 0.,
-                 dtype=None,
-                 ):
-        super().__init__()
-
-        self.attention = TransformerSelfAttention2p5D(
-            hidden_size=hidden_size,
-            num_attention_heads=num_attention_heads,
-            attention_dropout_prob=attention_dropout_prob,
-            hidden_dropout_prob=hidden_dropout_prob,
-            dtype=dtype,
-        )
-        self.mlp = TransformerMLP2p5D(
-            in_features=hidden_size,
-            dropout_prob=hidden_dropout_prob,
-            act_func=act_func,
-            mlp_ratio=mlp_ratio,
-            dtype=dtype,
-        )
-
-    def forward(self, hidden_states: Tensor, attention_mask: Tensor) -> Tensor:
-        attention_output = self.attention(hidden_states, attention_mask)
-        output = self.mlp(attention_output)
-        return output
--- a/colossalai/nn/layer/parallel_2p5d/_vit.py
+++ b/colossalai/nn/layer/parallel_2p5d/_vit.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import math
-
-import torch
-from torch import nn as nn, Tensor, distributed as dist
-from torch.nn.init import _calculate_fan_in_and_fan_out
-
-from colossalai.context import seed, ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.registry import LAYERS
-from colossalai.utils import checkpoint
-from colossalai.utils import get_current_device
-from ._operation import AllGatherLast, SplitFirst
-from ._utils import assert_tesseract_initialization, \
-    get_tesseract_dim_dep_from_env
-from .layers import Linear2p5D
-from ..base_layer import ParallelLayer
-from ..fused_bias_gelu import bias_gelu_impl
-from .._common_utils import (ACT2FN, divide, to_2tuple,
-                             set_tensor_parallel_attribute_by_partition)
-
-
-@LAYERS.register_module
-class ViTMLP2p5D(ParallelLayer):
-    """MLP layer for 2.5D parallel Vision Transformer
-
-    :param in_features: size of each input sample
-    :type in_features: int
-    :param mlp_ratio: hidden size of MLP divided by embedding dim
-    :type mlp_ratio: int
-    :param act_func: activation function, defaults to 'gelu'
-    :type act_func: str, optional
-    :param dropout_prob: dropout probability, defaults to 0.
-    :type dropout_prob: float, optional
-    :param dtype: The dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param checkpoint: If set to `True`, activation checkpoint is used, defaults to `False`
-    :type checkpoint: bool, optional
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 mlp_ratio: int,
-                 act_func: str = 'gelu',
-                 dropout_prob: float = 0.,
-                 dtype=None,
-                 checkpoint: bool = False,
-                 weight_init='torch'
-                 ):
-        super().__init__()
-
-        assert_tesseract_initialization()
-        self.in_features = in_features
-        self.mlp_ratio = mlp_ratio
-        self.checkpoint = checkpoint
-        assert weight_init in ('torch', 'jax')
-
-        if act_func == 'fused_gelu':
-            self.act = bias_gelu_impl
-            skip_dense_1_add_bias = True
-        else:
-            self.act = ACT2FN[act_func]
-            skip_dense_1_add_bias = False
-
-        # Project to mlp_ratio * h.
-        self.dense_1 = Linear2p5D(
-            self.in_features,
-            self.mlp_ratio * self.in_features,
-            dtype=dtype,
-            init_weight=weight_init,
-            init_bias=weight_init,
-            skip_bias_add=skip_dense_1_add_bias
-        )
-
-        self.act = ACT2FN[act_func]
-
-        # Project back to h.
-        self.dense_2 = Linear2p5D(
-            self.mlp_ratio * self.in_features,
-            self.in_features,
-            dtype=dtype,
-            init_weight=weight_init,
-            init_bias=weight_init
-        )
-        self.dropout = nn.Dropout(dropout_prob)
-
-    def _forward(self, hidden_states: Tensor) -> Tensor:
-        if self.act == bias_gelu_impl:
-            intermediate_output, bias = self.dense_1(hidden_states)
-            intermediate_output = self.act(intermediate_output, bias)
-        else:
-            intermediate_output = self.dense_1(hidden_states)
-            intermediate_output = self.act(intermediate_output)
-
-        with seed(ParallelMode.TENSOR):
-            intermediate_output = self.dropout(intermediate_output)
-        output = self.dense_2(intermediate_output)
-
-        with seed(ParallelMode.TENSOR):
-            output = self.dropout(output)
-        return output
-
-    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
-        return checkpoint(self._forward, hidden_states)
-
-    def forward(self, hidden_states: Tensor) -> Tensor:
-        if self.checkpoint:
-            return self._checkpoint_forward(hidden_states)
-        else:
-            return self._forward(hidden_states)
-
-
-@LAYERS.register_module
-class ViTSelfAttention2p5D(ParallelLayer):
-    """Self-attention layer for 2.5D parallel Vision Transformer
-
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param attention_dropout_prob: dropout probability for attention layers
-    :type attention_dropout_prob: float
-    :param hidden_dropout_prob: dropout probability for hidden layers
-    :type hidden_dropout_prob: float
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    :param checkpoint: If set to `True`, activation checkpoint is used, defaults to `False`
-    :type checkpoint: bool, optional
-    """
-
-    def __init__(self,
-                 hidden_size,
-                 num_attention_heads,
-                 attention_dropout_prob,
-                 hidden_dropout_prob,
-                 dtype=None,
-                 checkpoint: bool = False,
-                 weight_init='torch'
-                 ):
-        super().__init__()
-
-        assert_tesseract_initialization()
-        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
-        self.hidden_size = hidden_size
-        self.num_attention_heads = divide(
-            num_attention_heads, self.tesseract_dim)  # *
-        self.attention_head_size = divide(hidden_size, num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.checkpoint = checkpoint
-        assert weight_init in ('torch', 'jax')
-        if weight_init == 'jax':
-            self.init_bias = 'zero'
-        else:
-            self.init_bias = weight_init
-
-        self.query_key_value = Linear2p5D(
-            hidden_size,
-            3 * hidden_size,
-            dtype=dtype,
-            init_weight=weight_init,
-            init_bias=self.init_bias
-        )
-        self.attention_dropout = nn.Dropout(attention_dropout_prob)
-        self.dense = Linear2p5D(
-            hidden_size,
-            hidden_size,
-            dtype=dtype,
-            init_weight=weight_init,
-            init_bias=self.init_bias
-        )
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-        self.softmax = nn.Softmax(dim=-1)
-
-    def _forward(self, hidden_states: Tensor) -> Tensor:
-        query_key_value = self.query_key_value(hidden_states)
-        new_qkv_shape = query_key_value.shape[:-1] + \
-            (self.num_attention_heads, 3 * self.attention_head_size)
-        query_key_value = query_key_value.view(new_qkv_shape)
-        query_key_value = query_key_value.permute((0, 2, 1, 3))
-        query_layer, key_layer, value_layer = torch.chunk(
-            query_key_value, 3, dim=-1)
-
-        attention_scores = torch.matmul(
-            query_layer, key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / \
-            math.sqrt(self.attention_head_size)
-
-        attention_probs = self.softmax(attention_scores)
-
-        with seed(ParallelMode.TENSOR):
-            attention_probs = self.attention_dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.transpose(1, 2)
-        new_context_layer_shape = context_layer.size()[
-            :-2] + (self.all_head_size,)
-        context_layer = context_layer.reshape(new_context_layer_shape)
-
-        output = self.dense(context_layer)
-        with seed(ParallelMode.TENSOR):
-            output = self.dropout(output)
-        return output
-
-    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
-        return checkpoint(self._forward, hidden_states)
-
-    def forward(self, hidden_states: Tensor) -> Tensor:
-        if self.checkpoint:
-            return self._checkpoint_forward(hidden_states)
-        else:
-            return self._forward(hidden_states)
-
-
-@LAYERS.register_module
-class ViTHead2p5D(ParallelLayer):
-    """Output layer for 2.5D parallel Vision Transformer
-
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: torch.dtype, optional
-    """
-
-    def __init__(self,
-                 hidden_size,
-                 num_classes,
-                 dtype=None,
-                 weight_init='torch'
-                 ):
-        super().__init__()
-        assert_tesseract_initialization()
-        assert weight_init in ('torch', 'jax')
-        if weight_init == 'jax':
-            self.init_weight = 'zero'
-            self.init_bias = 'zero'
-        else:
-            self.init_weight = weight_init
-            self.init_bias = weight_init
-
-        self.linear = Linear2p5D(
-            hidden_size,
-            num_classes,
-            dtype=dtype,
-            init_weight=self.init_weight,
-            init_bias=self.init_bias
-        )
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = x[:, 0]
-        x = self.linear(x)
-        return x
-
-
-@LAYERS.register_module
-class ViTPatchEmbedding2p5D(ParallelLayer):
-    """ 2.5D Image to Patch Embedding
-
-    :param img_size: iamge size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param embed_dim: dimension of embedding
-    :type embed_dim: int
-    :param in_chans: number of channels of input image, defaults to 3
-    :type in_chans: int, optional
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    """
-
-    def __init__(self,
-                 img_size,
-                 patch_size,
-                 embed_dim,
-                 in_chans=3,
-                 flatten=True,
-                 weight_init='torch'):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-
-        assert_tesseract_initialization()
-        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0],
-                          img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        self.embed_dim = embed_dim // (self.tesseract_dep * self.tesseract_dim ** 2)  # *
-
-        with seed(ParallelMode.TENSOR):
-            self.proj = nn.Conv2d(in_chans,
-                                  self.embed_dim,
-                                  kernel_size=patch_size,
-                                  stride=patch_size,
-                                  device=get_current_device()
-                                  )
-        self._set_tensor_parallel_attribute()
-
-        if weight_init == 'jax':
-            with seed(ParallelMode.TENSOR):
-                fan_in, _ = _calculate_fan_in_and_fan_out(self.proj.weight)
-                std = math.sqrt(1.0 / fan_in)
-                nn.init.trunc_normal_(self.proj.weight, std=std / .87962566103423978)
-                nn.init.zeros_(self.proj.bias)
-
-    def _set_tensor_parallel_attribute(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.proj.weight, num_partition)
-        set_tensor_parallel_attribute_by_partition(self.proj.bias, num_partition)
-
-    def forward(self, x: Tensor) -> Tensor:
-        B, C, H, W = x.shape
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-        return x
-
-
-@LAYERS.register_module
-class ViTInputSplitter2p5D(ParallelLayer):
-    """Split the input tensor for 2D parallel Vision Transformer
-    """
-
-    def __init__(self):
-        super().__init__()
-        assert_tesseract_initialization()
-        self.tesseract_dim, _ = get_tesseract_dim_dep_from_env()
-
-    def forward(self, x: Tensor) -> Tensor:
-        x = AllGatherLast.apply(
-            x, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
-        x = SplitFirst.apply(
-            x, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
-        return x
-
-
-@LAYERS.register_module
-class ViTTokenFuser2p5D(ParallelLayer):
-    """
-    Fuse cls token and pos embedding to the input
-
-    :param img_size: image size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param embed_dim: dimension of embedding
-    :type embed_dim: int
-    :param drop_rate: dropout probability, defaults to 0.
-    :type drop_rate: float, optional
-    """
-
-    def __init__(self,
-                 img_size,
-                 patch_size,
-                 embed_dim,
-                 drop_rate=0.
-                 ):
-        super().__init__()
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-
-        assert_tesseract_initialization()
-        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0],
-                          img_size[1] // patch_size[1])
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.embed_dim = embed_dim
-
-        self.cls_token = nn.Parameter(torch.zeros(
-            (1, 1, self.embed_dim // (self.tesseract_dep * self.tesseract_dim ** 2)),
-            device=get_current_device()))
-        self.pos_embed = nn.Parameter(torch.empty(
-            (1, self.num_patches + 1, self.embed_dim // (self.tesseract_dep * self.tesseract_dim ** 2)),
-            device=get_current_device()))
-        with seed(ParallelMode.TENSOR):
-            nn.init.trunc_normal_(self.pos_embed, std=.02)
-
-        self.pos_drop = nn.Dropout(p=drop_rate)
-        self._set_tensor_parallel_attribute()
-
-    def _set_tensor_parallel_attribute(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.cls_token, num_partition)
-        set_tensor_parallel_attribute_by_partition(self.pos_embed, num_partition)
-
-    def _broadcast_params(self, param) -> None:
-        " broadcast to all column ranks for data consistency "
-        if self.tesseract_dep > 1:
-            xz_rank = gpc.get_ranks_in_group(ParallelMode.PARALLEL_2P5D_XZ)
-            xz_group = gpc.get_group(ParallelMode.PARALLEL_2P5D_XZ)
-            dist.broadcast(param, src=xz_rank[0],
-                           group=xz_group)
-
-    def _sync_grad_hook(self, grad) -> None:
-        dist.all_reduce(grad, group=gpc.get_group(
-            ParallelMode.PARALLEL_2P5D_XZ))
-        grad = grad / self.tesseract_dim  # / self.tesseract_dep  # *
-        return grad
-
-    def forward(self, x: Tensor) -> Tensor:
-        # stole cls_tokens impl from Phil Wang, thanks
-        cls_token = AllGatherLast.apply(
-            self.cls_token, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
-        cls_token = cls_token.expand(x.shape[0], -1, -1)
-        x = torch.cat((cls_token, x), dim=1)
-
-        pos_embed = AllGatherLast.apply(
-            self.pos_embed, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
-        x = x + pos_embed
-        with seed(ParallelMode.TENSOR):
-            x = self.pos_drop(x)
-        return x
--- a/colossalai/nn/layer/parallel_2p5d/layers.py
+++ b/colossalai/nn/layer/parallel_2p5d/layers.py
 import math
+from typing import Callable

 import torch
-from torch import Tensor
-from torch.nn import Parameter, init as init
-
-from colossalai.context import seed, ParallelMode
+import torch.nn as nn
+import torch.nn.functional as F
+from colossalai.communication import broadcast
+from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
+from colossalai.nn import init as init
 from colossalai.registry import LAYERS
 from colossalai.utils import get_current_device
-from ._operation import Matmul_AB_2p5D, Add_Bias_2p5D, _LayerNorm_2p5D
-from ._utils import get_tesseract_dim_dep_from_env, assert_tesseract_initialization
-from .._common_utils import divide, set_tensor_parallel_attribute_by_partition
+from torch import Tensor, dtype
+from torch.nn import Parameter
+
+from .._common_utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
 from ..base_layer import ParallelLayer
+from ._operation import (Add_Bias_2p5D, Matmul_AB_2p5D, all_gather_weight_2p5d, classifier_2p5d, layernorm_2p5d,
+                         split_batch_2p5d)
+from ._utils import (assert_tesseract_initialization, get_tesseract_dim_dep_from_env)


 @LAYERS.register_module
@@ -27,16 +33,14 @@ class Linear2p5D(ParallelLayer):
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    """
-
    def __init__(self,
                 in_features: int,
                 out_features: int,
                 bias: bool = True,
-                 dtype=None,
+                 dtype: dtype = None,
                 skip_bias_add: bool = False,
-                 init_weight='torch',
-                 init_bias='torch'
-                 ):
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
        super().__init__()

        self.in_features = in_features
@@ -52,76 +56,48 @@ class Linear2p5D(ParallelLayer):

        # partitioning dimension
        self.input_size_per_partition = divide(in_features, self.tesseract_dim)
-        self.hidden_size_per_partition = divide(
-            out_features, self.tesseract_dim)
+        self.hidden_size_per_partition = divide(out_features, self.tesseract_dim)

        # create weight, shape: [k/q, h/q]
        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}
-        self.weight = Parameter(torch.empty(
-            self.input_size_per_partition,
-            self.hidden_size_per_partition,
-            **factory_kwargs))
+        self.weight = Parameter(
+            torch.empty(self.input_size_per_partition, self.hidden_size_per_partition, **factory_kwargs))

        # create bias, shape: [h/q]
        if bias:
-            self.bias = Parameter(torch.empty(
-                self.hidden_size_per_partition,
-                **factory_kwargs))
+            self.bias = Parameter(torch.empty(self.hidden_size_per_partition, **factory_kwargs))
        else:
            self.register_parameter('bias', None)

        # initialize parameters
        with seed(ParallelMode.TENSOR):
-            self.reset_parameters(init_weight, init_bias)
+            self.reset_parameters(weight_initializer, bias_initializer)
        self._set_tensor_parallel_attributes()

    def _set_tensor_parallel_attributes(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.weight, num_partition)
+        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dim**2)
        if self.bias is not None:
-            set_tensor_parallel_attribute_by_partition(self.bias, num_partition)
+            set_tensor_parallel_attribute_by_partition(self.bias, self.tesseract_dim)

-    def reset_parameters(self, init_weight, init_bias) -> None:
-        assert init_weight in ('torch', 'jax', 'zero')
-        assert init_bias in ('torch', 'jax', 'zero')
-        # setting
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
        fan_in, fan_out = self.in_features, self.out_features
-
-        # init weight
-        if init_weight == 'torch':
-            a = math.sqrt(5)
-            nonlinearity = 'leaky_relu'
-            std = init.calculate_gain(nonlinearity, a) / math.sqrt(fan_in)
-            bound = math.sqrt(3.0) * std
-            init.uniform_(self.weight, -bound, bound)
-        elif init_weight == 'jax':
-            std = math.sqrt(2.0 / float(fan_in + fan_out))
-            a = math.sqrt(3.0) * std
-            init.uniform_(self.weight, -a, a)
-        elif init_weight == 'zero':
-            init.zeros_(self.weight)
-
-        # init bias
+        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
        if self.bias is not None:
-            if init_bias == 'torch':
-                bound = 1 / math.sqrt(fan_in) if fan_in > 0 else 0
-                init.uniform_(self.bias, -bound, bound)
-            elif init_bias == 'jax':
-                init.normal_(self.bias, std=1e-6)
-            elif init_bias == 'zero':
-                init.zeros_(self.bias)
+            bias_initializer(self.bias, fan_in=fan_in)

    def forward(self, x: Tensor) -> Tensor:
        # input: [m/dq, n/q, k/q]
        # output: [m/dq, n/q, h/q]
-        out_shape = x.shape[:-1] + (self.hidden_size_per_partition,)
+        out_shape = x.shape[:-1] + (self.hidden_size_per_partition, )

        output = Matmul_AB_2p5D.apply(
            x,
            self.weight,
            self.tesseract_dim,
            out_shape,
-            self.row_rank, self.col_rank, self.dep_rank,
+            self.row_rank,
+            self.col_rank,
+            self.dep_rank,
            ParallelMode.PARALLEL_2P5D_ROW,
            ParallelMode.PARALLEL_2P5D_COL,
            self.data_parallel_rank,
@@ -132,34 +108,17 @@ class Linear2p5D(ParallelLayer):

        if self.bias is not None:
            if self.skip_bias_add:
-                bias = Add_Bias_2p5D.apply(
-                    None,
-                    self.bias,
-                    self.hidden_size_per_partition,
-                    self.tesseract_dim,
-                    self.row_rank, self.col_rank, self.dep_rank,
-                    ParallelMode.PARALLEL_2P5D_COL,
-                    True,
-                    self.data_parallel_rank,
-                    self.pipeline_parallel_rank,
-                    self.pipeline_parallel_size,
-                    self.tensor_parallel_size
-                )
+                bias = Add_Bias_2p5D.apply(None, self.bias, self.hidden_size_per_partition, self.tesseract_dim,
+                                           self.row_rank, self.col_rank, self.dep_rank, ParallelMode.PARALLEL_2P5D_COL,
+                                           True, self.data_parallel_rank, self.pipeline_parallel_rank,
+                                           self.pipeline_parallel_size, self.tensor_parallel_size)
                return output, bias
            else:
-                output = Add_Bias_2p5D.apply(
-                    output,
-                    self.bias,
-                    self.hidden_size_per_partition,
-                    self.tesseract_dim,
+                output = Add_Bias_2p5D.apply(output, self.bias, self.hidden_size_per_partition, self.tesseract_dim,
                                             self.row_rank, self.col_rank, self.dep_rank,
-                    ParallelMode.PARALLEL_2P5D_COL,
-                    False,
-                    self.data_parallel_rank,
-                    self.pipeline_parallel_rank,
-                    self.pipeline_parallel_size,
-                    self.tensor_parallel_size
-                )
+                                             ParallelMode.PARALLEL_2P5D_COL, False, self.data_parallel_rank,
+                                             self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                             self.tensor_parallel_size)
                return output
        else:
            return output
@@ -179,12 +138,7 @@ class LayerNorm2p5D(ParallelLayer):
    :param dtype: The dtype of parameters, defaults to None
    :type dtype: torch.dtype, optional
    """
-
-    def __init__(self,
-                 normalized_shape: int,
-                 eps: float = 1e-05,
-                 dtype=None
-                 ):
+    def __init__(self, normalized_shape: int, eps: float = 1e-05, dtype=None):
        super().__init__()

        # layer norm config
@@ -199,66 +153,251 @@ class LayerNorm2p5D(ParallelLayer):
        self.tesseract_dim, _ = get_tesseract_dim_dep_from_env()

        # partitioning dimension
-        self.partitioned_partition = divide(
-            normalized_shape, self.tesseract_dim)  # *
+        self.partitioned_partition = divide(normalized_shape, self.tesseract_dim)  # *

        # create parameters
        factory_kwargs = {'device': get_current_device(), 'dtype': dtype}

-        self.gamma = Parameter(torch.ones(
-            self.partitioned_partition,
-            **factory_kwargs))
-        self.beta = Parameter(torch.zeros(
-            self.partitioned_partition,
-            **factory_kwargs))
+        self.gamma = Parameter(torch.ones(self.partitioned_partition, **factory_kwargs))
+        self.beta = Parameter(torch.zeros(self.partitioned_partition, **factory_kwargs))

        self._set_tensor_parallel_attribute()

    def _set_tensor_parallel_attribute(self):
-        num_partition = gpc.get_world_size(ParallelMode.TENSOR)
-        set_tensor_parallel_attribute_by_partition(self.gamma, num_partition)
-        set_tensor_parallel_attribute_by_partition(self.beta, num_partition)
+        set_tensor_parallel_attribute_by_partition(self.gamma, self.tesseract_dim)
+        set_tensor_parallel_attribute_by_partition(self.beta, self.tesseract_dim)

    def forward(self, x: Tensor) -> Tensor:
        with torch.no_grad():
            E_x = torch.sum(x, dim=-1, keepdim=True)  # [b/q, s, 1]
-            torch.distributed.all_reduce(
-                E_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
+            torch.distributed.all_reduce(E_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
            E_x /= self.normalized_shape

            # Var_x in the block below is the sum of input^2
            Var_x = torch.sum(x * x, dim=-1, keepdim=True)  # [b/q, s, 1]
-            torch.distributed.all_reduce(
-                Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
+            torch.distributed.all_reduce(Var_x, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
            Var_x /= self.normalized_shape

            Var_x = Var_x - E_x * E_x  # variance of x [b/q, s, 1]
            # this time 1/sqrt(Var_x + epsilon)
            Var_x = 1.0 / torch.sqrt(Var_x + self.variance_epsilon)

-        output = _LayerNorm_2p5D.apply(x, E_x, Var_x, self.normalized_shape,
-                                       ParallelMode.PARALLEL_2P5D_ROW)
-        bias = Add_Bias_2p5D.apply(
-            None, self.beta, self.partitioned_partition,
-            self.tesseract_dim,
-            self.row_rank, self.col_rank, self.dep_rank,
-            ParallelMode.PARALLEL_2P5D_COL,
-            True,
-            self.data_parallel_rank,
-            self.pipeline_parallel_rank,
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size
-        )
-        scale = Add_Bias_2p5D.apply(
-            None, self.gamma, self.partitioned_partition,
-            self.tesseract_dim,
-            self.row_rank, self.col_rank, self.dep_rank,
-            ParallelMode.PARALLEL_2P5D_COL,
-            True,
-            self.data_parallel_rank,
-            self.pipeline_parallel_rank,
-            self.pipeline_parallel_size,
-            self.tensor_parallel_size
-        )
+        output = layernorm_2p5d.apply(x, E_x, Var_x, self.normalized_shape, ParallelMode.PARALLEL_2P5D_ROW)
+        bias = Add_Bias_2p5D.apply(None, self.beta, self.partitioned_partition, self.tesseract_dim, self.row_rank,
+                                   self.col_rank, self.dep_rank, ParallelMode.PARALLEL_2P5D_COL, True,
+                                   self.data_parallel_rank, self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                   self.tensor_parallel_size)
+        scale = Add_Bias_2p5D.apply(None, self.gamma, self.partitioned_partition, self.tesseract_dim, self.row_rank,
+                                    self.col_rank, self.dep_rank, ParallelMode.PARALLEL_2P5D_COL, True,
+                                    self.data_parallel_rank, self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                    self.tensor_parallel_size)
        output = torch.addcmul(bias, scale, output)
        return output
+
+
+@LAYERS.register_module
+class PatchEmbedding2p5D(ParallelLayer):
+    """ 2D Image to Patch Embedding
+    :param img_size: iamge size
+    :type img_size: int
+    :param patch_size: patch size
+    :type patch_size: int
+    :param embed_dim: dimension of embedding
+    :type embed_dim: int
+    :param in_chans: number of channels of input image, defaults to 3
+    :type in_chans: int, optional
+    :param flatten: whether to flatten output tensor, defaults to True
+    :type flatten: bool, optional
+    """
+    def __init__(self,
+                 img_size: int,
+                 patch_size: int,
+                 in_chans: int,
+                 embed_size: int,
+                 dtype: dtype = None,
+                 flatten: bool = True,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 position_embed_initializer: Callable = init.zeros_()):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+
+        assert_tesseract_initialization()
+        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+        self.embed_size = embed_size
+        self.embed_size_per_partition = embed_size // (self.tesseract_dep * self.tesseract_dim**2)
+
+        with seed(ParallelMode.TENSOR):
+            self.weight = Parameter(
+                torch.empty((self.embed_size_per_partition, in_chans, *self.patch_size),
+                            device=get_current_device(),
+                            dtype=dtype))
+            self.bias = Parameter(torch.empty(self.embed_size_per_partition, device=get_current_device(), dtype=dtype))
+
+            self.cls_token = Parameter(
+                torch.zeros((1, 1, self.embed_size_per_partition), device=get_current_device(), dtype=dtype))
+            self.pos_embed = Parameter(
+                torch.zeros((1, self.num_patches + 1, self.embed_size_per_partition),
+                            device=get_current_device(),
+                            dtype=dtype))
+
+        self.reset_parameters(weight_initializer, bias_initializer, position_embed_initializer)
+        self._set_tensor_parallel_attribute()
+
+    def _set_tensor_parallel_attribute(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dep * self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.bias, self.tesseract_dep * self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.cls_token, self.tesseract_dep * self.tesseract_dim**2)
+        set_tensor_parallel_attribute_by_partition(self.pos_embed, self.tesseract_dep * self.tesseract_dim**2)
+
+    def reset_parameters(self, weight_initializer, bias_initializer, position_embed_initializer):
+        with seed(ParallelMode.TENSOR):
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            fan_out = self.embed_size
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            bias_initializer(self.bias, fan_in=fan_in)
+            position_embed_initializer(self.pos_embed)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        B, C, H, W = input_.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+
+        input_ = split_batch_2p5d(input_)
+
+        weight = all_gather_weight_2p5d.apply(self.weight, 0, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
+        bias = all_gather_weight_2p5d.apply(self.bias, 0, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
+
+        output = F.conv2d(input_, weight, bias, stride=self.patch_size)
+        if self.flatten:
+            output = output.flatten(2).transpose(1, 2)  # BCHW -> BNC
+
+        cls_token = all_gather_weight_2p5d.apply(self.cls_token, -1, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
+        pos_embed = all_gather_weight_2p5d.apply(self.pos_embed, -1, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
+        cls_token = cls_token.expand(output.shape[0], -1, -1)
+        output = torch.cat((cls_token, output), dim=1)
+        output = output + pos_embed
+
+        return output
+
+
+@LAYERS.register_module
+class Embedding2p5D(ParallelLayer):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs):
+        super().__init__()
+
+        assert_tesseract_initialization()
+        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        embed_dim_per_partition = embedding_dim // (self.tesseract_dep * self.tesseract_dim**2)
+
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+
+        self.weight = Parameter(
+            torch.empty((num_embeddings, embed_dim_per_partition), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self):
+        set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dep * self.tesseract_dim**2)
+
+    def reset_parameters(self, weight_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.num_embeddings, self.embed_dim
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            self._fill_padding_idx_with_zero()
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        input_ = split_batch_2p5d(input_)
+
+        weight = all_gather_weight_2p5d.apply(self.weight, -1, self.tesseract_dim, ParallelMode.PARALLEL_2P5D_COL)
+
+        output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+        return output
+
+
+@LAYERS.register_module
+class Classifier2p5D(ParallelLayer):
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: Parameter = None,
+                 bias: bool = True,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+        assert_tesseract_initialization()
+        self.row_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_COL)
+        self.col_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
+        self.dep_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_DEP)
+        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
+
+        # partitioning dimension
+        self.input_size_per_partition = divide(self.in_features, self.tesseract_dep * self.tesseract_dim**2)
+
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = Parameter(
+                torch.empty(self.num_classes, self.input_size_per_partition, device=get_current_device(), dtype=dtype))
+            self.has_weight = True
+        if bias:
+            self.bias = Parameter(torch.zeros(self.num_classes, device=get_current_device(), dtype=dtype))
+        else:
+            self.bias = None
+
+        self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self):
+        if self.has_weight:
+            set_tensor_parallel_attribute_by_partition(self.weight, self.tesseract_dep * self.tesseract_dim**2)
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.in_features, self.num_classes
+            col_src_rank = gpc.get_ranks_in_group(ParallelMode.PARALLEL_2P5D_COL)[0]
+            row_src_rank = gpc.get_ranks_in_group(ParallelMode.PARALLEL_2P5D_ROW)[0]
+
+            if self.has_weight:
+                weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+
+            if self.bias is not None:
+                bias_initializer(self.bias, fan_in=fan_in)
+                broadcast(self.bias, col_src_rank, ParallelMode.PARALLEL_2P5D_COL)
+                broadcast(self.bias, row_src_rank, ParallelMode.PARALLEL_2P5D_ROW)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        out_shape = input_.shape[:-1] + (self.num_classes, )
+
+        return classifier_2p5d.apply(input_, self.weight, self.bias, self.tesseract_dim, out_shape, self.row_rank,
+                                     self.col_rank, ParallelMode.PARALLEL_2P5D_ROW, ParallelMode.PARALLEL_2P5D_COL,
+                                     self.data_parallel_rank, self.pipeline_parallel_rank, self.pipeline_parallel_size,
+                                     self.tensor_parallel_size)
--- a/colossalai/nn/layer/parallel_3d/__init__.py
+++ b/colossalai/nn/layer/parallel_3d/__init__.py
-from ._operation import Matmul_ABT_3D, Matmul_ATB_3D, Matmul_AB_3D, Mul_3D, Sum_3D, Add_3D, Reduce_3D
-from ._vit import ViTHead3D, ViTMLP3D, ViTPatchEmbedding3D, ViTSelfAttention3D
-from .layers import Linear3D, LayerNorm3D
+from ._operation import reduce_by_batch_3d, split_batch_3d
+from .layers import Classifier3D, Embedding3D, LayerNorm3D, Linear3D, PatchEmbedding3D

 __all__ = [
-    'Matmul_ABT_3D', 'Matmul_ATB_3D', 'Matmul_AB_3D', 'Mul_3D', 'Sum_3D', 'Add_3D', 'Reduce_3D',
-    'ViTHead3D', 'ViTMLP3D', 'ViTPatchEmbedding3D', 'ViTSelfAttention3D',
-    'Linear3D', 'LayerNorm3D'
+    'reduce_by_batch_3d', 'split_batch_3d', 'Linear3D', 'LayerNorm3D', 'PatchEmbedding3D', 'Classifier3D', 'Embedding3D'
 ]
--- a/colossalai/nn/layer/parallel_3d/_operation.py
+++ b/colossalai/nn/layer/parallel_3d/_operation.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-

-from typing import Any, Optional, Tuple
+from typing import Optional, Tuple

 import torch
-import torch.distributed as dist
-from colossalai.communication import all_gather, all_reduce, reduce_scatter
+from colossalai.communication import all_gather, all_reduce, reduce_scatter, broadcast, reduce
 from colossalai.context.parallel_mode import ParallelMode
 from colossalai.core import global_context as gpc
 from torch import Tensor
@@ -15,7 +14,7 @@ from torch.cuda.amp import custom_bwd, custom_fwd
 class linear_3d(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
+    def forward(ctx,
                input_: Tensor,
                weight: Tensor,
                bias: Optional[Tensor],
@@ -25,33 +24,16 @@ class linear_3d(torch.autograd.Function):
                input_dim: int = 0,
                weight_dim: int = -1,
                output_dim: int = 0) -> Tensor:
-        assert input_.shape[-1] == weight.shape[0], \
-            'Invalid shapes: input = {}, weight = {}.'.format(input_.shape, weight.shape)
-
        ctx.use_bias = bias is not None

        input_ = all_gather(input_, input_dim, input_parallel_mode)
-        input_ = torch.cat(input_, dim=input_dim)
-        # weight = all_gather(weight, weight_dim, weight_parallel_mode)
        ctx.save_for_backward(input_, weight)

        output = torch.matmul(input_, weight)
        output = reduce_scatter(output, output_dim, output_parallel_mode)

        if bias is not None:
-            # ranks_in_group = gpc.get_ranks_in_group(output_parallel_mode)
-            # src_rank = ranks_in_group[gpc.get_local_rank(input_parallel_mode)]
-            # dist.broadcast(bias,
-            #                src=src_rank,
-            #                group=gpc.get_group(output_parallel_mode))
-            # bias = all_gather(bias, -1, weight_parallel_mode)
            output += bias
-            # ctx.src_rank = src_rank
-
-        # ctx.save_for_backward(input_, weight)
-        # output = torch.matmul(input_, weight)
-        # dist.all_reduce(output, group=gpc.get_group(output_parallel_mode))
-        # output += bias

        ctx.input_parallel_mode = input_parallel_mode
        ctx.weight_parallel_mode = weight_parallel_mode
@@ -63,115 +45,105 @@ class linear_3d(torch.autograd.Function):

    @staticmethod
    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
+    def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
        input_, weight = ctx.saved_tensors
        with torch.no_grad():
-            # input_grad = torch.matmul(output_grad, weight.transpose(0, 1))
-            # dist.all_reduce(input_grad,
-            #                 group=gpc.get_group(ctx.input_parallel_mode))
-            # weight_grad = torch.matmul(
-            #     input_.reshape(-1, input_.shape[-1]).transpose(0, 1),
-            #     output_grad.reshape(-1, output_grad.shape[-1]))
-            # dist.all_reduce(weight_grad,
-            #                 group=gpc.get_group(ctx.weight_parallel_mode))
-
-                # bias_grad = torch.sum(output_grad,
-                #                       dim=tuple(
-                #                           range(len(output_grad.shape))[:-1]))
-                # bias_grad = reduce_scatter(bias_grad, -1,
-                #                            ctx.weight_parallel_mode)
-                # dist.reduce(bias_grad,
-                #             dst=ctx.src_rank,
-                #             group=gpc.get_group(ctx.output_parallel_mode))
-                # if gpc.get_local_rank(
-                #         ctx.output_parallel_mode) != gpc.get_local_rank(
-                #             ctx.input_parallel_mode):
-                #     bias_grad = None
-
-            # input_ = all_gather(input_, ctx.input_dim, ctx.input_parallel_mode)
-            # weight = all_gather(weight, ctx.weight_dim,
-            #                     ctx.weight_parallel_mode)
-
-            output_grad = all_gather(output_grad, ctx.output_dim,
-                                     ctx.output_parallel_mode)
-            output_grad = torch.cat(output_grad, dim=ctx.output_dim)
+            output_grad = all_gather(output_grad, ctx.output_dim, ctx.output_parallel_mode)
+
+            async_ops = list()

            input_grad = torch.matmul(output_grad, weight.transpose(0, 1))
+            input_grad, op = reduce_scatter(input_grad, ctx.input_dim, ctx.input_parallel_mode, async_op=True)
+            async_ops.append(op)

-            input_grad, input_op = reduce_scatter(input_grad, ctx.input_dim,
-                                                  ctx.input_parallel_mode,
-                                                  async_op=True)
            weight_grad = torch.matmul(
-                input_.reshape(-1, input_.shape[-1]).transpose(0, 1),
-                output_grad.reshape(-1, output_grad.shape[-1]))
-
-            # weight_grad = torch.matmul(
-            #     input_.reshape(-1, input_.shape[-1]).transpose(0, 1),
-            #     output_grad.reshape(-1, output_grad.shape[-1]))
-            # weight_grad = reduce_scatter(weight_grad, ctx.weight_dim,
-            #                              ctx.weight_parallel_mode)
-            if ctx.use_bias:
-                bias_grad = torch.sum(output_grad,
-                                      dim=tuple(
-                                          range(len(output_grad.shape))[:-1]))
-                # bias_grad =all_reduce(bias_grad, ctx.output_parallel_mode)
-                # dist.all_reduce(bias_grad,
-                #                 group=gpc.get_group(ctx.weight_parallel_mode))
-                weight_grad = torch.cat([weight_grad, torch.unsqueeze(bias_grad, dim=0)])
-
-            weight_grad, weight_op = all_reduce(weight_grad, ctx.weight_parallel_mode, async_op=True)
-
-            input_op.wait()
-            weight_op.wait()
+                input_.reshape(-1, input_.shape[-1]).transpose(0, 1), output_grad.reshape(-1, output_grad.shape[-1]))
+            weight_grad, op = all_reduce(weight_grad, ctx.weight_parallel_mode, async_op=True)
+            async_ops.append(op)
+
            if ctx.use_bias:
-                bias_grad = weight_grad[-1]
-                weight_grad = weight_grad[:-1]
+                bias_grad = torch.sum(output_grad, dim=tuple(range(len(output_grad.shape))[:-1]))
+                bias_grad, op = all_reduce(bias_grad, ctx.weight_parallel_mode, async_op=True)
+                async_ops.append(op)
+
+            for op in async_ops:
+                if op is not None:
+                    op.wait()

        return input_grad, weight_grad, bias_grad, None, None, None, None, None, None


-class layer_norm_3d(torch.autograd.Function):
+class classifier_3d(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any, input_: Tensor, weight: Tensor, bias: Tensor,
-                normalized_shape: int, eps: float,
-                input_parallel_mode: ParallelMode,
-                weight_parallel_mode: ParallelMode,
+    def forward(ctx, input_: Tensor, weight: Tensor, bias: Optional[Tensor], input_parallel_mode: ParallelMode,
+                weight_parallel_mode: ParallelMode, output_parallel_mode: ParallelMode) -> Tensor:
+        ctx.use_bias = bias is not None
+
+        ranks_in_group = gpc.get_ranks_in_group(input_parallel_mode)
+        src_rank = ranks_in_group[gpc.get_local_rank(output_parallel_mode)]
+        weight = broadcast(weight, src_rank, input_parallel_mode)
+        ctx.save_for_backward(input_, weight)
+
+        output = torch.matmul(input_, weight.transpose(0, 1))
+        output = all_reduce(output, output_parallel_mode)
+
+        if bias is not None:
+            output += bias
+
+        ctx.src_rank = src_rank
+        ctx.input_parallel_mode = input_parallel_mode
+        ctx.weight_parallel_mode = weight_parallel_mode
+        ctx.output_parallel_mode = output_parallel_mode
+        return output
+
+    @staticmethod
+    @custom_bwd
+    def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
+        input_, weight = ctx.saved_tensors
+        with torch.no_grad():
+            async_ops = list()
+
+            weight_grad = torch.matmul(
+                output_grad.reshape(-1, output_grad.shape[-1]).transpose(0, 1), input_.reshape(-1, input_.shape[-1]))
+            weight_grad = reduce(weight_grad, ctx.src_rank, ctx.input_parallel_mode)
+            if gpc.get_local_rank(ctx.input_parallel_mode) == gpc.get_local_rank(ctx.output_parallel_mode):
+                weight_grad, op = all_reduce(weight_grad, ctx.weight_parallel_mode, async_op=True)
+                async_ops.append(op)
+            else:
+                weight_grad = None
+
+            if ctx.use_bias:
+                bias_grad = torch.sum(output_grad, dim=tuple(range(len(output_grad.shape))[:-1]))
+                bias_grad = all_reduce(bias_grad, ctx.input_parallel_mode)
+                bias_grad, op = all_reduce(bias_grad, ctx.weight_parallel_mode, async_op=True)
+                async_ops.append(op)
+
+            input_grad = torch.matmul(output_grad, weight)
+
+            for op in async_ops:
+                if op is not None:
+                    op.wait()
+
+        return input_grad, weight_grad, bias_grad, None, None, None, None, None, None
+
+
+class layernorm_3d(torch.autograd.Function):
+    @staticmethod
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, input_: Tensor, weight: Tensor, bias: Tensor, normalized_shape: int, eps: float,
+                input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
                output_parallel_mode: ParallelMode) -> Tensor:
-        # mean = torch.sum(input_, dim=-1)
-        # dist.all_reduce(mean, group=gpc.get_group(output_parallel_mode))
-        # mean /= normalized_shape
-        # mu = input_ - mean
-        # var = torch.sum(torch.pow(mu, 2), dim=-1)
-        # dist.all_reduce(var, group=gpc.get_group(output_parallel_mode))
-        # var /= normalized_shape
-        # std_dev = torch.sqrt(var + eps)
-        # ctx.save_for_backward(input_, mu, std_dev, weight)
-
-        # output = weight * mu / std_dev + bias
-
-        mean = all_reduce(torch.sum(input_, dim=-1, keepdim=True),
-                          output_parallel_mode) / normalized_shape
+        mean = all_reduce(torch.sum(input_, dim=-1, keepdim=True), output_parallel_mode) / normalized_shape
        mu = input_ - mean
-        var = all_reduce(torch.sum(mu**2, dim=-1, keepdim=True), 
-                         output_parallel_mode) / normalized_shape
+        var = all_reduce(torch.sum(mu**2, dim=-1, keepdim=True), output_parallel_mode) / normalized_shape
        sigma = torch.sqrt(var + eps)

-        # ranks_in_group = gpc.get_ranks_in_group(input_parallel_mode)
-        # src_rank = ranks_in_group[gpc.get_local_rank(output_parallel_mode)]
-        # transforms = torch.stack([weight, bias]).contiguous()
-        # dist.broadcast(transforms,
-        #                src=src_rank,
-        #                group=gpc.get_group(input_parallel_mode))
-        # transforms = all_gather(transforms, -1, weight_parallel_mode)
-        # weight, bias = transforms[0], transforms[1]
-
        ctx.save_for_backward(mu, sigma, weight)

        z = mu / sigma
        output = weight * z + bias

-        # ctx.src_rank = src_rank
        ctx.normalized_shape = normalized_shape
        ctx.input_parallel_mode = input_parallel_mode
        ctx.weight_parallel_mode = weight_parallel_mode
@@ -181,7 +153,7 @@ class layer_norm_3d(torch.autograd.Function):

    @staticmethod
    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
+    def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
        mu, sigma, weight = ctx.saved_tensors
        with torch.no_grad():
            bias_grad, weight_grad = output_grad, output_grad * mu / sigma
@@ -191,373 +163,63 @@ class layer_norm_3d(torch.autograd.Function):
            grads = all_reduce(grads, ctx.input_parallel_mode)
            bias_grad, weight_grad = grads[0], grads[1]

-            # grads = reduce_scatter(grads, -1, ctx.weight_parallel_mode)
-            # dist.reduce(grads,
-            #             dst=ctx.src_rank,
-            #             group=gpc.get_group(ctx.input_parallel_mode))
-            # if gpc.get_local_rank(
-            #         ctx.input_parallel_mode) == gpc.get_local_rank(
-            #             ctx.output_parallel_mode):
-            #     bias_grad, weight_grad = grads[0], grads[1]
-            # else:
-            #     bias_grad, weight_grad = None, None
-
            dz = output_grad * weight
            dvar = dz * mu * (-0.5) * sigma**(-3)
            dvar = all_reduce(torch.sum(dvar, dim=-1, keepdim=True), ctx.output_parallel_mode)
            dmean = dz * (-1 / sigma) + dvar * -2 * mu / ctx.normalized_shape
            dmean = all_reduce(torch.sum(dmean, dim=-1, keepdim=True), ctx.output_parallel_mode)

-            input_grad = dz / sigma + dvar * 2 * mu / ctx.normalized_shape + dmean / ctx.normalized_shape
+            input_grad = dz / sigma + dvar * 2 * mu / \
+                ctx.normalized_shape + dmean / ctx.normalized_shape

        return input_grad, weight_grad, bias_grad, None, None, None, None, None


-class Matmul_AB_3D(torch.autograd.Function):
-    """Matrix multiplication for :math:`C = AB`
-    """
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                A: Tensor,
-                B: Tensor,
-                depth: int,
+def split_batch_3d(input_: Tensor,
                   input_parallel_mode: ParallelMode,
                   weight_parallel_mode: ParallelMode,
-                output_parallel_mode: ParallelMode,
-                input_dim: int = 0,
-                weight_dim: int = -1,
-                output_dim: int = 0) -> Tensor:
-        # A: [m/q^2, n, k/q]
-        # B: [k/q, h/q^2]
-        # C: [m/q^2, n, h/q]
-        ctx.save_for_backward(A, B)
-
-        assert A.shape[-1] == B.shape[0], \
-            'Invalid shapes: A={}, B={}.'.format(A.shape, B.shape)
-
-        A_temp = all_gather(A, input_dim, input_parallel_mode)
-        B_temp = all_gather(B, weight_dim, weight_parallel_mode)
-
-        C = torch.matmul(A_temp, B_temp)
-        out = reduce_scatter(C, output_dim, output_parallel_mode)
-
-        ctx.depth = depth
-        ctx.A_group_parallel_mode = input_parallel_mode
-        ctx.B_group_parallel_mode = weight_parallel_mode
-        ctx.C_group_parallel_mode = output_parallel_mode
-        ctx.A_dim = input_dim
-        ctx.B_dim = weight_dim
-        ctx.C_dim = output_dim
-
-        return out
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        A, B = ctx.saved_tensors
-        with torch.no_grad():
-            A_grad = Matmul_ABT_3D.apply(output_grad, B, ctx.depth,
-                                         ctx.C_group_parallel_mode,
-                                         ctx.B_group_parallel_mode,
-                                         ctx.A_group_parallel_mode, ctx.C_dim,
-                                         ctx.B_dim, ctx.A_dim)
-            B_grad = Matmul_ATB_3D.apply(A, output_grad, ctx.depth,
-                                         ctx.A_group_parallel_mode,
-                                         ctx.C_group_parallel_mode,
-                                         ctx.B_group_parallel_mode, ctx.A_dim,
-                                         ctx.C_dim, ctx.B_dim)
-        return A_grad, B_grad, None, None, None, None, None, None, None
-
-
-class Matmul_ABT_3D(torch.autograd.Function):
-    """Matrix multiplication for :math:`C = AB^T`
-    """
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                A: Tensor,
-                B: Tensor,
-                depth: int,
-                input_parallel_mode: ParallelMode,
-                weight_parallel_mode: ParallelMode,
-                output_parallel_mode: ParallelMode,
-                input_dim: int = 0,
-                weight_dim: int = -1,
-                output_dim: int = 0) -> Tensor:
-        # A: [m/q^2, n, h/q]
-        # B: [k/q, h/q^2]
-        # C: [m/q^2, n, k/q]
-        ctx.save_for_backward(A, B)
-
-        A_temp = all_gather(A, input_dim, input_parallel_mode)
-        B_temp = all_gather(B, weight_dim, weight_parallel_mode)
-
-        C = torch.matmul(A_temp, B_temp.transpose(0, 1))
-        out = reduce_scatter(C, output_dim, output_parallel_mode)
-
-        ctx.depth = depth
-        ctx.A_group_parallel_mode = input_parallel_mode
-        ctx.B_group_parallel_mode = weight_parallel_mode
-        ctx.C_group_parallel_mode = output_parallel_mode
-        ctx.A_dim = input_dim
-        ctx.B_dim = weight_dim
-        ctx.C_dim = output_dim
+                   dim: int = 0) -> Tensor:
+    output = torch.chunk(input_, gpc.get_world_size(weight_parallel_mode),
+                         dim=dim)[gpc.get_local_rank(weight_parallel_mode)].contiguous()
+    output = torch.chunk(output, gpc.get_world_size(input_parallel_mode),
+                         dim=dim)[gpc.get_local_rank(input_parallel_mode)].contiguous()
+    return output

-        return out

+class reduce_by_batch_3d(torch.autograd.Function):
    @staticmethod
-    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        A, B = ctx.saved_tensors
-        with torch.no_grad():
-            A_grad = Matmul_AB_3D.apply(output_grad, B, ctx.depth,
-                                        ctx.C_group_parallel_mode,
-                                        ctx.B_group_parallel_mode,
-                                        ctx.A_group_parallel_mode, ctx.C_dim,
-                                        ctx.B_dim, ctx.A_dim)
-            B_grad = Matmul_ATB_3D.apply(output_grad, A, ctx.depth,
-                                         ctx.C_group_parallel_mode,
-                                         ctx.A_group_parallel_mode,
-                                         ctx.B_group_parallel_mode, ctx.C_dim,
-                                         ctx.A_dim, ctx.B_dim)
-        return A_grad, B_grad, None, None, None, None, None, None, None
-
-
-class Matmul_ATB_3D(torch.autograd.Function):
-    """Matrix multiplication for :math:`C = A^TB`
-    """
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                A: Tensor,
-                B: Tensor,
-                depth: int,
-                input_parallel_mode: ParallelMode,
-                weight_parallel_mode: ParallelMode,
-                output_parallel_mode: ParallelMode,
-                input_dim: int = 0,
-                weight_dim: int = 0,
-                output_dim: int = -1) -> Tensor:
-        # A: [m/q^2, n, k/q]
-        # B: [m/q^2, n, h/q]
-        # C: [k/q, h/q^2]
-        ctx.save_for_backward(A, B)
-
-        A_temp = all_gather(A, input_dim, input_parallel_mode)
-        A_temp = A_temp.reshape(-1, A.shape[-1])
-        B_temp = all_gather(B, weight_dim, weight_parallel_mode)
-        B_temp = B_temp.reshape(-1, B.shape[-1])
-
-        C = torch.matmul(A_temp.transpose(0, 1), B_temp)
-        out = reduce_scatter(C, output_dim, output_parallel_mode)
-
-        ctx.depth = depth
-        ctx.A_group_parallel_mode = input_parallel_mode
-        ctx.B_group_parallel_mode = weight_parallel_mode
-        ctx.C_group_parallel_mode = output_parallel_mode
-        ctx.A_dim = input_dim
-        ctx.B_dim = weight_dim
-        ctx.C_dim = output_dim
-
-        return out
+    @custom_fwd(cast_inputs=torch.float32)
+    def forward(ctx, input_: Tensor, input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode) -> Tensor:
+        output = all_reduce(input_, input_parallel_mode)
+        output = all_reduce(output, weight_parallel_mode)
+        return output.clone()

    @staticmethod
    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        A, B = ctx.saved_tensors
-        with torch.no_grad():
-            A_grad = Matmul_ABT_3D.apply(B, output_grad, ctx.depth,
-                                         ctx.B_group_parallel_mode,
-                                         ctx.C_group_parallel_mode,
-                                         ctx.A_group_parallel_mode, ctx.B_dim,
-                                         ctx.C_dim, ctx.A_dim)
-            B_grad = Matmul_AB_3D.apply(A, output_grad, ctx.depth,
-                                        ctx.A_group_parallel_mode,
-                                        ctx.C_group_parallel_mode,
-                                        ctx.B_group_parallel_mode, ctx.A_dim,
-                                        ctx.C_dim, ctx.B_dim)
-        return A_grad, B_grad, None, None, None, None, None, None, None
-
-
-class Add_3D(torch.autograd.Function):
-    """Matrix add bias: :math:`C = A + b`
-    """
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any, input_: Tensor, bias: Tensor, depth: int,
-                input_parallel_mode: ParallelMode,
-                weight_parallel_mode: ParallelMode,
-                output_parallel_mode: ParallelMode) -> Tensor:
-        # input: [m/q^2, n, h/q]
-        # bias: [h/q^2]
-        ranks_in_group = gpc.get_ranks_in_group(input_parallel_mode)
-        src_rank = ranks_in_group[gpc.get_local_rank(output_parallel_mode)]
-        bias_temp = bias.clone()
-        dist.broadcast(bias_temp,
-                       src=src_rank,
-                       group=gpc.get_group(input_parallel_mode))
-        # [h/q]
-        bias_temp = all_gather(bias_temp, -1, weight_parallel_mode)
-
-        out = input_ + bias_temp
-
-        ctx.depth = depth
-        ctx.src_rank = src_rank
-        ctx.A_group_parallel_mode = input_parallel_mode
-        ctx.B_group_parallel_mode = weight_parallel_mode
-        ctx.C_group_parallel_mode = output_parallel_mode
+    def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
+        return output_grad, None, None

-        return out

-    @staticmethod
-    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        # output_grad: [m/q^2, n, h/q]
-        with torch.no_grad():
-            # [h/q]
-            grad = torch.sum(output_grad,
-                             dim=tuple(range(len(output_grad.shape))[:-1]))
-            bias_grad = reduce_scatter(grad, -1, ctx.B_group_parallel_mode)
-            dist.reduce(bias_grad,
-                        dst=ctx.src_rank,
-                        group=gpc.get_group(ctx.A_group_parallel_mode))
-            if gpc.get_local_rank(
-                    ctx.A_group_parallel_mode) != gpc.get_local_rank(
-                        ctx.C_group_parallel_mode):
-                bias_grad = None
-        return output_grad, bias_grad, None, None, None, None
-
-
-class Mul_3D(torch.autograd.Function):
-    """Matrix multiplication for :math:`C = A * b`
-    """
+class broadcast_weight_3d_from_diagonal(torch.autograd.Function):
    @staticmethod
    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any, input_: Tensor, bias: Tensor, depth: int,
-                input_parallel_mode: ParallelMode,
-                weight_parallel_mode: ParallelMode,
+    def forward(ctx, input_: Tensor, input_parallel_mode: ParallelMode, weight_parallel_mode: ParallelMode,
                output_parallel_mode: ParallelMode) -> Tensor:
-        # input: [m/q^2, n, h/q]
-        # bias: [h/q^2]
        ranks_in_group = gpc.get_ranks_in_group(input_parallel_mode)
        src_rank = ranks_in_group[gpc.get_local_rank(output_parallel_mode)]
-        # [h/q^2]
-        bias_temp = bias.clone()
-        dist.broadcast(bias_temp,
-                       src=src_rank,
-                       group=gpc.get_group(input_parallel_mode))
-        # [h/q]
-        bias_temp = all_gather(bias_temp, -1, weight_parallel_mode)
-
-        # empty_cache()
-        ctx.save_for_backward(input_, bias_temp)
-
-        out = torch.mul(input_, bias_temp)
-
-        ctx.depth = depth
+        output = broadcast(input_, src_rank, input_parallel_mode)
        ctx.src_rank = src_rank
-        ctx.A_group_parallel_mode = input_parallel_mode
-        ctx.B_group_parallel_mode = weight_parallel_mode
-        ctx.C_group_parallel_mode = output_parallel_mode
-
-        return out
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        # output_grad: [m/q^2, n, h/q]
-        with torch.no_grad():
-            input_, bias = ctx.saved_tensors
-            # [m/q^2, n, h/q]
-            input_grad = torch.mul(output_grad, bias)
-            # [h/q]
-            grad = torch.mul(output_grad, input_)
-            grad = torch.sum(grad,
-                             dim=tuple(range(len(output_grad.shape))[:-1]))
-            bias_grad = reduce_scatter(grad, -1, ctx.B_group_parallel_mode)
-            dist.reduce(bias_grad,
-                        dst=ctx.src_rank,
-                        group=gpc.get_group(ctx.A_group_parallel_mode))
-            if gpc.get_local_rank(
-                    ctx.A_group_parallel_mode) != gpc.get_local_rank(
-                        ctx.C_group_parallel_mode):
-                bias_grad = None
-        return input_grad, bias_grad, None, None, None, None
-
-
-class Sum_3D(torch.autograd.Function):
-    """Compute the sum of input tensors
-    """
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any,
-                input_: Tensor,
-                dim: int,
-                depth: int,
-                parallel_mode: ParallelMode,
-                keepdim: bool = False) -> Tensor:
-        # input: [m/q^2, n, h/q]
-        out = torch.sum(input_, dim=dim, keepdim=keepdim)
-        dist.all_reduce(out, group=gpc.get_group(parallel_mode))
-
-        ctx.input_shape = input_.shape
-        ctx.depth = depth
-        ctx.group = parallel_mode
-        ctx.dim = dim
-        return out
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        with torch.no_grad():
-            output_grad = output_grad.contiguous()
-            dist.all_reduce(output_grad, group=gpc.get_group(ctx.group))
-            if len(output_grad.shape) < len(ctx.input_shape):
-                output_grad = torch.unsqueeze(output_grad, ctx.dim)
-            dims = [1 for _ in range(len(output_grad.shape))]
-            dims[ctx.dim] = ctx.input_shape[ctx.dim]
-            input_grad = output_grad.repeat(tuple(dims))
-        return input_grad, None, None, None, None, None
-
-
-class Reduce_3D(torch.autograd.Function):
-    """Reduce input tensors
-    """
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float16)
-    def forward(ctx: Any, input_: Tensor, depth: int,
-                parallel_mode: ParallelMode) -> Tensor:
-        dist.all_reduce(input_, group=gpc.get_group(parallel_mode))
-        return input_.clone()
+        ctx.input_parallel_mode = input_parallel_mode
+        ctx.weight_parallel_mode = weight_parallel_mode
+        ctx.output_parallel_mode = output_parallel_mode
+        return output

    @staticmethod
    @custom_bwd
-    def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-        return output_grad, None, None
-
-
-# class Slice_3D(torch.autograd.Function):
-#     """Slice input tensor
-#     """
-#     @staticmethod
-#     @custom_fwd(cast_inputs=torch.float16)
-#     def forward(ctx: Any, input_: Tensor, dim: int, depth: int,
-#                 parallel_mode: ParallelMode) -> Tensor:
-#         rank = gpc.get_local_rank(parallel_mode)
-#         out = torch.chunk(input_, depth, dim=dim)[rank].contiguous()
-
-#         ctx.depth = depth
-#         ctx.parallel_mode = parallel_mode
-#         ctx.dim = dim
-#         ctx.input_shape = input_.shape
-
-#         return out
-
-#     @staticmethod
-#     @custom_bwd
-#     def backward(ctx: Any, output_grad: Tensor) -> Tuple[Tensor, ...]:
-#         with torch.no_grad():
-#             input_grad = all_gather(output_grad, ctx.dim, ctx.parallel_mode)
-#             input_grad.reshape(ctx.input_shape)
-#         return input_grad, None, None, None
+    def backward(ctx, output_grad: Tensor) -> Tuple[Tensor, ...]:
+        input_grad = reduce(output_grad, ctx.src_rank, ctx.input_parallel_mode)
+        if gpc.get_local_rank(ctx.input_parallel_mode) == gpc.get_local_rank(ctx.output_parallel_mode):
+            input_grad = all_reduce(input_grad, ctx.weight_parallel_mode)
+        else:
+            input_grad = None
+        return input_grad, None, None, None
--- a/colossalai/nn/layer/parallel_3d/_vit.py
+++ b/colossalai/nn/layer/parallel_3d/_vit.py
-import math
-import os
-from typing import Tuple, Optional
-
-import torch
-import torch.distributed as dist
-from colossalai.constants import (INPUT_GROUP_3D, OUTPUT_GROUP_3D,
-                                  WEIGHT_GROUP_3D)
-from colossalai.context import ParallelMode, seed
-from colossalai.core import global_context as gpc
-from colossalai.registry import LAYERS
-from colossalai.nn.init import init_bias_, init_weight_
-from colossalai.utils import checkpoint, get_current_device
-from torch import Tensor, dtype, nn
-
-from .._common_utils import ACT2FN, divide, set_tensor_parallel_attribute_by_size, to_2tuple
-from ._utils import get_depth_from_env, get_parallel_mode_from_env, get_last_group
-from .layers import Linear3D
-
-
-@LAYERS.register_module
-class ViTPatchEmbedding3D(nn.Module):
-    """ 3D Image to Patch Embedding
-
-    :param img_size: iamge size
-    :type img_size: int
-    :param patch_size: patch size
-    :type patch_size: int
-    :param in_chans: number of channels of input image
-    :type in_chans: int
-    :param embed_size: dimension of embedding
-    :type embed_size: int
-    :param drop_prob: dropout probability
-    :type drop_prob: float
-    :param flatten: whether to flatten output tensor, defaults to True
-    :type flatten: bool, optional
-    """
-
-    def __init__(self,
-                 img_size: int,
-                 patch_size: int,
-                 in_chans: int,
-                 embed_size: int,
-                 drop_prob: float,
-                 flatten: bool = True,
-                 init_method: str = 'torch'):
-        super().__init__()
-        self.depth = get_depth_from_env()
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-                                                   self.weight_parallel_mode)
-        img_size = to_2tuple(img_size)
-        patch_size = to_2tuple(patch_size)
-        self.img_size = img_size
-        self.patch_size = patch_size
-        self.grid_size = (img_size[0] // patch_size[0],
-                          img_size[1] // patch_size[1])
-        self.in_chans = in_chans
-        self.embed_size = embed_size
-        self.embed_size_per_partition = divide(self.embed_size, self.depth)
-        self.num_patches = self.grid_size[0] * self.grid_size[1]
-        self.flatten = flatten
-        self.init_weight = 'torch'
-        self.init_bias = 'torch'
-        if init_method == 'jax':
-            self.init_weight = 'jax_embed'
-            self.init_bias = 'zero'
-
-        self.proj = nn.Conv2d(self.in_chans,
-                              self.embed_size_per_partition,
-                              kernel_size=patch_size,
-                              stride=patch_size)
-
-        self.cls_token = nn.Parameter(
-            torch.zeros(1, 1, self.embed_size_per_partition))
-        self.pos_embed = nn.Parameter(
-            torch.zeros(1, self.num_patches + 1,
-                        self.embed_size_per_partition))
-        self.pos_drop = nn.Dropout(drop_prob)
-
-        self.reset_parameters(self.init_weight, self.init_bias)
-        self._set_tensor_parallel_attributes()
-
-    def _set_tensor_parallel_attributes(self):
-        set_tensor_parallel_attribute_by_size(self.proj.weight, self.in_chans * self.embed_size * self.num_patches)
-        set_tensor_parallel_attribute_by_size(self.proj.bias, self.embed_size)
-        set_tensor_parallel_attribute_by_size(self.cls_token, 1 * 1 * self.embed_size)
-        set_tensor_parallel_attribute_by_size(self.pos_embed, 1 * (self.num_patches + 1) * self.embed_size)
-
-    def reset_parameters(self, init_weight, init_bias):
-        fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.proj.weight)
-        # std = math.sqrt(1.0 / fan_in)
-        # nn.init.trunc_normal_(self.proj.weight, std=std / .87962566103423978)
-        # nn.init.zeros_(self.proj.bias)
-        if init_weight != 'torch':
-            init_weight_(self.proj.weight, fan_in, init_method=init_weight)
-            init_bias_(self.pos_embed, fan_in, init_method=init_weight)
-        if init_bias != 'torch':
-            init_bias_(self.proj.bias, fan_in, init_method=init_bias)
-
-        self.to(get_current_device())
-        weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0]
-        dist.broadcast(self.proj.weight,
-                       src=weight_src_rank,
-                       group=gpc.get_group(self.weight_parallel_mode))
-        dist.broadcast(self.proj.bias,
-                       src=weight_src_rank,
-                       group=gpc.get_group(self.weight_parallel_mode))
-        input_src_rank = gpc.get_ranks_in_group(self.input_parallel_mode)[0]
-        dist.broadcast(self.proj.weight,
-                       src=input_src_rank,
-                       group=gpc.get_group(self.input_parallel_mode))
-        dist.broadcast(self.proj.bias,
-                       src=input_src_rank,
-                       group=gpc.get_group(self.input_parallel_mode))
-
-        self.proj.weight.register_hook(self._sync_grad_hook)
-        self.proj.bias.register_hook(self._sync_grad_hook)
-        self.cls_token.register_hook(self._sync_grad_hook)
-        self.pos_embed.register_hook(self._sync_grad_hook)
-
-    def _sync_grad_hook(self, grad) -> None:
-        dist.all_reduce(grad, group=gpc.get_group(self.input_parallel_mode))
-        dist.all_reduce(grad, group=gpc.get_group(self.weight_parallel_mode))
-        return grad
-
-    def forward(self, x: Tensor) -> Tensor:
-        # split a partition from inputs
-        x = torch.chunk(x, self.depth, dim=0)[gpc.get_local_rank(
-            self.weight_parallel_mode)].contiguous()
-        x = torch.chunk(x, self.depth, dim=0)[gpc.get_local_rank(
-            self.input_parallel_mode)].contiguous()
-
-        B, C, H, W = x.shape
-        assert H == self.img_size[0] and W == self.img_size[1], \
-            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
-        x = self.proj(x)
-        if self.flatten:
-            x = x.flatten(2).transpose(1, 2)  # BCHW -> BNC
-
-        # add cls token & pos embedding
-        # [b/q^2,s,h/q] --> [b/q^2, 1+s, h/q]
-        cls_token = self.cls_token.expand(x.shape[0], -1, -1)
-        x = torch.cat((cls_token, x), dim=1)
-
-        with seed(ParallelMode.TENSOR):
-            x = self.pos_drop(x + self.pos_embed)
-
-        return x
-
-
-@LAYERS.register_module
-class ViTSelfAttention3D(nn.Module):
-    """Self-attention layer for 3D parallel Vision Transformer
-
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param num_attention_heads: number of attention heads
-    :type num_attention_heads: int
-    :param attention_probs_dropout_prob: dropout probability for attention layers
-    :type attention_probs_dropout_prob: bool
-    :param hidden_dropout_prob: dropout probability for hidden layers
-    :type hidden_dropout_prob: bool
-    :param depth: the 3D parallelism depth
-    :type depth: int
-    :param input_parallel_mode: parallel mode of input tensor
-    :type input_parallel_mode: ParallelMode
-    :param weight_parallel_mode: parallel mode of weight
-    :type weight_parallel_mode: ParallelMode
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: dtype, optional
-    :param bias: whether to add bias, defaults to True
-    :type bias: bool, optional
-    """
-
-    def __init__(self,
-                 hidden_size: int,
-                 num_attention_heads: int,
-                 attention_probs_dropout_prob: float,
-                 hidden_dropout_prob: float,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 checkpoint: bool = False,
-                 init_method: str = 'torch'):
-        super().__init__()
-        self.depth = get_depth_from_env()
-        # self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        # self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        # self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-        #                                            self.weight_parallel_mode)
-        self.hidden_size = hidden_size
-        self.num_attention_heads = divide(num_attention_heads, self.depth)
-        self.attention_head_size = divide(hidden_size, num_attention_heads)
-        self.all_head_size = self.num_attention_heads * self.attention_head_size
-        self.checkpoint = checkpoint
-        self.init_weight = 'torch'
-        self.init_bias = 'torch'
-        if init_method == 'jax':
-            self.init_weight = 'jax'
-            self.init_bias = 'zero'
-
-        self.query_key_value = Linear3D(self.hidden_size,
-                                        3 * self.hidden_size,
-                                        # self.input_parallel_mode,
-                                        # self.weight_parallel_mode,
-                                        dtype=dtype,
-                                        bias=bias,
-                                        init_weight=self.init_weight,
-                                        init_bias=self.init_bias)
-        self.attention_dropout = nn.Dropout(attention_probs_dropout_prob)
-        self.dense = Linear3D(self.hidden_size,
-                              self.hidden_size,
-                              #   self.output_parallel_mode,
-                              #   self.weight_parallel_mode,
-                              dtype=dtype,
-                              bias=bias,
-                              init_weight=self.init_weight,
-                              init_bias=self.init_bias)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-        self.softmax = nn.Softmax(dim=-1)
-
-    # def groups_for_next_layer(self) -> Tuple[ParallelMode, ParallelMode]:
-    #     return self.input_parallel_mode, self.weight_parallel_mode
-
-    def _forward(self, hidden_states: Tensor) -> Tensor:
-        query_key_value = self.query_key_value(hidden_states)
-        new_qkv_shape = query_key_value.shape[:-1] + \
-            (self.num_attention_heads, 3 * self.attention_head_size)
-        query_key_value = query_key_value.view(new_qkv_shape)
-        query_key_value = query_key_value.permute((0, 2, 1, 3))
-        query_layer, key_layer, value_layer = torch.chunk(query_key_value,
-                                                          3,
-                                                          dim=-1)
-
-        attention_scores = torch.matmul(query_layer,
-                                        key_layer.transpose(-1, -2))
-        attention_scores = attention_scores / math.sqrt(
-            self.attention_head_size)
-        attention_probs = self.softmax(attention_scores)
-        with seed(ParallelMode.TENSOR):
-            attention_probs = self.attention_dropout(attention_probs)
-
-        context_layer = torch.matmul(attention_probs, value_layer)
-        context_layer = context_layer.transpose(1, 2)
-        new_context_layer_shape = context_layer.size()[:-2] + (
-            self.all_head_size, )
-        context_layer = context_layer.reshape(new_context_layer_shape)
-
-        output = self.dense(context_layer)
-        with seed(ParallelMode.TENSOR):
-            output = self.dropout(output)
-
-        return output
-
-    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
-        return checkpoint(self._forward, hidden_states)
-
-    def forward(self, hidden_states: Tensor) -> Tensor:
-        if self.checkpoint:
-            return self._checkpoint_forward(hidden_states)
-        else:
-            return self._forward(hidden_states)
-
-
-@LAYERS.register_module
-class ViTMLP3D(nn.Module):
-    """[summary]
-
-    :param hidden_size: hidden size
-    :type hidden_size: int
-    :param mlp_ratio: hidden size of MLP divided by embedding dim
-    :type mlp_ratio: int
-    :param hidden_dropout_prob: dropout probability for hidden layers
-    :type hidden_dropout_prob: float
-    :param hidden_act: activation function for hidden layers
-    :type hidden_act: str
-    :param depth: the 3D parallelism depth
-    :type depth: int
-    :param input_parallel_mode: parallel mode of input tensor
-    :type input_parallel_mode: ParallelMode
-    :param weight_parallel_mode: parallel mode of weight
-    :type weight_parallel_mode: ParallelMode
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: dtype, optional
-    :param bias: whether to add bias, defaults to True
-    :type bias: bool, optional
-    """
-
-    def __init__(self,
-                 hidden_size: int,
-                 mlp_ratio: int,
-                 hidden_dropout_prob: float,
-                 hidden_act: str = 'gelu',
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 checkpoint: bool = False,
-                 init_method: str = 'torch'):
-        super().__init__()
-        # self.depth = get_depth_from_env()
-        # self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        # self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        # self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-        #                                            self.weight_parallel_mode)
-        self.hidden_size = hidden_size
-        self.mlp_ratio = mlp_ratio
-        self.checkpoint = checkpoint
-        self.init_weight = init_method
-        self.init_bias = init_method
-
-        self.dense_1 = Linear3D(self.hidden_size,
-                                self.mlp_ratio * self.hidden_size,
-                                # self.input_parallel_mode,
-                                # self.weight_parallel_mode,
-                                dtype=dtype,
-                                bias=bias,
-                                init_weight=self.init_weight,
-                                init_bias=self.init_bias)
-        self.activation_func = ACT2FN[hidden_act]
-        self.dense_2 = Linear3D(self.mlp_ratio * self.hidden_size,
-                                self.hidden_size,
-                                # self.output_parallel_mode,
-                                # self.weight_parallel_mode,
-                                dtype=dtype,
-                                bias=bias,
-                                init_weight=self.init_weight,
-                                init_bias=self.init_bias)
-        self.dropout = nn.Dropout(hidden_dropout_prob)
-
-    # def groups_for_next_layer(self) -> Tuple[ParallelMode, ParallelMode]:
-    #     return self.input_parallel_mode, self.weight_parallel_mode
-
-    def _forward(self, hidden_states: Tensor) -> Tensor:
-        intermediate_output = self.dense_1(hidden_states)
-        intermediate_output = self.activation_func(intermediate_output)
-        output = self.dense_2(intermediate_output)
-        with seed(ParallelMode.TENSOR):
-            output = self.dropout(output)
-        return output
-
-    def _checkpoint_forward(self, hidden_states: Tensor) -> Tensor:
-        return checkpoint(self._forward, hidden_states)
-
-    def forward(self, hidden_states: Tensor) -> Tensor:
-        if self.checkpoint:
-            return self._checkpoint_forward(hidden_states)
-        else:
-            return self._forward(hidden_states)
-
-
-@LAYERS.register_module
-class ViTHead3D(nn.Module):
-    """Output layer for 3D parallel Vision Transformer
-
-    :param in_features: size of input tensor
-    :type in_features: int
-    :param num_classes: number of classes
-    :type num_classes: int
-    :param depth: the 3D parallelism depth
-    :type depth: int
-    :param input_parallel_mode: parallel mode of input tensor
-    :type input_parallel_mode: ParallelMode
-    :param weight_parallel_mode: parallel mode of weight
-    :type weight_parallel_mode: ParallelMode
-    :param dtype: dtype of parameters, defaults to None
-    :type dtype: dtype, optional
-    :param bias: whether to add bias, defaults to True
-    :type bias: bool, optional
-    """
-
-    def __init__(self,
-                 in_features: int,
-                 num_classes: int,
-                 dtype: dtype = None,
-                 bias: bool = True,
-                 init_method: str = 'torch'):
-        super().__init__()
-        # self.depth = get_depth_from_env()
-        # self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        # self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        # self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-        #                                            self.weight_parallel_mode)
-        self.in_features = in_features
-        self.num_classes = num_classes
-        # out_features = math.ceil(self.num_classes /
-        #                          (self.depth**2)) * (self.depth**2)
-        # self.num_classes_per_partition = divide(self.num_classes, self.depth)
-        self.init_weight = 'torch'
-        self.init_bias = 'torch'
-        if init_method == 'jax':
-            self.init_weight = 'zero'
-            self.init_bias = 'zero'
-
-        self.linear = Linear3D(self.in_features,
-                               self.num_classes,
-                               #    self.input_parallel_mode,
-                               #    self.weight_parallel_mode,
-                               dtype=dtype,
-                               bias=bias,
-                               init_weight=self.init_weight,
-                               init_bias=self.init_bias)
-
-    def forward(self, x: Tensor) -> Tensor:
-        # [b/q^2, s, h/q] --> [b/q^2, h/q]
-        x = x[:, 0]
-        # [b/q^2, h/q] --> [b/q^2, c/q]
-        x = self.linear(x)
-        # return x[:, :self.num_classes_per_partition]
-        return x
-
-    def extra_repr(self):
-        return 'in_features={}, num_classes={}'.format(self.in_features,
-                                                       self.num_classes)
--- a/colossalai/nn/layer/parallel_3d/layers.py
+++ b/colossalai/nn/layer/parallel_3d/layers.py
 #!/usr/bin/env python
 # -*- encoding: utf-8 -*-
-
 import math
-import os
-from typing import Tuple
+from typing import Callable

 import torch
-import torch.distributed as dist
 import torch.nn as nn
-from colossalai.constants import (INPUT_GROUP_3D, OUTPUT_GROUP_3D,
-                                  WEIGHT_GROUP_3D)
+import torch.nn.functional as F
+from colossalai.communication import all_reduce, broadcast
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
-from colossalai.nn.init import init_bias_, init_weight_
+from colossalai.nn import init as init
+from colossalai.nn.layer.base_layer import ParallelLayer
 from colossalai.registry import LAYERS
 from colossalai.utils import get_current_device
 from torch import Tensor, dtype
 from torch.nn import Parameter
-from torch.nn import init as init

-from .._common_utils import divide, set_tensor_parallel_attribute_by_size
-from ._operation import (Add_3D, Matmul_AB_3D, Mul_3D, Sum_3D, layer_norm_3d,
-                         linear_3d)
-from ._utils import (get_depth_from_env, get_last_group,
-                     get_parallel_mode_from_env, swap_in_out_group)
+from .._common_utils import (divide, set_tensor_parallel_attribute_by_partition, to_2tuple)
+from ._operation import *
+from ._utils import (get_depth_from_env, get_last_group, get_parallel_mode_from_env, swap_in_out_group)


 @LAYERS.register_module
-class LayerNorm3D(nn.Module):
-    def __init__(
-        self,
-        normalized_shape: int,
-        # input_parallel_mode: ParallelMode,
-        # weight_parallel_mode: ParallelMode,
-        eps: float = 1e-12,
-        dtype: dtype = None,
-    ):
+class LayerNorm3D(ParallelLayer):
+    def __init__(self, normalized_shape: int, eps: float = 1e-12, dtype: dtype = None):
        super().__init__()
        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-                                                   self.weight_parallel_mode)
+        self.output_parallel_mode = get_last_group(self.input_parallel_mode, self.weight_parallel_mode)
        self.depth = get_depth_from_env()
        self.normalized_shape = normalized_shape
        self.normalized_shape_per_partition = divide(normalized_shape, self.depth)

        self.weight = Parameter(
-            torch.ones(self.normalized_shape_per_partition,
-                       device=get_current_device(),
-                       dtype=dtype))
-        self.bias = Parameter(
-            torch.zeros(self.normalized_shape_per_partition,
-                        device=get_current_device(),
+            torch.ones(self.normalized_shape_per_partition, device=get_current_device(), dtype=dtype))
+        self.bias = Parameter(torch.zeros(self.normalized_shape_per_partition, device=get_current_device(),
                                          dtype=dtype))
        self.variance_epsilon = eps
        self._set_tensor_parallel_attributes()

-    def _set_tensor_parallel_attributes(self):
-        set_tensor_parallel_attribute_by_size(self.weight, self.normalized_shape)
-        set_tensor_parallel_attribute_by_size(self.bias, self.normalized_shape)
+    def _set_tensor_parallel_attributes(self) -> None:
+        set_tensor_parallel_attribute_by_partition(self.weight, self.depth)
+        set_tensor_parallel_attribute_by_partition(self.bias, self.depth)

-    def reset_parameters(self):
-        init.zeros_(self.bias)
-        init.ones_(self.weight)
+    def reset_parameters(self) -> None:
+        init.zeros_()(self.bias)
+        init.ones_()(self.weight)

    def forward(self, input_: Tensor) -> Tensor:
-        # '''x = weight * (x - mean) / sqrt(var + eps) + bias'''
-        # # input: [m/q^2, n, h/q]
-        # # [m/q^2, n, 1]
-        # mean = Sum_3D.apply(input_, -1, self.depth, self.output_parallel_mode,
-        #                     True) / self.normalized_shape
-        # # [m/q^2, n, 1]
-        # var = (input_ - mean).pow(2)
-        # var = Sum_3D.apply(var, -1, self.depth, self.output_parallel_mode,
-        #                    True) / self.normalized_shape
-
-        # output = (input_ - mean) / torch.sqrt(var + self.variance_epsilon)
-        # output = Mul_3D.apply(output, self.weight, self.depth,
-        #                       self.input_parallel_mode,
-        #                       self.weight_parallel_mode,
-        #                       self.output_parallel_mode)
-        # output = Add_3D.apply(output, self.bias, self.depth,
-        #                       self.input_parallel_mode,
-        #                       self.weight_parallel_mode,
-        #                       self.output_parallel_mode)
-        # return output
-        return layer_norm_3d.apply(input_, self.weight, self.bias,
-                                   self.normalized_shape,
-                                   self.variance_epsilon,
-                                   self.input_parallel_mode,
-                                   self.weight_parallel_mode,
-                                   self.output_parallel_mode)
-
-    def extra_repr(self):
-        return '{}, eps={}'.format(self.normalized_shape,
-                                   self.variance_epsilon)
+        return layernorm_3d.apply(input_, self.weight, self.bias, self.normalized_shape, self.variance_epsilon,
+                                  self.input_parallel_mode, self.weight_parallel_mode, self.output_parallel_mode)


 @LAYERS.register_module
-class Linear3D(nn.Module):
-    def __init__(
-            self,
+class Linear3D(ParallelLayer):
+    def __init__(self,
                 in_features: int,
                 out_features: int,
-            #  input_parallel_mode: ParallelMode,
-            #  weight_parallel_mode: ParallelMode,
                 bias: bool = True,
                 dtype: dtype = None,
-            init_weight: str = 'torch',
-            init_bias: str = 'torch'):
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
        super().__init__()
        self.in_features = in_features
        self.out_features = out_features
        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-                                                   self.weight_parallel_mode)
-        # self.with_bias = bias
+        self.output_parallel_mode = get_last_group(self.input_parallel_mode, self.weight_parallel_mode)
        self.depth = get_depth_from_env()
        self.in_features_per_partition = divide(in_features, self.depth)
        self.out_features_per_partition = divide(out_features, self.depth)

-        # [k/q, h/q]
        self.weight = Parameter(
            torch.empty(self.in_features_per_partition,
                        self.out_features_per_partition,
                        device=get_current_device(),
                        dtype=dtype))
-
-        # [h/q]
        if bias:
-            self.bias = Parameter(
-                torch.zeros(self.out_features_per_partition,
-                            device=get_current_device(),
+            self.bias = Parameter(torch.zeros(self.out_features_per_partition, device=get_current_device(),
                                              dtype=dtype))
        else:
-            self.register_parameter('bias', None)
+            self.bias = None

-        self.reset_parameters(init_weight, init_bias)
+        self.reset_parameters(weight_initializer, bias_initializer)
        self._set_tensor_parallel_attributes()
        swap_in_out_group()

-    def _set_tensor_parallel_attributes(self):
-        set_tensor_parallel_attribute_by_size(self.weight, self.in_features * self.out_features)
+    def _set_tensor_parallel_attributes(self) -> None:
+        set_tensor_parallel_attribute_by_partition(self.weight, self.depth**2)
        if self.bias is not None:
-            set_tensor_parallel_attribute_by_size(self.bias, self.out_features)
+            set_tensor_parallel_attribute_by_partition(self.bias, self.depth)

-    def reset_parameters(self, init_weight, init_bias) -> None:
-        # setting
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
            fan_in, fan_out = self.in_features, self.out_features
            weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0]
            output_src_rank = gpc.get_ranks_in_group(self.output_parallel_mode)[0]

-        # init weight
-        init_weight_(self.weight, fan_in, fan_out, init_method=init_weight)
-        dist.broadcast(self.weight,
-                       src=weight_src_rank,
-                       group=gpc.get_group(self.weight_parallel_mode))
-        # init bias
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            broadcast(self.weight, weight_src_rank, self.weight_parallel_mode)
+
+            if self.bias is not None:
+                bias_initializer(self.bias, fan_in=fan_in)
+                broadcast(self.bias, weight_src_rank, self.weight_parallel_mode)
+                broadcast(self.bias, output_src_rank, self.output_parallel_mode)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        return linear_3d.apply(input_, self.weight, self.bias, self.input_parallel_mode, self.weight_parallel_mode,
+                               self.output_parallel_mode)
+
+
+@LAYERS.register_module
+class Classifier3D(ParallelLayer):
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: Parameter = None,
+                 bias: bool = True,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+        self.output_parallel_mode = get_last_group(self.input_parallel_mode, self.weight_parallel_mode)
+        self.depth = get_depth_from_env()
+        self.in_features_per_partition = divide(in_features, self.depth)
+
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = Parameter(
+                torch.empty(self.num_classes, self.in_features_per_partition, device=get_current_device(), dtype=dtype))
+            self.has_weight = True
+        if bias:
+            self.bias = Parameter(torch.zeros(self.num_classes, device=get_current_device(), dtype=dtype))
+        else:
+            self.bias = None
+
+        self.reset_parameters(weight_initializer, bias_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self) -> None:
+        if self.has_weight:
+            set_tensor_parallel_attribute_by_partition(self.weight, self.depth)
+
+    def reset_parameters(self, weight_initializer, bias_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.in_features, self.num_classes
+            weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0]
+            output_src_rank = gpc.get_ranks_in_group(self.output_parallel_mode)[0]
+            input_src_rank = gpc.get_ranks_in_group(self.input_parallel_mode)[0]
+
+            if self.has_weight:
+                weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+                broadcast(self.weight, weight_src_rank, self.weight_parallel_mode)
+
            if self.bias is not None:
-            init_bias_(self.bias, fan_in, init_method=init_bias)
-            dist.broadcast(self.bias,
-                           src=weight_src_rank,
-                           group=gpc.get_group(self.weight_parallel_mode))
-            dist.broadcast(self.bias,
-                           src=output_src_rank,
-                           group=gpc.get_group(self.output_parallel_mode))
+                bias_initializer(self.bias, fan_in=fan_in)
+                broadcast(self.bias, weight_src_rank, self.weight_parallel_mode)
+                broadcast(self.bias, output_src_rank, self.output_parallel_mode)
+                broadcast(self.bias, input_src_rank, self.input_parallel_mode)

    def forward(self, input_: Tensor) -> Tensor:
-        # # input: [m/q^2, n, k/q]
-        # # output: [m/q^2, n, h/q]
-        # output = Matmul_AB_3D.apply(input_, self.weight, self.depth,
-        #                             self.input_parallel_mode,
-        #                             self.weight_parallel_mode,
-        #                             self.output_parallel_mode)
-
-        # if self.bias is not None:
-        #     output = Add_3D.apply(output, self.bias, self.depth,
-        #                           self.output_parallel_mode,
-        #                           self.weight_parallel_mode,
-        #                           self.input_parallel_mode)
-        # return output
-        return linear_3d.apply(input_, self.weight, self.bias,
-                               self.input_parallel_mode,
-                               self.weight_parallel_mode,
+        return classifier_3d.apply(input_, self.weight, self.bias, self.input_parallel_mode, self.weight_parallel_mode,
                                   self.output_parallel_mode)

-    def extra_repr(self):
-        return 'in_features={}, out_features={}, bias={}'.format(
-            self.in_features, self.out_features, self.with_bias)
+
+@LAYERS.register_module
+class PatchEmbedding3D(ParallelLayer):
+    def __init__(self,
+                 img_size: int,
+                 patch_size: int,
+                 in_chans: int,
+                 embed_size: int,
+                 dtype: dtype = None,
+                 flatten: bool = True,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 position_embed_initializer: Callable = init.zeros_()):
+        super().__init__()
+        self.depth = get_depth_from_env()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+        self.output_parallel_mode = get_last_group(self.input_parallel_mode, self.weight_parallel_mode)
+        self.patch_size = to_2tuple(patch_size)
+        grid_size = to_2tuple(img_size // patch_size)
+        num_patches = grid_size[0] * grid_size[1]
+        self.embed_size = embed_size
+        embed_size_per_partition = divide(embed_size, self.depth)
+        self.flatten = flatten
+
+        self.weight = nn.Parameter(
+            torch.empty((embed_size_per_partition, in_chans, *self.patch_size),
+                        device=get_current_device(),
+                        dtype=dtype))
+        self.bias = nn.Parameter(torch.empty(embed_size_per_partition, device=get_current_device(), dtype=dtype))
+
+        self.cls_token = nn.Parameter(
+            torch.zeros((1, 1, embed_size_per_partition), device=get_current_device(), dtype=dtype))
+        self.pos_embed = nn.Parameter(
+            torch.zeros((1, num_patches + 1, embed_size_per_partition), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer, bias_initializer, position_embed_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self) -> None:
+        set_tensor_parallel_attribute_by_partition(self.weight, self.depth)
+        set_tensor_parallel_attribute_by_partition(self.bias, self.depth)
+        set_tensor_parallel_attribute_by_partition(self.cls_token, self.depth)
+        set_tensor_parallel_attribute_by_partition(self.pos_embed, self.depth)
+
+    def _sync_grad_hook(self, grad) -> None:
+        grad = all_reduce(grad, self.input_parallel_mode)
+        grad = all_reduce(grad, self.weight_parallel_mode)
+        return grad
+
+    def reset_parameters(self, weight_initializer, bias_initializer, position_embed_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, _ = nn.init._calculate_fan_in_and_fan_out(self.weight)
+            fan_out = self.embed_size
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            bias_initializer(self.bias, fan_in=fan_in)
+            position_embed_initializer(self.pos_embed)
+
+        weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0]
+        input_src_rank = gpc.get_ranks_in_group(self.input_parallel_mode)[0]
+        broadcast(self.weight, weight_src_rank, self.weight_parallel_mode)
+        broadcast(self.bias, weight_src_rank, self.weight_parallel_mode)
+        broadcast(self.pos_embed, weight_src_rank, self.weight_parallel_mode)
+        broadcast(self.bias, input_src_rank, self.input_parallel_mode)
+        broadcast(self.pos_embed, input_src_rank, self.input_parallel_mode)
+
+        self.bias.register_hook(self._sync_grad_hook)
+        self.cls_token.register_hook(self._sync_grad_hook)
+        self.pos_embed.register_hook(self._sync_grad_hook)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        input_ = split_batch_3d(input_, self.input_parallel_mode, self.weight_parallel_mode)
+
+        weight = broadcast_weight_3d_from_diagonal.apply(self.weight, self.input_parallel_mode,
+                                                         self.weight_parallel_mode, self.output_parallel_mode)
+        output = F.conv2d(input_, weight, self.bias, stride=self.patch_size)
+        if self.flatten:
+            output = output.flatten(2).transpose(1, 2)  # BCHW -> BNC
+
+        cls_token = self.cls_token.expand(output.shape[0], -1, -1)
+        output = torch.cat((cls_token, output), dim=1)
+        output = output + self.pos_embed
+
+        return output
+
+
+@LAYERS.register_module
+class Embedding3D(ParallelLayer):
+    def __init__(self,
+                 num_embeddings: int,
+                 embedding_dim: int,
+                 padding_idx: int = None,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.normal_(),
+                 *args,
+                 **kwargs):
+        super().__init__()
+        self.depth = get_depth_from_env()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+        self.output_parallel_mode = get_last_group(self.input_parallel_mode, self.weight_parallel_mode)
+
+        self.num_embeddings = num_embeddings
+        self.embed_dim = embedding_dim
+        embed_dim_per_partition = divide(embedding_dim, self.depth)
+        self.padding_idx = padding_idx
+        self.embed_args = args
+        self.embed_kwargs = kwargs
+
+        self.weight = nn.Parameter(
+            torch.empty((num_embeddings, embed_dim_per_partition), device=get_current_device(), dtype=dtype))
+
+        self.reset_parameters(weight_initializer)
+        self._set_tensor_parallel_attributes()
+
+    def _set_tensor_parallel_attributes(self) -> None:
+        set_tensor_parallel_attribute_by_partition(self.weight, self.depth)
+
+    def reset_parameters(self, weight_initializer) -> None:
+        with seed(ParallelMode.TENSOR):
+            fan_in, fan_out = self.num_embeddings, self.embed_dim
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+            self._fill_padding_idx_with_zero()
+        weight_src_rank = gpc.get_ranks_in_group(self.weight_parallel_mode)[0]
+        broadcast(self.weight, weight_src_rank, self.weight_parallel_mode)
+
+    def _fill_padding_idx_with_zero(self) -> None:
+        if self.padding_idx is not None:
+            with torch.no_grad():
+                self.weight[self.padding_idx].fill_(0)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        input_ = split_batch_3d(input_, self.input_parallel_mode, self.weight_parallel_mode)
+
+        weight = broadcast_weight_3d_from_diagonal.apply(self.weight, self.input_parallel_mode,
+                                                         self.weight_parallel_mode, self.output_parallel_mode)
+        output = F.embedding(input_, weight, self.padding_idx, *self.embed_args, **self.embed_kwargs)
+
+        return output
--- a/colossalai/nn/layer/vanilla/__init__.py
+++ b/colossalai/nn/layer/vanilla/__init__.py
+from .layers import DropPath, VanillaClassifier, VanillaPatchEmbedding
+
+__all__ = ['VanillaPatchEmbedding', 'VanillaClassifier', 'DropPath']
--- a/colossalai/nn/layer/vanilla/layers.py
+++ b/colossalai/nn/layer/vanilla/layers.py
+import math
+from typing import Callable
+
+import torch
+import torch.nn.functional as F
+from colossalai.nn import init as init
+from colossalai.registry import LAYERS
+from colossalai.utils import get_current_device
+from torch import Tensor, dtype
+from torch import nn as nn
+
+from .._common_utils import to_2tuple
+
+
+def drop_path(x, drop_prob: float = 0., training: bool = False):
+    """Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
+    This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
+    the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
+    See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
+    changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
+    'survival rate' as the argument.
+    """
+    if drop_prob == 0. or not training:
+        return x
+    keep_prob = 1 - drop_prob
+    shape = (x.shape[0], ) + (1, ) * (x.ndim - 1)  # work with diff dim tensors, not just 2D ConvNets
+    random_tensor = keep_prob + torch.rand(shape, dtype=x.dtype, device=x.device)
+    random_tensor.floor_()  # binarize
+    output = x.div(keep_prob) * random_tensor
+    return output
+
+
+class DropPath(nn.Module):
+    """Drop paths (Stochastic Depth) per sample  (when applied in main path of residual blocks).
+    Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
+    """
+    def __init__(self, drop_prob=None):
+        super(DropPath, self).__init__()
+        self.drop_prob = drop_prob
+
+    def forward(self, x):
+        return drop_path(x, self.drop_prob, self.training)
+
+
+@LAYERS.register_module
+class VanillaPatchEmbedding(nn.Module):
+    """ 2D Image to Patch Embedding
+    """
+    def __init__(self,
+                 img_size: int,
+                 patch_size: int,
+                 in_chans: int,
+                 embed_size: int,
+                 dtype: dtype = None,
+                 flatten: bool = True,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+                 position_embed_initializer: Callable = init.zeros_()):
+        super().__init__()
+        img_size = to_2tuple(img_size)
+        patch_size = to_2tuple(patch_size)
+        self.img_size = img_size
+        self.patch_size = patch_size
+        self.grid_size = (img_size[0] // patch_size[0], img_size[1] // patch_size[1])
+        self.num_patches = self.grid_size[0] * self.grid_size[1]
+        self.flatten = flatten
+
+        self.weight = nn.Parameter(
+            torch.empty((embed_size, in_chans, *self.patch_size), device=get_current_device(), dtype=dtype))
+        self.bias = nn.Parameter(torch.empty(embed_size, device=get_current_device(), dtype=dtype))
+        self.cls_token = nn.Parameter(torch.zeros(1, 1, embed_size))
+        self.pos_embed = nn.Parameter(torch.zeros(1, self.num_patches + 1, embed_size))
+
+        self.reset_parameters(weight_initializer, bias_initializer, position_embed_initializer)
+
+    def reset_parameters(self, weight_initializer, bias_initializer, position_embed_initializer):
+        fan_in, fan_out = nn.init._calculate_fan_in_and_fan_out(self.weight)
+        weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+        bias_initializer(self.bias, fan_in=fan_in)
+        position_embed_initializer(self.pos_embed)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        B, C, H, W = input_.shape
+        assert H == self.img_size[0] and W == self.img_size[1], \
+            f"Input image size ({H}*{W}) doesn't match model ({self.img_size[0]}*{self.img_size[1]})."
+        output = F.conv2d(input_, self.weight, self.bias, stride=self.patch_size)
+        if self.flatten:
+            output = output.flatten(2).transpose(1, 2)  # BCHW -> BNC
+
+        cls_token = self.cls_token.expand(output.shape[0], -1, -1)
+        output = torch.cat((cls_token, output), dim=1)
+        output = output + self.pos_embed
+        return output
+
+
+@LAYERS.register_module
+class VanillaClassifier(nn.Module):
+    def __init__(self,
+                 in_features: int,
+                 num_classes: int,
+                 weight: nn.Parameter = None,
+                 bias: bool = True,
+                 dtype: dtype = None,
+                 weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+                 bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1)):
+        super().__init__()
+        self.in_features = in_features
+        self.num_classes = num_classes
+
+        if weight is not None:
+            self.weight = weight
+            self.has_weight = False
+        else:
+            self.weight = nn.Parameter(
+                torch.empty(self.num_classes, self.in_features, device=get_current_device(), dtype=dtype))
+            self.has_weight = True
+        if bias:
+            self.bias = nn.Parameter(torch.zeros(self.num_classes, device=get_current_device(), dtype=dtype))
+        else:
+            self.bias = None
+
+        self.reset_parameters(weight_initializer, bias_initializer)
+
+    def reset_parameters(self, weight_initializer, bias_initializer):
+        fan_in, fan_out = self.in_features, self.num_classes
+
+        if self.has_weight:
+            weight_initializer(self.weight, fan_in=fan_in, fan_out=fan_out)
+
+        if self.bias is not None:
+            bias_initializer(self.bias, fan_in=fan_in)
+
+    def forward(self, input_: Tensor) -> Tensor:
+        return F.linear(input_, self.weight, self.bias)
--- a/colossalai/nn/loss/__init__.py
+++ b/colossalai/nn/loss/__init__.py
-from .cross_entropy_2d import CrossEntropyLoss2D
-from .cross_entropy_2p5d import CrossEntropyLoss2p5D
-from .cross_entropy_3d import CrossEntropyLoss3D
+from torch import nn
+from torch.nn.modules.loss import *
+from torch.nn.modules.loss import _Loss

-__all__ = ['CrossEntropyLoss2D', 'CrossEntropyLoss2p5D', 'CrossEntropyLoss3D']
+from .loss_2d import CrossEntropyLoss2D
+from .loss_2p5d import CrossEntropyLoss2p5D
+from .loss_3d import CrossEntropyLoss3D
+
+_parallel_cross_entropy = {
+    '2d': CrossEntropyLoss2D,
+    '2.5d': CrossEntropyLoss2p5D,
+    '3d': CrossEntropyLoss3D
+}
+
+
+class CrossEntropyLoss(_Loss):
+    def __init__(self, reduction: bool = True, tensor_parallel: str = None, *args, **kwargs):
+        super().__init__()
+        if tensor_parallel in [None, '1d']:
+            reduction = 'mean' if reduction else 'none'
+            self.loss = nn.CrossEntropyLoss(reduction=reduction, *args, **kwargs)
+        else:
+            self.loss = _parallel_cross_entropy[tensor_parallel](reduction=reduction, *args, **kwargs)
+
+    def forward(self, *args):
+        return self.loss(*args)
--- a/colossalai/nn/loss/cross_entropy_2d.py
+++ b/colossalai/nn/loss/cross_entropy_2d.py
-import torch
-import torch.distributed as dist
-from torch.nn.modules.loss import _Loss
-
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization, get_summa_dim_from_env
-from colossalai.registry import LOSSES
-from colossalai.utils import get_current_device
-from torch.cuda.amp import custom_bwd, custom_fwd
-
-
-class _ParallelCrossEntropyLossFunction_2D(torch.autograd.Function):
-    ### Modified based on megatron.mpu.cross_entropy ###
-
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, logits, targets):
-        # logits: [b/q, h/q]
-        # labels: [b/q]
-
-        logits_max = torch.max(logits, dim=-1)[0]
-        torch.distributed.all_reduce(
-            logits_max,
-            op=torch.distributed.ReduceOp.MAX,
-            group=gpc.get_group(ParallelMode.PARALLEL_2D_ROW))
-        # Subtract the maximum value.
-        # vocab_parallel_logits.sub_(logits_max.unsqueeze(dim=-1))
-        logits = logits - logits_max.unsqueeze(dim=-1)
-
-        vocab_size = logits.size(-1)
-        rank = gpc.get_local_rank(ParallelMode.PARALLEL_2D_ROW)
-        vocab_start = rank * (vocab_size)
-        vocab_end = (rank + 1) * (vocab_size) - 1
-
-        target_mask = (targets < vocab_start) | (targets > vocab_end)
-
-        masked_target = targets.clone() - vocab_start
-        masked_target[target_mask] = 0
-        arange_1d = torch.arange(
-            start=0, end=logits.size()[0],
-        )
-        predicted_logits = logits[arange_1d, masked_target]
-        predicted_logits[target_mask] = 0.
-        dist.all_reduce(predicted_logits, group=gpc.get_group(
-            ParallelMode.PARALLEL_2D_ROW))
-
-        exp_logits = torch.exp(logits)
-        sum_exp_logits = exp_logits.sum(dim=1)
-        dist.all_reduce(sum_exp_logits, group=gpc.get_group(
-            ParallelMode.PARALLEL_2D_ROW))
-
-        loss = torch.log(sum_exp_logits) - predicted_logits
-
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-        ctx.save_for_backward(exp_logits, target_mask, masked_target)
-
-        return loss
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
-        softmax, target_mask, masked_target = ctx.saved_tensors
-
-        # All the inputs have softmax as their gradient.
-        grad_input = softmax
-
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
-                                 device=get_current_device())
-        grad_2d[arange_1d,
-                masked_target] -= (1.0 - target_mask.view(-1).float())
-
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(output_grad.unsqueeze(dim=-1))
-
-        return grad_input, None
-
-
-class _ReduceByColumn(torch.autograd.Function):
-    """All-reduce the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        dist.all_reduce(input_, group=gpc.get_group(
-            ParallelMode.PARALLEL_2D_COL))
-        return input_
-
-    @staticmethod
-    @custom_fwd(cast_inputs=torch.float32)
-    def forward(ctx, input_):
-        dist.all_reduce(input_, group=gpc.get_group(
-            ParallelMode.PARALLEL_2D_COL))
-        return input_
-
-    @staticmethod
-    @custom_bwd
-    def backward(ctx, grad_output):
-        return grad_output
-
-
-@LOSSES.register_module
-class CrossEntropyLoss2D(_Loss):
-    """Cross entropy loss for 2D parallelism
-
-    :param reduction: whether to average the loss, defaults to True
-    :type reduction: bool, optional
-    """
-
-    def __init__(self, reduction=True):
-        super().__init__()
-        assert_summa_initialization()
-        self.summa_dim = get_summa_dim_from_env()
-        self.row_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2D_COL)
-        self.reduction_mean = reduction
-
-    def forward(self, logits, targets):
-        targets = targets.chunk(self.summa_dim, dim=0)[self.row_rank]
-        loss = _ParallelCrossEntropyLossFunction_2D.apply(
-            logits, targets,
-        )
-        if self.reduction_mean:
-            loss = _ReduceByColumn.apply(loss) / self.summa_dim
-        dist_loss = loss.mean()
-
-        return dist_loss
--- a/colossalai/nn/loss/cross_entropy_2p5d.py
+++ b/colossalai/nn/loss/cross_entropy_2p5d.py
-import torch
-import torch.distributed as dist
-from torch.nn.modules.loss import _Loss
-
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization, \
-    get_tesseract_dim_dep_from_env
-from colossalai.registry import LOSSES
-from colossalai.utils import get_current_device
-
-
-class _ParallelCrossEntropyLossFunction_2p5D(torch.autograd.Function):
-    ### Modified based on megatron.mpu.cross_entropy ###
-
-    @staticmethod
-    def forward(ctx, logits, targets):
-        # logits: [b/dq, h/q]
-        # loss: [b/dq]
-        # targets: [b/dq, h/q]
-        logits_max = torch.max(logits, dim=-1)[0]
-        torch.distributed.all_reduce(
-            logits_max,
-            op=torch.distributed.ReduceOp.MAX,
-            group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
-        # Subtract the maximum value.
-        logits = logits - logits_max.unsqueeze(dim=-1)
-
-        vocab_size = logits.size(-1)
-        rank = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_ROW)
-        vocab_start = rank * (vocab_size)
-        vocab_end = (rank + 1) * (vocab_size) - 1
-
-        target_mask = (targets < vocab_start) | (targets > vocab_end)
-
-        masked_target = targets.clone() - vocab_start
-        masked_target[target_mask] = 0
-        arange_1d = torch.arange(
-            start=0, end=logits.size()[0],
-        )
-        predicted_logits = logits[arange_1d, masked_target]
-        predicted_logits[target_mask] = 0.
-        dist.all_reduce(predicted_logits, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
-
-        exp_logits = torch.exp(logits)
-        sum_exp_logits = exp_logits.sum(dim=1)
-        dist.all_reduce(sum_exp_logits, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_ROW))
-
-        loss = torch.log(sum_exp_logits) - predicted_logits
-
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-        ctx.save_for_backward(exp_logits, target_mask, masked_target)
-
-        return loss
-
-    @staticmethod
-    def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
-        softmax, target_mask, masked_target = ctx.saved_tensors
-
-        # All the inputs have softmax as their gradient.
-        grad_input = softmax
-
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = grad_input.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0, end=grad_2d.size()[0],
-                                 device=get_current_device())
-        grad_2d[arange_1d,
-                masked_target] -= (1.0 - target_mask.view(-1).float())
-
-        # Finally elementwise multiplication with the output gradients.
-        grad_input.mul_(output_grad.unsqueeze(dim=-1))
-
-        return grad_input, None
-
-
-class _ReduceByColDep(torch.autograd.Function):
-    """All-reduce the input from the model parallel region."""
-
-    @staticmethod
-    def symbolic(graph, input_):
-        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_XZ))
-        return input_
-
-    @staticmethod
-    def forward(ctx, input_):
-        dist.all_reduce(input_, group=gpc.get_group(ParallelMode.PARALLEL_2P5D_XZ))
-        return input_
-
-    @staticmethod
-    def backward(ctx, grad_output):
-        return grad_output
-
-
-@LOSSES.register_module
-class CrossEntropyLoss2p5D(_Loss):
-    """Cross entropy loss for 2.5D parallelism
-
-    :param reduction: whether to average the loss, defaults to True
-    :type reduction: bool, optional
-    """
-
-    def __init__(self, reduction=True):
-        super().__init__()
-        assert_tesseract_initialization()
-        self.xz_rank = gpc.get_local_rank(ParallelMode.PARALLEL_2P5D_XZ)
-        self.tesseract_dim, self.tesseract_dep = get_tesseract_dim_dep_from_env()
-        self.reduction_mean = reduction
-
-    def forward(self, logits, targets):
-        targets = targets.chunk(self.tesseract_dim *
-                                self.tesseract_dep, dim=0)[self.xz_rank]
-        loss = _ParallelCrossEntropyLossFunction_2p5D.apply(
-            logits, targets,
-        )
-        if self.reduction_mean:
-            loss = _ReduceByColDep.apply(
-                loss) / self.tesseract_dim / self.tesseract_dep
-        dist_loss = loss.mean()
-
-        return dist_loss
--- a/colossalai/nn/loss/cross_entropy_3d.py
+++ b/colossalai/nn/loss/cross_entropy_3d.py
-#!/usr/bin/env python
-# -*- encoding: utf-8 -*-
-
-import os
-
-import torch
-import torch.distributed as dist
-from colossalai.constants import (INPUT_GROUP_3D, OUTPUT_GROUP_3D,
-                                  WEIGHT_GROUP_3D)
-from colossalai.core import global_context as gpc
-from colossalai.nn.layer.parallel_3d._operation import Reduce_3D
-from colossalai.nn.layer.parallel_3d._utils import (get_depth_from_env,
-                                                    get_last_group,
-                                                    get_parallel_mode_from_env)
-from colossalai.registry import LOSSES
-from colossalai.utils import get_current_device
-from torch.nn.modules.loss import _Loss
-
-
-class _ParallelCrossEntropyLossFunction_3D(torch.autograd.Function):
-    """
-    Adapted from megatron.mpu.cross_entropy
-    loss[i] = -logits[i][targets] + log(sum(exp(logits[i])))
-    """
-    @staticmethod
-    def forward(ctx, logits, targets, depth, output_parallel_mode):
-        # logits: [b/q^2, c/q]
-        # labels: [b/q^2]
-        # loss: [b/q^2]
-        logits_max = torch.max(logits, dim=-1)[0]
-        dist.all_reduce(logits_max,
-                        op=torch.distributed.ReduceOp.MAX,
-                        group=gpc.get_group(output_parallel_mode))
-        # Subtract the maximum value.
-        logits = logits - logits_max.unsqueeze(dim=-1)
-
-        vocab_size_per_partition = logits.size()[-1]
-        rank = gpc.get_local_rank(output_parallel_mode)
-        vocab_start = rank * vocab_size_per_partition
-        vocab_end = (rank + 1) * vocab_size_per_partition - 1
-
-        # loss[i] = 0 if targets[i] < vocab_start or targets[i] > vocab_end
-        target_mask = (targets < vocab_start) | (targets > vocab_end)
-        masked_target = targets.clone() - vocab_start
-        masked_target[target_mask] = 0
-        arange_1d = torch.arange(start=0,
-                                 end=logits.size()[0],
-                                 device=get_current_device())
-        predicted_logits = logits[arange_1d, masked_target]
-        predicted_logits = predicted_logits.clone().contiguous().view_as(
-            targets)
-        predicted_logits[target_mask] = 0.
-        dist.all_reduce(predicted_logits,
-                        group=gpc.get_group(output_parallel_mode))
-
-        # Loss = log(sum(exp(logits))) - predicted-logit.
-        exp_logits = torch.exp(logits)
-        sum_exp_logits = exp_logits.sum(dim=-1)
-        dist.all_reduce(sum_exp_logits,
-                        group=gpc.get_group(output_parallel_mode))
-        loss = torch.log(sum_exp_logits) - predicted_logits
-
-        exp_logits.div_(sum_exp_logits.unsqueeze(dim=-1))
-        ctx.save_for_backward(exp_logits, target_mask, masked_target)
-
-        return loss
-
-    @staticmethod
-    def backward(ctx, output_grad):
-        # Retreive tensors from the forward path.
-        softmax, target_mask, masked_target = ctx.saved_tensors
-
-        # All the inputs have softmax as thier gradient.
-        input_grad = softmax
-        # For simplicity, work with the 2D gradient.
-        partition_vocab_size = softmax.size()[-1]
-        grad_2d = input_grad.view(-1, partition_vocab_size)
-
-        # Add the gradient from matching classes.
-        arange_1d = torch.arange(start=0,
-                                 end=grad_2d.size()[0],
-                                 device=get_current_device())
-        grad_2d[arange_1d,
-                masked_target] -= (1.0 - target_mask.view(-1).float())
-        input_grad.mul_(output_grad.unsqueeze(dim=-1))
-
-        return input_grad, None, None, None
-
-
-@LOSSES.register_module
-class CrossEntropyLoss3D(_Loss):
-    """Cross entropy loss for 3D parallelism
-
-    :param depth: depth for 3D parallelism
-    :type depth: int
-    :param input_parallel_mode: parallel mode for input tensor
-    :type input_parallel_mode: ParallelMode
-    :param weight_parallel_mode: parallel mode for weight
-    :type weight_parallel_mode: ParallelMode
-    :param reduction: whether to average the loss, defaults to True
-    :type reduction: bool, optional
-    """
-    def __init__(
-            self,
-            #  input_parallel_mode,
-            #  weight_parallel_mode,
-            reduction=True,
-            label_smoothing=0.0):
-        super().__init__()
-        self.depth = get_depth_from_env()
-        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
-        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
-        self.output_parallel_mode = get_last_group(self.input_parallel_mode,
-                                                   self.weight_parallel_mode)
-        self.input_rank = gpc.get_local_rank(self.input_parallel_mode)
-        self.weight_rank = gpc.get_local_rank(self.weight_parallel_mode)
-        self.reduction_mean = reduction
-
-    def forward(self, logits, targets):
-        # split label partition from the entire batch
-        batch_size = targets.size(0)
-        targets = torch.chunk(targets, self.depth, dim=0)[self.weight_rank]
-        targets = torch.chunk(targets, self.depth, dim=0)[self.input_rank]
-        loss = _ParallelCrossEntropyLossFunction_3D.apply(
-            logits, targets, self.depth, self.output_parallel_mode)
-        if self.reduction_mean:
-            loss = loss.sum()
-            loss = Reduce_3D.apply(loss, self.depth, self.input_parallel_mode)
-            loss = Reduce_3D.apply(loss, self.depth, self.weight_parallel_mode)
-            loss /= batch_size
-        return loss
-
-
-# @LOSSES.register_module
-# class LabelSmoothingCrossEntropy3D(_Loss):
-#     """
-#     NLL loss with label smoothing, adapted from timm.loss.LabelSmoothingCrossEntropy
-
-#     :param input_parallel_mode: parallel mode for input tensor
-#     :type input_parallel_mode: ParallelMode
-#     :param weight_parallel_mode: parallel mode for weight
-#     :type weight_parallel_mode: ParallelMode
-#     :param smoothing: label smoothing value, defaults to 0.1
-#     :type smoothing: float
-#     :param reduction: whether to average the loss, defaults to True
-#     :type reduction: bool, optional
-#     """
-#     def __init__(self,
-#                  input_parallel_mode,
-#                  weight_parallel_mode,
-#                  smoothing=0.1,
-#                  reduction=True):
-#         super().__init__()
-#         assert smoothing < 1.0
-#         self.smoothing = smoothing
-#         self.confidence = 1. - smoothing
-#         self.depth = get_depth_from_env()
-#         self.input_parallel_mode = input_parallel_mode
-#         self.weight_parallel_mode = weight_parallel_mode
-#         self.output_parallel_mode = get_last_group(input_parallel_mode,
-#                                                    weight_parallel_mode)
-#         self.reduction_mean = reduction
-
-#     def forward(self, logits, targets):
-#         # split label partition from the entire batch
-#         j = gpc.get_local_rank(self.input_parallel_mode)
-#         i = gpc.get_local_rank(self.weight_parallel_mode)
-#         targets = torch.chunk(targets, self.depth, dim=0)[i]
-#         targets = torch.chunk(targets, self.depth, dim=0)[j]
-#         exp_logits = torch.exp(logits)
-#         sum_exp_logits = Sum3D.apply(exp_logits, -1, depth,
-#                                      self.output_parallel_mode, False)
-#         log_probs = torch.log(sum_exp_logits) - logits
-#         nll_loss = _ParallelCrossEntropyLossFunction_3D.apply(
-#             logits, targets, self.depth, self.output_parallel_mode)
-#         smooth_loss = -log_probs.mean(dim=-1)
-#         loss = self.confidence * nll_loss + self.smoothing * smooth_loss
-#         if self.reduction_mean:
-#             loss = loss.sum()
-#             loss = Reduce_3D.apply(loss, self.depth, self.input_parallel_mode)
-#             loss = Reduce_3D.apply(loss, self.depth, self.weight_parallel_mode)
-#             loss /= batch_size
-#         return loss
--- a/colossalai/nn/loss/loss_2d.py
+++ b/colossalai/nn/loss/loss_2d.py
+from colossalai.nn.layer.parallel_2d import reduce_by_batch_2d, split_batch_2d
+from colossalai.nn.layer.parallel_2d._utils import assert_summa_initialization
+from colossalai.registry import LOSSES
+from torch.nn.functional import cross_entropy
+from torch.nn.modules.loss import _Loss
+
+
+@LOSSES.register_module
+class CrossEntropyLoss2D(_Loss):
+    """Cross entropy loss for 2D parallelism
+
+    :param reduction: whether to average the loss, defaults to True
+    :type reduction: bool, optional
+    """
+    def __init__(self, reduction=True, *args, **kwargs):
+        super().__init__()
+        assert_summa_initialization()
+        self.reduction_mean = reduction
+        self.loss_args = args
+        self.loss_kwargs = kwargs
+
+    def forward(self, logits, targets):
+        batch_size = targets.size(0)
+        targets = split_batch_2d(targets)
+        loss = cross_entropy(logits, targets, reduction='sum', *self.loss_args, **self.loss_kwargs)
+        if self.reduction_mean:
+            loss = loss.sum()
+            loss = reduce_by_batch_2d.apply(loss)
+            loss /= batch_size
+        return loss
--- a/colossalai/nn/loss/loss_2p5d.py
+++ b/colossalai/nn/loss/loss_2p5d.py
+from colossalai.nn.layer.parallel_2p5d import reduce_by_batch_2p5d, split_batch_2p5d
+from colossalai.nn.layer.parallel_2p5d._utils import assert_tesseract_initialization
+from colossalai.registry import LOSSES
+from torch.nn.functional import cross_entropy
+from torch.nn.modules.loss import _Loss
+
+
+@LOSSES.register_module
+class CrossEntropyLoss2p5D(_Loss):
+    """Cross entropy loss for 2.5D parallelism
+    :param reduction: whether to average the loss, defaults to True
+    :type reduction: bool, optional
+    """
+    def __init__(self, reduction=True, *args, **kwargs):
+        super().__init__()
+        assert_tesseract_initialization()
+        self.reduction_mean = reduction
+        self.loss_args = args
+        self.loss_kwargs = kwargs
+
+    def forward(self, logits, targets):
+        batch_size = targets.size(0)
+        targets = split_batch_2p5d(targets)
+        loss = cross_entropy(logits, targets, reduction='sum', *self.loss_args, **self.loss_kwargs)
+        if self.reduction_mean:
+            loss = loss.sum()
+            loss = reduce_by_batch_2p5d.apply(loss)
+            loss /= batch_size
+        return loss
--- a/colossalai/nn/loss/loss_3d.py
+++ b/colossalai/nn/loss/loss_3d.py
+from colossalai.constants import INPUT_GROUP_3D, WEIGHT_GROUP_3D
+from colossalai.nn.layer.parallel_3d import reduce_by_batch_3d, split_batch_3d
+from colossalai.nn.layer.parallel_3d._utils import get_parallel_mode_from_env
+from colossalai.registry import LOSSES
+from torch.nn.functional import cross_entropy
+from torch.nn.modules.loss import _Loss
+
+
+@LOSSES.register_module
+class CrossEntropyLoss3D(_Loss):
+    """Cross entropy loss for 3D parallelism
+
+    :param depth: depth for 3D parallelism
+    :type depth: int
+    :param input_parallel_mode: parallel mode for input tensor
+    :type input_parallel_mode: ParallelMode
+    :param weight_parallel_mode: parallel mode for weight
+    :type weight_parallel_mode: ParallelMode
+    :param reduction: whether to average the loss, defaults to True
+    :type reduction: bool, optional
+    """
+    def __init__(self, reduction=True, *args, **kwargs):
+        super().__init__()
+        self.input_parallel_mode = get_parallel_mode_from_env(INPUT_GROUP_3D)
+        self.weight_parallel_mode = get_parallel_mode_from_env(WEIGHT_GROUP_3D)
+        self.reduction_mean = reduction
+        self.loss_args = args
+        self.loss_kwargs = kwargs
+
+    def forward(self, logits, targets):
+        batch_size = targets.size(0)
+        targets = split_batch_3d(targets, self.input_parallel_mode, self.weight_parallel_mode)
+        loss = cross_entropy(logits, targets, reduction='sum', *self.loss_args, **self.loss_kwargs)
+        if self.reduction_mean:
+            loss = loss.sum()
+            loss = reduce_by_batch_3d.apply(loss, self.input_parallel_mode, self.weight_parallel_mode)
+            loss /= batch_size
+        return loss
--- a/colossalai/nn/metric/__init__.py
+++ b/colossalai/nn/metric/__init__.py
+from torch import nn
+
+from ._utils import calc_acc
+from .accuracy_2d import Accuracy2D
+from .accuracy_2p5d import Accuracy2p5D
+from .accuracy_3d import Accuracy3D
+
+_parallel_accuracy = {
+    '2d': Accuracy2D,
+    '2.5d': Accuracy2p5D,
+    '3d': Accuracy3D,
+}
+
+
+class Accuracy(nn.Module):
+    def __init__(self, tensor_parallel: str = None):
+        super().__init__()
+        if tensor_parallel in [None, '1d']:
+            self.acc = calc_acc
+        else:
+            self.acc = _parallel_accuracy[tensor_parallel]()
+
+    def forward(self, *args):
+        return self.acc(*args)