[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check

[legacy] move communication and nn to legacy and refactor logger (#4671)
* [legacy] move communication to legacy (#4640) * [legacy] refactor logger and clean up legacy codes (#4654) * [legacy] make logger independent to gpc * [legacy] make optim independent to registry * [legacy] move test engine to legacy * [legacy] move nn to legacy (#4656) * [legacy] move nn to legacy * [checkpointio] fix save hf config * [test] remove useledd rpc pp test * [legacy] fix nn init * [example] skip tutorial hybriad parallel example * [devops] test doc check * [devops] test doc check
554aa959 · Hongxin Liu · GitHub · 536397cc · 554aa959 · 554aa959
Unverified Commit 554aa959 authored Sep 11, 2023 by Hongxin Liu Committed by GitHub Sep 11, 2023
20 changed files
--- a/colossalai/nn/_ops/embedding_bag.py
+++ b/colossalai/nn/_ops/embedding_bag.py
-import torch.nn.functional as F
 from typing import Optional
+import torch.nn.functional as F
 from torch import Tensor
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
-    ShardSpec, ReplicaSpec
 from ._utils import GeneralTensor, convert_to_colo_tensor

--- a/colossalai/nn/_ops/layernorm.py
+++ b/colossalai/nn/_ops/layernorm.py
 from typing import List, Optional
 import torch.nn.functional as F
+from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
 from colossalai.tensor.op_wrapper import colo_op_impl
-from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
 from ._utils import GeneralTensor, convert_to_colo_tensor

--- a/colossalai/nn/_ops/linear.py
+++ b/colossalai/nn/_ops/linear.py
--- a/colossalai/nn/_ops/loss.py
+++ b/colossalai/nn/_ops/loss.py
+from typing import Optional
 import torch
 import torch.nn.functional as F
-from typing import Optional
-from colossalai.tensor.op_wrapper import colo_op_impl
+from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
 from colossalai.tensor import ColoTensor, ColoTensorSpec
-from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
+from colossalai.tensor.op_wrapper import colo_op_impl
 from ._utils import GeneralTensor, convert_to_colo_tensor

--- a/colossalai/nn/_ops/view.py
+++ b/colossalai/nn/_ops/view.py
--- a/colossalai/legacy/nn/layer/__init__.py
+++ b/colossalai/legacy/nn/layer/__init__.py
+from .colossalai_layer import *
+from .parallel_1d import *
+from .parallel_2d import *
+from .parallel_2p5d import *
+from .parallel_3d import *
+from .parallel_sequence import *
+from .utils import *
+from .vanilla import *
+from .wrapper import *
--- a/colossalai/nn/layer/base_layer.py
+++ b/colossalai/nn/layer/base_layer.py
--- a/colossalai/nn/layer/colossalai_layer/__init__.py
+++ b/colossalai/nn/layer/colossalai_layer/__init__.py
 from ._utils import partition_batch
 from .dropout import Dropout
 from .embedding import Embedding, PatchEmbedding
 from .linear import Classifier, Linear
 from .normalization import LayerNorm
 __all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
--- a/colossalai/nn/layer/colossalai_layer/_utils.py
+++ b/colossalai/nn/layer/colossalai_layer/_utils.py
--- a/colossalai/nn/layer/colossalai_layer/dropout.py
+++ b/colossalai/nn/layer/colossalai_layer/dropout.py
--- a/colossalai/nn/layer/colossalai_layer/embedding.py
+++ b/colossalai/nn/layer/colossalai_layer/embedding.py
 import math
 from typing import Callable
-from colossalai.utils import get_current_device
+from torch import dtype, nn
-from torch import dtype, nn
+from colossalai.nn import init
-from ... import init as init
+from colossalai.utils import get_current_device
-from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
-from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
+from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
-from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
+from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
-from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
+from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
-from ..utils import get_tensor_parallel_mode
+from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
-from ..vanilla import VanillaPatchEmbedding
+from ..utils import get_tensor_parallel_mode
-from ._utils import ColossalaiModule
+from ..vanilla import VanillaPatchEmbedding
+from ._utils import ColossalaiModule
-_parallel_embedding = {
-    '1d': Embedding1D,
+_parallel_embedding = {
-    '2d': Embedding2D,
+    '1d': Embedding1D,
-    '2.5d': Embedding2p5D,
+    '2d': Embedding2D,
-    '3d': Embedding3D,
+    '2.5d': Embedding2p5D,
-}
+    '3d': Embedding3D,
+}
-_vocab_parallel_embedding = {
-    '1d': VocabParallelEmbedding1D,
+_vocab_parallel_embedding = {
-    '2d': VocabParallelEmbedding2D,
+    '1d': VocabParallelEmbedding1D,
-    '2.5d': VocabParallelEmbedding2p5D,
+    '2d': VocabParallelEmbedding2D,
-    '3d': VocabParallelEmbedding3D
+    '2.5d': VocabParallelEmbedding2p5D,
-}
+    '3d': VocabParallelEmbedding3D
+}
-_parallel_patchembedding = {
-    None: VanillaPatchEmbedding,
+_parallel_patchembedding = {
-    '1d': PatchEmbedding1D,
+    None: VanillaPatchEmbedding,
-    '2d': PatchEmbedding2D,
+    '1d': PatchEmbedding1D,
-    '2.5d': PatchEmbedding2p5D,
+    '2d': PatchEmbedding2D,
-    '3d': PatchEmbedding3D
+    '2.5d': PatchEmbedding2p5D,
-}
+    '3d': PatchEmbedding3D
+}
-class Embedding(ColossalaiModule):
-    r"""Embedding for colossalai.
+class Embedding(ColossalaiModule):
+    r"""Embedding for colossalai.
-    Args:
-        num_embeddings (int): number of embeddings.
+    Args:
-        embedding_dim (int): dimension of embedding.
+        num_embeddings (int): number of embeddings.
-        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
+        embedding_dim (int): dimension of embedding.
-            therefore, the embedding vector at padding_idx is not updated during training,
+        padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
-            i.e. it remains as a fixed “pad”, defaults to None.
+            therefore, the embedding vector at padding_idx is not updated during training,
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+            i.e. it remains as a fixed “pad”, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-            he initializer of weight, defaults to normal initializer.
+        weight_initializer (:class:`typing.Callable`, optional):
+            he initializer of weight, defaults to normal initializer.
-    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
-    ::
+    The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
+    ::
-        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-                    renormalized to have norm max_norm. Note: this will modify weight in-place.
+        max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
-        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
+                    renormalized to have norm max_norm. Note: this will modify weight in-place.
-        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
+        norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
-                    of frequency of the words in the mini-batch. Default False.
+        scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
-        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
+                    of frequency of the words in the mini-batch. Default False.
+        sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
-    More details about ``args`` and ``kwargs`` could be found in
-    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
+    More details about ``args`` and ``kwargs`` could be found in
+    `Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    More details about ``initializer`` please refer to
-    """
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
+    """
-    def __init__(self,
-                 num_embeddings: int,
+    def __init__(self,
-                 embedding_dim: int,
+                 num_embeddings: int,
-                 padding_idx: int = None,
+                 embedding_dim: int,
-                 dtype: dtype = None,
+                 padding_idx: int = None,
-                 weight_initializer: Callable = init.normal_(),
+                 dtype: dtype = None,
-                 vocab_parallel_limit: int = 2048,
+                 weight_initializer: Callable = init.normal_(),
-                 *args,
+                 vocab_parallel_limit: int = 2048,
-                 **kwargs) -> None:
+                 *args,
-        tensor_parallel = get_tensor_parallel_mode()
+                 **kwargs) -> None:
-        if tensor_parallel is None:
+        tensor_parallel = get_tensor_parallel_mode()
-            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
+        if tensor_parallel is None:
-                                 **kwargs).to(dtype).to(get_current_device())
+            embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
-            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
+                                 **kwargs).to(dtype).to(get_current_device())
-        elif num_embeddings <= vocab_parallel_limit:
+            weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
-            embed = _parallel_embedding[tensor_parallel](
+        elif num_embeddings <= vocab_parallel_limit:
-                num_embeddings,
+            embed = _parallel_embedding[tensor_parallel](
-                embedding_dim,
+                num_embeddings,
-                padding_idx=padding_idx,
+                embedding_dim,
-                dtype=dtype,
+                padding_idx=padding_idx,
-                weight_initializer=weight_initializer,
+                dtype=dtype,
-                *args,
+                weight_initializer=weight_initializer,
-                **kwargs,
+                *args,
-            )
+                **kwargs,
-        else:
+            )
-            embed = _vocab_parallel_embedding[tensor_parallel](
+        else:
-                num_embeddings,
+            embed = _vocab_parallel_embedding[tensor_parallel](
-                embedding_dim,
+                num_embeddings,
-                padding_idx=padding_idx,
+                embedding_dim,
-                dtype=dtype,
+                padding_idx=padding_idx,
-                weight_initializer=weight_initializer,
+                dtype=dtype,
-                *args,
+                weight_initializer=weight_initializer,
-                **kwargs,
+                *args,
-            )
+                **kwargs,
-        super().__init__(embed)
+            )
+        super().__init__(embed)
-class PatchEmbedding(ColossalaiModule):
-    """2D Image to Patch Embedding.
+class PatchEmbedding(ColossalaiModule):
+    """2D Image to Patch Embedding.
-    Args:
-        img_size (int): image size.
+    Args:
-        patch_size (int): patch size.
+        img_size (int): image size.
-        in_chans (int): number of channels of input image.
+        patch_size (int): patch size.
-        embed_size (int): size of embedding.
+        in_chans (int): number of channels of input image.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        embed_size (int): size of embedding.
-        flatten (bool, optional): whether to flatten output tensor, defaults to True.
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
-        weight_initializer (:class:`typing.Callable`, optional):
+        flatten (bool, optional): whether to flatten output tensor, defaults to True.
-            The initializer of weight, defaults to kaiming uniform initializer.
+        weight_initializer (:class:`typing.Callable`, optional):
-        bias_initializer (:class:`typing.Callable`, optional):
+            The initializer of weight, defaults to kaiming uniform initializer.
-            The initializer of bias, defaults to xavier uniform initializer.
+        bias_initializer (:class:`typing.Callable`, optional):
-        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of bias, defaults to xavier uniform initializer.
-            The initializer of position embedding, defaults to zeros initializer.
+        position_embed_initializer (:class:`typing.Callable`, optional):
+            The initializer of position embedding, defaults to zeros initializer.
-    More details about ``initializer`` please refer to
-    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    More details about ``initializer`` please refer to
-    """
+    `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
+    """
-    def __init__(
-        self,
+    def __init__(
-        img_size: int,
+        self,
-        patch_size: int,
+        img_size: int,
-        in_chans: int,
+        patch_size: int,
-        embed_size: int,
+        in_chans: int,
-        dtype: dtype = None,
+        embed_size: int,
-        flatten: bool = True,
+        dtype: dtype = None,
-        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
+        flatten: bool = True,
-        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
+        weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
-        position_embed_initializer: Callable = init.zeros_()
+        bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
-    ) -> None:
+        position_embed_initializer: Callable = init.zeros_()
-        tensor_parallel = get_tensor_parallel_mode()
+    ) -> None:
-        embed = _parallel_patchembedding[tensor_parallel](
+        tensor_parallel = get_tensor_parallel_mode()
-            img_size,
+        embed = _parallel_patchembedding[tensor_parallel](
-            patch_size,
+            img_size,
-            in_chans,
+            patch_size,
-            embed_size,
+            in_chans,
-            dtype=dtype,
+            embed_size,
-            flatten=flatten,
+            dtype=dtype,
-            weight_initializer=weight_initializer,
+            flatten=flatten,
-            bias_initializer=bias_initializer,
+            weight_initializer=weight_initializer,
-            position_embed_initializer=position_embed_initializer,
+            bias_initializer=bias_initializer,
-        )
+            position_embed_initializer=position_embed_initializer,
-        super().__init__(embed)
+        )
+        super().__init__(embed)
--- a/colossalai/nn/layer/colossalai_layer/linear.py
+++ b/colossalai/nn/layer/colossalai_layer/linear.py
@@ -4,9 +4,9 @@ from typing import Callable
 from torch import dtype, nn
+from colossalai.nn import init
 from colossalai.utils import get_current_device
-from ... import init as init
 from ..parallel_1d import *
 from ..parallel_2d import *
 from ..parallel_2p5d import *

--- a/colossalai/nn/layer/colossalai_layer/normalization.py
+++ b/colossalai/nn/layer/colossalai_layer/normalization.py
-from colossalai.utils import get_current_device
+from torch import nn
-from torch import nn
+from colossalai.utils import get_current_device
-from ..parallel_1d import LayerNorm1D
-from ..parallel_2d import LayerNorm2D
+from ..parallel_1d import LayerNorm1D
-from ..parallel_2p5d import LayerNorm2p5D
+from ..parallel_2d import LayerNorm2D
-from ..parallel_3d import LayerNorm3D
+from ..parallel_2p5d import LayerNorm2p5D
-from ..utils import get_tensor_parallel_mode
+from ..parallel_3d import LayerNorm3D
-from ..vanilla import VanillaLayerNorm
+from ..utils import get_tensor_parallel_mode
-from ._utils import ColossalaiModule
+from ..vanilla import VanillaLayerNorm
+from ._utils import ColossalaiModule
-_parallel_layernorm = {
-    None: VanillaLayerNorm,
+_parallel_layernorm = {
-    "1d": LayerNorm1D,
+    None: VanillaLayerNorm,
-    "2d": LayerNorm2D,
+    "1d": LayerNorm1D,
-    "2.5d": LayerNorm2p5D,
+    "2d": LayerNorm2D,
-    "3d": LayerNorm3D,
+    "2.5d": LayerNorm2p5D,
-}
+    "3d": LayerNorm3D,
+}
-class LayerNorm(ColossalaiModule):
-    r"""Layer Normalization for colossalai.
+class LayerNorm(ColossalaiModule):
+    r"""Layer Normalization for colossalai.
-    Args:
-        normalized_shape (int): input shape from an expected input of size.
+    Args:
-            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
+        normalized_shape (int): input shape from an expected input of size.
-            \times \ldots \times \text{normalized_shape}[-1]]`
+            :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
-            If a single integer is used, it is treated as a singleton list, and this module will
+            \times \ldots \times \text{normalized_shape}[-1]]`
-            normalize over the last dimension which is expected to be of that specific size.
+            If a single integer is used, it is treated as a singleton list, and this module will
-        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
+            normalize over the last dimension which is expected to be of that specific size.
-        bias (bool, optional): Whether to add a bias, defaults to ``True``.
+        eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
-        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+        bias (bool, optional): Whether to add a bias, defaults to ``True``.
-    """
+        dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
+    """
-    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
-        tensor_parallel = get_tensor_parallel_mode()
+    def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
-        if tensor_parallel is None:
+        tensor_parallel = get_tensor_parallel_mode()
-            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
+        if tensor_parallel is None:
-        else:
+            norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
-            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+        else:
-        super().__init__(norm)
+            norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
+        super().__init__(norm)
--- a/colossalai/nn/layer/parallel_1d/__init__.py
+++ b/colossalai/nn/layer/parallel_1d/__init__.py
-from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
+from .layers import (
-                     PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
+    Classifier1D,
+    Dropout1D,
+    Embedding1D,
+    LayerNorm1D,
+    Linear1D,
+    Linear1D_Col,
+    Linear1D_Row,
+    PatchEmbedding1D,
+    VocabParallelClassifier1D,
+    VocabParallelEmbedding1D,
+)
 __all__ = [
    'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',

--- a/colossalai/nn/layer/parallel_1d/_operation.py
+++ b/colossalai/nn/layer/parallel_1d/_operation.py
--- a/colossalai/nn/layer/parallel_1d/_utils.py
+++ b/colossalai/nn/layer/parallel_1d/_utils.py
@@ -3,6 +3,7 @@
 import torch
 import torch.distributed as dist
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
@@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
 class _SplitForwardGatherBackward(torch.autograd.Function):
    """
    Split the input and keep only the corresponding chuck to the rank.
    Args:
        input_: input matrix.
        parallel_mode: parallel mode.

--- a/colossalai/nn/layer/parallel_1d/layers.py
+++ b/colossalai/nn/layer/parallel_1d/layers.py
@@ -10,11 +10,11 @@ import torch.nn.functional as F
 from torch import Tensor
 from torch.nn.parameter import Parameter
-from colossalai.communication import broadcast
 from colossalai.context import ParallelMode, seed
 from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
 from colossalai.kernel import LayerNorm
+from colossalai.legacy.communication import broadcast
 from colossalai.legacy.registry import LAYERS
 from colossalai.nn import init as init
 from colossalai.utils.checkpointing import (

--- a/colossalai/nn/layer/parallel_2d/__init__.py
+++ b/colossalai/nn/layer/parallel_2d/__init__.py
 from ._operation import reduce_by_batch_2d, split_batch_2d
-from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
+from .layers import (
-                     VocabParallelEmbedding2D)
+    Classifier2D,
+    Embedding2D,
+    LayerNorm2D,
+    Linear2D,
+    PatchEmbedding2D,
+    VocabParallelClassifier2D,
+    VocabParallelEmbedding2D,
+)
 __all__ = [
    'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',

--- a/colossalai/nn/layer/parallel_2d/_operation.py
+++ b/colossalai/nn/layer/parallel_2d/_operation.py
@@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
 import torch
 import torch.distributed as dist
-from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
-from colossalai.context.parallel_mode import ParallelMode
-from colossalai.core import global_context as gpc
-from colossalai.utils import get_current_device
 from torch import Tensor
 from torch.cuda.amp import custom_bwd, custom_fwd
+from colossalai.context.parallel_mode import ParallelMode
+from colossalai.core import global_context as gpc
 from colossalai.global_variables import tensor_parallel_env as env
+from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
+from colossalai.utils import get_current_device
 def matmul_2d(
@@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)
        src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        opa = [None] * 2
        opb = [None] * 2
@@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)
        src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        opb = [None] * 2
        opr = [None] * 2
@@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
        col_group = gpc.get_group(col_parallel_mode)
        src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
-                pipeline_parallel_rank * tensor_parallel_size
+            pipeline_parallel_rank * tensor_parallel_size
        opa = [None] * 2
        opr = [None] * 2

--- a/colossalai/nn/layer/parallel_2d/_utils.py
+++ b/colossalai/nn/layer/parallel_2d/_utils.py