Unverified Commit 554aa959 authored by Hongxin Liu's avatar Hongxin Liu Committed by GitHub
Browse files

[legacy] move communication and nn to legacy and refactor logger (#4671)

* [legacy] move communication to legacy (#4640)

* [legacy] refactor logger and clean up legacy codes (#4654)

* [legacy] make logger independent to gpc

* [legacy] make optim independent to registry

* [legacy] move test engine to legacy

* [legacy] move nn to legacy (#4656)

* [legacy] move nn to legacy

* [checkpointio] fix save hf config

* [test] remove useledd rpc pp test

* [legacy] fix nn init

* [example] skip tutorial hybriad parallel example

* [devops] test doc check

* [devops] test doc check
parent 536397cc
import torch.nn.functional as F
from typing import Optional
import torch.nn.functional as F
from torch import Tensor
from colossalai.tensor import ColoTensor, ColoTensorSpec, ComputePattern, ComputeSpec, ReplicaSpec, ShardSpec, distspec
from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ComputePattern, ComputePattern, ComputeSpec, ColoTensor, distspec, ColoTensorSpec, \
ShardSpec, ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor
......
from typing import List, Optional
import torch.nn.functional as F
from colossalai.tensor import ColoTensor, ColoTensorSpec, ReplicaSpec, distspec
from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.tensor import ColoTensor, distspec, ColoTensorSpec, ReplicaSpec
from ._utils import GeneralTensor, convert_to_colo_tensor
......
from typing import Optional
import torch
import torch.nn.functional as F
from typing import Optional
from colossalai.tensor.op_wrapper import colo_op_impl
from colossalai.legacy.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
from colossalai.tensor import ColoTensor, ColoTensorSpec
from colossalai.nn.loss.loss_1d import VocabParallelCrossEntropyLoss1D
from colossalai.tensor.op_wrapper import colo_op_impl
from ._utils import GeneralTensor, convert_to_colo_tensor
......
from .colossalai_layer import *
from .parallel_1d import *
from .parallel_2d import *
from .parallel_2p5d import *
from .parallel_3d import *
from .parallel_sequence import *
from .utils import *
from .vanilla import *
from .wrapper import *
from ._utils import partition_batch
from .dropout import Dropout
from .embedding import Embedding, PatchEmbedding
from .linear import Classifier, Linear
from .normalization import LayerNorm
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
from ._utils import partition_batch
from .dropout import Dropout
from .embedding import Embedding, PatchEmbedding
from .linear import Classifier, Linear
from .normalization import LayerNorm
__all__ = ['Linear', 'Classifier', 'Embedding', 'PatchEmbedding', 'LayerNorm', 'Dropout', 'partition_batch']
import math
from typing import Callable
from colossalai.utils import get_current_device
from torch import dtype, nn
from ... import init as init
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
from ..utils import get_tensor_parallel_mode
from ..vanilla import VanillaPatchEmbedding
from ._utils import ColossalaiModule
_parallel_embedding = {
'1d': Embedding1D,
'2d': Embedding2D,
'2.5d': Embedding2p5D,
'3d': Embedding3D,
}
_vocab_parallel_embedding = {
'1d': VocabParallelEmbedding1D,
'2d': VocabParallelEmbedding2D,
'2.5d': VocabParallelEmbedding2p5D,
'3d': VocabParallelEmbedding3D
}
_parallel_patchembedding = {
None: VanillaPatchEmbedding,
'1d': PatchEmbedding1D,
'2d': PatchEmbedding2D,
'2.5d': PatchEmbedding2p5D,
'3d': PatchEmbedding3D
}
class Embedding(ColossalaiModule):
r"""Embedding for colossalai.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def __init__(self,
num_embeddings: int,
embedding_dim: int,
padding_idx: int = None,
dtype: dtype = None,
weight_initializer: Callable = init.normal_(),
vocab_parallel_limit: int = 2048,
*args,
**kwargs) -> None:
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is None:
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
**kwargs).to(dtype).to(get_current_device())
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
elif num_embeddings <= vocab_parallel_limit:
embed = _parallel_embedding[tensor_parallel](
num_embeddings,
embedding_dim,
padding_idx=padding_idx,
dtype=dtype,
weight_initializer=weight_initializer,
*args,
**kwargs,
)
else:
embed = _vocab_parallel_embedding[tensor_parallel](
num_embeddings,
embedding_dim,
padding_idx=padding_idx,
dtype=dtype,
weight_initializer=weight_initializer,
*args,
**kwargs,
)
super().__init__(embed)
class PatchEmbedding(ColossalaiModule):
"""2D Image to Patch Embedding.
Args:
img_size (int): image size.
patch_size (int): patch size.
in_chans (int): number of channels of input image.
embed_size (int): size of embedding.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def __init__(
self,
img_size: int,
patch_size: int,
in_chans: int,
embed_size: int,
dtype: dtype = None,
flatten: bool = True,
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
position_embed_initializer: Callable = init.zeros_()
) -> None:
tensor_parallel = get_tensor_parallel_mode()
embed = _parallel_patchembedding[tensor_parallel](
img_size,
patch_size,
in_chans,
embed_size,
dtype=dtype,
flatten=flatten,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
position_embed_initializer=position_embed_initializer,
)
super().__init__(embed)
import math
from typing import Callable
from torch import dtype, nn
from colossalai.nn import init
from colossalai.utils import get_current_device
from ..parallel_1d import Embedding1D, PatchEmbedding1D, VocabParallelEmbedding1D
from ..parallel_2d import Embedding2D, PatchEmbedding2D, VocabParallelEmbedding2D
from ..parallel_2p5d import Embedding2p5D, PatchEmbedding2p5D, VocabParallelEmbedding2p5D
from ..parallel_3d import Embedding3D, PatchEmbedding3D, VocabParallelEmbedding3D
from ..utils import get_tensor_parallel_mode
from ..vanilla import VanillaPatchEmbedding
from ._utils import ColossalaiModule
_parallel_embedding = {
'1d': Embedding1D,
'2d': Embedding2D,
'2.5d': Embedding2p5D,
'3d': Embedding3D,
}
_vocab_parallel_embedding = {
'1d': VocabParallelEmbedding1D,
'2d': VocabParallelEmbedding2D,
'2.5d': VocabParallelEmbedding2p5D,
'3d': VocabParallelEmbedding3D
}
_parallel_patchembedding = {
None: VanillaPatchEmbedding,
'1d': PatchEmbedding1D,
'2d': PatchEmbedding2D,
'2.5d': PatchEmbedding2p5D,
'3d': PatchEmbedding3D
}
class Embedding(ColossalaiModule):
r"""Embedding for colossalai.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def __init__(self,
num_embeddings: int,
embedding_dim: int,
padding_idx: int = None,
dtype: dtype = None,
weight_initializer: Callable = init.normal_(),
vocab_parallel_limit: int = 2048,
*args,
**kwargs) -> None:
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is None:
embed = nn.Embedding(num_embeddings, embedding_dim, padding_idx=padding_idx, *args,
**kwargs).to(dtype).to(get_current_device())
weight_initializer(embed.weight, fan_in=num_embeddings, fan_out=embedding_dim)
elif num_embeddings <= vocab_parallel_limit:
embed = _parallel_embedding[tensor_parallel](
num_embeddings,
embedding_dim,
padding_idx=padding_idx,
dtype=dtype,
weight_initializer=weight_initializer,
*args,
**kwargs,
)
else:
embed = _vocab_parallel_embedding[tensor_parallel](
num_embeddings,
embedding_dim,
padding_idx=padding_idx,
dtype=dtype,
weight_initializer=weight_initializer,
*args,
**kwargs,
)
super().__init__(embed)
class PatchEmbedding(ColossalaiModule):
"""2D Image to Patch Embedding.
Args:
img_size (int): image size.
patch_size (int): patch size.
in_chans (int): number of channels of input image.
embed_size (int): size of embedding.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def __init__(
self,
img_size: int,
patch_size: int,
in_chans: int,
embed_size: int,
dtype: dtype = None,
flatten: bool = True,
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
position_embed_initializer: Callable = init.zeros_()
) -> None:
tensor_parallel = get_tensor_parallel_mode()
embed = _parallel_patchembedding[tensor_parallel](
img_size,
patch_size,
in_chans,
embed_size,
dtype=dtype,
flatten=flatten,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
position_embed_initializer=position_embed_initializer,
)
super().__init__(embed)
......@@ -4,9 +4,9 @@ from typing import Callable
from torch import dtype, nn
from colossalai.nn import init
from colossalai.utils import get_current_device
from ... import init as init
from ..parallel_1d import *
from ..parallel_2d import *
from ..parallel_2p5d import *
......
from colossalai.utils import get_current_device
from torch import nn
from ..parallel_1d import LayerNorm1D
from ..parallel_2d import LayerNorm2D
from ..parallel_2p5d import LayerNorm2p5D
from ..parallel_3d import LayerNorm3D
from ..utils import get_tensor_parallel_mode
from ..vanilla import VanillaLayerNorm
from ._utils import ColossalaiModule
_parallel_layernorm = {
None: VanillaLayerNorm,
"1d": LayerNorm1D,
"2d": LayerNorm2D,
"2.5d": LayerNorm2p5D,
"3d": LayerNorm3D,
}
class LayerNorm(ColossalaiModule):
r"""Layer Normalization for colossalai.
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
bias (bool, optional): Whether to add a bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is None:
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
else:
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
super().__init__(norm)
from torch import nn
from colossalai.utils import get_current_device
from ..parallel_1d import LayerNorm1D
from ..parallel_2d import LayerNorm2D
from ..parallel_2p5d import LayerNorm2p5D
from ..parallel_3d import LayerNorm3D
from ..utils import get_tensor_parallel_mode
from ..vanilla import VanillaLayerNorm
from ._utils import ColossalaiModule
_parallel_layernorm = {
None: VanillaLayerNorm,
"1d": LayerNorm1D,
"2d": LayerNorm2D,
"2.5d": LayerNorm2p5D,
"3d": LayerNorm3D,
}
class LayerNorm(ColossalaiModule):
r"""Layer Normalization for colossalai.
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float): a value added to the denominator for numerical stability, defaults to 1e-05.
bias (bool, optional): Whether to add a bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def __init__(self, normalized_shape: int, eps=1e-05, bias=True, dtype=None) -> None:
tensor_parallel = get_tensor_parallel_mode()
if tensor_parallel is None:
norm = nn.LayerNorm(normalized_shape, eps=eps).to(dtype).to(get_current_device())
else:
norm = _parallel_layernorm[tensor_parallel](normalized_shape, eps=eps, dtype=dtype)
super().__init__(norm)
from .layers import (Classifier1D, Dropout1D, Embedding1D, LayerNorm1D, Linear1D, Linear1D_Col, Linear1D_Row,
PatchEmbedding1D, VocabParallelClassifier1D, VocabParallelEmbedding1D)
from .layers import (
Classifier1D,
Dropout1D,
Embedding1D,
LayerNorm1D,
Linear1D,
Linear1D_Col,
Linear1D_Row,
PatchEmbedding1D,
VocabParallelClassifier1D,
VocabParallelEmbedding1D,
)
__all__ = [
'Linear1D', 'Linear1D_Col', 'Linear1D_Row', 'Embedding1D', 'Dropout1D', 'Classifier1D', 'VocabParallelClassifier1D',
......
......@@ -3,6 +3,7 @@
import torch
import torch.distributed as dist
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
......@@ -124,7 +125,7 @@ class _ReduceInput(torch.autograd.Function):
class _SplitForwardGatherBackward(torch.autograd.Function):
"""
Split the input and keep only the corresponding chuck to the rank.
Args:
input_: input matrix.
parallel_mode: parallel mode.
......
......@@ -10,11 +10,11 @@ import torch.nn.functional as F
from torch import Tensor
from torch.nn.parameter import Parameter
from colossalai.communication import broadcast
from colossalai.context import ParallelMode, seed
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.kernel import LayerNorm
from colossalai.legacy.communication import broadcast
from colossalai.legacy.registry import LAYERS
from colossalai.nn import init as init
from colossalai.utils.checkpointing import (
......
from ._operation import reduce_by_batch_2d, split_batch_2d
from .layers import (Classifier2D, Embedding2D, LayerNorm2D, Linear2D, PatchEmbedding2D, VocabParallelClassifier2D,
VocabParallelEmbedding2D)
from .layers import (
Classifier2D,
Embedding2D,
LayerNorm2D,
Linear2D,
PatchEmbedding2D,
VocabParallelClassifier2D,
VocabParallelEmbedding2D,
)
__all__ = [
'split_batch_2d', 'reduce_by_batch_2d', 'Linear2D', 'LayerNorm2D', 'Classifier2D', 'PatchEmbedding2D',
......
......@@ -2,13 +2,14 @@ from typing import Any, Optional, Tuple
import torch
import torch.distributed as dist
from colossalai.communication.collective import (all_gather, all_reduce, reduce, reduce_scatter)
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.utils import get_current_device
from torch import Tensor
from torch.cuda.amp import custom_bwd, custom_fwd
from colossalai.context.parallel_mode import ParallelMode
from colossalai.core import global_context as gpc
from colossalai.global_variables import tensor_parallel_env as env
from colossalai.legacy.communication.collective import all_gather, all_reduce, reduce, reduce_scatter
from colossalai.utils import get_current_device
def matmul_2d(
......@@ -226,9 +227,9 @@ class Matmul_AB_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode)
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size
pipeline_parallel_rank * tensor_parallel_size
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size
pipeline_parallel_rank * tensor_parallel_size
opa = [None] * 2
opb = [None] * 2
......@@ -351,9 +352,9 @@ class Matmul_ABT_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode)
src_b = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size
pipeline_parallel_rank * tensor_parallel_size
src_c = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size
pipeline_parallel_rank * tensor_parallel_size
opb = [None] * 2
opr = [None] * 2
......@@ -484,9 +485,9 @@ class Matmul_ATB_2D(torch.autograd.Function):
col_group = gpc.get_group(col_parallel_mode)
src_a = summa_dim * row_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size
pipeline_parallel_rank * tensor_parallel_size
src_c = col_rank + data_parallel_rank * pipeline_parallel_size * tensor_parallel_size + \
pipeline_parallel_rank * tensor_parallel_size
pipeline_parallel_rank * tensor_parallel_size
opa = [None] * 2
opr = [None] * 2
......
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment