Skip to content
GitLab
Menu
Projects
Groups
Snippets
Loading...
Help
Help
Support
Community forum
Keyboard shortcuts
?
Submit feedback
Contribute to GitLab
Sign in / Register
Toggle navigation
Menu
Open sidebar
OpenDAS
ColossalAI
Commits
ec5086c4
Commit
ec5086c4
authored
Mar 25, 2022
by
Liang Bowen
Committed by
アマデウス
Mar 29, 2022
Browse files
Refactored docstring to google style
parent
53b1b6e3
Changes
94
Hide whitespace changes
Inline
Side-by-side
Showing
20 changed files
with
1206 additions
and
1180 deletions
+1206
-1180
colossalai/nn/layer/colossalai_layer/linear.py
colossalai/nn/layer/colossalai_layer/linear.py
+44
-31
colossalai/nn/layer/colossalai_layer/normalization.py
colossalai/nn/layer/colossalai_layer/normalization.py
+9
-12
colossalai/nn/layer/moe/experts.py
colossalai/nn/layer/moe/experts.py
+4
-5
colossalai/nn/layer/moe/layers.py
colossalai/nn/layer/moe/layers.py
+18
-33
colossalai/nn/layer/moe/utils.py
colossalai/nn/layer/moe/utils.py
+4
-4
colossalai/nn/layer/parallel_1d/_operation.py
colossalai/nn/layer/parallel_1d/_operation.py
+10
-10
colossalai/nn/layer/parallel_1d/_utils.py
colossalai/nn/layer/parallel_1d/_utils.py
+17
-14
colossalai/nn/layer/parallel_1d/layers.py
colossalai/nn/layer/parallel_1d/layers.py
+138
-124
colossalai/nn/layer/parallel_2d/_operation.py
colossalai/nn/layer/parallel_2d/_operation.py
+173
-205
colossalai/nn/layer/parallel_2d/layers.py
colossalai/nn/layer/parallel_2d/layers.py
+128
-113
colossalai/nn/layer/parallel_2p5d/_operation.py
colossalai/nn/layer/parallel_2p5d/_operation.py
+173
-212
colossalai/nn/layer/parallel_2p5d/layers.py
colossalai/nn/layer/parallel_2p5d/layers.py
+126
-110
colossalai/nn/layer/parallel_3d/_operation.py
colossalai/nn/layer/parallel_3d/_operation.py
+120
-116
colossalai/nn/layer/parallel_3d/layers.py
colossalai/nn/layer/parallel_3d/layers.py
+126
-112
colossalai/nn/layer/parallel_sequence/layers.py
colossalai/nn/layer/parallel_sequence/layers.py
+7
-8
colossalai/nn/layer/utils/common.py
colossalai/nn/layer/utils/common.py
+8
-3
colossalai/nn/layer/vanilla/layers.py
colossalai/nn/layer/vanilla/layers.py
+62
-38
colossalai/nn/layer/wrapper/lambda_wrapper.py
colossalai/nn/layer/wrapper/lambda_wrapper.py
+4
-5
colossalai/nn/loss/loss_1d.py
colossalai/nn/loss/loss_1d.py
+7
-8
colossalai/nn/loss/loss_2d.py
colossalai/nn/loss/loss_2d.py
+28
-17
No files found.
colossalai/nn/layer/colossalai_layer/linear.py
View file @
ec5086c4
...
...
@@ -31,22 +31,35 @@ _vocab_parallel_classifier = {
class
Linear
(
nn
.
Module
):
"""
Linear layer of colossalai
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
:param kwargs: Kwargs used for particular parallelisms
"""Linear layer of colossalai.
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
Note: ``kwargs`` would contain different parameters when you use different parallelisms.
The ``kwargs`` should contain parameters below:
::
Linear1D:
gather_output: bool (optional, default to be false)
skip_bias_add: bool (optional, default to be false)
Linear2D:
skip_bias_add: bool (optional, default to be false)
Linear2p5D:
skip_bias_add: bool (optional, default to be false)
Linear3D:
None
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -88,21 +101,21 @@ class Linear(nn.Module):
class
Classifier
(
nn
.
Module
):
"""
Classifier layer of colossalai
:param
in_features: size of each input sample
:type in_features: int
:param num_classes: number of total classes for the dataset
:type num_classes: int
:param bias: If set to ``False``, the layer will not learn an additive bia
s, defaults to
True
:type bias: bool
, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype
, optional
:param weight_initializer:
The intializer of
weight
, defaults to
kaiming
uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The
intializer
of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
"""
Classifier layer of colossalai.
Args:
in_features
(int)
: size of each input sample
.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameter
s, defaults to
None.
weight_initializer (:class:`typing.Callable`
, optional
):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`
, optional
):
The in
i
tializer of
bias
, defaults to
xavier
uniform initializer
.
More details about ``
in
i
tializer
`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
colossalai/nn/layer/colossalai_layer/normalization.py
View file @
ec5086c4
...
...
@@ -19,18 +19,15 @@ _parallel_layernorm = {
class
LayerNorm
(
nn
.
Module
):
r
"""
Layer Normalization for colossalai
:param normalized_shape: input shape from an expected input
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
:type eps: float, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
r
"""Layer Normalization for colossalai.
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def
__init__
(
self
,
normalized_shape
:
int
,
eps
=
1e-05
,
dtype
=
None
)
->
None
:
...
...
colossalai/nn/layer/moe/experts.py
View file @
ec5086c4
...
...
@@ -28,11 +28,10 @@ class Experts(MoeExperts):
moe model parallel group, where E is the number of experts. Every expert
is a instence of the class, 'expert' in initialization parameters.
:param expert: The class of all experts
:param num_experts: The number of experts
:param expert_args: Args used to initialize experts
:type num_experts: int
Args:
expert_cls (:class:`torch.nn.Module`): The class of all experts
num_experts (int): The number of experts
expert_args: Args used to initialize experts, the args could be found in corresponding expert class
"""
def
__init__
(
self
,
expert_cls
:
Type
[
nn
.
Module
],
num_experts
:
int
,
**
expert_args
):
...
...
colossalai/nn/layer/moe/layers.py
View file @
ec5086c4
...
...
@@ -18,19 +18,13 @@ class Top1Router(nn.Module):
for routing usage. More deailted function can be found in the paper about Switch Transformer
of Google.
:param capacity_factor_train: Capacity factor in routing during training
:param capacity_factor_eval: Capacity factor in routing during evaluation
:param min_capacity: The minimum number of the capacity of each expert
:param select_policy: The policy about tokens selection
:param noisy_func: Noisy function used in logits
:param drop_tks: Whether drops tokens in evaluation
:type capacity_factor_train: float, optional
:type capacity_factor_eval: float, optional
:type min_capacity: int, optional
:type select_policy: str, optional
:type noisy_func: Callable, optional
:type drop_tks: bool, optional
Args:
capacity_factor_train (float, optional): Capacity factor in routing of training.
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
min_capacity (int, optional): The minimum number of the capacity of each expert.
select_policy (str, optional): The policy about tokens selection.
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
drop_tks (bool, optional): Whether drops tokens in evaluation
"""
def
__init__
(
self
,
...
...
@@ -119,17 +113,12 @@ class Top2Router(nn.Module):
"""Top2 router that returns the dispatch mask [s, e, c] and combine weight [s, e, c]
for routing usage. More deailted function can be found in the paper about ViT-MoE.
:param capacity_factor_train: Capacity factor in routing during training
:param capacity_factor_eval: Capacity factor in routing during evaluation
:param min_capacity: The minimum number of the capacity of each expert
:param noisy_func: Noisy function used in logits
:param drop_tks: Whether drops tokens in evaluation
:type capacity_factor_train: float, optional
:type capacity_factor_eval: float, optional
:type min_capacity: int, optional
:type noisy_func: Callable, optional
:type drop_tks: bool, optional
Args:
capacity_factor_train (float, optional): Capacity factor in routing of training.
capacity_factor_eval (float, optional): Capacity factor in routing of evaluation.
min_capacity (int, optional): The minimum number of the capacity of each expert
noisy_func (:class:`typing.Callable`, optional): Noisy function used in logits.
drop_tks (bool, optional): Whether drops tokens in evaluation.
"""
def
__init__
(
self
,
...
...
@@ -239,15 +228,11 @@ class MoeLayer(nn.Module):
the moe tensor group by all to all comunication. Then it will get the output of all
experts and exchange the output. At last returns the output of the moe system.
:param dim_model: Dimension of model
:param num_experts: The number of experts
:param router: Instance of router used in routing
:param experts: Instance of experts generated by Expert
:type dim_model: int
:type num_experts: int
:type router: nn.Module
:type experts: nn.Module
Args:
dim_model (int): Dimension of model.
num_experts (int): The number of experts.
router (:class:`torch.nn.Module`): Instance of router used in routing.
experts (:class:`torch.nn.Module`): Instance of experts generated by Expert.
"""
def
__init__
(
self
,
dim_model
:
int
,
num_experts
:
int
,
router
:
nn
.
Module
,
experts
:
MoeExperts
):
...
...
colossalai/nn/layer/moe/utils.py
View file @
ec5086c4
...
...
@@ -16,8 +16,8 @@ class NormalNoiseGenerator:
All noise is generated from a normal distribution (0, 1 / E^2), where
E = the number of experts.
:param num_experts: The number of experts
:type
num_experts
: int
Args:
num_experts
(int): The number of experts.
"""
def
__init__
(
self
,
num_experts
:
int
):
...
...
@@ -37,8 +37,8 @@ class UniformNoiseGenerator:
Makes models more resilient to rounding errors introduced by bfloat16.
This seems particularly important for logits.
:param eps: Epsilon in generator
:type eps: float
Args:
eps (float, optional): Epsilon in generator, defaults 1e-2.
"""
def
__init__
(
self
,
eps
:
float
=
1e-2
):
...
...
colossalai/nn/layer/parallel_1d/_operation.py
View file @
ec5086c4
...
...
@@ -7,17 +7,17 @@ except:
class
FusedLayerNormAffineFunction1D
(
torch
.
autograd
.
Function
):
r
"""
Layernorm
r
"""Layernorm
:param input: input maxtrix
:param weight: weight matrix
:param bias: bias matrix
:param normalized_shape: input shape from an expected input
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:param eps: a value added to the denominator for numerical stability
Args:
input: input matrix.
weight: weight matrix.
bias: bias matrix.
normalized_shape: input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps: a value added to the denominator for numerical stability
"""
@
staticmethod
...
...
colossalai/nn/layer/parallel_1d/_utils.py
View file @
ec5086c4
...
...
@@ -78,8 +78,9 @@ class _ReduceGrad(torch.autograd.Function):
"""
Pass the input to the model parallel region.
:param input_: input matrix
:param parallel_mode: parallel mode
Args:
input_: input matrix.
parallel_mode: parallel mode.
"""
@
staticmethod
...
...
@@ -99,9 +100,10 @@ class _ReduceGrad(torch.autograd.Function):
class
_ReduceInput
(
torch
.
autograd
.
Function
):
"""
All-reduce the input from the model parallel region.
:param input_: input matrix
:param parallel_mode: parallel mode
Args:
input_: input matrix.
parallel_mode: parallel mode.
"""
@
staticmethod
...
...
@@ -121,9 +123,10 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
"""
Split the input and keep only the corresponding chuck to the rank.
:param input_: input matrix
:param parallel_mode: parallel mode
:param dim: dimension
Args:
input_: input matrix.
parallel_mode: parallel mode.
dim: dimension
"""
@
staticmethod
...
...
@@ -142,12 +145,12 @@ class _SplitForwardGatherBackward(torch.autograd.Function):
class
_GatherForwardSplitBackward
(
torch
.
autograd
.
Function
):
"""
Gather the input from model parallel region and concatinate.
:param
input_: input matrix
:param
parallel_mode: parallel mode
:param
dim: dimension
"""
Gather the input from model parallel region and concatenate.
Args:
input_: input matrix
.
parallel_mode: parallel mode
.
dim: dimension
"""
@
staticmethod
...
...
colossalai/nn/layer/parallel_1d/layers.py
View file @
ec5086c4
...
...
@@ -24,24 +24,23 @@ from ._utils import (gather_forward_split_backward, get_parallel_input, reduce_g
@
LAYERS
.
register_module
class
Linear1D
(
torch
.
nn
.
Module
):
"""
Linear layer for 1D parallelism
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to False
:type skip_bias_add: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Linear layer for 1D parallelism.
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
gather_output (bool, optional): Whether to call all-gather on output, defaults to False.
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to False
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -88,23 +87,21 @@ class Linear1D(torch.nn.Module):
@
LAYERS
.
register_module
class
Classifier1D
(
ParallelLayer
):
"""RowLinear with given weight
Classifier of 1D parallelism
:param in_features: size of input features
:type in_features: int
:param num_classes: number of classes in the dataset
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""RowLinear with given weight. Classifier of 1D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -171,23 +168,21 @@ class Classifier1D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelClassifier1D
(
ParallelLayer
):
"""ColLinear with given weight
Classifier of 1D parallelism
:param in_features: size of input features
:type in_features: int
:param num_classes: number of classes in the dataset
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""ColLinear with given weight. Classifier of 1D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -249,30 +244,28 @@ class VocabParallelClassifier1D(ParallelLayer):
@
LAYERS
.
register_module
class
Linear1D_Col
(
ParallelLayer
):
"""Linear layer with column parallelism.
r
"""Linear layer with column parallelism.
The linear layer is defined as :math:`Y = XA + b`. A is parallelized along
its second dimension as :math:`A = [A_1, ..., A_p]`.
:param in_features: first dimension of matrix A.
:type in_features: int
:param output_size: second dimension of matrix A.
:type output_size: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param gather_output: If true, call all-gether on output and make Y avaiable
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
gather_output (bool, optional): If true, call all-gather on output and make Y available
to all GPUs, otherwise, every GPU will have its output
which is :math:`Y_i = XA_i`, defaults to False
:type gather_output: bool, optional
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to False
:type skip_bias_add: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to Fals
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -343,25 +336,23 @@ class Linear1D_Col(ParallelLayer):
@
LAYERS
.
register_module
class
Linear1D_Row
(
ParallelLayer
):
""" Linear layer with row parallelism
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param parallel_input: If set to ``True``, it's assumed that the input is splitted, defaults to False
:type parallel_input: bool, optional
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to False
:type skip_bias_add: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
""" Linear layer with row parallelism
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
parallel_input (bool, optional): If set to ``True``, it's assumed that the input is split, defaults to False.
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to Fals
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -432,21 +423,33 @@ class Linear1D_Row(ParallelLayer):
@
LAYERS
.
register_module
class
Embedding1D
(
ParallelLayer
):
"""
Embedding for 1D parallelism
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding for 1D parallelism.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:`torch.nn.functional.embedding` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def
__init__
(
self
,
...
...
@@ -499,20 +502,33 @@ class Embedding1D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelEmbedding1D
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding parallelized in the vocabulary dimension.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -578,13 +594,11 @@ class VocabParallelEmbedding1D(torch.nn.Module):
@
LAYERS
.
register_module
class
Dropout1D
(
ParallelLayer
):
"""
Dropout layer of 1D parallelism
"""Dropout layer of 1D parallelism.
:param p: dropout rate, defaults to 0.5
:type p: float, optional
:param inplace: If set to ``True``, will do this operation in-place, defaults tp ``False``
:type inplace: bool, optional
Args:
p (float, optional): probability of an element to be zeroed, defaults 0.5.
inplace (bool, optional): whether to do dropout in-place, default to be False.
"""
def
__init__
(
self
,
p
:
float
=
0.5
,
inplace
:
bool
=
False
):
...
...
colossalai/nn/layer/parallel_2d/_operation.py
View file @
ec5086c4
...
...
@@ -21,27 +21,26 @@ def matmul_2d(
row_parallel_mode
=
ParallelMode
.
PARALLEL_2D_ROW
,
col_parallel_mode
=
ParallelMode
.
PARALLEL_2D_COL
,
):
"""
Matrix multiplication for 2D parallelism
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param summa_dim: dimension of SUMMA fo 2D parallelism
:type summa_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row, defaults to None
:type row_rank: int, optional
:param col_rank: the rank of column, defaults to None
:type col_rank: int, optional
:param row_parallel_mode: row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW
:type row_parallel_mode: str, optional
:param col_parallel_mode: column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL
:type col_parallel_mode: str, optional
:return: :math:`C = AB`
:rtype: torch.tensor
r
"""Matrix multiplication for 2D parallelism.
Args:
a (:class:`torch.tensor`): matrix :math:`A`.
b (:class:`torch.tensor`): matrix :math:`B`.
summa_dim (int): dimension of SUMMA fo 2D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
row parallel mode, defaults to ParallelMode.PARALLEL_2D_ROW.
col_parallel_mode (:class:`colossalai.context.ParallelMode`, optional):
column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
Returns:
:class:`torch.tensor`: :math:`C = AB`.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
if
row_rank
is
None
:
row_rank
=
gpc
.
get_local_rank
(
col_parallel_mode
)
...
...
@@ -135,35 +134,26 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
row_rank
:
int
,
col_rank
:
int
,
row_parallel_mode
:
ParallelMode
,
col_parallel_mode
:
ParallelMode
,
data_parallel_rank
:
int
,
pipeline_parallel_rank
:
int
,
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
)
->
Tensor
:
"""
2D parallel classifier
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param bias: matrix of bias
:type bias: torch.tensor, optional
:param summa_dim: dimension of SUMMA fo 2D parallelism
:type summa_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""2D parallel classifier.
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
bias (:class:`torch.tensor`, optional): matrix of bias.
summa_dim (int): dimension of SUMMA fo 2D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Classifier2D
.
apply
(
A
,
B
,
bias
,
summa_dim
,
out_shape
,
row_rank
,
col_rank
,
row_parallel_mode
,
col_parallel_mode
,
data_parallel_rank
,
pipeline_parallel_rank
,
pipeline_parallel_size
,
...
...
@@ -171,33 +161,25 @@ def classifier_2d(A: Tensor, B: Tensor, bias: Optional[Tensor], summa_dim: int,
class
Matmul_AB_2D
(
torch
.
autograd
.
Function
):
"""
Matrix multiplication for :math:`C = AB`
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param summa_dim: dimension of SUMMA fo 2D parallelism
:type summa_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix multiplication for :math:`C = AB`.
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
summa_dim (int): dimension of SUMMA fo 2D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float16
)
...
...
@@ -305,33 +287,26 @@ class Matmul_AB_2D(torch.autograd.Function):
class
Matmul_ABT_2D
(
torch
.
autograd
.
Function
):
"""
Matrix multiplication for :math:`C = AB^T`
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param summa_dim: dimension of SUMMA fo 2D parallelism
:type summa_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix multiplication for :math:`C = AB^T`
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
summa_dim (int): dimension of SUMMA fo 2D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
column parallel mode, defaults to ParallelMode.PARALLEL_2D_COL.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float16
)
...
...
@@ -445,33 +420,25 @@ class Matmul_ABT_2D(torch.autograd.Function):
class
Matmul_ATB_2D
(
torch
.
autograd
.
Function
):
"""
Matrix multiplication for :math:`C = A^TB`
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param summa_dim: dimension of SUMMA fo 2D parallelism
:type summa_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix multiplication for :math:`C = A^TB`.
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
summa_dim (int): dimension of SUMMA fo 2D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
@
staticmethod
@
custom_fwd
(
cast_inputs
=
torch
.
float16
)
...
...
@@ -639,33 +606,26 @@ def add_bias_2d(input_: Tensor, bias: Tensor, output_size_per_partition: int, ro
row_parallel_mode
:
ParallelMode
,
col_parallel_mode
:
ParallelMode
,
skip_bias_add
:
bool
,
data_parallel_rank
:
int
,
pipeline_parallel_rank
:
int
,
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
)
->
Tensor
:
"""
Matrix add bias: :math:`C = A + b`
:param input_: matrix :math:`A`
:type input_: torch.tensor
:param bias: matrix :math:`b`
:type bias: torch.tensor
:param output_size_per_partition: size of ouput per partition
:type output_size_per_partition: int
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion
:type skip_bias_add: bool
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix add bias: :math:`C = A + b`.
Args:
input_ (:class:`torch.tensor`): matrix :math:`A`.
bias (:class:`torch.tensor`): matrix :math:`B`.
output_size_per_partition (int): size of output per partition.
row_rank (int, optional): the rank of row, defaults to None.
col_rank (int, optional): the rank of column, defaults to None.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
skip_bias_add (bool):
If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Add_Bias_2D
.
apply
(
input_
,
bias
,
output_size_per_partition
,
row_rank
,
col_rank
,
row_parallel_mode
,
col_parallel_mode
,
skip_bias_add
,
data_parallel_rank
,
pipeline_parallel_rank
,
...
...
@@ -711,21 +671,19 @@ class _Layernorm_2D(torch.autograd.Function):
def
layernorm_2d
(
input_
:
Tensor
,
E_x
:
Tensor
,
Var_x
:
Tensor
,
hidden_size
:
int
,
row_parallel_mode
:
ParallelMode
,
col_parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
Layernorm
:param input_: input maxtrix
:type input_: torch.tensor
:param E_x: mean
:type E_x: torch.tensor
:param Var_x: variance
:type Var_x: torch.tensor
:param hidden_size: hidden size
:type hidden_size: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""Layernorm.
Args:
input_ (:class:`torch.tensor`): input matrix.
E_x (:class:`torch.tensor`): mean.
Var_x (:class:`torch.tensor`): variance.
hidden_size (int): hidden size.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Layernorm_2D
.
apply
(
input_
,
E_x
,
Var_x
,
hidden_size
,
row_parallel_mode
,
col_parallel_mode
)
...
...
@@ -748,27 +706,29 @@ class _AllGatherTensor2D(torch.autograd.Function):
def
all_gather_tensor_2d
(
tensor
:
Tensor
,
dim
:
int
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
All gather the tensor of 2D parallelism
:param inputs: input maxtrix
:type inputs: torch.tensor
:param dim: dimension to gather
:type dim: int
:param parallel_mode: parallel mode
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""All gather the tensor of 2D parallelism.
Args:
tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to gather.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_AllGatherTensor2D
.
apply
(
tensor
,
dim
,
parallel_mode
)
def
split_tensor_2d
(
input_
:
Tensor
,
dim
:
int
=
0
)
->
Tensor
:
"""Splits 2D tensor in specified dimension across cols
:param input_: Input tensor
:param dim: Specified dimension in which to split
:type input_: torch.Tensor
:type dim: int, optional
:return output: Splitted tensor
:rtype output: torch.Tensor
"""Splits 2D tensor in specified dimension across cols.
Args:
input_ (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split.
Returns:
:class:`torch.tensor`: The tensor has been split.
"""
if
input_
.
size
(
dim
)
<=
1
:
return
input_
...
...
@@ -787,11 +747,15 @@ class _ReduceTensor2D(torch.autograd.Function):
def
reduce_tensor_2d
(
input_
:
Tensor
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
All-reduce the input.
:param input_: input tensor
:param parallel_mode: parallel mode
r
"""All-reduce the input.
Args:
input_ (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_ReduceTensor2D
.
apply
(
input_
,
parallel_mode
)
...
...
@@ -809,12 +773,16 @@ class _ReduceScatterTensor2D(torch.autograd.Function):
def
reduce_scatter_tensor_2d
(
tensor
:
Tensor
,
dim
:
int
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
Reduce-scatter the input.
:param tensor: Input tensor
:param dim: Dimension to scatter
:param parallel_mode: Parallel mode
r
"""Reduce-scatter the input.
Args:
tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to reduce.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_ReduceScatterTensor2D
.
apply
(
tensor
,
dim
,
parallel_mode
)
...
...
@@ -849,11 +817,11 @@ class _ReduceByBatch2D(torch.autograd.Function):
def
reduce_by_batch_2d
(
input_
,
reduce_mean
:
bool
=
False
)
->
Tensor
:
"""All-reduce the input from the model parallel region.
r
"""All-reduce the input from the model parallel region.
:param input_: input maxtrix
:type
input_
:
torch.tensor
:param
reduce_mean
: If set to ``True``, it will divide the output by column parallel size, default to False
:type reduce_mean: bool, optional
Args:
input_
(:class:`
torch.tensor
`): input matrix.
reduce_mean
(bool, optional):
If set to ``True``, it will divide the output by column parallel size, default to False.
"""
return
_ReduceByBatch2D
.
apply
(
input_
,
reduce_mean
)
\ No newline at end of file
colossalai/nn/layer/parallel_2d/layers.py
View file @
ec5086c4
...
...
@@ -22,23 +22,22 @@ from ._utils import assert_summa_initialization, get_summa_dim_from_env
@
LAYERS
.
register_module
class
Linear2D
(
ParallelLayer
):
"""
Linear layer for 2D parallelism
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer, which is preserved for kernel fusion, defaults to False
:type skip_bias_add: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Linear layer for 2D parallelism
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to False.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
in_features
:
int
,
...
...
@@ -119,18 +118,16 @@ class Linear2D(ParallelLayer):
@
LAYERS
.
register_module
class
LayerNorm2D
(
ParallelLayer
):
r
"""
Layer Normalization for 2D parallelism
:param normalized_shape: input shape from an expected input
of size. :math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1] \times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
:type eps: float, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
r
"""Layer Normalization for 2D parallelism.
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def
__init__
(
self
,
normalized_shape
:
int
,
eps
:
float
=
1e-05
,
dtype
=
None
):
super
().
__init__
()
...
...
@@ -189,27 +186,24 @@ class LayerNorm2D(ParallelLayer):
@
LAYERS
.
register_module
class
PatchEmbedding2D
(
ParallelLayer
):
"""
2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: patch size
:type patch_size: int
:param in_chans: number of channels of input image
:type in_chans: int
:param embed_size: size of embedding
:type embed_size: int
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param flatten: whether to flatten output tensor, defaults to True
:type flatten: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
r
"""2D Image to Patch Embedding.
Args:
img_size (int): image size.
patch_size (int): patch size.
in_chans (int): number of channels of input image.
embed_size (int): size of embedding.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
img_size
:
int
,
...
...
@@ -291,21 +285,33 @@ class PatchEmbedding2D(ParallelLayer):
@
LAYERS
.
register_module
class
Embedding2D
(
ParallelLayer
):
"""
Embedding for 2D parallelism
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding for 2D parallelism.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def
__init__
(
self
,
num_embeddings
:
int
,
...
...
@@ -358,20 +364,33 @@ class Embedding2D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelEmbedding2D
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding parallelized in the vocabulary dimension.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
num_embeddings
:
int
,
...
...
@@ -435,23 +454,21 @@ class VocabParallelEmbedding2D(torch.nn.Module):
@
LAYERS
.
register_module
class
Classifier2D
(
ParallelLayer
):
"""
Classifier for 2D parallelism
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Classifier for 2D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
in_features
:
int
,
...
...
@@ -515,23 +532,21 @@ class Classifier2D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelClassifier2D
(
ParallelLayer
):
"""
Vocab parallel classifier layer for 2D parallelism
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Vocab parallel classifier layer for 2D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
in_features
:
int
,
...
...
colossalai/nn/layer/parallel_2p5d/_operation.py
View file @
ec5086c4
...
...
@@ -100,35 +100,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
...],
row_rank
:
int
,
col_rank
:
int
,
row_parallel_mode
:
ParallelMode
,
col_parallel_mode
:
ParallelMode
,
data_parallel_rank
:
int
,
pipeline_parallel_rank
:
int
,
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
)
->
Tensor
:
"""
Classifier
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param bias: matrix of bias
:type bias: torch.tensor, optional
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Classifier.
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
bias (:class:`torch.tensor`): matrix of bias.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int): the rank of row.
col_rank (int): the rank of column.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Classifier2p5D
.
apply
(
A
,
B
,
bias
,
tesseract_dim
,
out_shape
,
row_rank
,
col_rank
,
row_parallel_mode
,
col_parallel_mode
,
data_parallel_rank
,
pipeline_parallel_rank
,
pipeline_parallel_size
,
...
...
@@ -136,35 +127,26 @@ def classifier_2p5d(A: Tensor, B: Tensor, bias, tesseract_dim: int, out_shape: T
class
Matmul_AB_2p5D
(
torch
.
autograd
.
Function
):
"""
Matrix multiplication for :math:`C = AB`
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param dep_rank: the rank of depth
:type dep_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix multiplication for :math:`C = AB`.
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int): the rank of row.
col_rank (int): the rank of column.
dep_rank (int): the rank of depth.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
@
staticmethod
...
...
@@ -270,35 +252,26 @@ class Matmul_AB_2p5D(torch.autograd.Function):
class
Matmul_ABT_2p5D
(
torch
.
autograd
.
Function
):
"""
Matrix multiplication for :math:`C = AB^T`
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param dep_rank: the rank of depth
:type dep_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix multiplication for :math:`C = AB^T`.
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int): the rank of row.
col_rank (int): the rank of column.
dep_rank (int): the rank of depth.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
@
staticmethod
...
...
@@ -409,35 +382,26 @@ class Matmul_ABT_2p5D(torch.autograd.Function):
class
Matmul_ATB_2p5D
(
torch
.
autograd
.
Function
):
"""
Matrix multiplication for :math:`C = A^TB`
:param a: matrix :math:`A`
:type a: torch.tensor
:param b: matrix :math:`B`
:type b: torch.tensor
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param out_shape: shape of output tensor
:type out_shape: tuple
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param dep_rank: the rank of depth
:type dep_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix multiplication for :math:`C = A^TB`
Args:
A (:class:`torch.tensor`): matrix :math:`A`.
B (:class:`torch.tensor`): matrix :math:`B`.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
out_shape (:class:`torch.size`): shape of output tensor.
row_rank (int): the rank of row.
col_rank (int): the rank of column.
dep_rank (int): the rank of depth.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
@
staticmethod
...
...
@@ -629,36 +593,27 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
col_rank
:
int
,
dep_rank
:
int
,
col_parallel_mode
:
ParallelMode
,
skip_bias_add
:
bool
,
data_parallel_rank
:
int
,
pipeline_parallel_rank
:
int
,
pipeline_parallel_size
:
int
,
tensor_parallel_size
:
int
)
->
Tensor
:
"""
Matrix add bias: :math:`C = A + b`
:param input: matrix :math:`A`
:type input: torch.tensor
:param bias: matrix :math:`b`
:type bias: torch.tensor
:param output_size_per_partition: output size in each partition
:type output_size_per_partition: int
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param row_rank: the rank of row
:type row_rank: int
:param col_rank: the rank of column
:type col_rank: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param skip_bias_add: If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion
:type skip_bias_add: bool
:param data_parallel_rank: data parallel rank
:type data_parallel_rank: int
:param pipeline_parallel_rank: pipeline parallel rank
:type pipeline_parallel_rank: int
:param pipeline_parallel_size: pipeline parallel size
:type pipeline_parallel_size: int
:param tensor_parallel_size: tensor parallel size
:type tensor_parallel_size: int
r
"""Matrix add bias: :math:`C = A + b`.
Args:
input (:class:`torch.tensor`): matrix :math:`A`.
bias (:class:`torch.tensor`): matrix :math:`B`.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism.
output_size_per_partition (int): output size in each partition.
row_rank (int): the rank of row.
col_rank (int): the rank of column.
dep_rank (int): the rank of depth.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
skip_bias_add (bool): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion.
data_parallel_rank (int): data parallel rank.
pipeline_parallel_rank (int): pipeline parallel rank
pipeline_parallel_size (int): pipeline parallel size.
tensor_parallel_size (int): tensor parallel size.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Add_Bias_2p5D
.
apply
(
input
,
bias
,
output_size_per_partition
,
tesseract_dim
,
row_rank
,
col_rank
,
dep_rank
,
col_parallel_mode
,
skip_bias_add
,
data_parallel_rank
,
pipeline_parallel_rank
,
...
...
@@ -666,19 +621,18 @@ def add_bias_2p5d(input: Tensor, bias: Tensor, output_size_per_partition: int, t
class
_Layernorm2p5D
(
torch
.
autograd
.
Function
):
"""
Layernorm
:param input: input maxtrix
:type input: torch.tensor
:param E_x: mean
:type E_x: torch.tensor
:param Var_x: variance
:type Var_x: torch.tensor
:param hidden_size: hidden size
:type hidden_size: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""Layernorm.
Args:
input (:class:`torch.tensor`): input matrix.
E_x (:class:`torch.tensor`): mean.
Var_x (:class:`torch.tensor`): variance.
hidden_size (int): hidden size.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
@
staticmethod
...
...
@@ -718,19 +672,18 @@ class _Layernorm2p5D(torch.autograd.Function):
def
layernorm_2p5d
(
input
:
Tensor
,
E_x
:
Tensor
,
Var_x
:
Tensor
,
hidden_size
:
int
,
row_parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
Layernorm
:param input: input maxtrix
:type input: torch.tensor
:param E_x: mean
:type E_x: torch.tensor
:param Var_x: variance
:type Var_x: torch.tensor
:param hidden_size: hidden size
:type hidden_size: int
:param row_parallel_mode: row parallel mode
:type row_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""Layernorm.
Args:
input (:class:`torch.tensor`): input matrix.
E_x (:class:`torch.tensor`): mean.
Var_x (:class:`torch.tensor`): variance.
hidden_size (int): hidden size.
row_parallel_mode (:class:`colossalai.context.ParallelMode`): row parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
return
_Layernorm2p5D
.
apply
(
input
,
E_x
,
Var_x
,
hidden_size
,
row_parallel_mode
)
...
...
@@ -753,29 +706,31 @@ class _AllGatherTensor2p5D(torch.autograd.Function):
def
all_gather_tensor_2p5d
(
inputs
:
Tensor
,
dim
:
int
,
col_parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
all gather the weight of 2.5D parallelism
:param inputs: input maxtrix
:type inputs: torch.tensor
:param dim: dimension of all gather
:type dim: int
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""all gather the weight of 2.5D parallelism.
Args:
inputs (:class:`torch.tensor`): input tensor.
dim (int): dimension of all-gather.
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
return
_AllGatherTensor2p5D
.
apply
(
inputs
,
dim
,
col_parallel_mode
)
class
SplitFirst
(
torch
.
autograd
.
Function
):
"""
:param inputs: input maxtrix
:type inputs: torch.tensor
:param tesseract_dim: dimension of TESSERACT fo 2.5D parallelism
:type tesseract_dim: int
:param col_parallel_mode: column parallel mode
:type col_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""
Args:
inputs (:class:`torch.tensor`): input tensor.
tesseract_dim (int): dimension of TESSERACT fo 2.5D parallelism
col_parallel_mode (:class:`colossalai.context.ParallelMode`): column parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
@
staticmethod
...
...
@@ -801,16 +756,14 @@ class SplitFirst(torch.autograd.Function):
def
split_tensor_2p5d
(
input_
:
Tensor
,
dim
:
int
=
0
)
->
Tensor
:
"""Splits 2P5D tensor in specified dimension across cols
"""Splits 2P5D tensor in specified dimension across cols
.
:param input_: Input tensor
:param dim: Specified dimension in which to split
Args:
input_ (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split.
:type input_: torch.Tensor
:type dim: int, optional
:return output: Splitted tensor
:rtype output: torch.Tensor
Returns:
:class:`torch.tensor`: The tensor has been split.
"""
if
input_
.
size
(
dim
)
<=
1
:
return
input_
...
...
@@ -829,11 +782,15 @@ class _ReduceTensor2p5D(torch.autograd.Function):
def
reduce_tensor_2p5d
(
input_
:
Tensor
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
All-reduce the input.
r
"""All-reduce the input.
:param input_: input tensor
:param parallel_mode: parallel mode
Args:
input_ (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_ReduceTensor2p5D
.
apply
(
input_
,
parallel_mode
)
...
...
@@ -851,11 +808,16 @@ class _ReduceScatterTensor2p5D(torch.autograd.Function):
def
reduce_scatter_tensor_2p5d
(
input_
:
Tensor
,
dim
:
int
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
Reduce-scatter the input.
r
"""Reduce-scatter the input.
:param input_: input tensor
:param parallel_mode: parallel mode
Args:
input_ (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to reduce.
parallel_mode (:class:`colossalai.context.ParallelMode`): The parallel mode tensor used.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_ReduceScatterTensor2p5D
.
apply
(
input_
,
dim
,
parallel_mode
)
...
...
@@ -890,12 +852,11 @@ class _RreduceByBatch2p5D(torch.autograd.Function):
def
reduce_by_batch_2p5d
(
input_
,
reduce_mean
:
bool
=
False
)
->
Tensor
:
"""
All-reduce the input from the model parallel region.
r
"""All-reduce the input from the model parallel region.
:param input_: input maxtrix
:type
input_
:
torch.tensor
:param
reduce_mean
: If set to ``True``, it will divide the output by column parallel size, default to False
:type reduce_mean: bool, optional
Args:
input_
(:class:`
torch.tensor
`): input matrix.
reduce_mean
(bool, optional):
If set to ``True``, it will divide the output by column parallel size, default to False.
"""
return
_RreduceByBatch2p5D
.
apply
(
input_
,
reduce_mean
)
colossalai/nn/layer/parallel_2p5d/layers.py
View file @
ec5086c4
...
...
@@ -23,21 +23,22 @@ from ._utils import assert_tesseract_initialization, get_tesseract_dim_dep_from_
@
LAYERS
.
register_module
class
Linear2p5D
(
ParallelLayer
):
"""
Linear layer for 2.5D parallelism
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Linear layer for 2.5D parallelism.
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
skip_bias_add (bool, optional): If set to ``True``, it will skip bias add for linear layer,
which is preserved for kernel fusion, defaults to False.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
in_features
:
int
,
...
...
@@ -131,19 +132,16 @@ class Linear2p5D(ParallelLayer):
@
LAYERS
.
register_module
class
LayerNorm2p5D
(
ParallelLayer
):
r
"""
Layer Normalization for 2.5D parallelism
:param normalized_shape: input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability, defaults to 1e-05
:type eps: float, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
r
"""Layer Normalization for 2.5D parallelism.
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-05.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def
__init__
(
self
,
normalized_shape
:
int
,
eps
:
float
=
1e-05
,
dtype
=
None
):
super
().
__init__
()
...
...
@@ -204,27 +202,24 @@ class LayerNorm2p5D(ParallelLayer):
@
LAYERS
.
register_module
class
PatchEmbedding2p5D
(
ParallelLayer
):
"""
2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: patch size
:type patch_size: int
:param in_chans: number of channels of input image
:type in_chans: int
:param embed_size: size of embedding
:type embed_size: int
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param flatten: whether to flatten output tensor, defaults to True
:type flatten: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
r
"""2D Image to Patch Embedding.
Args:
img_size (int): image size.
patch_size (int): patch size.
in_chans (int): number of channels of input image.
embed_size (int): size of embedding.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
img_size
:
int
,
...
...
@@ -306,21 +301,33 @@ class PatchEmbedding2p5D(ParallelLayer):
@
LAYERS
.
register_module
class
Embedding2p5D
(
ParallelLayer
):
"""
Embedding for 2.5D parallelism
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding for 2.5D parallelism.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def
__init__
(
self
,
num_embeddings
:
int
,
...
...
@@ -376,18 +383,31 @@ class Embedding2p5D(ParallelLayer):
class
VocabParallelEmbedding2p5D
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
num_embeddings
:
int
,
...
...
@@ -455,23 +475,21 @@ class VocabParallelEmbedding2p5D(torch.nn.Module):
@
LAYERS
.
register_module
class
Classifier2p5D
(
ParallelLayer
):
"""
Classifier for 2.5D parallelism
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Classifier for 2.5D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
in_features
:
int
,
...
...
@@ -537,23 +555,21 @@ class Classifier2p5D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelClassifier2p5D
(
ParallelLayer
):
"""
Vocab parallel classifier layer for 2.5D parallelism
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Vocab parallel classifier layer for 2.5D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
in_features
:
int
,
...
...
colossalai/nn/layer/parallel_3d/_operation.py
View file @
ec5086c4
...
...
@@ -88,27 +88,22 @@ def linear_3d(input_: Tensor,
input_dim
:
int
=
0
,
weight_dim
:
int
=
-
1
,
output_dim
:
int
=
0
)
->
Tensor
:
"""
Linear layer for 3D parallelism
:param input_: matrix of input
:type input_: torch.tensor
:param weight: matrix of weight
:type weight: torch.tensor
:param bias: matrix of bias
:type bias: torch.tensor, optional
:param input_parallel_mode: input parallel mode
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: weight parallel mode
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param output_parallel_mode: output parallel mode
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param input_dim: dimension of input, defaults to 0
:type input_dim: int, optional
:param weight_dim: dimension of weight, defaults to -1
:type weight_dim: int, optional
:param output_dim: dimension of output, defaults to 0
:type output_dim: int, optional
r
"""Linear layer for 3D parallelism.
Args:
input_ (:class:`torch.tensor`): input matrix.
weight (:class:`torch.tensor`): matrix of weight.
bias (:class:`torch.tensor`): matrix of bias.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
input_dim (int, optional): dimension of input, defaults to 0.
weight_dim (int, optional): dimension of weight, defaults to -1.
output_dim (int, optional): dimension of output, defaults to 0.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Linear3D
.
apply
(
input_
,
weight
,
bias
,
input_parallel_mode
,
weight_parallel_mode
,
output_parallel_mode
,
input_dim
,
weight_dim
,
output_dim
)
...
...
@@ -174,21 +169,19 @@ class _Classifier3D(torch.autograd.Function):
def
classifier_3d
(
input_
:
Tensor
,
weight
:
Tensor
,
bias
:
Optional
[
Tensor
],
input_parallel_mode
:
ParallelMode
,
weight_parallel_mode
:
ParallelMode
,
output_parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
3D parallel classifier
:param input_: matrix of input
:type input_: torch.tensor
:param weight: matrix of weight
:type weight: torch.tensor
:param bias: matrix of bias
:type bias: torch.tensor, optional
:param input_parallel_mode: input parallel mode
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: weight parallel mode
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param output_parallel_mode: output parallel mode
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""3D parallel classifier.
Args:
input_ (:class:`torch.tensor`): input matrix.
weight (:class:`torch.tensor`): matrix of weight.
bias (:class:`torch.tensor`): matrix of bias.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Classifier3D
.
apply
(
input_
,
weight
,
bias
,
input_parallel_mode
,
weight_parallel_mode
,
output_parallel_mode
)
...
...
@@ -244,48 +237,44 @@ class _Layernorm3D(torch.autograd.Function):
def
layernorm_3d
(
input_
:
Tensor
,
weight
:
Tensor
,
bias
:
Tensor
,
normalized_shape
:
int
,
eps
:
float
,
input_parallel_mode
:
ParallelMode
,
weight_parallel_mode
:
ParallelMode
,
output_parallel_mode
:
ParallelMode
)
->
Tensor
:
r
"""
3D parallel Layernorm
:param input_: input maxtrix
:type input_: torch.tensor
:param weight: matrix of weight
:type weight: torch.tensor
:param bias: matrix of bias
:type bias: torch.tensor
:param normalized_shape: input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability
:type eps: float
:param input_parallel_mode: input parallel mode
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: weight parallel mode
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param output_parallel_mode: output parallel mode
:type output_parallel_mode: colossalai.context.parallel_mode.ParallelMode
r
"""3D parallel Layernorm.
Args:
input_ (:class:`torch.tensor`): input matrix.
weight (:class:`torch.tensor`): matrix of weight.
bias (:class:`torch.tensor`): matrix of bias.
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float): a value added to the denominator for numerical stability
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
output_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): output parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_Layernorm3D
.
apply
(
input_
,
weight
,
bias
,
normalized_shape
,
eps
,
input_parallel_mode
,
weight_parallel_mode
,
output_parallel_mode
)
def
split_tensor_3d
(
tensor
:
Tensor
,
dim
:
int
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""Splits 3D parallel tensor in specified dimension
r
"""Splits 3D parallel tensor in specified dimension
.
:param tensor: Input tensor
:param dim: Specified dimension in which to split
:param parallel_mode: Parallel mode
:param weight_parallel_mode: Weight p
arallel mode
Args:
tensor (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): P
arallel mode
.
:type tensor: torch.Tensor
:type dim: int
:type parallel_mode: colossalai.context.parallel_mode.ParallelMode
Returns:
:class:`torch.tensor`: The tensor has been split.
:return output: Splitted tensor
:rtype output: torch.Tensor
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
if
tensor
.
size
(
dim
)
<=
1
:
return
tensor
...
...
@@ -298,17 +287,20 @@ def split_batch_3d(input_: Tensor,
dim
:
int
=
0
,
input_parallel_mode
:
ParallelMode
=
ParallelMode
.
PARALLEL_3D_INPUT
,
weight_parallel_mode
:
ParallelMode
=
ParallelMode
.
PARALLEL_3D_WEIGHT
)
->
Tensor
:
"""Splits 3D tensor in batch
:param input_: Input tensor
:param dim: Specified dimension in which to split
:param input_parallel_mode: Input parallel mode
:param weight_parallel_mode: Weight parallel mode
:type input_: torch.Tensor
:type dim: int, optional
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode, optional
:return output: Splitted tensor
:rtype output: torch.Tensor
r
"""Splits 3D tensor in batch.
Args:
input_ (:class:`torch.tensor`): Input tensor.
dim (int): Specified dimension in which to split.
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`, optional): weight parallel mode.
Returns:
:class:`torch.tensor`: The tensor has been split.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
if
input_
.
size
(
dim
)
<=
1
:
return
input_
...
...
@@ -333,11 +325,15 @@ class _ReduceTensor3D(torch.autograd.Function):
def
reduce_tensor_3d
(
tensor
:
Tensor
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
All-reduce the input
r
"""All-reduce the input
Args:
tensor (:class:`torch.tensor`): Input tensor.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
:param tensor: Input tensor
:param parallel_mode: Parallel mode
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
return
_ReduceTensor3D
.
apply
(
tensor
,
parallel_mode
)
...
...
@@ -358,11 +354,16 @@ class _AllGatherTensor3D(torch.autograd.Function):
def
all_gather_tensor_3d
(
tensor
:
Tensor
,
dim
:
int
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
All-reduce the gradient in backward pass.
r
"""All-reduce the gradient in backward pass.
Args:
tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to gather.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
:param tensor: Input tensor
:param parallel_mode: Parallel mode
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_.
"""
return
_AllGatherTensor3D
.
apply
(
tensor
,
dim
,
parallel_mode
)
...
...
@@ -382,12 +383,16 @@ class _ReduceScatterTensor3D(torch.autograd.Function):
def
reduce_scatter_tensor_3d
(
tensor
:
Tensor
,
dim
:
int
,
parallel_mode
:
ParallelMode
)
->
Tensor
:
"""
Reduce-scatter the input.
r
"""Reduce-scatter the input.
Args:
tensor (:class:`torch.tensor`): Input tensor.
dim (int): Dimension to scatter.
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): Parallel mode.
:param tensor: Input tensor
:param dim: Dimension to scatter
:param
parallel_mode
: P
arallel
mode
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `
parallel_mode
<https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/p
arallel
_
mode
.py>`_
"""
return
_ReduceScatterTensor3D
.
apply
(
tensor
,
dim
,
parallel_mode
)
...
...
@@ -423,34 +428,33 @@ def reduce_by_batch_3d(tensor: Tensor,
input_parallel_mode
:
ParallelMode
,
weight_parallel_mode
:
ParallelMode
,
reduce_mean
:
bool
=
False
)
->
Tensor
:
"""
All-reduce the input from the model parallel region.
:param input_: input maxtrix
:type input_: torch.tensor
:param input_parallel_mode: input parallel mode
:type input_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: weight parallel mode
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param reduce_mean: If set to ``True``, it will divide the output by (input parallel size * weight parallel size),
default to False
:type reduce_mean: int, optional
r
"""All-reduce the input from the model parallel region.
Args:
input_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weight parallel mode.
reduce_mean (bool, optional): If set to ``True``, it will divide the output by
(input parallel size * weight parallel size), default to False.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
return
_ReduceByBatch3D
.
apply
(
tensor
,
input_parallel_mode
,
weight_parallel_mode
,
reduce_mean
)
class
_BroadcastWeight3D_FromDiagonal
(
torch
.
autograd
.
Function
):
"""
broadcast weight from diagonal
:param
input_: input ma
x
trix
:type
input_
: torch.tensor
:param input_parallel_mode: inpu
t parallel mode
:type in
put_parallel_mode
:
colossalai.context.parallel_mode.ParallelMode
:param weight_parallel_mode: weight parallel mode
:type weight_parallel_mode: colossalai.context.parallel_mode.ParallelMode
:param weight_p
arallel
_m
ode
: output parallel mode
:type weight_parallel_mode: colossalai.
context
.
parallel_mode.
ParallelMode
r
"""
broadcast weight from diagonal.
Args:
input_
(:class:`torch.tensor`)
: input matrix
.
input_
parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): input parallel mode.
weight_parallel_mode (:class:`colossalai.context.parallel_mode.ParallelMode`): weigh
t parallel mode
.
out
put_parallel_mode
(:class:`
colossalai.context.parallel_mode.ParallelMode
`): output parallel mode.
Note:
The parallel_mode should be concluded in ``P
arallel
M
ode
``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/
context
/
parallel_mode.
py>`_
"""
@
staticmethod
...
...
colossalai/nn/layer/parallel_3d/layers.py
View file @
ec5086c4
...
...
@@ -24,19 +24,16 @@ from ._utils import get_depth_from_env, get_last_group, get_parallel_mode_from_e
@
LAYERS
.
register_module
class
LayerNorm3D
(
ParallelLayer
):
r
"""
Layer Normalization for 3D parallelism
:param normalized_shape: input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
:type normalized_shape: int
:param eps: a value added to the denominator for numerical stability, defaults to 1e-12
:type eps: float, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
r
"""Layer Normalization for 3D parallelism.
Args:
normalized_shape (int): input shape from an expected input of size.
:math:`[* \times \text{normalized_shape}[0] \times \text{normalized_shape}[1]
\times \ldots \times \text{normalized_shape}[-1]]`
If a single integer is used, it is treated as a singleton list, and this module will
normalize over the last dimension which is expected to be of that specific size.
eps (float, optional): a value added to the denominator for numerical stability, defaults to 1e-12.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
"""
def
__init__
(
self
,
normalized_shape
:
int
,
eps
:
float
=
1e-12
,
dtype
=
None
):
...
...
@@ -71,21 +68,20 @@ class LayerNorm3D(ParallelLayer):
@
LAYERS
.
register_module
class
Linear3D
(
ParallelLayer
):
"""
Linear layer for 3D parallelism
:param in_features: size of each input sample
:type in_features: int
:param out_features: size of each output sample
:type out_features: int
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Linear layer for 3D parallelism.
Args:
in_features (int): size of each input sample.
out_features (int): size of each output sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -146,23 +142,21 @@ class Linear3D(ParallelLayer):
@
LAYERS
.
register_module
class
Classifier3D
(
ParallelLayer
):
"""
Classifier for 3D parallelism
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Classifier for 3D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -225,23 +219,21 @@ class Classifier3D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelClassifier3D
(
ParallelLayer
):
"""
Vocab parallel classifier layer for 2D parallelism
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to ``True``
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Vocab parallel classifier layer for 3D parallelism.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -311,27 +303,24 @@ class VocabParallelClassifier3D(ParallelLayer):
@
LAYERS
.
register_module
class
PatchEmbedding3D
(
ParallelLayer
):
"""
2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: patch size
:type patch_size: int
:param in_chans: number of channels of input image
:type in_chans: int
:param embed_size: size of embedding
:type embed_size: int
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param flatten: whether to flatten output tensor, defaults to True
:type flatten: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
r
"""2D Image to Patch Embedding.
Args:
img_size (int): image size.
patch_size (int): patch size.
in_chans (int): number of channels of input image.
embed_size (int): size of embedding.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -419,21 +408,33 @@ class PatchEmbedding3D(ParallelLayer):
@
LAYERS
.
register_module
class
Embedding3D
(
ParallelLayer
):
"""
Embedding for 3D parallelism
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding for 3D parallelism.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_
"""
def
__init__
(
self
,
...
...
@@ -491,20 +492,33 @@ class Embedding3D(ParallelLayer):
@
LAYERS
.
register_module
class
VocabParallelEmbedding3D
(
torch
.
nn
.
Module
):
"""Embedding parallelized in the vocabulary dimension.
:param num_embeddings: number of embeddings
:type num_embeddings: int
:param embedding_dim: dimension of embedding
:type embedding_dim: int
:param padding_idx: index of padding, defaults to None
:type padding_idx: int, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to normal initializer
:type weight_initializer: typing.Callable, optional
:param args: Args used in F.embedding
:param kwargs: Kwargs used in F.embedding
r
"""Embedding parallelized in the vocabulary dimension.
Args:
num_embeddings (int): number of embeddings.
embedding_dim (int): dimension of embedding.
padding_idx (int, optional): If specified, the entries at padding_idx do not contribute to the gradient;
therefore, the embedding vector at padding_idx is not updated during training,
i.e. it remains as a fixed “pad”, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
weight_initializer (:class:`typing.Callable`, optional):
he initializer of weight, defaults to normal initializer.
The ``args`` and ``kwargs`` used in :class:``torch.nn.functional.embedding`` should contain:
::
max_norm (float, optional): If given, each embedding vector with norm larger than max_norm is
renormalized to have norm max_norm. Note: this will modify weight in-place.
norm_type (float, optional): The p of the p-norm to compute for the max_norm option. Default 2.
scale_grad_by_freq (bool, optional): If given, this will scale gradients by the inverse
of frequency of the words in the mini-batch. Default False.
sparse (bool, optional): If True, gradient w.r.t. weight will be a sparse tensor. Default False.
More details about ``args`` and ``kwargs`` could be found in
`Embedding <https://pytorch.org/docs/stable/generated/torch.nn.functional.embedding.html#torch.nn.functional.embedding>`_.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
colossalai/nn/layer/parallel_sequence/layers.py
View file @
ec5086c4
...
...
@@ -24,14 +24,13 @@ class TransformerSelfAttentionRing(nn.Module):
Self-attention layer takes input with size [b, s, h]
and returns output of the same size.
:param hidden_size: hidden size
:type hidden_size: int
:param kv_channels: channels of key/value tensor
:type kv_channels: int
:param num_attention_heads: number of attention heads
:type num_attention_heads: int
:param attention_dropout: dropout probability for attention layer
:type attention_dropout: float
Args:
hidden_size (int): hidden size.
num_attention_heads (int): number of attention heads.
attention_dropout (float): dropout probability for attention layer.
attention_mask_func (:class:`typing.Callable`): Mask function to be applied.
layer_number (int): number of layers.
"""
def
__init__
(
self
,
...
...
colossalai/nn/layer/utils/common.py
View file @
ec5086c4
...
...
@@ -38,11 +38,16 @@ class CheckpointModule(nn.Module):
def
divide
(
numerator
,
denominator
):
"""Only allow exact division
"""Only allow exact division
.
:param numerator: Numerator of the division
:param denominator: Denominator of the division
Args:
numerator (int): Numerator of the division.
denominator (int): Denominator of the division.
Returns:
int: the result of exact division.
"""
assert
denominator
!=
0
,
'denominator can not be zero'
assert
numerator
%
denominator
==
0
,
\
'{} is not divisible by {}'
.
format
(
numerator
,
denominator
)
return
numerator
//
denominator
...
...
colossalai/nn/layer/vanilla/layers.py
View file @
ec5086c4
...
...
@@ -15,11 +15,16 @@ from ..utils import to_2tuple
def
drop_path
(
x
,
drop_prob
:
float
=
0.
,
training
:
bool
=
False
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
This is the same as the DropConnect impl I created for EfficientNet, etc networks, however,
the original name is misleading as 'Drop Connect' is a different form of dropout in a separate paper...
See discussion: https://github.com/tensorflow/tpu/issues/494#issuecomment-532968956 ... I've opted for
changing the layer and argument names to 'drop path' rather than mix DropConnect as a layer name and use
'survival rate' as the argument.
Args:
drop_prob (float, optional): probability of dropping path, defaults 0.0.
training (bool, optional): whether in training progress, defaults False.
"""
if
drop_prob
==
0.
or
not
training
:
return
x
...
...
@@ -35,6 +40,9 @@ class DropPath(nn.Module):
"""
Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Adapted from https://github.com/rwightman/pytorch-image-models/blob/master/timm/models/layers/drop.py
Args:
drop_prob (float, optional): probability of dropping path, defaults None.
"""
def
__init__
(
self
,
drop_prob
=
None
):
...
...
@@ -46,7 +54,19 @@ class DropPath(nn.Module):
class
WrappedDropout
(
nn
.
Module
):
"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager.
r
"""Same as torch.nn.Dropout. But it is wrapped with the context of seed manager. During training, randomly zeroes
some elements of the input tensor with probability p using samples from a Bernoulli distribution. Each
channel will be zeroed out independently on every forward call. Furthermore, the outputs are scaled by a factor of
1/(1-p) during training. This means that during evaluation the module simply computes an identity function.
Args:
p (float, optional): probability of an element to be zeroed, defaults 0.5.
inplace (bool, optional): whether to do dropout in-place, default to be False.
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
def
__init__
(
self
,
p
:
float
=
0.5
,
inplace
:
bool
=
False
,
mode
=
None
):
...
...
@@ -74,8 +94,16 @@ class WrappedDropout(nn.Module):
class
WrappedDropPath
(
nn
.
Module
):
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
r
"""Drop paths (Stochastic Depth) per sample (when applied in main path of residual blocks).
Here, it is wrapped with the context of seed manager.
Args:
p (float, optional): probability of dropping path, defaults 0.0.
mode (:class:`colossalai.context.ParallelMode`): The chosen parallel mode.
Note:
The parallel_mode should be concluded in ``ParallelMode``. More details about ``ParallelMode`` could be found
in `parallel_mode <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/context/parallel_mode.py>`_
"""
def
__init__
(
self
,
p
:
float
=
0.
,
mode
=
None
):
...
...
@@ -101,27 +129,25 @@ class WrappedDropPath(nn.Module):
@
LAYERS
.
register_module
class
VanillaPatchEmbedding
(
nn
.
Module
):
"""
r
"""
2D Image to Patch Embedding
:param img_size: image size
:type img_size: int
:param patch_size: patch size
:type patch_size: int
:param in_chans: number of channels of input image
:type in_chans: int
:param embed_size: size of embedding
:type embed_size: int
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param flatten: whether to flatten output tensor, defaults to True
:type flatten: bool, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
:param position_embed_initializer: The intializer of position embedding, defaults to zero
:type position_embed_initializer: typing.Callable, optional
Args:
img_size (int): image size.
patch_size (int): patch size.
in_chans (int): number of channels of input image.
embed_size (int): size of embedding.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
position_embed_initializer (:class:`typing.Callable`, optional):
The initializer of position embedding, defaults to zeros initializer.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
@@ -174,23 +200,21 @@ class VanillaPatchEmbedding(nn.Module):
@
LAYERS
.
register_module
class
VanillaClassifier
(
nn
.
Module
):
"""
Dense linear classifier
:param in_features: size of each input sample
:type in_features: int
:param num_classes: number of classes
:type num_classes: int
:param weight: weight of the classifier, defaults to True
:type weight: torch.nn.Parameter, optional
:param bias: If set to ``False``, the layer will not learn an additive bias, defaults to True
:type bias: bool, optional
:param dtype: The dtype of parameters, defaults to None
:type dtype: torch.dtype, optional
:param weight_initializer: The intializer of weight, defaults to kaiming uniform initializer
:type weight_initializer: typing.Callable, optional
:param bias_initializer: The intializer of bias, defaults to xavier uniform initializer
:type bias_initializer: typing.Callable, optional
r
"""Dense linear classifier.
Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
flatten (bool, optional): whether to flatten output tensor, defaults to True.
weight_initializer (:class:`typing.Callable`, optional):
The initializer of weight, defaults to kaiming uniform initializer.
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
More details about initializer please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def
__init__
(
self
,
...
...
colossalai/nn/layer/wrapper/lambda_wrapper.py
View file @
ec5086c4
...
...
@@ -9,12 +9,11 @@ from colossalai.registry import LAYERS
@
LAYERS
.
register_module
class
LambdaWrapper
(
nn
.
Module
):
"""Wrap a function to nn.Module, which takes a config of layers and can fully access them
"""Wrap a function to nn.Module, which takes a config of layers and can fully access them
.
:param func: User customed function
:type func: Callable
:param layers_cfg: Config of layers, defaults to None
:type layers_cfg: dict, optional
Args:
func (``Callable``): User customed function.
layers_cfg (dict, optional): Config of layers, defaults to None.
"""
def
__init__
(
self
,
func
,
layers_cfg
:
dict
=
None
):
...
...
colossalai/nn/loss/loss_1d.py
View file @
ec5086c4
...
...
@@ -86,12 +86,10 @@ class _VocabParallelCrossEntropy1D(torch.autograd.Function):
@
LOSSES
.
register_module
class
VocabParallelCrossEntropyLoss1D
(
_Loss
):
"""
Vocab parallel cross entropy loss for 1D parallelism
:param reduction: whether to average the loss, defaults to True
"""Vocab parallel cross entropy loss for 1D parallelism.
:type reduction: bool, optional
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
"""
def
__init__
(
self
,
reduction
=
True
):
...
...
@@ -99,10 +97,11 @@ class VocabParallelCrossEntropyLoss1D(_Loss):
self
.
reduction_mean
=
reduction
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
loss
=
_VocabParallelCrossEntropy1D
.
apply
(
logits
,
targets
)
if
self
.
reduction_mean
:
...
...
colossalai/nn/loss/loss_2d.py
View file @
ec5086c4
...
...
@@ -13,14 +13,22 @@ from torch.nn.modules.loss import _Loss
@
LOSSES
.
register_module
class
CrossEntropyLoss2D
(
_Loss
):
"""
Cross entropy loss for 2D parallelism
r
"""Cross entropy loss for 2D parallelism
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
The ``args`` and ``kwargs`` should include parameters below:
::
:param reduction: whether to average the loss, defaults to True
:param args: Args for loss function
:param kwargs: Kwargs for loss function
weight (Tensor, optional)
size_average (bool, optional)
ignore_index (int, optional)
reduce (bool, optional)
label_smoothing (float, optional)
:type reduction: bool, optional
More details about args, kwargs and torch.nn.functional.cross_entropy could be found in
`Cross_entropy <https://pytorch.org/docs/stable/generated/torch.nn.functional.cross_entropy.html#torch.nn.functional.cross_entropy>`_.
"""
def
__init__
(
self
,
reduction
=
True
,
*
args
,
**
kwargs
):
...
...
@@ -31,10 +39,14 @@ class CrossEntropyLoss2D(_Loss):
self
.
loss_kwargs
=
kwargs
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
Returns:
float: the loss between logits and targets.
"""
targets
=
split_tensor_2d
(
targets
)
loss
=
cross_entropy
(
logits
,
targets
,
reduction
=
'none'
,
*
self
.
loss_args
,
**
self
.
loss_kwargs
)
...
...
@@ -116,12 +128,10 @@ class _VocabParallelCrossEntropy2D(torch.autograd.Function):
@
LOSSES
.
register_module
class
VocabParallelCrossEntropyLoss2D
(
_Loss
):
"""
Vocab parallel cross entropy loss for 2D parallelism
:param reduction: whether to average the loss, defaults to True
"""Vocab parallel cross entropy loss for 2D parallelism.
:type reduction: bool, optional
Args:
reduction (bool, optional): whether to average the loss, defaults to True.
"""
def
__init__
(
self
,
reduction
=
True
):
...
...
@@ -129,10 +139,11 @@ class VocabParallelCrossEntropyLoss2D(_Loss):
self
.
reduction_mean
=
reduction
def
forward
(
self
,
logits
,
targets
):
"""Calculate loss between logits and targets
"""Calculate loss between logits and targets
.
:param logits: Output logits of model
:param targets: True targets from data
Args:
logits (:class:`torch.tensor`): Predicted unnormalized scores (often referred to as logits).
targets (:class:`torch.tensor`): Ground truth class indices or class probabilities.
"""
targets
=
split_tensor_2d
(
targets
)
loss
=
_VocabParallelCrossEntropy2D
.
apply
(
...
...
Prev
1
2
3
4
5
Next
Write
Preview
Markdown
is supported
0%
Try again
or
attach a new file
.
Attach a file
Cancel
You are about to add
0
people
to the discussion. Proceed with caution.
Finish editing this message first!
Cancel
Please
register
or
sign in
to comment