Commit 653b0a62 authored by zbian's avatar zbian Committed by アマデウス
Browse files

added skip_bias_add for non-tp linear

parent e5b1a0c9
import math import inspect
import inspect import math
from typing import Callable from typing import Callable
from colossalai.utils import get_current_device from torch import dtype, nn
from torch import dtype, nn
from colossalai.utils import get_current_device
from ... import init as init
from ..parallel_1d import * from ... import init as init
from ..parallel_2d import * from ..parallel_1d import *
from ..parallel_2p5d import * from ..parallel_2d import *
from ..parallel_3d import * from ..parallel_2p5d import *
from ..utils import get_tensor_parallel_mode from ..parallel_3d import *
from ..vanilla import * from ..utils import get_tensor_parallel_mode
from ._utils import ColossalaiModule from ..vanilla import *
from ._utils import ColossalaiModule
_parallel_linear = {'1d': Linear1D, '2d': Linear2D, '2.5d': Linear2p5D, '3d': Linear3D}
_parallel_linear = {None: VanillaLinear, '1d': Linear1D, '2d': Linear2D, '2.5d': Linear2p5D, '3d': Linear3D}
_parallel_classifier = {
None: VanillaClassifier, _parallel_classifier = {
'1d': Classifier1D, None: VanillaClassifier,
'2d': Classifier2D, '1d': Classifier1D,
'2.5d': Classifier2p5D, '2d': Classifier2D,
'3d': Classifier3D '2.5d': Classifier2p5D,
} '3d': Classifier3D
}
_vocab_parallel_classifier = {
'1d': VocabParallelClassifier1D, _vocab_parallel_classifier = {
'2d': VocabParallelClassifier2D, '1d': VocabParallelClassifier1D,
'2.5d': VocabParallelClassifier2p5D, '2d': VocabParallelClassifier2D,
'3d': VocabParallelClassifier3D '2.5d': VocabParallelClassifier2p5D,
} '3d': VocabParallelClassifier3D
}
class Linear(ColossalaiModule):
"""Linear layer of colossalai. class Linear(ColossalaiModule):
"""Linear layer of colossalai.
Args:
in_features (int): size of each input sample. Args:
out_features (int): size of each output sample. in_features (int): size of each input sample.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``. out_features (int): size of each output sample.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
weight_initializer (:class:`typing.Callable`, optional): dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
The initializer of weight, defaults to kaiming uniform initializer. weight_initializer (:class:`typing.Callable`, optional):
bias_initializer (:class:`typing.Callable`, optional): The initializer of weight, defaults to kaiming uniform initializer.
The initializer of bias, defaults to xavier uniform initializer. bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer.
Note: ``kwargs`` would contain different parameters when you use different parallelisms.
Note: ``kwargs`` would contain different parameters when you use different parallelisms.
The ``kwargs`` should contain parameters below:
:: The ``kwargs`` should contain parameters below:
::
Linear1D:
gather_output: bool (optional, default to be false) Linear1D:
skip_bias_add: bool (optional, default to be false) gather_output: bool (optional, default to be false)
Linear2D: skip_bias_add: bool (optional, default to be false)
skip_bias_add: bool (optional, default to be false) Linear2D:
Linear2p5D: skip_bias_add: bool (optional, default to be false)
skip_bias_add: bool (optional, default to be false) Linear2p5D:
Linear3D: skip_bias_add: bool (optional, default to be false)
None Linear3D:
None
More details about ``initializer`` please refer to
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_. More details about ``initializer`` please refer to
""" `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
"""
def __init__(self,
in_features: int, def __init__(self,
out_features: int, in_features: int,
bias: bool = True, out_features: int,
dtype: dtype = None, bias: bool = True,
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), dtype: dtype = None,
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
**kwargs) -> None: bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
tensor_parallel = get_tensor_parallel_mode() **kwargs) -> None:
if tensor_parallel is None: tensor_parallel = get_tensor_parallel_mode()
layer = nn.Linear(in_features, out_features, bias=bias).to(dtype).to(get_current_device()) linear_cls = _parallel_linear[tensor_parallel]
weight_initializer(layer.weight, fan_in=in_features, fan_out=out_features) gather_output = kwargs.pop('gather_output', None)
if layer.bias is not None: if 'gather_output' in inspect.signature(linear_cls.__init__).parameters.keys(): # gather_out arg is available
bias_initializer(layer.bias, fan_in=in_features) kwargs['gather_output'] = gather_output
else: layer = linear_cls(
linear_cls = _parallel_linear[tensor_parallel] in_features,
gather_output = kwargs.pop('gather_output', None) out_features,
if 'gather_output' in inspect.signature( bias=bias,
linear_cls.__init__).parameters.keys(): # gather_out arg is available dtype=dtype,
kwargs['gather_output'] = gather_output weight_initializer=weight_initializer,
layer = linear_cls( bias_initializer=bias_initializer,
in_features, **kwargs,
out_features, )
bias=bias, super().__init__(layer)
dtype=dtype,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer, class Classifier(ColossalaiModule):
**kwargs, """Classifier layer of colossalai.
)
super().__init__(layer) Args:
in_features (int): size of each input sample.
num_classes (int): number of classes.
class Classifier(ColossalaiModule): weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None.
"""Classifier layer of colossalai. bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None.
Args: weight_initializer (:class:`typing.Callable`, optional):
in_features (int): size of each input sample. The initializer of weight, defaults to kaiming uniform initializer.
num_classes (int): number of classes. bias_initializer (:class:`typing.Callable`, optional):
weight (:class:`torch.nn.Parameter`, optional): weight of the classifier, defaults to None. The initializer of bias, defaults to xavier uniform initializer.
bias (bool, optional): If set to ``False``, the layer will not learn an additive bias, defaults to ``True``.
dtype (:class:`torch.dtype`, optional): The dtype of parameters, defaults to None. More details about ``initializer`` please refer to
weight_initializer (:class:`typing.Callable`, optional): `init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_.
The initializer of weight, defaults to kaiming uniform initializer. """
bias_initializer (:class:`typing.Callable`, optional):
The initializer of bias, defaults to xavier uniform initializer. def __init__(self,
in_features: int,
More details about ``initializer`` please refer to num_classes: int,
`init <https://github.com/hpcaitech/ColossalAI/blob/main/colossalai/nn/init.py>`_. weight: nn.Parameter = None,
""" bias: bool = True,
dtype: dtype = None,
def __init__(self, weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)),
in_features: int, bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1),
num_classes: int, vocab_parallel_limit: int = 2048) -> None:
weight: nn.Parameter = None, tensor_parallel = get_tensor_parallel_mode()
bias: bool = True, if num_classes <= vocab_parallel_limit or tensor_parallel is None:
dtype: dtype = None, layer = _parallel_classifier[tensor_parallel](
weight_initializer: Callable = init.kaiming_uniform_(a=math.sqrt(5)), in_features,
bias_initializer: Callable = init.xavier_uniform_(a=1, scale=1), num_classes,
vocab_parallel_limit: int = 2048) -> None: weight=weight,
tensor_parallel = get_tensor_parallel_mode() bias=bias,
if num_classes <= vocab_parallel_limit or tensor_parallel is None: dtype=dtype,
layer = _parallel_classifier[tensor_parallel]( weight_initializer=weight_initializer,
in_features, bias_initializer=bias_initializer,
num_classes, )
weight=weight, else:
bias=bias, layer = _vocab_parallel_classifier[tensor_parallel](
dtype=dtype, in_features,
weight_initializer=weight_initializer, num_classes,
bias_initializer=bias_initializer, weight=weight,
) bias=bias,
else: dtype=dtype,
layer = _vocab_parallel_classifier[tensor_parallel]( weight_initializer=weight_initializer,
in_features, bias_initializer=bias_initializer,
num_classes, )
weight=weight, super().__init__(layer)
bias=bias,
dtype=dtype,
weight_initializer=weight_initializer,
bias_initializer=bias_initializer,
)
super().__init__(layer)
from .layers import (DropPath, VanillaClassifier, VanillaLayerNorm, VanillaPatchEmbedding, WrappedDropout, from .layers import (
WrappedDropPath) DropPath,
VanillaClassifier,
VanillaLayerNorm,
VanillaLinear,
VanillaPatchEmbedding,
WrappedDropout,
WrappedDropPath,
)
__all__ = [ __all__ = [
"VanillaLayerNorm", "VanillaPatchEmbedding", "VanillaClassifier", "DropPath", "WrappedDropout", "WrappedDropPath" "VanillaLayerNorm", "VanillaPatchEmbedding", "VanillaClassifier", "DropPath", "WrappedDropout", "WrappedDropPath",
"VanillaLinear"
] ]
This diff is collapsed.
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment